Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit afc45bf

Browse files
author
Pramod Kumbhar
committed
Various changes (including temporary) to make XL OpenMP offload build working
* todo: need to rebase this branch on latest hackathon_master * todo: temporary changes to OpenAccHelper.cmake, needs refinement * todo: see caliper linkling issue * todo: _OPENACC needs to be renamed CORENRN_ENABLE_GPU so that OpenMP based builds can use GPU offload. * todo: hardcoded CXX flags for quick build
1 parent f49d0cd commit afc45bf

File tree

16 files changed

+93
-50
lines changed

16 files changed

+93
-50
lines changed

CMake/OpenAccHelper.cmake

Lines changed: 46 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -47,39 +47,53 @@ if(CORENRN_ENABLE_GPU)
4747
endif()
4848
set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
4949
endif()
50-
# -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be
51-
# required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA
52-
# device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which
53-
# apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when
54-
# compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences
55-
# due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
56-
# more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
57-
# CUDA version as is used for the explicit CUDA code.
58-
set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
59-
set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
60-
# Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
61-
# code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
62-
# same default compute capabilities as each other, particularly on GPU-less build machines.
63-
foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
64-
string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
65-
endforeach()
66-
if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
67-
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
68-
# for a region then prefer OpenMP.
69-
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
70-
string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
50+
51+
if(${CMAKE_CXX_COMPILER_ID} STREQUAL "XLClang")
52+
set(NVHPC_ACC_COMP_FLAGS "-qsmp=omp -qoffload -qreport")
53+
set(NVHPC_ACC_LINK_FLAGS "-qcuda -lcaliper")
54+
55+
if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
56+
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
57+
# for a region then prefer OpenMP.
58+
add_compile_definitions(CORENRN_PREFER_OPENMP_OFFLOAD)
59+
endif()
60+
61+
elseif(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
62+
set(NVHPC_ACC_COMP_FLAGS "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Wno-unknown-cuda-version -I${CUDAToolkit_INCLUDE_DIRS}")
63+
set(NVHPC_ACC_LINK_FLAGS)
64+
else()
65+
# -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be
66+
# required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA
67+
# device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which
68+
# apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when
69+
# compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences
70+
# due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
71+
# more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
72+
# CUDA version as is used for the explicit CUDA code.
73+
set(NVHPC_ACC_COMP_FLAGS "-acc -Minfo=accel -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
74+
set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
75+
# Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
76+
# code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
77+
# same default compute capabilities as each other, particularly on GPU-less build machines.
78+
foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
79+
string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
80+
endforeach()
81+
if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
82+
# Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
83+
# for a region then prefer OpenMP.
84+
add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
85+
string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu -Minfo=mp")
86+
endif()
87+
# avoid PGI adding standard compliant "-A" flags
88+
# set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
89+
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
90+
# Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
91+
# especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
92+
# Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
93+
# for OpenACC code generation.
94+
set(NVHPC_CXX_INLINE_FLAGS "-Mautoinline")
95+
set(NVHPC_CXX_INLINE_FLAGS)
7196
endif()
72-
set(NVHPC_ACC_COMP_FLAGS "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Wno-unknown-cuda-version -I${CUDAToolkit_INCLUDE_DIRS}")
73-
set(NVHPC_ACC_LINK_FLAGS)
74-
# avoid PGI adding standard compliant "-A" flags
75-
# set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
76-
string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
77-
# Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
78-
# especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
79-
# Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
80-
# for OpenACC code generation.
81-
set(NVHPC_CXX_INLINE_FLAGS "-Mautoinline")
82-
set(NVHPC_CXX_INLINE_FLAGS)
8397
endif()
8498

8599
# =============================================================================

coreneuron/gpu/nrn_acc_manager.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,14 @@
2727
#ifdef _OPENACC
2828
#include <openacc.h>
2929
#endif
30-
#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
30+
#ifdef CORENRN_PREFER_OPENMP_OFFLOAD
3131
#include <omp.h>
3232
#endif
3333

3434
#ifdef CRAYPAT
3535
#include <pat_api.h>
3636
#endif
37+
3738
namespace coreneuron {
3839
extern InterleaveInfo* interleave_info;
3940
void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
@@ -76,9 +77,14 @@ void cnrn_target_delete(void* h_ptr, size_t len) {
7677
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENRN_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
7778
(void)len;
7879
auto device_id = omp_get_default_device();
79-
omp_target_disassociate_ptr(h_ptr, device_id);
80-
auto* d_ptr = omp_get_mapped_ptr(h_ptr, device_id);
80+
void *d_ptr = nullptr;
81+
nrn_pragma_omp(target data device(device_id) use_device_ptr(h_ptr))
82+
{
83+
d_ptr = h_ptr;
84+
}
85+
// todo: disassociate first or free first
8186
omp_target_free(d_ptr, device_id);
87+
omp_target_disassociate_ptr(h_ptr, device_id);
8288
#else
8389
throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
8490
#endif
@@ -89,7 +95,12 @@ void* cnrn_target_deviceptr(void* h_ptr) {
8995
return acc_deviceptr(h_ptr);
9096
#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENRN_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
9197
auto device_id = omp_get_default_device();
92-
return omp_get_mapped_ptr(h_ptr, device_id);
98+
void *d_ptr = nullptr;
99+
nrn_pragma_omp(target data device(device_id) use_device_ptr(h_ptr))
100+
{
101+
d_ptr = h_ptr;
102+
}
103+
return d_ptr;
93104
#else
94105
throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
95106
#endif
@@ -1416,7 +1427,7 @@ void init_gpu() {
14161427

14171428
int device_num = local_rank % num_devices_per_node;
14181429
acc_set_device_num(device_num, device_type);
1419-
#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
1430+
#ifdef CORENRN_PREFER_OPENMP_OFFLOAD
14201431
omp_set_default_device(device_num);
14211432
#endif
14221433

coreneuron/kinderiv.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ def write_out_kinderiv(fout):
5959
fout.write("\n/* declarations */\n")
6060
fout.write("\nnamespace coreneuron {\n")
6161

62+
if deriv or kin or euler:
63+
fout.write('nrn_pragma_omp(declare target)\n')
64+
6265
for item in deriv:
6366
fout.write('#pragma acc routine seq\n')
6467
fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
@@ -73,6 +76,9 @@ def write_out_kinderiv(fout):
7376
fout.write('#pragma acc routine seq\n')
7477
fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
7578

79+
if deriv or kin or euler:
80+
fout.write('nrn_pragma_omp(end declare target)\n')
81+
7682
fout.write("\n/* callback indices */\n")
7783
derivoffset = 1
7884
kinoffset = 1

coreneuron/mechanism/eion.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {
177177
}
178178
}
179179

180+
nrn_pragma_omp(declare target)
180181
void nrn_wrote_conc(int type,
181182
double* p1,
182183
int p2,
@@ -193,6 +194,7 @@ void nrn_wrote_conc(int type,
193194
pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
194195
}
195196
}
197+
nrn_pragma_omp(end declare target)
196198

197199
static double efun(double x) {
198200
if (fabs(x) < 1e-4) {

coreneuron/mechanism/mech/dimplic.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
2525
#include "_kinderiv.h"
2626
namespace coreneuron {
27+
nrn_pragma_omp(declare target)
2728
int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) {
2829
difun(fun);
2930
return 0;
@@ -48,5 +49,6 @@ int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) {
4849
switch (fun) { _NRN_KINETIC_CASES }
4950
return 0;
5051
}
52+
nrn_pragma_omp(end declare target)
5153

5254
} // namespace coreneuron

coreneuron/mechanism/register_mech.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919

2020
namespace coreneuron {
2121
int secondorder = 0;
22+
nrn_pragma_omp(declare target)
2223
double t, dt, celsius, pi;
24+
nrn_pragma_omp(end declare target)
2325
int rev_dt;
2426

2527
using Pfrv = void (*)();

coreneuron/network/cvodestb.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,12 +86,14 @@ void fixed_play_continuous(NrnThread* nt) {
8686

8787
// NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
8888
// for the ISPC backend. If changes are required, make sure to change ISPC as well.
89+
nrn_pragma_omp(declare target)
8990
int at_time(NrnThread* nt, double te) {
9091
double x = te - 1e-11;
9192
if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
9293
return 1;
9394
}
9495
return 0;
9596
}
97+
nrn_pragma_omp(end declare target)
9698

9799
} // namespace coreneuron

coreneuron/network/netcvode.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -537,7 +537,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method
537537
nrn_pragma_acc(parallel loop present(
538538
nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
539539
copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
540-
nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu))
540+
nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
541541
for (int i = 0; i < nt->ncell; ++i) {
542542
PreSyn* ps = presyns + i;
543543
PreSynHelper* psh = presyns_helper + i;

coreneuron/network/partrans.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ void nrnthread_v_transfer(NrnThread* _nt) {
114114
int* insrc_indices = ttd.insrc_indices.data();
115115
double* tar_data = _nt->_data;
116116
// last element in the displacement vector gives total length
117-
#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
117+
#if defined(_OPENACC) && !defined(CORENRN_PREFER_OPENMP_OFFLOAD)
118118
int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
119119
int ndata = _nt->_ndata;
120120
#endif

coreneuron/permute/cellorder.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -598,7 +598,7 @@ void solve_interleaved2(int ith) {
598598
int* strides = ii.stride; // sum ncycles of these (bad since ncompart/warpsize)
599599
int* rootbegin = ii.firstnode; // nwarp+1 of these
600600
int* nodebegin = ii.lastnode; // nwarp+1 of these
601-
#if defined(_OPENACC) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD)
601+
#if defined(_OPENACC) && !defined(CORENRN_PREFER_OPENMP_OFFLOAD)
602602
int nstride = stridedispl[nwarp];
603603
#endif
604604
nrn_pragma_acc(parallel loop gang vector vector_length(

0 commit comments

Comments
 (0)