@@ -482,21 +482,22 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
482
482
int icycle = ncycle - 1 ;
483
483
int istride = stride[icycle];
484
484
int i = lastnode - istride + icore;
485
- #ifndef CORENEURON_ENABLE_GPU
485
+ // #ifndef CORENEURON_ENABLE_GPU
486
486
int ii = i;
487
- #endif
487
+ // #endif
488
488
489
489
// execute until all tree depths are executed
490
490
bool has_subtrees_to_compute = true ;
491
491
492
492
// clang-format off
493
493
nrn_pragma_acc (loop seq)
494
494
for (; has_subtrees_to_compute; ) { // ncycle loop
495
- #ifndef CORENEURON_ENABLE_GPU
495
+ // #ifndef CORENEURON_ENABLE_GPU
496
496
// serial test, gpu does this in parallel
497
+ nrn_pragma_acc (loop)
497
498
for (int icore = 0 ; icore < warpsize; ++icore) {
498
499
int i = ii + icore;
499
- #endif
500
+ // #endif
500
501
if (icore < istride) { // most efficient if istride equal warpsize
501
502
// what is the index
502
503
int ip = GPU_PARENT (i);
@@ -508,9 +509,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
508
509
nrn_pragma_omp (atomic update)
509
510
GPU_RHS (ip) -= p * GPU_RHS (i);
510
511
}
511
- #ifndef CORENEURON_ENABLE_GPU
512
+ // #ifndef CORENEURON_ENABLE_GPU
512
513
}
513
- #endif
514
+ // #endif
514
515
// if finished with all tree depths then ready to break
515
516
// (note that break is not allowed in OpenACC)
516
517
if (icycle == 0 ) {
@@ -520,9 +521,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
520
521
--icycle;
521
522
istride = stride[icycle];
522
523
i -= istride;
523
- #ifndef CORENEURON_ENABLE_GPU
524
+ // #ifndef CORENEURON_ENABLE_GPU
524
525
ii -= istride;
525
- #endif
526
+ // #endif
526
527
}
527
528
// clang-format on
528
529
}
@@ -535,36 +536,37 @@ static void bksub_interleaved2(NrnThread* nt,
535
536
int ncycle,
536
537
int * stride,
537
538
int firstnode) {
538
- #ifndef CORENEURON_ENABLE_GPU
539
+ // #ifndef CORENEURON_ENABLE_GPU
539
540
for (int i = root; i < lastroot; i += 1 ) {
540
- #else
541
- nrn_pragma_acc (loop seq)
542
- for (int i = root; i < lastroot; i += warpsize) {
543
- #endif
541
+ // #else
542
+ // nrn_pragma_acc(loop seq)
543
+ // for (int i = root; i < lastroot; i += warpsize) {
544
+ // #endif
544
545
GPU_RHS (i) /= GPU_D (i); // the root
545
546
}
546
547
547
548
int i = firstnode + icore;
548
- #ifndef CORENEURON_ENABLE_GPU
549
+ // #ifndef CORENEURON_ENABLE_GPU
549
550
int ii = i;
550
- #endif
551
+ // #endif
551
552
for (int icycle = 0 ; icycle < ncycle; ++icycle) {
552
553
int istride = stride[icycle];
553
- #ifndef CORENEURON_ENABLE_GPU
554
+ // #ifndef CORENEURON_ENABLE_GPU
555
+ nrn_pragma_acc (loop)
554
556
// serial test, gpu does this in parallel
555
557
for (int icore = 0 ; icore < warpsize; ++icore) {
556
558
int i = ii + icore;
557
- #endif
559
+ // #endif
558
560
if (icore < istride) {
559
561
int ip = GPU_PARENT (i);
560
562
GPU_RHS (i) -= GPU_B (i) * GPU_RHS (ip);
561
563
GPU_RHS (i) /= GPU_D (i);
562
564
}
563
565
i += istride;
564
- #ifndef CORENEURON_ENABLE_GPU
566
+ // #ifndef CORENEURON_ENABLE_GPU
565
567
}
566
568
ii += istride;
567
- #endif
569
+ // #endif
568
570
}
569
571
}
570
572
@@ -617,14 +619,14 @@ void solve_interleaved2(int ith) {
617
619
int lastroot = rootbegin[iwarp + 1 ];
618
620
int firstnode = nodebegin[iwarp];
619
621
int lastnode = nodebegin[iwarp + 1 ];
620
- #ifndef CORENEURON_ENABLE_GPU
622
+ // #ifndef CORENEURON_ENABLE_GPU
621
623
if (ic == 0 ) { // serial test mode. triang and bksub do all cores in warp
622
- #endif
624
+ // #endif
623
625
triang_interleaved2 (nt, ic, ncycle, stride, lastnode);
624
626
bksub_interleaved2 (nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
625
- #ifndef CORENEURON_ENABLE_GPU
627
+ // #ifndef CORENEURON_ENABLE_GPU
626
628
} // serial test mode
627
- #endif
629
+ // #endif
628
630
}
629
631
nrn_pragma_acc (wait (nt->stream_id ))
630
632
#ifdef _OPENACC
0 commit comments