diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu index 82198410f..1226b4bf7 100644 --- a/coreneuron/permute/cellorder.cu +++ b/coreneuron/permute/cellorder.cu @@ -72,25 +72,32 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int int* rootbegin = ii->firstnode; // nwarp+1 of these int* nodebegin = ii->lastnode; // nwarp+1 of these - int iwarp = icore / warpsize; // figure out the >> value - int ic = icore & (warpsize - 1); // figure out the & mask - int ncycle = ncycles[iwarp]; - int* stride = strides + stridedispl[iwarp]; - int root = rootbegin[iwarp]; - int lastroot = rootbegin[iwarp + 1]; - int firstnode = nodebegin[iwarp]; - int lastnode = nodebegin[iwarp + 1]; - - triang_interleaved2_device(nt, ic, ncycle, stride, lastnode); - bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); + while (icore < ncore) { + int iwarp = icore / warpsize; // figure out the >> value + int ic = icore & (warpsize - 1); // figure out the & mask + int ncycle = ncycles[iwarp]; + int* stride = strides + stridedispl[iwarp]; + int root = rootbegin[iwarp]; + int lastroot = rootbegin[iwarp + 1]; + int firstnode = nodebegin[iwarp]; + int lastnode = nodebegin[iwarp + 1]; + + triang_interleaved2_device(nt, ic, ncycle, stride, lastnode); + bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); + + icore += blockDim.x * gridDim.x; + } } void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) { auto cuda_stream = static_cast(stream); - int threadsPerBlock = warpsize; - // TODO: Should blocksPerGrid be a fixed number and have a while block inside the kernel? - int blocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock; + // the selection of these parameters has been done after running the channel-benchmark for typical production runs, i.e. + // 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells. + // The main idea is to have multiple warps per SM and sufficient blocks to fill the GPU. + // In our case, given that multiple threads share the available GPUs, we "guarantee" a sufficient occupancy of the GPUs. + int threadsPerBlock = 128; + int blocksPerGrid = 512; solve_interleaved2_kernel<<>>(nt, info, ncore);