diff --git a/ComputeForce.cuh b/ComputeForce.cuh index b144b8b07bbac876093a84dfc698b4cbc576bbc3..4564bbba8f0fca24287593061065b60bd12edc72 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -210,6 +210,8 @@ void clearPairlists(Vector3 pos[], int num, int numReplicas, } +__device__ int g_nextBlock; + __global__ void createPairlists(Vector3 pos[], int num, int numReplicas, BaseGrid* sys, CellDecomposition* decomp) { @@ -217,14 +219,14 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, // Each thread has designated values in shared memory as a buffer // A sync operation periodically moves data from shared to global const int NUMTHREADS = 128; /* RBTODO: fix */ - __shared__ int spid[NUMTHREADS]; + __shared__ int pairCount[NUMTHREADS]; const int tid = threadIdx.x; const int ai = blockIdx.x * blockDim.x + threadIdx.x; /* atom index 0 */ const int MAXPAIRSPERTHREAD = 8; /* optimized for 32 threads on K40 */ const int MAXPAIRSPERBLOCK = MAXPAIRSPERTHREAD*NUMTHREADS*4; - - spid[tid] = 0; + + pairCount[tid] = 0; // int numPairs = g_numPairs; if (ai == 0) { g_numPairs = 0; /* RBTODO: be more efficient */ @@ -235,23 +237,31 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, g_pairJ = (int*) malloc( maxPairs * sizeof(int)); assert( g_pairI != NULL ); assert( g_pairJ != NULL ); + g_nextBlock = gridDim.x; /* number of blocks */ } - __shared__ int b_numPairs; /* block's index into global numPairs object */ - if (tid == 0) b_NumPairs = blockIdx.x * NUMTHREADS * MAXPAIRSPERTHREAD; - - __syncthreads(); - int pid = 0; + + // __shared__ int currentBlock; /* for index into g_pairX arrays */ + /* __shared__ int b_numPairs; /\* block's index into global numPairs object *\/ */ + /* if (tid == 0) { */ + /* currentBlock = blockIdx.x; */ + /* b_numPairs = currentBlock * NUMTHREADS * MAXPAIRSPERTHREAD; */ + /* } */ + __shared__ int gpidOff; + __shared__ int gpidRem; + if (tid == 0) { + gpidOff = 0; + gpidRem = MAXPAIRSPERBLOCK; + } + + int currentBlock = blockIdx.x; + __syncthreads(); + int pid = 0; // ai - index of the particle in the original, unsorted array if (ai < (num-1) * numReplicas) { const int repID = ai / (num-1); - // const int typei = type[i]; // maybe irrelevent const Vector3 posi = pos[ai]; - // RBTODO: if this approach works well, make cell decomposition finer so limit still works - - // TODO: Fix this: Find correct celli (add a new function to - // CellDecomposition, binary search over cells) CellDecomposition::cell_t celli = decomp->getCellForParticle(ai); const CellDecomposition::cell_t* pairs = decomp->getCells(); for (int x = -1; x <= 1; ++x) { @@ -267,7 +277,7 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, last = range.last; } - // go through possible pairs 32 at a time per threa + // go through possible pairs in chunks for each thread __shared__ int pairI[MAXPAIRSPERTHREAD*NUMTHREADS]; __shared__ int pairJ[MAXPAIRSPERTHREAD*NUMTHREADS]; @@ -290,59 +300,82 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, } n += MAXPAIRSPERTHREAD; done = __all( n >= last ); - { // SYNC THREADS - // assert(pid <= MAXPAIRSPERTHREAD); + { // SYNC THREADS // find where each thread should put its pairs in global list - if (tid == NUMTHREADS) atomicAdd( &g_numPairs, spid[tid] + pidLast ); - spid[tid] = pid; - inclIntCumSum(spid,NUMTHREADS); // returns cumsum with spid[0] = 0 - int total = spid[NUMTHREADS-1]; /* number of new pairs */ - - // RBTODO: fix this - // RBTODO: test that this gives the same numpairs as original method - // __shared__ int offset = 0; - // int stride = 0; + pairCount[tid] = pid; + inclIntCumSum(pairCount,NUMTHREADS); // returns cumsum with pairCoun[t0] = 0 + int total = pairCount[NUMTHREADS-1]; /* number of new pairs */ // loop over new pairs, with stride of threads + int gpid = currentBlock * MAXPAIRSPERBLOCK + gpidOff; int tmpTid = 0; /* thread id of the thread we are reading from */ - for ( int gTmpPid = tid; gTmpPid < total; gTmpPid += NUMTHREADS ) { - - while ( tmpTid < NUMTHREADS-1 && spid[tmpTid] < gTmpPid ) - tmpTid++; - int sTmpPid = tmpTid > 0 ? spid[tmpTid-1] : 0; // numPairs created by threads [0..tmpTid] - sTmpPid = (gTmpPid - sTmpPid) + MAXPAIRSPERTHREAD*(tmpTid); - g_pairI[g_numPairs + gTmpPid] = pairI[ sTmpPid ]; - g_pairJ[g_numPairs + gTmpPid] = pairJ[ sTmpPid ]; + if ( total <= gpidRem) { // send it all at once + + for (int gTmpPid = tid; gTmpPid < total; gTmpPid += NUMTHREADS) { + // RBTODO: binary search for tmpTid + while ( tmpTid < NUMTHREADS-1 && pairCount[tmpTid] < gTmpPid ) + tmpTid++; + int sTmpPid = tmpTid > 0 ? pairCount[tmpTid-1] : 0; // numPairs created by threads [0..tmpTid] + sTmpPid = (gTmpPid - sTmpPid) + MAXPAIRSPERTHREAD*(tmpTid); + g_pairI[gpid + gTmpPid] = pairI[ sTmpPid ]; + g_pairJ[gpid + gTmpPid] = pairJ[ sTmpPid ]; + } + } else { // too much data; finish block's chunk first + + for (int gTmpPid = tid; gTmpPid < gpidRem; gTmpPid += NUMTHREADS) { + while ( tmpTid < NUMTHREADS-1 && pairCount[tmpTid] < gTmpPid ) + tmpTid++; + int sTmpPid = tmpTid > 0 ? pairCount[tmpTid-1] : 0; // numPairs created by threads [0..tmpTid] + sTmpPid = (gTmpPid - sTmpPid) + MAXPAIRSPERTHREAD*(tmpTid); + g_pairI[gpid + gTmpPid] = pairI[ sTmpPid ]; + g_pairJ[gpid + gTmpPid] = pairJ[ sTmpPid ]; + } + + // update block + __shared__ int tmp; + if ( tid == 0 ) tmp = atomicAdd( &g_nextBlock, 1 ); + __syncthreads; + currentBlock = tmp; + + gpid = currentBlock * MAXPAIRSPERBLOCK; + + // keep going + for (int gTmpPid = tid; gTmpPid < total-gpidRem; gTmpPid += NUMTHREADS) { + while ( tmpTid < NUMTHREADS-1 && pairCount[tmpTid] < gTmpPid ) + tmpTid++; + int sTmpPid = tmpTid > 0 ? pairCount[tmpTid-1] : 0; // numPairs created by threads [0..tmpTid] + sTmpPid = (gTmpPid - sTmpPid) + MAXPAIRSPERTHREAD*(tmpTid); + g_pairI[gpid + gTmpPid] = pairI[ sTmpPid ]; + g_pairJ[gpid + gTmpPid] = pairJ[ sTmpPid ]; + } + + if (tid == 0) { + gpidOff = 0; + gpidRem = gpidRem + MAXPAIRSPERBLOCK - total; + } + } - /* while ( total > 0 ) { */ - /* if ( tid < total ) { */ - /* int offset = stride*NUMTHREADS; */ - /* g_pairI[g_numPairs + tid + stride*NUMTHREADS] = pairI[stride]; */ - /* } */ - /* total -= NUMTHREADS; */ - /* stride++; */ - /* } */ - - /* // commenting this block brings from 37 ms to 6.5 ms *\/ */ - /* for (int d = 0; d < pid; d++) { */ - /* g_pairI[g_numPairs + d + spid[tid]] = pairI[d]; */ - /* g_pairJ[g_numPairs + d + spid[tid]] = pairJ[d]; */ - /* } */ - - // update global index - /* __syncthreads(); */ - // if (tid == NUMTHREADS) pidLast = pid; pid = 0; + /* if (tid == NUMTHREADS) { */ + /* const int tmp = pairCount[tid] + pidLast; */ + /* b_numPairs += tmp; */ + /* atomicAdd( &g_numPairs, tmp ); */ + /* } */ + } // END SYNC THREADS } // n } // z } // y } // x + + // RBTODO: extra processing for jagged elements of g_pairI + } /* replicas */ } + __global__ void createPairlistsOLD(Vector3 pos[], int num, int numReplicas, BaseGrid* sys, CellDecomposition* decomp) { @@ -353,10 +386,8 @@ void createPairlistsOLD(Vector3 pos[], int num, int numReplicas, __shared__ int spid[NUMTHREADS]; const int tid = threadIdx.x; const int ai = blockIdx.x * blockDim.x + threadIdx.x; /* atom index 0 */ - __shared__ int b_numPairs; const int MAXPAIRSPERTHREAD = 8; /* optimized for 32 threads on K40 */ - const int MAXPAIRSPERBLOCK = MAXPAIRSPERTHREAD*NUMTHREADS*4; // int numPairs = g_numPairs; if (ai == 0) { @@ -369,8 +400,7 @@ void createPairlistsOLD(Vector3 pos[], int num, int numReplicas, assert( g_pairI != NULL ); assert( g_pairJ != NULL ); } - if (tid == 0) b_NumPairs = blockIdx.x * NUMTHREADS * MAXPAIRSPERTHREAD; /* */ - + __syncthreads(); int pid = 0; @@ -431,12 +461,12 @@ void createPairlistsOLD(Vector3 pos[], int num, int numReplicas, exclIntCumSum(spid,NUMTHREADS); // returns cumsum with spid[0] = 0 for (int d = 0; d < pid; d++) { - g_pairI[b_numPairs + d + spid[tid]] = pairI[d]; - g_pairJ[b_numPairs + d + spid[tid]] = pairJ[d]; + g_pairI[g_numPairs + d + spid[tid]] = pairI[d]; + g_pairJ[g_numPairs + d + spid[tid]] = pairJ[d]; } // update global index __syncthreads(); - if (tid == NUMTHREADS) b_numPairs += spid[tid] + pid; + if (tid == NUMTHREADS) g_numPairs += spid[tid] + pid; pid = 0; } // END SYNC THREADS