diff --git a/ComputeForce.cu b/ComputeForce.cu index 68468cb261726e491111ea42f2a8d53579c4a39b..10fc45a23cd81665ff7d5849175892f5922070dc 100644 --- a/ComputeForce.cu +++ b/ComputeForce.cu @@ -366,14 +366,33 @@ void ComputeForce::decompose(Vector3* pos) { // Update pairlists using cell decomposition (not sure this is really needed or good) //RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d); - - // initializePairlistArrays + + size_t free, total; + cuMemGetInfo(&free,&total); + printf("Free memory: %zu / %zu\n", free, total); + // initializePairlistArrays int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z; int blocksPerCell = 10; if (newDecomp) { - initializePairlistArrays<<< 1, 32 >>>(nCells*blocksPerCell); + // initializePairlistArrays<<< 1, 32 >>>(10*nCells*blocksPerCell); + + const int nLists = blocksPerCell*nCells*100; + const size_t s = sizeof(int*) * nLists; + gpuErrchk(cudaMalloc(&pairListList_d, s)); + + int *tmpPairLists[nLists]; + for (int i = 0; i < nLists; i++) { + gpuErrchk(cudaMalloc(&tmpPairLists[i], sizeof(int)*(1<<14))); + } + gpuErrchk(cudaMemcpyAsync(pairListList_d, tmpPairLists, + sizeof(int*)*nLists, cudaMemcpyHostToDevice)); gpuErrchk(cudaDeviceSynchronize()); } + + + cuMemGetInfo(&free,&total); + printf("Free memory: %zu / %zu\n", free, total); + const int NUMTHREADS = 128; //const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1; const size_t nBlocks = nCells*blocksPerCell; diff --git a/ComputeForce.cuh b/ComputeForce.cuh index f7ca68358400ae2fe1068b16ee035b499bd79f84..1e2a6a36140e05b830e7d103b13418b0b3300b42 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -196,12 +196,13 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[], // RBTODO: remove global device variables for fast prototyping of pairlist kernel -// #define MAXPAIRS 25500*25500 // (num/numReplicas) * ((num/numReplicas)-1); +#define MAXPAIRS 25500*25500 // (num/numReplicas) * ((num/numReplicas)-1); __device__ int* g_numPairs; __device__ int** g_pairI; __device__ int** g_pairJ; __device__ int g_nextPairlist; -const __device__ int maxPairs = 1 << 12; +__device__ int g_numPairlistArrays; +const __device__ int maxPairs = 1 << 14; __global__ void initializePairlistArrays(const int nLists) { @@ -211,25 +212,58 @@ void initializePairlistArrays(const int nLists) { // RBTODO: free later if (tid == 0) { - printf("Initializing device pairlists for %d cells\n", nLists); - g_numPairs = (int*) malloc( nLists * sizeof(int) ); + printf("Initializing %d device pairlists.\n", nLists); + g_numPairs = (int*) malloc( nLists * sizeof(int) ); g_pairI = (int**) malloc( nLists * sizeof(int*)); g_pairJ = (int**) malloc( nLists * sizeof(int*)); g_nextPairlist = 0; - } - __syncthreads(); - assert( g_numPairs != NULL ); - assert( g_pairI != NULL ); - assert( g_pairJ != NULL ); + g_numPairlistArrays = nLists; + + assert( g_numPairs != NULL ); + assert( g_pairI != NULL ); + assert( g_pairJ != NULL ); - for (int i = tid; i < nLists; i += blockDim.x) { - g_pairI[i] = (int*) malloc( maxPairs * sizeof(int)); - g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int)); - g_numPairs[i] = 0; - assert( g_pairI[i] != NULL ); - assert( g_pairJ[i] != NULL ); + for (int i = 0; i < nLists; i++) { + g_pairI[i] = (int*) malloc( maxPairs * sizeof(int)); + g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int)); + g_numPairs[i] = 0; + assert( g_pairI[i] != NULL ); + assert( g_pairJ[i] != NULL ); + } } } +/* __global__ */ +/* void initializePairlistArrays(const int nLists) { */ +/* const int tid = threadIdx.x; */ +/* // const int maxPairs = 1 << 12; /\* ~120,000 per cell *\/ */ +/* if (blockIdx.x > 0) return; */ + +/* // RBTODO: free later */ +/* if (tid == 0) { */ +/* printf("Initializing %d device pairlists.\n", nLists); */ +/* g_numPairs = (int*) malloc( nLists * sizeof(int) ); */ +/* g_pairI = (int**) malloc( nLists * sizeof(int*)); */ +/* g_pairJ = (int**) malloc( nLists * sizeof(int*)); */ +/* g_nextPairlist = 0; */ +/* g_numPairlistArrays = nLists; */ +/* } */ +/* __syncthreads(); */ +/* assert( g_numPairs != NULL ); */ +/* assert( g_pairI != NULL ); */ +/* assert( g_pairJ != NULL ); */ + +/* for (int i = tid; i < nLists; i += blockDim.x) { */ +/* g_pairI[i] = (int*) malloc( maxPairs * sizeof(int)); */ +/* g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int)); */ +/* g_numPairs[i] = 0; */ +/* } */ + +/* __syncthreads(); */ +/* for (int i = tid; i < nLists; i += blockDim.x) { */ +/* assert( g_pairI[i] != NULL ); */ +/* assert( g_pairJ[i] != NULL ); */ +/* } */ +/* } */ __global__ void createPairlists(Vector3 pos[], int num, int numReplicas, @@ -247,12 +281,8 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, const int wid = tid/WARPSIZE; const int blockLane = bid % blocksPerCell; - __shared__ int pid[NUMTHREADS/WARPSIZE]; - if (warpLane == 0) pid[wid] = 0; - - /* if (warpLane == 0) */ - /* pid = atomicAdd( &g_nextPairlist, 1 ); */ - /* res = warp_bcast(res,leader); */ + volatile __shared__ int pid[NUMTHREADS/WARPSIZE]; + if (warpLane == 0) pid[wid] = bid; if (cID >= nCells) return; int count = 0; /* debug */ @@ -285,40 +315,23 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, if (aj <= ai) continue; // RBTODO: skip exclusions - /* int gid; */ - /* { // per-warp atomic add to get global inices */ - /* int t_active = __ballot(1); */ - /* int leader = __ffs(t_active)-1; */ - /* int res; */ - - /* // RBTODO: see if performance improves with __any(gid >= maxPairs */ - /* if ( warpLane == leader ) { */ - /* const int t_count = __popc(t_active); */ - /* res = atomicAdd( &g_numPairs[pid], t_count ); */ - /* if ( res + t_count >= maxPairs ) { // went too far; mark invalid and go again */ - /* int tmp = atomicSub( &g_numPairs[pid], t_count ); */ - /* assert( tmp == res + t_count ); */ - /* pid++; */ - /* res = atomicAdd( &g_numPairs[pid], t_count ); */ - /* } */ - /* } */ - /* pid = warp_bcast(pid,leader); */ - /* res = warp_bcast(res,leader); */ - /* gid = res + __popc( t_active & ((1 << warpLane) - 1) ); */ - /* } */ - int gid = atomicAggInc( &g_numPairs[pid], warpLane ); // fails - if (__any(gid >= maxPairs)) { // a little inefficient, but important - g_pairI[pid][gid] = -1; - g_pairJ[pid][gid] = -1; - pid++; /* needs to apply to ALL warp threads */ + int gid = atomicAggInc( pid[wid], &g_numPairs[pid[wid]], warpLane ); // fails + while (__any(gid >= maxPairs)) { // does any thread in the warp have too large and index? + if (gid < maxPairs) { + g_pairI[pid[wid]][gid] = -1; + g_pairJ[pid[wid]][gid] = -1; + } + + // Have 'leader' thread in warp increment counter + if ( warpLane + 1 == __ffs(__ballot(1)) ) pid[wid]++; + // we assume arrays at pid are nearly empty (no while loop) - gid = atomicAggInc( &g_numPairs[pid], warpLane ); /* assume this hasn't filled */ + gid = atomicAggInc( pid[wid], &g_numPairs[pid[wid]], warpLane ); /* assume this hasn't filled */ } - // int wid = atomicAdd( &g_numPairs[pid], 1 ); // works - g_pairI[pid][gid] = ai; - g_pairJ[pid][gid] = aj; + g_pairI[pid[wid]][gid] = ai; + g_pairJ[pid[wid]][gid] = aj; } // atoms J } // z } // y diff --git a/ComputeForce.h b/ComputeForce.h index c9f6fcb57fd1ea9fde2ea1cffcb74228938d2e4f..a0abaace11418f42b30dad91ef52469f5b653d01 100644 --- a/ComputeForce.h +++ b/ComputeForce.h @@ -112,9 +112,10 @@ private: TabulatedDihedralPotential **tableDihedral_d, **tableDihedral_addr; // Pairlists - int *pairIds_d; - int numPairs; - int numPairs_d; + int **pairListList_d; + int **pairLists_d; + + int *numPairs_d; }; #endif diff --git a/CudaUtil.cuh b/CudaUtil.cuh index 70d7e78e1bdc276a1cb3297d77f978116d6d5b2a..84067175209b3c1ec2d71e8561933798fa69e110 100644 --- a/CudaUtil.cuh +++ b/CudaUtil.cuh @@ -3,16 +3,21 @@ #define WARPSIZE 32 __device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); } -__device__ int atomicAggInc(int *ctr, int warpLane) { +__device__ int atomicAggInc(int tmpidx, int *ctr, int warpLane) { // https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ int mask = __ballot(1); int leader = __ffs(mask)-1; + int tmp4 = tmpidx; int res; if ( warpLane == leader ) res = atomicAdd(ctr, __popc(mask)); res = warp_bcast(res,leader); + + int tmp2 = warpLane; /* diddo */ + int* tmp = ctr; /* avoid optimizing */ + int tmp3 = tmpidx; return res + __popc( mask & ((1 << warpLane) - 1) ); } diff --git a/makefile b/makefile index 435923b2eb045884ce780d21b067110adbf94231..015abdf56a0d4c1b94865998e27c0b72998cc827 100644 --- a/makefile +++ b/makefile @@ -49,10 +49,11 @@ CODE_20 := -arch=sm_20 # NV_FLAGS += $(CODE_10) $(CODE_12) $(CODE_20) $(CODE_30) $(CODE_35) NV_FLAGS += -arch=sm_35 -NVLD_FLAGS := $(NV_FLAGS) --device-link +NVLD_FLAGS := $(NV_FLAGS) --device-link # NV_FLAGS += -rdc=true LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(LIBRARY) +LD_FLAGS += -lcuda ### Sources