diff --git a/ComputeForce.cu b/ComputeForce.cu index 0569f6c20dd461a584fef06f57982fd4df6603dd..6494959ab1b01693da5d4fc9ec731a331e590226 100644 --- a/ComputeForce.cu +++ b/ComputeForce.cu @@ -4,6 +4,7 @@ #include "ComputeForce.h" #include "ComputeForce.cuh" +#include <cuda_profiler_api.h> #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { @@ -351,6 +352,7 @@ bool ComputeForce::addDihedralPotential(String fileName, int ind, } void ComputeForce::decompose(Vector3* pos) { + gpuErrchk( cudaProfilerStart() ); // Reset the cell decomposition. if (decomp_d) cudaFree(decomp_d); @@ -358,9 +360,12 @@ void ComputeForce::decompose(Vector3* pos) { decomp_d = decomp.copyToCUDA(); // Update pairlists using cell decomposition (not sure this is really needed or good) - //RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d); + //RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d); const int NUMTHREADS = 32; const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1; + + /* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d, decomp_d); */ + /* gpuErrchk(cudaDeviceSynchronize()); */ createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d); gpuErrchk(cudaDeviceSynchronize()); diff --git a/ComputeForce.cuh b/ComputeForce.cuh index c954f005dc00f8d444c8b9d8fd824e8ece43eeb5..6850074c0a64e3f92c62206159f92bbd21df138c 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -197,14 +197,23 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[], // RBTODO: remove global device variables for fast prototyping of pairlist kernel -__device__ int g_numPairs = 0; +// #define MAXPAIRS 25500*25500 // (num/numReplicas) * ((num/numReplicas)-1); +__device__ int g_numPairs; __device__ int* g_pairI; __device__ int* g_pairJ; +__global__ +void clearPairlists(Vector3 pos[], int num, int numReplicas, + BaseGrid* sys, CellDecomposition* decomp) { + const int tid = threadIdx.x; + const int ai = blockIdx.x * blockDim.x + threadIdx.x; + +} + __global__ void createPairlists(Vector3 pos[], int num, int numReplicas, BaseGrid* sys, CellDecomposition* decomp) { - // Loop over threads searching for atom pairs + // Loop over threads searching for atom pairs // Each thread has designated values in shared memory as a buffer // A sync operation periodically moves data from shared to global const int NUMTHREADS = 32; /* RBTODO: fix */ @@ -212,14 +221,18 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, const int tid = threadIdx.x; const int ai = blockIdx.x * blockDim.x + threadIdx.x; - if (tid == 0) g_numPairs = 0; /* RBTODO: be more efficient */ - __syncthreads(); - - int numPairs = g_numPairs; - - const int MAXPAIRSPERTHREAD = 256; /* RBTODO Hard limit likely too low */ - int pairI[MAXPAIRSPERTHREAD]; - int pairJ[MAXPAIRSPERTHREAD]; + // int numPairs = g_numPairs; + if (ai == 0) { + g_numPairs = 0; /* RBTODO: be more efficient */ + // RBTODO: free later, do this elsewhere + // const int maxPairs = (num/numReplicas) * ((num/numReplicas)-1) / 1000; + const int maxPairs = 1e5; + g_pairI = (int*) malloc( maxPairs * sizeof(int)); + g_pairJ = (int*) malloc( maxPairs * sizeof(int)); + assert( g_pairI != NULL ); + assert( g_pairJ != NULL ); + } + __syncthreads(); int pid = 0; // ai - index of the particle in the original, unsorted array @@ -237,37 +250,57 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, for (int x = -1; x <= 1; ++x) { for (int y = -1; y <= 1; ++y) { for (int z = -1; z <= 1; ++z) { - if (x+y+z != -3) { // SYNC THREADS - __syncthreads(); - assert(pid < MAXPAIRSPERTHREAD); - - // find where each thread should put its pairs in global list - spid[tid] = pid; - exclIntCumSum(spid,NUMTHREADS); // returns cumsum with spid[0] = 0 - - for (int d = 0; d < pid; d++) { - g_pairI[g_numPairs + d + spid[tid]] = pairI[d]; - g_pairJ[g_numPairs + d + spid[tid]] = pairJ[d]; - } - // update global index - __syncthreads(); - if (tid == NUMTHREADS) g_numPairs += spid[tid] + pid; - pid = 0; - } // END SYNC THREADS + int n = 0; + int last = -1; const int nID = decomp->getNeighborID(celli, x, y, z); - if (nID < 0) continue; // Skip if got wrong or duplicate cell. - const CellDecomposition::range_t range = decomp->getRange(nID, repID); + if (nID >= 0) { // set range for valid cells + const CellDecomposition::range_t range = decomp->getRange(nID, repID); + n = range.first; + last = range.last; + } - for (int n = range.first; n < range.last; n++) { - const int aj = pairs[n].particle; - - // RBTODO: skip exclusions - if (aj <= ai) continue; - - pairI[pid] = ai; - pairJ[pid] = aj; - pid++; /* RBTODO synchronize across threads somehow */ + // go through possible pairs 32 at a time per thread + const int MAXPAIRSPERTHREAD = 32; + int pairI[MAXPAIRSPERTHREAD]; + int pairJ[MAXPAIRSPERTHREAD]; + + bool findingPairs = true; + while (findingPairs) { + // if (tid == 0) + // printf("Working to find pairlist: numPairs = %d\n", g_numPairs); + + for (int dn = 0; dn < MAXPAIRSPERTHREAD; dn++) { + if ( n + dn >= last ) continue; + + const int aj = pairs[n+dn].particle; + // RBTODO: skip exclusions + if (aj <= ai) continue; + + pairI[pid] = ai; + pairJ[pid] = aj; + pid++; + } + n += MAXPAIRSPERTHREAD; + findingPairs = __all( n < last ); + + { // SYNC THREADS + __syncthreads(); + assert(pid <= MAXPAIRSPERTHREAD); + + // find where each thread should put its pairs in global list + spid[tid] = pid; + exclIntCumSum(spid,NUMTHREADS); // returns cumsum with spid[0] = 0 + + for (int d = 0; d < pid; d++) { + g_pairI[g_numPairs + d + spid[tid]] = pairI[d]; + g_pairJ[g_numPairs + d + spid[tid]] = pairJ[d]; + } + // update global index + __syncthreads(); + if (tid == NUMTHREADS) g_numPairs += spid[tid] + pid; + pid = 0; + } // END SYNC THREADS } // n } // z } // y diff --git a/makefile b/makefile index 8d5c8eb1ddc329cae1398ea5d348be94f645e106..460f00ce47fd1f33d7600b86ee0c22136496bf78 100644 --- a/makefile +++ b/makefile @@ -11,13 +11,13 @@ include ./findcudalib.mk INCLUDE = $(CUDA_PATH)/include -DEBUG = -g -O0 +# DEBUG = -g -O0 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no NV_FLAGS = -Xptxas -v # -v,-abi=no EX_FLAGS = -O3 -m$(OS_SIZE) # EX_FLAGS = -O0 -m$(OS_SIZE) -NV_FLAGS += -g -G #debug +# NV_FLAGS += -g -G #debug ifneq ($(MAVERICKS),) CC = $(CLANG)