diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu index 7fca72cf7db6506cc4d1905702a793167b54bb27..69c192986617af9def3fe51a9ea7541616acab54 100644 --- a/src/ComputeForce.cu +++ b/src/ComputeForce.cu @@ -386,7 +386,6 @@ void ComputeForce::decompose() { // Update pairlists using cell decomposition (not sure this is really needed or good) //RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d); - /* size_t free, total; */ /* { */ /* cuMemGetInfo(&free,&total); */ @@ -399,8 +398,7 @@ void ComputeForce::decompose() { if (newDecomp) { // RBTODO: free memory elsewhere // allocate device data - // initializePairlistArrays<<< 1, 32 >>>(10*nCells*blocksPerCell); - const int maxPairs = 1<<25; + const int maxPairs = 1<<20; gpuErrchk(cudaMalloc(&numPairs_d, sizeof(int))); gpuErrchk(cudaMalloc(&pairLists_d, sizeof(int2)*maxPairs)); @@ -431,12 +429,14 @@ void ComputeForce::decompose() { float pairlistdist2 = (sqrt(cutoff2) + 2.0f); pairlistdist2 = pairlistdist2*pairlistdist2; +#if __CUDA_ARCH__ >= 300 createPairlists<<< 2048, 64 >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2); - /* createPairlistsOld<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, */ - /* sys_d, decomp_d, nCells, blocksPerCell, */ - /* numPairs_d, pairLists_d, */ - /* numParts, type, pairTabPotType_d, pairlistdist2); */ +#else + // Use shared memory for warp_bcast function + createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2); +#endif + gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ // if (false) { // sort pairlist diff --git a/src/ComputeForce.cuh b/src/ComputeForce.cuh index 73ec7766664ca9c4db40e62bf25d196247bc83ea..bd524dc23d8420bcf534fc90931f67a9478a72d8 100644 --- a/src/ComputeForce.cuh +++ b/src/ComputeForce.cuh @@ -213,78 +213,6 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[], /* } */ /* } */ -__global__ -void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas, - BaseGrid* sys, CellDecomposition* __restrict__ decomp, - const int nCells, const int blocksPerCell, - int* g_numPairs, int2* g_pair, - int numParts, int type[], int* __restrict__ g_pairTabPotType, - float pairlistdist2) { - // Loop over threads searching for atom pairs - // Each thread has designated values in shared memory as a buffer - // A sync operation periodically moves data from shared to global - const int tid = threadIdx.x; - const int bid = blockIdx.x; - - const int cID = bid / blocksPerCell; /* cellID */ - - const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */ - // const int wid = tid/WARPSIZE; - const int blockLane = bid % blocksPerCell; - - if (cID >= nCells) return; - int count = 0; /* debug */ - const CellDecomposition::cell_t* pairs = decomp->getCells(); - - for (int repID = 0; repID < numReplicas; repID++) { - const CellDecomposition::range_t rangeI = decomp->getRange(cID, repID); - /* if (tid == 0) printf(" Cell%d: Working on %d atoms for repID %d\n", */ - /* cID, rangeI.last - rangeI.first, repID); */ - - for (int ci = rangeI.first + blockLane; ci < rangeI.last; ci+=blocksPerCell) { - // ai - index of the particle in the original, unsorted array - const int ai = pairs[ci].particle; - // const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai); - const CellDecomposition::cell_t celli = pairs[ci]; - const Vector3 posi = pos[ai]; - - for (int x = -1; x <= 1; ++x) { - for (int y = -1; y <= 1; ++y) { - for (int z = -1; z <= 1; ++z) { - - const int nID = decomp->getNeighborID(celli, x, y, z); - if (nID < 0) continue; - const CellDecomposition::range_t range = decomp->getRange(nID, repID); - const int last = range.last; - // for (int n = 0; n < last; n++) {} - for (int n = range.first + tid; n < last; n+=blockDim.x) { - count++; - const int aj = pairs[n].particle; - if (aj <= ai) continue; - - // skip ones that are too far away - float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2(); - if (dr > pairlistdist2) continue; - - int gid = atomicAggInc( g_numPairs, warpLane ); - - // RBTODO: skip exclusions, non-interacting types - int pairType = type[ai] + type[aj] * numParts; - /* assert( ai >= 0 ); */ - /* assert( aj >= 0 ); */ - - g_pair[gid] = make_int2(ai,aj); - g_pairTabPotType[gid] = pairType; - // g_pairDists[gid] = dr; - } // atoms J - } // z - } // y - } // x - } // atoms I - } // replicas - /* if (tid == 0) printf("Cell%d: found %d pairs\n",cID,g_numPairs[cID]); */ -} - __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas, const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp, @@ -294,8 +222,8 @@ void createPairlists(Vector3* __restrict__ pos, const int num, const int numRepl const Exclude* __restrict__ excludes, const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2) { // Loop over threads searching for atom pairs - // Each thread has designated values in shared memory as a buffer - // A sync operation periodically moves data from shared to global + // Each thread has designated values in shared memory as a buffer + // A sync operation periodically moves data from shared to global const int tid = threadIdx.x; const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */ const int split = 32; /* numblocks should be divisible by split */ diff --git a/src/ComputeGridGrid.cuh b/src/ComputeGridGrid.cuh index 670041ed15164ff4b3a3cf9511a472087da9c9ec..6150ae6726405e84c4cc49538ef6c8802d8d8dfd 100644 --- a/src/ComputeGridGrid.cuh +++ b/src/ComputeGridGrid.cuh @@ -2,6 +2,7 @@ #pragma once #include "useful.h" #define NUMTHREADS 96 +#define WARPSIZE 32 class RigidBodyGrid; diff --git a/src/CudaUtil.cu b/src/CudaUtil.cu index 40f8ad2d7182d6dc3d20876fb429f462e9ea5d87..c7bb55a1ad75d2d8f6e891b2207fa7dbed3ef0be 100644 --- a/src/CudaUtil.cu +++ b/src/CudaUtil.cu @@ -1,6 +1,19 @@ #include "CudaUtil.cuh" -__device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); } +#if __CUDA_ARCH__ >= 300 +__device__ int warp_bcast(int v, int leader) {return __shfl(v, leader); } +#else +volatile extern __shared__ int sh[]; +__device__ int warp_bcast(int v, int leader) { + // WARNING: might not be safe to call in divergent branches + const int tid = threadIdx.x; + const int warpLane = tid % WARPSIZE; + if (warpLane == leader) + sh[tid/WARPSIZE] = v; + return sh[tid/WARPSIZE]; +} +#endif + __device__ int atomicAggInc(int *ctr, int warpLane) { // https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ int mask = __ballot(1); diff --git a/src/RigidBody.cu b/src/RigidBody.cu index 873e42973ff849a1a277c1ba89d3901fe2367e15..54d1b17aa80bbb71232d3bc480c941a50bbd3269 100644 --- a/src/RigidBody.cu +++ b/src/RigidBody.cu @@ -103,10 +103,15 @@ void RigidBody::updateParticleList(Vector3* pos_d) { gpuErrchk(cudaMemcpy( tmp_d, &numParticles[i], sizeof(int), cudaMemcpyHostToDevice )); int nb = floor(tnp/NUMTHREADS) + 1; +#if __CUDA_ARCH__ >= 300 createPartlist<<<NUMTHREADS,nb>>>(pos_d, tnp, t->particles_d[i], tmp_d, particles_d[i], gridCenter + position, cutoff*cutoff); - +#else + createPartlist<<<NUMTHREADS,nb,NUMTHREADS/WARPSIZE>>>(pos_d, tnp, t->particles_d[i], + tmp_d, particles_d[i], + gridCenter + position, cutoff*cutoff); +#endif gpuErrchk(cudaMemcpy(&numParticles[i], tmp_d, sizeof(int), cudaMemcpyDeviceToHost )); } }