From 8c67cfb978c60ebfa0cad47a6ef507ea2d36d5d8 Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Mon, 22 Feb 2016 14:06:53 -0600 Subject: [PATCH] pairlist kernel being rewritten; ammend this broken commit --- ComputeForce.cu | 7 +++-- ComputeForce.cuh | 80 +++++++++++++++++++++++++++++++++++------------- CudaUtil.cuh | 2 +- makefile | 2 +- 4 files changed, 65 insertions(+), 26 deletions(-) diff --git a/ComputeForce.cu b/ComputeForce.cu index 3e0d912..68468cb 100644 --- a/ComputeForce.cu +++ b/ComputeForce.cu @@ -369,17 +369,18 @@ void ComputeForce::decompose(Vector3* pos) { // initializePairlistArrays int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z; + int blocksPerCell = 10; if (newDecomp) { - initializePairlistArrays<<< 1, 32 >>>(nCells); + initializePairlistArrays<<< 1, 32 >>>(nCells*blocksPerCell); gpuErrchk(cudaDeviceSynchronize()); } const int NUMTHREADS = 128; //const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1; - const size_t nBlocks = nCells; + const size_t nBlocks = nCells*blocksPerCell; /* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d, decomp_d); */ /* gpuErrchk(cudaDeviceSynchronize()); */ - createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d, nCells); + createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d, nCells, blocksPerCell); gpuErrchk(cudaDeviceSynchronize()); diff --git a/ComputeForce.cuh b/ComputeForce.cuh index c93022e..f7ca683 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -6,7 +6,6 @@ #include "CudaUtil.cuh" #include <assert.h> - __host__ __device__ EnergyForce ComputeForce::coulombForce(Vector3 r, float alpha, float start, float len) { @@ -201,27 +200,29 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[], __device__ int* g_numPairs; __device__ int** g_pairI; __device__ int** g_pairJ; -__device__ bool g_pairsSet = false; +__device__ int g_nextPairlist; +const __device__ int maxPairs = 1 << 12; __global__ -void initializePairlistArrays(const int nCells) { +void initializePairlistArrays(const int nLists) { const int tid = threadIdx.x; - const int maxPairs = 1 << 16; /* ~60,000 per cell */ +// const int maxPairs = 1 << 12; /* ~120,000 per cell */ if (blockIdx.x > 0) return; - + // RBTODO: free later if (tid == 0) { - printf("Initializing device pairlists for %d cells\n", nCells); - g_numPairs = (int*) malloc( nCells * sizeof(int) ); - g_pairI = (int**) malloc( nCells * sizeof(int*)); - g_pairJ = (int**) malloc( nCells * sizeof(int*)); + printf("Initializing device pairlists for %d cells\n", nLists); + g_numPairs = (int*) malloc( nLists * sizeof(int) ); + g_pairI = (int**) malloc( nLists * sizeof(int*)); + g_pairJ = (int**) malloc( nLists * sizeof(int*)); + g_nextPairlist = 0; } __syncthreads(); assert( g_numPairs != NULL ); assert( g_pairI != NULL ); assert( g_pairJ != NULL ); - for (int i = tid; i < nCells; i += blockDim.x) { + for (int i = tid; i < nLists; i += blockDim.x) { g_pairI[i] = (int*) malloc( maxPairs * sizeof(int)); g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int)); g_numPairs[i] = 0; @@ -232,20 +233,27 @@ void initializePairlistArrays(const int nCells) { __global__ void createPairlists(Vector3 pos[], int num, int numReplicas, - BaseGrid* sys, CellDecomposition* decomp, int nCells) { + BaseGrid* sys, CellDecomposition* decomp, const int nCells, const int blocksPerCell) { // Loop over threads searching for atom pairs // Each thread has designated values in shared memory as a buffer // A sync operation periodically moves data from shared to global const int NUMTHREADS = 128; /* RBTODO: fix */ const int tid = threadIdx.x; + const int bid = blockIdx.x; - const int cID = blockIdx.x; - // * blockDim.x + threadIdx.x; - // const int cID = blockIdx.x * blockDim.x + threadIdx.x; /* atom index 0 */ + const int cID = bid / blocksPerCell; const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */ + const int wid = tid/WARPSIZE; + const int blockLane = bid % blocksPerCell; + + __shared__ int pid[NUMTHREADS/WARPSIZE]; + if (warpLane == 0) pid[wid] = 0; + + /* if (warpLane == 0) */ + /* pid = atomicAdd( &g_nextPairlist, 1 ); */ + /* res = warp_bcast(res,leader); */ - /* const int3& nCells = decomp->nCells; */ if (cID >= nCells) return; int count = 0; /* debug */ const CellDecomposition::cell_t* pairs = decomp->getCells(); @@ -255,7 +263,7 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, if (tid == 0) printf(" Cell%d: Working on %d atoms for repID %d\n", cID, rangeI.last - rangeI.first, repID); - for (int ci = rangeI.first; ci < rangeI.last; ci++) { + for (int ci = rangeI.first + blockLane; ci < rangeI.last; ci+=blocksPerCell) { // ai - index of the particle in the original, unsorted array const int ai = pairs[ci].particle; // const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai); @@ -276,11 +284,41 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, const int aj = pairs[n].particle; if (aj <= ai) continue; // RBTODO: skip exclusions - - int wid = atomicAggInc( &g_numPairs[cID], warpLane ); // fails - // int wid = atomicAdd( &g_numPairs[cID], 1 ); // works - g_pairI[cID][wid] = ai; - g_pairJ[cID][wid] = aj; + + /* int gid; */ + /* { // per-warp atomic add to get global inices */ + /* int t_active = __ballot(1); */ + /* int leader = __ffs(t_active)-1; */ + /* int res; */ + + /* // RBTODO: see if performance improves with __any(gid >= maxPairs */ + /* if ( warpLane == leader ) { */ + /* const int t_count = __popc(t_active); */ + /* res = atomicAdd( &g_numPairs[pid], t_count ); */ + /* if ( res + t_count >= maxPairs ) { // went too far; mark invalid and go again */ + /* int tmp = atomicSub( &g_numPairs[pid], t_count ); */ + /* assert( tmp == res + t_count ); */ + /* pid++; */ + /* res = atomicAdd( &g_numPairs[pid], t_count ); */ + /* } */ + /* } */ + /* pid = warp_bcast(pid,leader); */ + /* res = warp_bcast(res,leader); */ + /* gid = res + __popc( t_active & ((1 << warpLane) - 1) ); */ + /* } */ + int gid = atomicAggInc( &g_numPairs[pid], warpLane ); // fails + if (__any(gid >= maxPairs)) { // a little inefficient, but important + g_pairI[pid][gid] = -1; + g_pairJ[pid][gid] = -1; + pid++; /* needs to apply to ALL warp threads */ + // we assume arrays at pid are nearly empty (no while loop) + gid = atomicAggInc( &g_numPairs[pid], warpLane ); /* assume this hasn't filled */ + } + + + // int wid = atomicAdd( &g_numPairs[pid], 1 ); // works + g_pairI[pid][gid] = ai; + g_pairJ[pid][gid] = aj; } // atoms J } // z } // y diff --git a/CudaUtil.cuh b/CudaUtil.cuh index 7241f5c..70d7e78 100644 --- a/CudaUtil.cuh +++ b/CudaUtil.cuh @@ -15,7 +15,7 @@ __device__ int atomicAggInc(int *ctr, int warpLane) { return res + __popc( mask & ((1 << warpLane) - 1) ); } - + __device__ inline void exclIntCumSum(int* in, const int n) { // 1) int* in must point to shared memory diff --git a/makefile b/makefile index 83fcb81..435923b 100644 --- a/makefile +++ b/makefile @@ -11,7 +11,7 @@ include ./findcudalib.mk INCLUDE = $(CUDA_PATH)/include -# DEBUG = -g -O0 +DEBUG = -g -O0 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no NV_FLAGS = -Xptxas -v # -v,-abi=no -- GitLab