From acd1ecbed7269d1a6dce030437d16338b511937a Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Tue, 17 May 2016 11:16:00 -0500 Subject: [PATCH] slightly simplified pairlist kernel --- ComputeForce.cu | 78 ++++++++++++++++++++----------- ComputeForce.cuh | 119 +++++++++++++++++++++++++++++++++++++---------- makefile | 10 ++-- notes.org | 10 ++-- 4 files changed, 160 insertions(+), 57 deletions(-) diff --git a/ComputeForce.cu b/ComputeForce.cu index c311281..6566354 100644 --- a/ComputeForce.cu +++ b/ComputeForce.cu @@ -6,6 +6,7 @@ #include "ComputeForce.cuh" #include <cuda_profiler_api.h> + #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -16,6 +17,10 @@ inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { cudaEvent_t start, stop; +void runSort(int2 *d1, int *d2, float *key, + int2 *scratch1, int *scratch2, float *scratchKey, + unsigned int count); + ComputeForce::ComputeForce(int num, const BrownianParticleType part[], int numParts, const BaseGrid* g, float switchStart, float switchLen, float electricConst, @@ -367,7 +372,7 @@ void ComputeForce::decompose(Vector3* pos, int type[]) { //RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d); - size_t free, total; + /* size_t free, total; */ /* { */ /* cuMemGetInfo(&free,&total); */ /* printf("Free memory: %zu / %zu\n", free, total); */ @@ -380,14 +385,12 @@ void ComputeForce::decompose(Vector3* pos, int type[]) { // RBTODO: free memory elsewhere // allocate device data // initializePairlistArrays<<< 1, 32 >>>(10*nCells*blocksPerCell); - const int maxPairs = 1<<26; - gpuErrchk(cudaMalloc(&numPairs_d, sizeof(int)*maxPairs)); - gpuErrchk(cudaMalloc(&pairLists_d, sizeof(int2)*maxPairs)); + const int maxPairs = 1<<25; + gpuErrchk(cudaMalloc(&numPairs_d, sizeof(int))); + + gpuErrchk(cudaMalloc(&pairLists_d, sizeof(int2)*maxPairs)); gpuErrchk(cudaMalloc(&pairTabPotType_d, sizeof(int)*maxPairs)); - int tmp = 0; - gpuErrchk(cudaMemcpyAsync(numPairs_d, &tmp, - sizeof(int), cudaMemcpyHostToDevice)); - + gpuErrchk(cudaDeviceSynchronize()); } @@ -417,14 +420,32 @@ void ComputeForce::decompose(Vector3* pos, int type[]) { float pairlistdist2 = (sqrt(cutoff2) + 2.0f); pairlistdist2 = pairlistdist2*pairlistdist2; - createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, - sys_d, decomp_d, nCells, blocksPerCell, - numPairs_d, pairLists_d, - numParts, type, pairTabPotType_d,pairlistdist2); - - gpuErrchk(cudaDeviceSynchronize()); - - + createPairlists<<< 2048, 64 >>>(pos, num, numReplicas, + sys_d, decomp_d, nCells, + numPairs_d, pairLists_d, + numParts, type, pairTabPotType_d, pairlistdist2); + /* createPairlistsOld<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, */ + /* sys_d, decomp_d, nCells, blocksPerCell, */ + /* numPairs_d, pairLists_d, */ + /* numParts, type, pairTabPotType_d, pairlistdist2); */ + + gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ + // if (false) + { // sort pairlist + int numPairs; + gpuErrchk(cudaMemcpyAsync( &numPairs, numPairs_d, sizeof(int), cudaMemcpyDeviceToHost)); + gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ + printf("here, %d pairs\n", numPairs); + /* runSort(pairLists_d, pairTabPotType_d, pairDists_d, */ + /* pairLists_s, pairTabPotType_s, pairDists_s, */ + /* numPairs); */ + /* printf("done\n"); */ + + /* // RBTODO: sort pairListInd as well!!! (i.e. roll your own sort!) */ + /* // thrust::sort_by_key( pairDists_d, pairDists_d+numPairs_d, pairLists_d ); */ + /* // thrust::sort_by_key( pairDists_d, pairDists_d+numPairs_d, pairLists_d ); */ + /* gpuErrchk(cudaDeviceSynchronize()); /\* RBTODO: sync needed here? *\/ */ + } } IndexList ComputeForce::decompDim() const { @@ -549,21 +570,25 @@ float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type, // int nb = (1+(decomp.nCells.x * decomp.nCells.y * decomp.nCells.z)) * 75; /* RBTODO: number of pairLists */ const int nb = 800; // printf("ComputeTabulated\n"); - computeTabulatedKernel<<< nb, numThreads >>>(force, pos, type, - tablePot_d, tableBond_d, - num, sys_d, - bonds, bondMap, numBonds, - cutoff2, - numPairs_d, pairLists_d, - pairTabPotType_d); - - gpuErrchk(cudaDeviceSynchronize()); + + if (get_energy) { + clearEnergies<<< nb, numThreads >>>(energies_d,num); + gpuErrchk(cudaDeviceSynchronize()); + computeTabulatedEnergyKernel<<< nb, numThreads >>>(force, pos, type, + tablePot_d, tableBond_d, sys_d, + bonds, bondMap, numBonds, energies_d, cutoff2, + numPairs_d, pairLists_d, pairTabPotType_d); + } else { + computeTabulatedKernel<<< nb, numThreads >>>(force, pos, type, + tablePot_d, tableBond_d, sys_d, + bonds, bondMap, numBonds, cutoff2, + numPairs_d, pairLists_d, pairTabPotType_d); + } /* printPairForceCounter<<<1,32>>>(); */ /* gpuErrchk(cudaDeviceSynchronize()); */ computeAngles<<<numBlocks, numThreads>>>(force, pos, angles, tableAngle_d, numAngles, num, sys_d, energies_d, get_energy); - gpuErrchk(cudaDeviceSynchronize()); computeDihedrals<<<numBlocks, numThreads>>>(force, pos, dihedrals, tableDihedral_d, numDihedrals, num, sys_d, energies_d, get_energy); @@ -611,4 +636,3 @@ float ComputeForce::computeTabulatedFull(Vector3* force, Vector3* pos, int* type return energy; } - diff --git a/ComputeForce.cuh b/ComputeForce.cuh index 6d6350c..10454f2 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -197,7 +197,7 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[], } -const __device__ int maxPairs = 1 << 14; +/* const __device__ int maxPairs = 1 << 14; */ /* __global__ */ /* void pairlistTest(Vector3 pos[], int num, int numReplicas, */ @@ -213,7 +213,7 @@ const __device__ int maxPairs = 1 << 14; /* } */ __global__ -void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, +void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas, BaseGrid* sys, CellDecomposition* __restrict__ decomp, const int nCells, const int blocksPerCell, int* g_numPairs, int2* g_pair, @@ -262,18 +262,19 @@ void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, if (aj <= ai) continue; // skip ones that are too far away - const Vector3 dr = sys->wrapDiff(pos[aj] - pos[ai]); - if (dr.length2() > pairlistdist2) continue; - + float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2(); + if (dr > pairlistdist2) continue; + + int gid = atomicAggInc( g_numPairs, warpLane ); + // RBTODO: skip exclusions, non-interacting types int pairType = type[ai] + type[aj] * numParts; - int gid = atomicAggInc( g_numPairs, warpLane ); /* assert( ai >= 0 ); */ /* assert( aj >= 0 ); */ g_pair[gid] = make_int2(ai,aj); g_pairTabPotType[gid] = pairType; - + // g_pairDists[gid] = dr; } // atoms J } // z } // y @@ -282,8 +283,75 @@ void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, } // replicas /* if (tid == 0) printf("Cell%d: found %d pairs\n",cID,g_numPairs[cID]); */ } - + __global__ +void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, + BaseGrid* sys, CellDecomposition* __restrict__ decomp, + const int nCells, + int* g_numPairs, int2* g_pair, + int numParts, int type[], int* __restrict__ g_pairTabPotType, + float pairlistdist2) { + // Loop over threads searching for atom pairs + // Each thread has designated values in shared memory as a buffer + // A sync operation periodically moves data from shared to global + const int tid = threadIdx.x; + const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */ + const int split = 32; /* numblocks should be divisible by split */ + /* const int blocksPerCell = gridDim.x/split; */ + + const CellDecomposition::cell_t* pairs = decomp->getCells(); + for (int cID = 0 + (blockIdx.x % split); cID < nCells; cID += split) { + // for (int cID = blockIdx.x/blocksPerCell; cID < nCells; cID += split ) { + for (int repID = 0; repID < numReplicas; repID++) { + const CellDecomposition::range_t rangeI = decomp->getRange(cID, repID); + + for (int ci = rangeI.first + blockIdx.x/split; ci < rangeI.last; ci += gridDim.x/split) { + /* for (int ci = rangeI.first + (blockIdx.x % blocksPerCell); */ + /* ci < rangeI.last; */ + /* ci += blocksPerCell) { */ + + // ai - index of the particle in the original, unsorted array + const int ai = pairs[ci].particle; + // const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai); + const CellDecomposition::cell_t celli = pairs[ci]; + const Vector3 posi = pos[ai]; + + for (int x = -1; x <= 1; ++x) { + for (int y = -1; y <= 1; ++y) { + for (int z = -1; z <= 1; ++z) { + + const int nID = decomp->getNeighborID(celli, x, y, z); + if (nID < 0) continue; + const CellDecomposition::range_t range = decomp->getRange(nID, repID); + + for (int n = range.first + tid; n < range.last; n+=blockDim.x) { + const int aj = pairs[n].particle; + if (aj <= ai) continue; + + // skip ones that are too far away + float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2(); + if (dr > pairlistdist2) continue; + + int gid = atomicAggInc( g_numPairs, warpLane ); + + // RBTODO: skip exclusions, non-interacting types + int pairType = type[ai] + type[aj] * numParts; + /* assert( ai >= 0 ); */ + /* assert( aj >= 0 ); */ + + g_pair[gid] = make_int2(ai,aj); + g_pairTabPotType[gid] = pairType; + // g_pairDists[gid] = dr; + } // atoms J + } // z + } // y + } // x + } // atoms I + } // replicas + /* if (tid == 0) printf("Cell%d: found %d pairs\n",cID,g_numPairs[cID]); */ + } +} + __global__ void computeKernel(Vector3 force[], Vector3 pos[], int type[], float tableAlpha[], float tableEps[], float tableRad6[], int num, int numParts, BaseGrid* sys, @@ -365,14 +433,13 @@ __global__ void printPairForceCounter() { // Kernel computes forces between Brownian particles (ions) // using cell decomposition // -__global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict__ pos, int* type, - TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond, - int num, const BaseGrid* __restrict__ sys, - const Bond* __restrict__ bonds, const int2* __restrict__ bondMap, int numBonds, - float cutoff2, - const int* __restrict__ g_numPairs, - const int2* __restrict__ g_pair, - const int* __restrict__ g_pairTabPotType) { +__global__ void computeTabulatedKernel( + Vector3* force, const Vector3* __restrict__ pos, int* type, + TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond, + const BaseGrid* __restrict__ sys, + const Bond* __restrict__ bonds, const int2* __restrict__ bondMap, int numBonds, + float cutoff2, const int* __restrict__ g_numPairs, + const int2* __restrict__ g_pair, const int* __restrict__ g_pairTabPotType) { const int numPairs = *g_numPairs; @@ -401,14 +468,18 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict } -__global__ void computeTabulatedEnergyKernel(Vector3* force, Vector3* __restrict__ pos, int* type, - TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond, - int num, BaseGrid* __restrict__ sys, - Bond* __restrict__ bonds, int2* __restrict__ bondMap, int numBonds, - float* g_energies, float cutoff2, - int* __restrict__ g_numPairs, - int2* __restrict__ g_pair, - int* __restrict__ g_pairTabPotType) { +__global__ void clearEnergies(float* g_energies, int num) { + for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) { + g_energies[i] = 0.0f; + } +} +__global__ void computeTabulatedEnergyKernel(Vector3* force, Vector3* __restrict__ pos, + int* type, TabulatedPotential** __restrict__ tablePot, + TabulatedPotential** __restrict__ tableBond, + BaseGrid* __restrict__ sys, Bond* __restrict__ bonds, + int2* __restrict__ bondMap, int numBonds, float* g_energies, float cutoff2, + int* __restrict__ g_numPairs, int2* __restrict__ g_pair, + int* __restrict__ g_pairTabPotType) { const int numPairs = *g_numPairs; // RBTODO: BONDS (handle through an independent kernel call?) diff --git a/makefile b/makefile index 015abdf..6ec127e 100644 --- a/makefile +++ b/makefile @@ -2,7 +2,8 @@ # CUDA_PATH ?= /Developer/NVIDIA/CUDA-6.5 # CUDA_PATH ?= /software/cuda-toolkit-6.5-x86_64 -CUDA_PATH ?= /usr/local/cuda-7.0 +# CUDA_PATH ?= /usr/local/cuda-7.0 +CUDA_PATH ?= /usr/local/cuda-7.5 #CUDA_PATH ?= /usr/local/encap/cuda-5.5 @@ -11,7 +12,7 @@ include ./findcudalib.mk INCLUDE = $(CUDA_PATH)/include -DEBUG = -g -O0 +# DEBUG = -g -O0 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no NV_FLAGS = -Xptxas -v # -v,-abi=no @@ -47,7 +48,10 @@ CODE_20 := -arch=sm_20 # CODE_35 := -arch=compute_35 # NV_FLAGS += $(CODE_10) $(CODE_12) $(CODE_20) $(CODE_30) $(CODE_35) -NV_FLAGS += -arch=sm_35 +# NV_FLAGS += -arch=sm_35 +SM=52 +NV_FLAGS += -gencode arch=compute_$(SM),code=compute_$(SM) + NVLD_FLAGS := $(NV_FLAGS) --device-link # NV_FLAGS += -rdc=true diff --git a/notes.org b/notes.org index 06de8c0..996b47a 100644 --- a/notes.org +++ b/notes.org @@ -1,8 +1,12 @@ * TODO active -** move to efficient linear interpolation everywhere -** update pairlists -** statistical tests of results +** split computation into multiple regions +*** simulate regions, also in overlap +**** atoms at edge of overlap can have DoF frozen +*** periodically synchronize positions +*** optimize to minimize communication & error +**** monitor energy jumps associated with this process + * TODO eventually ** organize code a bit better -- GitLab