diff --git a/CellDecomposition.cu b/CellDecomposition.cu index 300b19fe187ee680d1f6d5272253e1db471f18d7..07a7b028c8f1404aa94f7bf2e4e84615788bf22c 100644 --- a/CellDecomposition.cu +++ b/CellDecomposition.cu @@ -113,6 +113,8 @@ void CellDecomposition::decompose_d(Vector3 pos_d[], size_t num) { numCells, numReplicas); gpuErrchk(cudaMemcpy(ranges, ranges_d, numCellRep, cudaMemcpyDeviceToHost)); gpuErrchk( cudaFree(temp_ranges) ); + + } // ***************************************************************************** diff --git a/ComputeForce.cu b/ComputeForce.cu index 309845a60bfba3bd4dbc10770b32dd3700e81043..0569f6c20dd461a584fef06f57982fd4df6603dd 100644 --- a/ComputeForce.cu +++ b/ComputeForce.cu @@ -356,6 +356,15 @@ void ComputeForce::decompose(Vector3* pos) { cudaFree(decomp_d); decomp.decompose_d(pos, num); decomp_d = decomp.copyToCUDA(); + + // Update pairlists using cell decomposition (not sure this is really needed or good) + //RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d); + const int NUMTHREADS = 32; + const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1; + createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d); + gpuErrchk(cudaDeviceSynchronize()); + + } IndexList ComputeForce::decompDim() const { @@ -475,10 +484,12 @@ float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type, dim3 numBlocks(gridSize, 1, 1); dim3 numThreads(NUM_THREADS, 1, 1); + // Call the kernel to calculate the forces computeTabulatedKernel<<< numBlocks, numThreads >>>(force, pos, type, - tablePot_d, tableBond_d, num, numParts, sys_d, bonds, bondMap, - numBonds, excludes, excludeMap, numExcludes, decomp_d, + tablePot_d, tableBond_d, + num, numParts, sys_d, + bonds, bondMap, numBonds, energies_d, cutoff2, gridSize, numReplicas, get_energy); gpuErrchk(cudaDeviceSynchronize()); diff --git a/ComputeForce.cuh b/ComputeForce.cuh index 1f1e1c5ceabea867442c3739a5fdd52c51195841..c954f005dc00f8d444c8b9d8fd824e8ece43eeb5 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -3,25 +3,21 @@ // Terrance Howard <heyterrance@gmail.com> #pragma once +#include "CudaUtil.cuh" +#include <assert.h> + __host__ __device__ EnergyForce ComputeForce::coulombForce(Vector3 r, float alpha, float start, float len) { float d = r.length(); - + if (d >= start + len) return EnergyForce(); if (d <= start) { float energy = alpha/d - alpha/start + 0.5f*alpha/(start*start)*len; Vector3 force = -alpha/(d*d*d)*r; - if (isnan(force.x)) - printf(">> DamnX.\n"); - if (isnan(force.y)) - printf(">> DamnY.\n"); - if (isnan(force.z)) - printf(">> DamnZ.\n"); - return EnergyForce(energy, force); } @@ -199,6 +195,86 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[], } } + +// RBTODO: remove global device variables for fast prototyping of pairlist kernel +__device__ int g_numPairs = 0; +__device__ int* g_pairI; +__device__ int* g_pairJ; + +__global__ +void createPairlists(Vector3 pos[], int num, int numReplicas, + BaseGrid* sys, CellDecomposition* decomp) { + // Loop over threads searching for atom pairs + // Each thread has designated values in shared memory as a buffer + // A sync operation periodically moves data from shared to global + const int NUMTHREADS = 32; /* RBTODO: fix */ + __shared__ int spid[NUMTHREADS]; + const int tid = threadIdx.x; + const int ai = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid == 0) g_numPairs = 0; /* RBTODO: be more efficient */ + __syncthreads(); + + int numPairs = g_numPairs; + + const int MAXPAIRSPERTHREAD = 256; /* RBTODO Hard limit likely too low */ + int pairI[MAXPAIRSPERTHREAD]; + int pairJ[MAXPAIRSPERTHREAD]; + int pid = 0; + + // ai - index of the particle in the original, unsorted array + if (ai < (num-1) * numReplicas) { + const int repID = ai / (num-1); + // const int typei = type[i]; // maybe irrelevent + const Vector3 posi = pos[ai]; + + // RBTODO: if this approach works well, make cell decomposition finer so limit still works + + // TODO: Fix this: Find correct celli (add a new function to + // CellDecomposition, binary search over cells) + CellDecomposition::cell_t celli = decomp->getCellForParticle(ai); + const CellDecomposition::cell_t* pairs = decomp->getCells(); + for (int x = -1; x <= 1; ++x) { + for (int y = -1; y <= 1; ++y) { + for (int z = -1; z <= 1; ++z) { + if (x+y+z != -3) { // SYNC THREADS + __syncthreads(); + assert(pid < MAXPAIRSPERTHREAD); + + // find where each thread should put its pairs in global list + spid[tid] = pid; + exclIntCumSum(spid,NUMTHREADS); // returns cumsum with spid[0] = 0 + + for (int d = 0; d < pid; d++) { + g_pairI[g_numPairs + d + spid[tid]] = pairI[d]; + g_pairJ[g_numPairs + d + spid[tid]] = pairJ[d]; + } + // update global index + __syncthreads(); + if (tid == NUMTHREADS) g_numPairs += spid[tid] + pid; + pid = 0; + } // END SYNC THREADS + + const int nID = decomp->getNeighborID(celli, x, y, z); + if (nID < 0) continue; // Skip if got wrong or duplicate cell. + const CellDecomposition::range_t range = decomp->getRange(nID, repID); + + for (int n = range.first; n < range.last; n++) { + const int aj = pairs[n].particle; + + // RBTODO: skip exclusions + if (aj <= ai) continue; + + pairI[pid] = ai; + pairJ[pid] = aj; + pid++; /* RBTODO synchronize across threads somehow */ + } // n + } // z + } // y + } // x + } /* replicas */ +} + __global__ void computeKernel(Vector3 force[], Vector3 pos[], int type[], float tableAlpha[], float tableEps[], float tableRad6[], @@ -272,7 +348,7 @@ void computeKernel(Vector3 force[], Vector3 pos[], int type[], } -// =================================================================================== +// ============================================================================ // Kernel computes forces between Brownian particles (ions) // using cell decomposition // @@ -280,142 +356,46 @@ __global__ void computeTabulatedKernel(Vector3* force, Vector3* pos, int* type, TabulatedPotential** tablePot, TabulatedPotential** tableBond, int num, int numParts, BaseGrid* sys, Bond* bonds, int2* bondMap, int numBonds, - Exclude* excludes, int2* excludeMap, int numExcludes, - CellDecomposition* decomp, float* g_energies, float cutoff2, int gridSize, + float* g_energies, float cutoff2, int gridSize, int numReplicas, bool get_energy) { + + const int NUMTHREADS = 32; /* RBTODO: fix */ + __shared__ EnergyForce fe[NUMTHREADS]; + __shared__ int atomI[NUMTHREADS]; + __shared__ int atomJ[NUMTHREADS]; - // Thread's unique ID. - int i = blockIdx.x * blockDim.x + threadIdx.x; - - // Initialize interaction energy (per particle) - float energy_local = 0; + int numPairs = g_numPairs; - // Loop over ALL particles in ALL replicas - if (i < num * numReplicas) { - const int repID = i / num; + const int tid = threadIdx.x; + const int gid = blockIdx.x * blockDim.x + threadIdx.x; + // loop over particle pairs in pairlist + if (gid < numPairs) { // BONDS: RBTODO: handle through an independent kernel call - /* Each particle may have a varying number of bonds - * bondMap is an array with one element for each particle - * which keeps track of where a particle's bonds are stored - * in the bonds array. - * bondMap[i].x is the index in the bonds array where the ith particle's bonds begin - * bondMap[i].y is the index in the bonds array where the ith particle's bonds end - */ - - // Initialize bond_start and bond_end to -1 - // If these are not changed later, then we know this particle does not have any bonds - /* int bond_start = -1; */ - /* int bond_end = -1; */ - - /* // Get bond_start and bond_end from the bondMap */ - /* if (bondMap != NULL) { */ - /* bond_start = bondMap[i - repID * num].x; */ - /* bond_end = bondMap[i - repID * num].y; */ - /* } */ - - /* // currBond is the index in the bonds array that we should look at next */ - /* // currBond is initialized to bond_start because that is the first index of the */ - /* // bonds array where this particle's bonds are stored */ - /* int currBond = bond_start; */ - - /* // nextBond is the ID number of the next particle that this particle is bonded to */ - /* // If this particle has at least one bond, then nextBond is initialized to be the */ - /* // first particle that this particle is bonded to */ - /* int nextBond = (bond_start >= 0) ? bonds[bond_start].ind2 : -1; */ - - // RBTODO: handle exclusions in other kernel call + const int ai = g_pairI[tid]; + const int aj = g_pairJ[tid]; - /* // Same as for bonds, but for exclusions now */ - /* int ex_start = -1; */ - /* int ex_end = -1; */ - /* if (excludeMap != NULL) { */ - /* ex_start = excludeMap[i].x; */ - /* ex_end = excludeMap[i].y; */ - /* } */ - /* int currEx = ex_start; */ - /* int nextEx = (ex_start >= 0) ? excludes[ex_start].ind2 : -1; */ - // Particle's type and position - int typei = type[i]; - Vector3 posi = pos[i]; - const CellDecomposition::cell_t celli = decomp->getCellForParticle(i); - - // Initialize force_local - force on a particle (i) - Vector3 force_local(0.0f); - const CellDecomposition::cell_t* cells = decomp->getCells(); - - // Loop over neighboring cells - for (int x = -1; x <= 1; ++x) { - for (int y = -1; y <= 1; ++y) { - for (int z = -1; z <= 1; ++z) { - // Get ID of the neighboring cell (x, y, z) - const int nID = decomp->getNeighborID(celli, x, y, z); - - // Skip if bad neighbor. - if (nID == -1) continue; + const int ind = type[ai] + type[aj] * numParts; /* RBTODO: why is numParts here? */ - const CellDecomposition::range_t range = decomp->getRange(nID, repID); + // RBTODO: implement wrapDiff2, returns dr2 + const Vector3 dr = sys->wrapDiff(pos[aj] - pos[ai]); - int typej = -1; - int ind = -1; - for (int n = range.first; n < range.last; ++n) { - const int j = cells[n].particle; - if (j == i) continue; + // Calculate the force based on the tabulatedPotential file + fe[tid] = EnergyForce(0.0f, Vector3(0.0f)); + if (tablePot[ind] != NULL && dr.length2() <= cutoff2) + fe[tid] = tablePot[ind]->compute(dr); - const int newj = type[j]; - if (typej != newj) { - typej = newj; - ind = typei + typej * numParts; - } - - // Find the distance between particles i and j, - // wrapping this value if necessary - const Vector3 dr = sys->wrapDiff(pos[j] - posi); + // RBTODO: think of a better approach; e.g. shared atomicAdds that are later reduced in global memory + atomicAdd( &force[ai], fe[tid].f ); + atomicAdd( &force[aj], -fe[tid].f ); - // First, calculate the force based on the tabulatedPotential file - EnergyForce ft(0.0f, Vector3(0.0f)); - /* bool exclude = (nextEx == j); */ - /* if (exclude) */ - /* nextEx = (currEx < ex_end - 1) ? excludes[++currEx].ind2 : -1; */ - - if (tablePot[ind] != NULL && dr.length2() <= cutoff2) - ft = tablePot[ind]->compute(dr); - - /* // If the next bond we want is the same as j, then there is a bond */ - /* // between particles i and j. */ - /* if (nextBond == j && tableBond != NULL) { */ - - /* // Calculate the force on the particle from the bond */ - /* // If the user has specified the REPLACE option for this bond, */ - /* // then overwrite the force we calculated from the regular */ - /* // tabulated potential */ - /* // If the user has specified the ADD option, then add the bond */ - /* // force to the tabulated potential value */ - /* EnergyForce bond_ef = */ - /* tableBond[ bonds[currBond].tabFileIndex ]->compute(dr); */ - /* switch (bonds[currBond].flag) { */ - /* case Bond::REPLACE: ft = bond_ef; break; */ - /* case Bond::ADD: ft += bond_ef; break; */ - /* default: break; */ - /* } */ - - /* // Increment currBond, so that we can find the index of the */ - /* // next particle that this particle is bonded to. */ - /* nextBond = (currBond < bond_end - 1) ? bonds[++currBond].ind2 : -1; */ - /* } */ - force_local += ft.f; - if (get_energy && j > i) - energy_local += ft.e; - } // n - } // z - } // y - } // z - force[i] = force_local; - if (get_energy) - g_energies[i] = energy_local; + // RBTODO: why are energies calculated for each atom? Could be reduced + if (get_energy && aj > ai) + atomicAdd( &(g_energies[ai]), fe[tid].e ); } + } diff --git a/ComputeForce.h b/ComputeForce.h index 38fdf36d4887921b36471068de7aed0f9d5258e1..c9f6fcb57fd1ea9fde2ea1cffcb74228938d2e4f 100644 --- a/ComputeForce.h +++ b/ComputeForce.h @@ -111,6 +111,10 @@ private: TabulatedAnglePotential **tableAngle_d, **tableAngle_addr; TabulatedDihedralPotential **tableDihedral_d, **tableDihedral_addr; + // Pairlists + int *pairIds_d; + int numPairs; + int numPairs_d; }; #endif diff --git a/CudaUtil.cuh b/CudaUtil.cuh new file mode 100644 index 0000000000000000000000000000000000000000..f44034f68ef35acc7651bca5932dc88fc536ba03 --- /dev/null +++ b/CudaUtil.cuh @@ -0,0 +1,76 @@ +#pragma once + +__device__ inline void exclIntCumSum(int* in, const int n) { + // 1) int* in must point to shared memory + // 2) int n must be power of 2 + const int tid = threadIdx.x; + // RBTODO: worry about possible bank conflicts http://www.eecs.umich.edu/courses/eecs570/hw/parprefix.pdf + + // build tree of sums + int stride = 1; + for (int d = n>>1; d > 0; d >>= 1) { + __syncthreads(); + if (tid < d) { + int id = 2*stride*(tid+1)-1; + in[id] += in[id-stride]; + } + stride *= 2; + } + if (tid == 0) in[n-1] = 0; /* exclusive cumsum (starts at 0) */ + + // traverse down tree and build 'scan' + for (int d = 1; d < n; d*= 2) { + stride >>= 1; + __syncthreads(); + + if (tid < d) { // RBTODO: this could be incorrect ==> test + int id = 2*stride*(tid+1)-1; + int t = in[id]; + in[id] += in[id-stride]; + in[id-stride] = t; + } + } + __syncthreads(); +} + + +__device__ inline void inclIntCumSum(int* in, const int n) { + // 1) int* in must point to shared memory + // 2) int n must be power of 2 + const int tid = threadIdx.x; + + // RBTODO: worry about possible bank conflicts http://www.eecs.umich.edu/courses/eecs570/hw/parprefix.pdf + + // build tree of sums + int stride = 1; + for (int d = n>>1; d > 0; d >>= 1) { + __syncthreads(); + if (tid < d) { + int id = 2*stride*(tid+1)-1; + in[id] += in[id-stride]; + } + stride *= 2; + } + // if (tid == 0) in[n-1] = 0; /* exclusive cumsum (starts at 0) */ + + // traverse down tree and build 'scan' + for (int d = 1; d < n; d*= 2) { + stride >>= 1; + __syncthreads(); + + if (tid < d) { // RBTODO: this could be incorrect ==> test + int id = 2*stride*(tid+1)-1; + in[id+stride] += in[id]; + /* int t = in[id]; */ + /* in[id] += in[id-stride]; */ + /* in[id-stride] = t; */ + } + } + __syncthreads(); +} + +__device__ void atomicAdd( Vector3* address, Vector3 val) { + atomicAdd( &(address->x), val.x); + atomicAdd( &(address->y), val.y); + atomicAdd( &(address->z), val.z); +} diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu index 92c6dfd694a7083938925a973a1271a977effdcb..1fdd9e5d2683b83ebe18d9984fc721ecf8651ead 100644 --- a/GrandBrownTown.cu +++ b/GrandBrownTown.cu @@ -334,8 +334,10 @@ void GrandBrownTown::run() { // 1 - do not use cutoff [ N^2 ] switch (fullLongRange) { case 0: // [ N*log(N) ] interactions, + cutoff | decomposition - if (s % decompPeriod == 0) + if (s % decompPeriod == 0) { internal->decompose(pos_d); + //internal->updatePairlists(pos_d); // perhaps this way? + } energy = internal->computeTabulated(forceInternal_d, pos_d, type_d, bonds_d, bondMap_d, excludes_d, excludeMap_d, diff --git a/makefile b/makefile index a7de250d1b0280edfcadf243c2f76c0bb5bbaf75..8d5c8eb1ddc329cae1398ea5d348be94f645e106 100644 --- a/makefile +++ b/makefile @@ -11,13 +11,13 @@ include ./findcudalib.mk INCLUDE = $(CUDA_PATH)/include -# DEBUG = -g -O0 +DEBUG = -g -O0 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic -# NV_FLAGS = -g -G #debug # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no NV_FLAGS = -Xptxas -v # -v,-abi=no EX_FLAGS = -O3 -m$(OS_SIZE) # EX_FLAGS = -O0 -m$(OS_SIZE) +NV_FLAGS += -g -G #debug ifneq ($(MAVERICKS),) CC = $(CLANG)