diff --git a/BaseGrid.h b/BaseGrid.h index b4ac7c5452736af9ba02ad2a4ce7b44f2fe9e36f..4eca85da1652936533a0b3907249b341b96a6495 100644 --- a/BaseGrid.h +++ b/BaseGrid.h @@ -26,6 +26,8 @@ using namespace std; #define STRLEN 512 +DEVICE float __fdividef(float x, float y); + class NeighborList { public: float v[3][3][3]; @@ -649,7 +651,14 @@ public: } // Wrap distance: -0.5*l <= x < 0.5*l - HOST DEVICE static inline float wrapDiff(float x, float l) { + DEVICE static inline float d_wrapDiff(float x, float l) { + // int image = int(floor(x/l)); + int image = int(floorf( __fdividef(x,l) )); + x -= image*l; + x = (x >= 0.5f * l) ? x-l : x; + return x; + } + HOST DEVICE static inline float wrapDiff(float x, float l) { int image = int(floor(x/l)); x -= image*l; if (x >= 0.5f * l) @@ -681,6 +690,13 @@ public: l.z = wrapDiff(l.z, nz); return basis.transform(l); } + DEVICE inline Vector3 d_wrapDiff(Vector3 r) const { + Vector3 l = basisInv.transform(r); + l.x = d_wrapDiff(l.x, nx); + l.y = d_wrapDiff(l.y, ny); + l.z = d_wrapDiff(l.z, nz); + return basis.transform(l); + } Vector3 wrapDiffNearest(Vector3 r) const; diff --git a/ComputeForce.cu b/ComputeForce.cu index 3ef58db501944731fa3e7142d27fb56a47ae3b2a..c311281fbcd0808ddc835cc09c0fe0aa9e8d54a6 100644 --- a/ComputeForce.cu +++ b/ComputeForce.cu @@ -380,10 +380,9 @@ void ComputeForce::decompose(Vector3* pos, int type[]) { // RBTODO: free memory elsewhere // allocate device data // initializePairlistArrays<<< 1, 32 >>>(10*nCells*blocksPerCell); - const int maxPairs = 1<<29; + const int maxPairs = 1<<26; gpuErrchk(cudaMalloc(&numPairs_d, sizeof(int)*maxPairs)); - gpuErrchk(cudaMalloc(&pairListsI_d, sizeof(int)*maxPairs)); - gpuErrchk(cudaMalloc(&pairListsJ_d, sizeof(int)*maxPairs)); + gpuErrchk(cudaMalloc(&pairLists_d, sizeof(int2)*maxPairs)); gpuErrchk(cudaMalloc(&pairTabPotType_d, sizeof(int)*maxPairs)); int tmp = 0; gpuErrchk(cudaMemcpyAsync(numPairs_d, &tmp, @@ -420,7 +419,7 @@ void ComputeForce::decompose(Vector3* pos, int type[]) { createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d, nCells, blocksPerCell, - numPairs_d, pairListsI_d, pairListsJ_d, + numPairs_d, pairLists_d, numParts, type, pairTabPotType_d,pairlistdist2); gpuErrchk(cudaDeviceSynchronize()); @@ -548,14 +547,14 @@ float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type, // Call the kernel to calculate the forces // int nb = (decomp.nCells.x * decomp.nCells.y * decomp.nCells.z); // int nb = (1+(decomp.nCells.x * decomp.nCells.y * decomp.nCells.z)) * 75; /* RBTODO: number of pairLists */ - const int nb = 200; + const int nb = 800; // printf("ComputeTabulated\n"); computeTabulatedKernel<<< nb, numThreads >>>(force, pos, type, tablePot_d, tableBond_d, - num, numParts, sys_d, - bonds, bondMap, numBonds, - energies_d, cutoff2, gridSize, numReplicas, get_energy, - numPairs_d, pairListsI_d, pairListsJ_d, + num, sys_d, + bonds, bondMap, numBonds, + cutoff2, + numPairs_d, pairLists_d, pairTabPotType_d); gpuErrchk(cudaDeviceSynchronize()); diff --git a/ComputeForce.cuh b/ComputeForce.cuh index 15a3a1804c222cf7097dcc7c8b1a3704bf621893..6d6350c7aac03f058028fdd0ee4db66b0eceb006 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -213,11 +213,11 @@ const __device__ int maxPairs = 1 << 14; /* } */ __global__ -void createPairlists(Vector3 pos[], int num, int numReplicas, - BaseGrid* sys, CellDecomposition* decomp, +void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, + BaseGrid* sys, CellDecomposition* __restrict__ decomp, const int nCells, const int blocksPerCell, - int* g_numPairs, int* g_pairI, int* g_pairJ, - int numParts, int type[], int* g_pairTabPotType, + int* g_numPairs, int2* g_pair, + int numParts, int type[], int* __restrict__ g_pairTabPotType, float pairlistdist2) { // Loop over threads searching for atom pairs // Each thread has designated values in shared memory as a buffer @@ -267,12 +267,11 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, // RBTODO: skip exclusions, non-interacting types int pairType = type[ai] + type[aj] * numParts; - int gid = atomicAggInc( g_numPairs, warpLane ); + int gid = atomicAggInc( g_numPairs, warpLane ); /* assert( ai >= 0 ); */ /* assert( aj >= 0 ); */ - g_pairI[gid] = ai; - g_pairJ[gid] = aj; + g_pair[gid] = make_int2(ai,aj); g_pairTabPotType[gid] = pairType; } // atoms J @@ -366,83 +365,75 @@ __global__ void printPairForceCounter() { // Kernel computes forces between Brownian particles (ions) // using cell decomposition // -__global__ void computeTabulatedKernel(Vector3* force, Vector3* pos, int* type, - TabulatedPotential** tablePot, TabulatedPotential** tableBond, - int num, int numParts, BaseGrid* sys, - Bond* bonds, int2* bondMap, int numBonds, - float* g_energies, float cutoff2, int gridSize, - int numReplicas, bool get_energy, - int* g_numPairs, int* g_pairI, int* g_pairJ, - int* g_pairTabPotType) { +__global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict__ pos, int* type, + TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond, + int num, const BaseGrid* __restrict__ sys, + const Bond* __restrict__ bonds, const int2* __restrict__ bondMap, int numBonds, + float cutoff2, + const int* __restrict__ g_numPairs, + const int2* __restrict__ g_pair, + const int* __restrict__ g_pairTabPotType) { - // const int NUMTHREADS = 256; /* RBTODO: fix */ - // __shared__ EnergyForce fe[NUMTHREADS]; - /* __shared__ int atomI[NUMTHREADS]; */ - /* __shared__ int atomJ[NUMTHREADS]; */ - // const int& tid = threadIdx.x; - // const int blockStride = (*g_numPairs) / gridDim.x > 0 ? (*g_numPairs) / gridDim.x : 1 ; /* RBTODO: fix this */ const int numPairs = *g_numPairs; - const int blockStride = ceil(((float)(numPairs)) / gridDim.x); - - - /* if (threadIdx.x == 0) { */ - /* printf( "block %d running from pair %d to %d\n", blockIdx.x, blockIdx.x*blockStride, blockIdx.x*blockStride + blockStride -1 );\ */ - /* } */ - - - // RBTODO: fix this - // loop over particle pairs in pairlist - /* if (threadIdx.x == 0 && blockIdx.x == 0) { */ - /* printf("%d %d %d\n", *g_numPairs, gridDim.x, blockStride); */ - /* printf("%d %d\n", blockDim.x, gridDim.x); */ - /* } */ - - /* if (threadIdx.x == 0 && blockIdx.x == 0) { */ - /* printf("blockStride: %d\n",blockStride); */ - /* for (int i = 1; i < 1; i++) */ - /* printf("TEST FAILED\n"); */ - - /* printf("g_numPairs: %d\n", numPairs); */ - - for (int i = threadIdx.x + blockIdx.x*blockStride; i < (blockIdx.x+1)*blockStride && i < numPairs; i+=blockDim.x) { - + for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < numPairs; i+=blockDim.x*gridDim.x) { + const int2 pair = g_pair[i]; + const int ai = pair.x; + const int aj = pair.y; // BONDS: RBTODO: handle through an independent kernel call - const int ai = g_pairI[i]; - const int aj = g_pairJ[i]; - if (ai < 0) continue; // Particle's type and position - // const int ind = type[ai] + type[aj] * numParts; /* RBTODO: why is numParts here? */ const int ind = g_pairTabPotType[i]; - // if (tablePot[ind] == NULL) continue; - - // RBTODO: implement wrapDiff2, returns dr2 + /* printf("tid,bid,pos[ai(%d)]: %d %d %f %f %f\n", ai, threadIdx.x, blockIdx.x, pos[ai].x, pos[ai].y, pos[ai].z); */ - /* printf("pos[ai(%d)]: %f %f %f\n", ai, pos[ai].x, pos[ai].y, pos[ai].z); */ - /* printf("pos[aj(%d)]: %f %f %f\n", aj, pos[aj].x, pos[aj].y, pos[aj].z); */ - const Vector3 dr = sys->wrapDiff(pos[aj] - pos[ai]); - // const Vector3 dr = pos[aj] - pos[ai]; - + Vector3 dr = pos[aj] - pos[ai]; + dr = sys->d_wrapDiff(dr); + // Calculate the force based on the tabulatedPotential file - // if (dr.length2() > cutoff2) continue; - if (tablePot[ind] != NULL && dr.length2() <= cutoff2) { - EnergyForce fe = tablePot[ind]->compute(dr); - - // RBTODO: think of a better approach; e.g. shared atomicAdds that are later reduced in global memory - atomicAdd( &force[ai], -fe.f ); - atomicAdd( &force[aj], fe.f ); + float d2 = dr.length2(); + if (tablePot[ind] != NULL && d2 <= cutoff2) { + Vector3 f = tablePot[ind]->computef(dr,d2); + atomicAdd( &force[ai], -f ); + atomicAdd( &force[aj], f ); + } + } +} - /* printf("force[ai(%d)]: %0.2f %0.2f %0.2f\n", ai, fe.f.x, fe.f.y, fe.f.z); */ - // atomicAdd( &pairForceCounter, 1 ); /* DEBUG */ - +__global__ void computeTabulatedEnergyKernel(Vector3* force, Vector3* __restrict__ pos, int* type, + TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond, + int num, BaseGrid* __restrict__ sys, + Bond* __restrict__ bonds, int2* __restrict__ bondMap, int numBonds, + float* g_energies, float cutoff2, + int* __restrict__ g_numPairs, + int2* __restrict__ g_pair, + int* __restrict__ g_pairTabPotType) { + + const int numPairs = *g_numPairs; + // RBTODO: BONDS (handle through an independent kernel call?) + for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < numPairs; i+=blockDim.x*gridDim.x) { + const int2 pair = g_pair[i]; + const int ai = pair.x; + const int aj = pair.y; + const int ind = g_pairTabPotType[i]; + + // RBTODO: implement wrapDiff2, returns dr2 (???) + Vector3 dr = pos[aj] - pos[ai]; + dr = sys->d_wrapDiff(dr); + // Calculate the force based on the tabulatedPotential file + float d2 = dr.length2(); + /* RBTODO: filter tabpot particles ahead of time */ + // RBTODO: order pairs according to distance to reduce divergence + + if (tablePot[ind] != NULL && d2 <= cutoff2) { + EnergyForce fe = tablePot[ind]->compute(dr,d2); + // RBTODO: is this the best approach? + atomicAdd( &force[ai], -fe.f ); + atomicAdd( &force[aj], fe.f ); // RBTODO: why are energies calculated for each atom? Could be reduced - if (get_energy) { - atomicAdd( &(g_energies[ai]), fe.e ); - atomicAdd( &(g_energies[aj]), fe.e ); - } + atomicAdd( &(g_energies[ai]), fe.e ); + atomicAdd( &(g_energies[aj]), fe.e ); } } } diff --git a/ComputeForce.h b/ComputeForce.h index f1f56feb5f4ab6e7ff194126673da37178c8030b..ea1761d9fbec60137a02f36f33a3dc5fb054b041 100644 --- a/ComputeForce.h +++ b/ComputeForce.h @@ -113,8 +113,7 @@ private: TabulatedDihedralPotential **tableDihedral_d, **tableDihedral_addr; // Pairlists - int *pairListsI_d; - int *pairListsJ_d; + int2 *pairLists_d; int *pairTabPotType_d; int *numPairs_d; diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh index 586108b455f68f88784d6160cac5b2c7dab2b18c..066680dff4697656c13e89ab0825500a94b03dfd 100644 --- a/ComputeGridGrid.cuh +++ b/ComputeGridGrid.cuh @@ -1,7 +1,7 @@ // Included in RigidBodyController.cu #pragma once - +//RBTODO: add __restrict__, benchmark (Q: how to restrict member data?) __global__ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, const Matrix3 basis_rho, const Matrix3 basis_u_inv, diff --git a/GPUManager.cpp b/GPUManager.cpp index 393b649bbd0b1e11123734c3c2359cd7d5bdbc05..b3f7d1d6db8cc4b054f9fe1b1d08fb09c8f41063 100644 --- a/GPUManager.cpp +++ b/GPUManager.cpp @@ -86,6 +86,7 @@ void GPUManager::init_devices() { void GPUManager::set(int gpu_id) { gpu_id = gpu_id % (int) gpus.size(); cudaSetDevice(gpus[gpu_id]); + cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 ); } int GPUManager::current() { diff --git a/TabulatedPotential.h b/TabulatedPotential.h index 1c47343e80e244d396a5eedadc8ade74096e8314..91a401a46a85d0c06bf3a38fbc50e29b5cff49c4 100644 --- a/TabulatedPotential.h +++ b/TabulatedPotential.h @@ -80,6 +80,40 @@ public: Vector3 force = (-du/(d*dr))*r; return EnergyForce(energy,force); } + HOST DEVICE inline EnergyForce compute(Vector3 r, float d) { + d = sqrt(d); + // float d = r.length(); + float w = (d - r0)/dr; + int home = int( floorf(w) ); + w = w - home; + // if (home < 0) return EnergyForce(v0[0], Vector3(0.0f)); + home = home < 0 ? 0 : home; + if (home >= n) return EnergyForce(e0, Vector3(0.0f)); + + float u0 = v0[home]; + float du = home+1 < n ? v0[home+1]-u0 : 0; + + // Interpolate. + float energy = du*w+u0; + Vector3 force = (-du/(d*dr))*r; + return EnergyForce(energy,force); + } + HOST DEVICE inline Vector3 computef(Vector3 r, float d) { + d = sqrt(d); + // float d = r.length(); + // RBTODO: precompute so that initial blocks are zero; reduce computation here + float w = (d - r0)/dr; + int home = int( floorf(w) ); + w = w - home; + // if (home < 0) return EnergyForce(v0[0], Vector3(0.0f)); + home = home < 0 ? 0 : home; + if (home >= n) return Vector3(0.0f); + + if (home+1 < n) + return (-(v0[home+1]-v0[home])/(d*dr))*r; + else + return Vector3(0.0f); + } // private: public: diff --git a/runBrownTown.cpp b/runBrownTown.cpp index fd1bd4bbfa0cec2d0022119829d5d53892eff204..49b272df357592d2f2ff18f41e0f3b3145de5581 100644 --- a/runBrownTown.cpp +++ b/runBrownTown.cpp @@ -19,6 +19,7 @@ int main(int argc, char* argv[]) { printf("Usage: %s [OPTIONS [ARGS]] CONFIGFILE OUTPUT [SEED]\n", argv[0]); printf("\n"); printf(" -r, --replicas= Number of replicas to run\n"); + printf(" -g, --gpu= Index of gpu to use (defaults to 0)\n"); printf(" -i, --imd= IMD port (defaults to %d)\n", kIMDPort); printf(" -d, --debug Debug mode: allows user to choose which forces are computed\n"); printf(" --safe Do not use GPUs that may timeout (default)\n"); @@ -42,7 +43,10 @@ int main(int argc, char* argv[]) { printf("Try '%s --help' for more information.\n", argv[0]); return 1; } - + size_t n_gpus = max(GPUManager::gpus.size(), 1lu); + + int gpuID = 0; + bool debug = false, safe = true; int replicas = 1; unsigned int imd_port; @@ -59,6 +63,16 @@ int main(int argc, char* argv[]) { } else if (strcmp(arg, "-d") == 0 || strcmp(arg, "--debug") == 0) { debug = true; num_flags++; + + } else if (strcmp(arg, "-g") == 0 || strcmp(arg, "--gpu") == 0) { + int arg_val = atoi(argv[pos + 1]); + gpuID = arg_val; + num_flags += 2; + if (arg_val < 0 || arg_val > n_gpus) { + printf("Invalid argument given to %s\n", arg); + return 1; + } + } else if (strcmp(arg, "-r") == 0 || strcmp(arg, "--replicas") == 0) { int arg_val = atoi(argv[pos + 1]); if (arg_val <= 0) { @@ -104,7 +118,7 @@ int main(int argc, char* argv[]) { GPUManager::safe(safe); Configuration config(configFile, replicas, debug); // GPUManager::set(0); - GPUManager::set(1); + GPUManager::set(gpuID); config.copyToCUDA(); GrandBrownTown brown(config, outArg, randomSeed,