From 202d5daaca7fb122412096ab50fd6bc558ec790e Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Thu, 6 Oct 2016 13:54:33 -0500 Subject: [PATCH] Refactored CudaUtil; Added RandomCUDA routines that take state as parameter --- ComputeForce.cuh | 16 ++++++++-------- CudaUtil.cu | 43 +++++++++++++++++++++++++++++++++++++++++++ CudaUtil.cuh | 20 +++++--------------- GrandBrownTown.cu | 18 +++++++++++++++++- GrandBrownTown.cuh | 11 ++++++++--- RandomCUDA.h | 17 ++++++++++++++--- TabulatedMethods.cuh | 4 ++-- notes.org | 14 ++++++++++++++ 8 files changed, 111 insertions(+), 32 deletions(-) create mode 100644 CudaUtil.cu diff --git a/ComputeForce.cuh b/ComputeForce.cuh index 0fa23ba..6e78f8e 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -286,7 +286,7 @@ void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas, } __global__ -void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas, +void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas, const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp, const int nCells, int* g_numPairs, int2* g_pair, @@ -462,11 +462,12 @@ __global__ void computeTabulatedKernel( } -__global__ void clearEnergies(float* g_energies, int num) { +__global__ void clearEnergies(float* __restrict__ g_energies, int num) { for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) { g_energies[i] = 0.0f; } } + __global__ void computeTabulatedEnergyKernel(Vector3* force, const Vector3* __restrict__ pos, const BaseGrid* __restrict__ sys, float cutoff2, const int* __restrict__ g_numPairs, const int2* __restrict__ g_pair, const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot, float* g_energies) { @@ -692,7 +693,7 @@ void computeAngles(Vector3 force[], Vector3 pos[], } }*/ -__global__ void computeTabulatedBonds(Vector3* __restrict__ force, +__global__ void computeTabulatedBonds(Vector3* force, Vector3* __restrict__ pos, BaseGrid* __restrict__ sys, int numBonds, int3* __restrict__ bondList_d, TabulatedPotential** tableBond) { @@ -727,7 +728,7 @@ __global__ void computeTabulatedBonds(Vector3* __restrict__ force, // TODO: add kernel for energy calculation __global__ -void computeTabulatedAngles(Vector3* __restrict__ force, +void computeTabulatedAngles(Vector3* force, Vector3* __restrict__ pos, BaseGrid* __restrict__ sys, int numAngles, int4* __restrict__ angleList_d, TabulatedAnglePotential** tableAngle) { @@ -834,12 +835,12 @@ void computeDihedrals(Vector3 force[], Vector3 pos[], // bool get_energy, TabulatedDihedralPotential** __restrict__ tableDihedral) { __global__ -void computeTabulatedDihedrals(Vector3* __restrict__ force, const Vector3* __restrict__ pos, +void computeTabulatedDihedrals(Vector3* force, const Vector3* __restrict__ pos, const BaseGrid* __restrict__ sys, - int numDihedrals, const int4* __restrict__ dihedralList_d, + int numDihedrals, const int4* const __restrict__ dihedralList_d, const int* __restrict__ dihedralPotList_d, TabulatedDihedralPotential** tableDihedral) { - int currDihedral = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID + // int currDihedral = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID // Loop over ALL dihedrals in ALL replicas // TODO make grid stride loop @@ -856,4 +857,3 @@ void computeTabulatedDihedrals(Vector3* __restrict__ force, const Vector3* __res // } } } - diff --git a/CudaUtil.cu b/CudaUtil.cu new file mode 100644 index 0000000..40f8ad2 --- /dev/null +++ b/CudaUtil.cu @@ -0,0 +1,43 @@ +#include "CudaUtil.cuh" + +__device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); } +__device__ int atomicAggInc(int *ctr, int warpLane) { + // https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ + int mask = __ballot(1); + int leader = __ffs(mask)-1; + + int res; + if ( warpLane == leader ) + res = atomicAdd(ctr, __popc(mask)); + res = warp_bcast(res,leader); + + return res + __popc( mask & ((1 << warpLane) - 1) ); +} + +__global__ +void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector) { + extern __shared__ Vector3 blockVector[]; + const int tid = threadIdx.x; + + // grid-stride loop over vector[] + for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) { + // assign vector to shared memory + blockVector[tid] = vector[i]; + // blockVector[tid] = Vector3(0.0f); + __syncthreads(); + + + // Reduce vectors in shared memory + // http://www.cuvilib.com/Reduction.pdf + for (int offset = blockDim.x/2; offset > 0; offset >>= 1) { + if (tid < offset) { + int oid = tid + offset; + blockVector[tid] = blockVector[tid] + blockVector[oid]; + } + __syncthreads(); + } + + if (tid == 0) + atomicAdd( netVector, blockVector[0] ); + } +} diff --git a/CudaUtil.cuh b/CudaUtil.cuh index 49e0f83..3442969 100644 --- a/CudaUtil.cuh +++ b/CudaUtil.cuh @@ -1,22 +1,13 @@ #pragma once - +#include "useful.h" #define WARPSIZE 32 -__device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); } -__device__ int atomicAggInc(int *ctr, int warpLane) { - // https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ - int mask = __ballot(1); - int leader = __ffs(mask)-1; - - int res; - if ( warpLane == leader ) - res = atomicAdd(ctr, __popc(mask)); - res = warp_bcast(res,leader); - return res + __popc( mask & ((1 << warpLane) - 1) ); -} +extern __device__ int warp_bcast(int v, int leader); +extern __device__ int atomicAggInc(int *ctr, int warpLane); +extern __global__ +void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector); - __device__ inline void exclIntCumSum(int* in, const int n) { // 1) int* in must point to shared memory // 2) int n must be power of 2 @@ -50,7 +41,6 @@ __device__ inline void exclIntCumSum(int* in, const int n) { __syncthreads(); } - __device__ inline void inclIntCumSum(int* in, const int n) { // 1) int* in must point to shared memory // 2) int n must be power of 2 diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu index 4e53b81..4eb2572 100644 --- a/GrandBrownTown.cu +++ b/GrandBrownTown.cu @@ -305,7 +305,8 @@ GrandBrownTown::~GrandBrownTown() { // Run the Brownian Dynamics steps. void GrandBrownTown::run() { printf("\n"); - + Vector3 runningNetForce(0.0f); + // Open the files for recording ionic currents for (int repID = 0; repID < numReplicas; ++repID) { newCurrent(repID); @@ -452,6 +453,21 @@ void GrandBrownTown::run() { RBC.updateForces(s); /* update RB forces before update particle positions... */ + /* DEBUG: reduce net force on particles + { + Vector3 netForce(0.0f); + Vector3* netForce_d; + gpuErrchk(cudaMalloc(&netForce_d, sizeof(Vector3))); + gpuErrchk(cudaMemcpy(netForce_d, &netForce, sizeof(Vector3), cudaMemcpyHostToDevice)); + reduceVector<<< 500, NUM_THREADS, NUM_THREADS*sizeof(Vector3)>>>( + num, internal->getForceInternal_d(), netForce_d); + gpuErrchk(cudaDeviceSynchronize()); + gpuErrchk(cudaMemcpy(&netForce, netForce_d, sizeof(Vector3), cudaMemcpyDeviceToHost)); + runningNetForce = runningNetForce + netForce; + printf("Net Force: %f %f %f\n", runningNetForce.x,runningNetForce.y,runningNetForce.z); + } + // */ + //MLog: Call the kernel to update the positions of each particle // cudaSetDevice(0); updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas); diff --git a/GrandBrownTown.cuh b/GrandBrownTown.cuh index d5a1d16..da0ffd9 100644 --- a/GrandBrownTown.cuh +++ b/GrandBrownTown.cuh @@ -1,9 +1,10 @@ // GrandBrownTown.cuh // // Terrance Howard <heyterrance@gmail.com> - #pragma once +#include "CudaUtil.cuh" + __device__ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion, float timestep, BaseGrid *sys, Random *randoGen, int num); @@ -20,7 +21,7 @@ void clearInternalForces(Vector3* __restrict__ forceInternal, const int num) { forceInternal[idx] = Vector3(0.0f); } __global__ -void updateKernel(Vector3 pos[], Vector3 forceInternal[], +void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField, int tGridLength, @@ -29,7 +30,9 @@ void updateKernel(Vector3 pos[], Vector3 forceInternal[], // Calculate this thread's ID const int idx = blockIdx.x * blockDim.x + threadIdx.x; - // Loop over ALL particles in ALL replicas + + // TODO: Make this a grid-stride loop to make efficient reuse of RNG states + // Loop over ALL particles in ALL replicas if (idx < num * numReplicas) { const int t = type[idx]; Vector3& p = pos[idx]; @@ -102,6 +105,7 @@ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion, Vector3 diffGrad, float timestep, BaseGrid *sys, Random *randoGen, int num) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; + // TODO: improve performance by storing state locally, then sending it back to GPU Vector3 rando = randoGen->gaussian_vector(idx, num); diffusion *= timestep; @@ -109,6 +113,7 @@ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion, Vector3 r = r0 + (diffusion / kTlocal) * force + timestep * diffGrad + sqrtf(2.0f * diffusion) * rando; + // Wrap about periodic boundaries Vector3 l = sys->getInverseBasis().transform(r - sys->getOrigin()); l.x = sys->wrapFloat(l.x, sys->getNx()); l.y = sys->wrapFloat(l.y, sys->getNy()); diff --git a/RandomCUDA.h b/RandomCUDA.h index e031785..b3f719a 100644 --- a/RandomCUDA.h +++ b/RandomCUDA.h @@ -46,12 +46,23 @@ public: return curand_normal(&states[idx]); return 0.0f; } + DEVICE inline float gaussian(curandState* state) { + return curand_normal(state); + } DEVICE inline Vector3 gaussian_vector(int idx, int num) { // TODO do stuff - float x = gaussian(idx, num); - float y = gaussian(idx, num); - float z = gaussian(idx, num); + if (idx < num) { + curandState localState = states[idx]; + Vector3 v = gaussian_vector(&localState); + states[idx] = localState; + return v; + } else return Vector3(0.0f); + } + DEVICE inline Vector3 gaussian_vector(curandState* state) { + float x = gaussian(state); + float y = gaussian(state); + float z = gaussian(state); return Vector3(x, y, z); } diff --git a/TabulatedMethods.cuh b/TabulatedMethods.cuh index 3a07b7c..af169c8 100644 --- a/TabulatedMethods.cuh +++ b/TabulatedMethods.cuh @@ -2,7 +2,7 @@ #define BD_PI 3.1415927f -__device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ a, const BaseGrid* __restrict__ sys, Vector3* __restrict__ force, const Vector3* __restrict__ pos, +__device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ a, const BaseGrid* __restrict__ sys, Vector3* force, const Vector3* __restrict__ pos, const int& i, const int& j, const int& k) { @@ -67,7 +67,7 @@ __device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ __device__ inline void computeDihedral(const TabulatedDihedralPotential* __restrict__ d, - const BaseGrid* __restrict__ sys, Vector3* __restrict__ forces, const Vector3* __restrict__ pos, + const BaseGrid* __restrict__ sys, Vector3* forces, const Vector3* __restrict__ pos, const int& i, const int& j, const int& k, const int& l) { const Vector3 posa = pos[i]; const Vector3 posb = pos[j]; diff --git a/notes.org b/notes.org index 996b47a..9d64c91 100644 --- a/notes.org +++ b/notes.org @@ -1,5 +1,19 @@ * TODO active + +** test RB code + +** re-implement exclusions +how? + +*** in-thread approach, with particles i,j? + + 1) Look at whether i*j is in an array? --> potentially very memory intensive; could run out of integers! + 2) Look at find i,j in array? --> potentially slow + 3) Look at +*** post-pairlist filtering step + + ** split computation into multiple regions *** simulate regions, also in overlap **** atoms at edge of overlap can have DoF frozen -- GitLab