Skip to content
Snippets Groups Projects
Commit 202d5daa authored by cmaffeo2's avatar cmaffeo2
Browse files

Refactored CudaUtil; Added RandomCUDA routines that take state as parameter

parent 1c46ffbe
No related branches found
No related tags found
No related merge requests found
......@@ -286,7 +286,7 @@ void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas,
}
__global__
void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas,
void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas,
const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp,
const int nCells,
int* g_numPairs, int2* g_pair,
......@@ -462,11 +462,12 @@ __global__ void computeTabulatedKernel(
}
__global__ void clearEnergies(float* g_energies, int num) {
__global__ void clearEnergies(float* __restrict__ g_energies, int num) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) {
g_energies[i] = 0.0f;
}
}
__global__ void computeTabulatedEnergyKernel(Vector3* force, const Vector3* __restrict__ pos,
const BaseGrid* __restrict__ sys, float cutoff2,
const int* __restrict__ g_numPairs, const int2* __restrict__ g_pair, const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot, float* g_energies) {
......@@ -692,7 +693,7 @@ void computeAngles(Vector3 force[], Vector3 pos[],
}
}*/
__global__ void computeTabulatedBonds(Vector3* __restrict__ force,
__global__ void computeTabulatedBonds(Vector3* force,
Vector3* __restrict__ pos,
BaseGrid* __restrict__ sys,
int numBonds, int3* __restrict__ bondList_d, TabulatedPotential** tableBond) {
......@@ -727,7 +728,7 @@ __global__ void computeTabulatedBonds(Vector3* __restrict__ force,
// TODO: add kernel for energy calculation
__global__
void computeTabulatedAngles(Vector3* __restrict__ force,
void computeTabulatedAngles(Vector3* force,
Vector3* __restrict__ pos,
BaseGrid* __restrict__ sys,
int numAngles, int4* __restrict__ angleList_d, TabulatedAnglePotential** tableAngle) {
......@@ -834,12 +835,12 @@ void computeDihedrals(Vector3 force[], Vector3 pos[],
// bool get_energy, TabulatedDihedralPotential** __restrict__ tableDihedral) {
__global__
void computeTabulatedDihedrals(Vector3* __restrict__ force, const Vector3* __restrict__ pos,
void computeTabulatedDihedrals(Vector3* force, const Vector3* __restrict__ pos,
const BaseGrid* __restrict__ sys,
int numDihedrals, const int4* __restrict__ dihedralList_d,
int numDihedrals, const int4* const __restrict__ dihedralList_d,
const int* __restrict__ dihedralPotList_d, TabulatedDihedralPotential** tableDihedral) {
int currDihedral = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID
// int currDihedral = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID
// Loop over ALL dihedrals in ALL replicas
// TODO make grid stride loop
......@@ -856,4 +857,3 @@ void computeTabulatedDihedrals(Vector3* __restrict__ force, const Vector3* __res
// }
}
}
#include "CudaUtil.cuh"
__device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); }
__device__ int atomicAggInc(int *ctr, int warpLane) {
// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
int mask = __ballot(1);
int leader = __ffs(mask)-1;
int res;
if ( warpLane == leader )
res = atomicAdd(ctr, __popc(mask));
res = warp_bcast(res,leader);
return res + __popc( mask & ((1 << warpLane) - 1) );
}
__global__
void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector) {
extern __shared__ Vector3 blockVector[];
const int tid = threadIdx.x;
// grid-stride loop over vector[]
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) {
// assign vector to shared memory
blockVector[tid] = vector[i];
// blockVector[tid] = Vector3(0.0f);
__syncthreads();
// Reduce vectors in shared memory
// http://www.cuvilib.com/Reduction.pdf
for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
if (tid < offset) {
int oid = tid + offset;
blockVector[tid] = blockVector[tid] + blockVector[oid];
}
__syncthreads();
}
if (tid == 0)
atomicAdd( netVector, blockVector[0] );
}
}
#pragma once
#include "useful.h"
#define WARPSIZE 32
__device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); }
__device__ int atomicAggInc(int *ctr, int warpLane) {
// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
int mask = __ballot(1);
int leader = __ffs(mask)-1;
int res;
if ( warpLane == leader )
res = atomicAdd(ctr, __popc(mask));
res = warp_bcast(res,leader);
return res + __popc( mask & ((1 << warpLane) - 1) );
}
extern __device__ int warp_bcast(int v, int leader);
extern __device__ int atomicAggInc(int *ctr, int warpLane);
extern __global__
void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector);
__device__ inline void exclIntCumSum(int* in, const int n) {
// 1) int* in must point to shared memory
// 2) int n must be power of 2
......@@ -50,7 +41,6 @@ __device__ inline void exclIntCumSum(int* in, const int n) {
__syncthreads();
}
__device__ inline void inclIntCumSum(int* in, const int n) {
// 1) int* in must point to shared memory
// 2) int n must be power of 2
......
......@@ -305,7 +305,8 @@ GrandBrownTown::~GrandBrownTown() {
// Run the Brownian Dynamics steps.
void GrandBrownTown::run() {
printf("\n");
Vector3 runningNetForce(0.0f);
// Open the files for recording ionic currents
for (int repID = 0; repID < numReplicas; ++repID) {
newCurrent(repID);
......@@ -452,6 +453,21 @@ void GrandBrownTown::run() {
RBC.updateForces(s); /* update RB forces before update particle positions... */
/* DEBUG: reduce net force on particles
{
Vector3 netForce(0.0f);
Vector3* netForce_d;
gpuErrchk(cudaMalloc(&netForce_d, sizeof(Vector3)));
gpuErrchk(cudaMemcpy(netForce_d, &netForce, sizeof(Vector3), cudaMemcpyHostToDevice));
reduceVector<<< 500, NUM_THREADS, NUM_THREADS*sizeof(Vector3)>>>(
num, internal->getForceInternal_d(), netForce_d);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(&netForce, netForce_d, sizeof(Vector3), cudaMemcpyDeviceToHost));
runningNetForce = runningNetForce + netForce;
printf("Net Force: %f %f %f\n", runningNetForce.x,runningNetForce.y,runningNetForce.z);
}
// */
//MLog: Call the kernel to update the positions of each particle
// cudaSetDevice(0);
updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
......
// GrandBrownTown.cuh
//
// Terrance Howard <heyterrance@gmail.com>
#pragma once
#include "CudaUtil.cuh"
__device__
Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
float timestep, BaseGrid *sys, Random *randoGen, int num);
......@@ -20,7 +21,7 @@ void clearInternalForces(Vector3* __restrict__ forceInternal, const int num) {
forceInternal[idx] = Vector3(0.0f);
}
__global__
void updateKernel(Vector3 pos[], Vector3 forceInternal[],
void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal,
int type[], BrownianParticleType* part[],
float kT, BaseGrid* kTGrid,
float electricField, int tGridLength,
......@@ -29,7 +30,9 @@ void updateKernel(Vector3 pos[], Vector3 forceInternal[],
// Calculate this thread's ID
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Loop over ALL particles in ALL replicas
// TODO: Make this a grid-stride loop to make efficient reuse of RNG states
// Loop over ALL particles in ALL replicas
if (idx < num * numReplicas) {
const int t = type[idx];
Vector3& p = pos[idx];
......@@ -102,6 +105,7 @@ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
Vector3 diffGrad, float timestep, BaseGrid *sys,
Random *randoGen, int num) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
// TODO: improve performance by storing state locally, then sending it back to GPU
Vector3 rando = randoGen->gaussian_vector(idx, num);
diffusion *= timestep;
......@@ -109,6 +113,7 @@ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
Vector3 r = r0 + (diffusion / kTlocal) * force
+ timestep * diffGrad
+ sqrtf(2.0f * diffusion) * rando;
// Wrap about periodic boundaries
Vector3 l = sys->getInverseBasis().transform(r - sys->getOrigin());
l.x = sys->wrapFloat(l.x, sys->getNx());
l.y = sys->wrapFloat(l.y, sys->getNy());
......
......@@ -46,12 +46,23 @@ public:
return curand_normal(&states[idx]);
return 0.0f;
}
DEVICE inline float gaussian(curandState* state) {
return curand_normal(state);
}
DEVICE inline Vector3 gaussian_vector(int idx, int num) {
// TODO do stuff
float x = gaussian(idx, num);
float y = gaussian(idx, num);
float z = gaussian(idx, num);
if (idx < num) {
curandState localState = states[idx];
Vector3 v = gaussian_vector(&localState);
states[idx] = localState;
return v;
} else return Vector3(0.0f);
}
DEVICE inline Vector3 gaussian_vector(curandState* state) {
float x = gaussian(state);
float y = gaussian(state);
float z = gaussian(state);
return Vector3(x, y, z);
}
......
......@@ -2,7 +2,7 @@
#define BD_PI 3.1415927f
__device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ a, const BaseGrid* __restrict__ sys, Vector3* __restrict__ force, const Vector3* __restrict__ pos,
__device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ a, const BaseGrid* __restrict__ sys, Vector3* force, const Vector3* __restrict__ pos,
const int& i, const int& j, const int& k) {
......@@ -67,7 +67,7 @@ __device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__
__device__ inline void computeDihedral(const TabulatedDihedralPotential* __restrict__ d,
const BaseGrid* __restrict__ sys, Vector3* __restrict__ forces, const Vector3* __restrict__ pos,
const BaseGrid* __restrict__ sys, Vector3* forces, const Vector3* __restrict__ pos,
const int& i, const int& j, const int& k, const int& l) {
const Vector3 posa = pos[i];
const Vector3 posb = pos[j];
......
* TODO active
** test RB code
** re-implement exclusions
how?
*** in-thread approach, with particles i,j?
1) Look at whether i*j is in an array? --> potentially very memory intensive; could run out of integers!
2) Look at find i,j in array? --> potentially slow
3) Look at
*** post-pairlist filtering step
** split computation into multiple regions
*** simulate regions, also in overlap
**** atoms at edge of overlap can have DoF frozen
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment