From 202d5daaca7fb122412096ab50fd6bc558ec790e Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Thu, 6 Oct 2016 13:54:33 -0500
Subject: [PATCH] Refactored CudaUtil; Added RandomCUDA routines that take
 state as parameter

---
 ComputeForce.cuh     | 16 ++++++++--------
 CudaUtil.cu          | 43 +++++++++++++++++++++++++++++++++++++++++++
 CudaUtil.cuh         | 20 +++++---------------
 GrandBrownTown.cu    | 18 +++++++++++++++++-
 GrandBrownTown.cuh   | 11 ++++++++---
 RandomCUDA.h         | 17 ++++++++++++++---
 TabulatedMethods.cuh |  4 ++--
 notes.org            | 14 ++++++++++++++
 8 files changed, 111 insertions(+), 32 deletions(-)
 create mode 100644 CudaUtil.cu

diff --git a/ComputeForce.cuh b/ComputeForce.cuh
index 0fa23ba..6e78f8e 100644
--- a/ComputeForce.cuh
+++ b/ComputeForce.cuh
@@ -286,7 +286,7 @@ void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas,
 }
 
 __global__
-void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas,
+void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas,
 				const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp,
 				const int nCells,
 				int* g_numPairs, int2* g_pair,
@@ -462,11 +462,12 @@ __global__ void computeTabulatedKernel(
 }
 
 
-__global__ void clearEnergies(float* g_energies, int num) {
+__global__ void clearEnergies(float* __restrict__  g_energies, int num) {
 	for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) {
 		g_energies[i] = 0.0f;
 	}
 }
+
 __global__ void computeTabulatedEnergyKernel(Vector3* force, const Vector3* __restrict__ pos,
 				const BaseGrid* __restrict__ sys, float cutoff2,
 				const int* __restrict__ g_numPairs,	const int2* __restrict__ g_pair, const int* __restrict__ g_pairTabPotType, 	TabulatedPotential** __restrict__ tablePot, float* g_energies) {
@@ -692,7 +693,7 @@ void computeAngles(Vector3 force[], Vector3 pos[],
 	}
 }*/
 
-__global__ void computeTabulatedBonds(Vector3* __restrict__ force,
+__global__ void computeTabulatedBonds(Vector3* force,
 				Vector3* __restrict__ pos,
 				BaseGrid* __restrict__ sys,
 				int numBonds, int3* __restrict__ bondList_d, TabulatedPotential** tableBond) {
@@ -727,7 +728,7 @@ __global__ void computeTabulatedBonds(Vector3* __restrict__ force,
 
 // TODO: add kernel for energy calculation 
 __global__
-void computeTabulatedAngles(Vector3* __restrict__ force,
+void computeTabulatedAngles(Vector3* force,
 				Vector3* __restrict__ pos,
 				BaseGrid* __restrict__ sys,
 				int numAngles, int4* __restrict__ angleList_d, TabulatedAnglePotential** tableAngle) {
@@ -834,12 +835,12 @@ void computeDihedrals(Vector3 force[], Vector3 pos[],
     // 			    bool get_energy, TabulatedDihedralPotential** __restrict__ tableDihedral) {
 
 __global__
-void computeTabulatedDihedrals(Vector3* __restrict__ force, const Vector3* __restrict__ pos,
+void computeTabulatedDihedrals(Vector3* force, const Vector3* __restrict__ pos,
 			       const BaseGrid* __restrict__ sys,
-			       int numDihedrals, const int4* __restrict__ dihedralList_d,
+			       int numDihedrals, const int4* const __restrict__ dihedralList_d,
 			       const int* __restrict__ dihedralPotList_d, TabulatedDihedralPotential** tableDihedral) {
 
-    int currDihedral = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID
+	// int currDihedral = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID
 
     // Loop over ALL dihedrals in ALL replicas
     // TODO make grid stride loop
@@ -856,4 +857,3 @@ void computeTabulatedDihedrals(Vector3* __restrict__ force, const Vector3* __res
 	// }
     }
 }
-
diff --git a/CudaUtil.cu b/CudaUtil.cu
new file mode 100644
index 0000000..40f8ad2
--- /dev/null
+++ b/CudaUtil.cu
@@ -0,0 +1,43 @@
+#include "CudaUtil.cuh"
+
+__device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); }
+__device__ int atomicAggInc(int *ctr, int warpLane) {
+	// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
+	int mask = __ballot(1);
+	int leader = __ffs(mask)-1;
+
+	int res;
+	if ( warpLane == leader )
+		res = atomicAdd(ctr, __popc(mask));
+	res = warp_bcast(res,leader);
+
+	return res + __popc( mask & ((1 << warpLane) - 1) );
+}
+
+__global__
+void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector) {
+	extern __shared__ Vector3 blockVector[];
+	const int tid = threadIdx.x;
+
+	// grid-stride loop over vector[]
+	for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) {
+		// assign vector to shared memory
+		blockVector[tid] = vector[i];
+		// blockVector[tid] = Vector3(0.0f);
+		__syncthreads();
+		
+		
+		// Reduce vectors in shared memory
+		// http://www.cuvilib.com/Reduction.pdf
+		for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
+			if (tid < offset) {
+				int oid = tid + offset;
+				blockVector[tid] = blockVector[tid] + blockVector[oid];
+			}
+			__syncthreads();
+		}
+
+		if (tid == 0)
+			atomicAdd( netVector, blockVector[0] );
+	}
+}
diff --git a/CudaUtil.cuh b/CudaUtil.cuh
index 49e0f83..3442969 100644
--- a/CudaUtil.cuh
+++ b/CudaUtil.cuh
@@ -1,22 +1,13 @@
 #pragma once
-
+#include "useful.h"
 #define WARPSIZE 32
 
-__device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); }
-__device__ int atomicAggInc(int *ctr, int warpLane) {
-	// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
-	int mask = __ballot(1);
-	int leader = __ffs(mask)-1;
-
-	int res;
-	if ( warpLane == leader )
-		res = atomicAdd(ctr, __popc(mask));
-	res = warp_bcast(res,leader);
 
-	return res + __popc( mask & ((1 << warpLane) - 1) );
-}
+extern __device__ int warp_bcast(int v, int leader);
+extern __device__ int atomicAggInc(int *ctr, int warpLane);
+extern __global__
+void reduceVector(const int num, Vector3* __restrict__ vector, Vector3* netVector);
 
-	
 __device__ inline void exclIntCumSum(int* in, const int n) {
 	// 1) int* in must point to shared memory
 	// 2) int n must be power of 2
@@ -50,7 +41,6 @@ __device__ inline void exclIntCumSum(int* in, const int n) {
 	__syncthreads();
 }
 
-
 __device__ inline void inclIntCumSum(int* in, const int n) {
 	// 1) int* in must point to shared memory
 	// 2) int n must be power of 2
diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu
index 4e53b81..4eb2572 100644
--- a/GrandBrownTown.cu
+++ b/GrandBrownTown.cu
@@ -305,7 +305,8 @@ GrandBrownTown::~GrandBrownTown() {
 // Run the Brownian Dynamics steps.
 void GrandBrownTown::run() {
 	printf("\n");
-
+	Vector3 runningNetForce(0.0f);
+	
 	// Open the files for recording ionic currents
 	for (int repID = 0; repID < numReplicas; ++repID) {
 		newCurrent(repID);
@@ -452,6 +453,21 @@ void GrandBrownTown::run() {
 
 		RBC.updateForces(s);					/* update RB forces before update particle positions... */
 
+		/* DEBUG: reduce net force on particles
+		{  
+			Vector3 netForce(0.0f);
+			Vector3* netForce_d;
+			gpuErrchk(cudaMalloc(&netForce_d, sizeof(Vector3)));
+			gpuErrchk(cudaMemcpy(netForce_d, &netForce, sizeof(Vector3), cudaMemcpyHostToDevice));
+			reduceVector<<< 500, NUM_THREADS, NUM_THREADS*sizeof(Vector3)>>>(
+				num, internal->getForceInternal_d(), netForce_d);
+			gpuErrchk(cudaDeviceSynchronize());
+			gpuErrchk(cudaMemcpy(&netForce, netForce_d, sizeof(Vector3), cudaMemcpyDeviceToHost));
+			runningNetForce = runningNetForce + netForce;
+			printf("Net Force: %f %f %f\n", runningNetForce.x,runningNetForce.y,runningNetForce.z);
+		}		
+		// */
+		
 		//MLog: Call the kernel to update the positions of each particle
 		// cudaSetDevice(0);
 		updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
diff --git a/GrandBrownTown.cuh b/GrandBrownTown.cuh
index d5a1d16..da0ffd9 100644
--- a/GrandBrownTown.cuh
+++ b/GrandBrownTown.cuh
@@ -1,9 +1,10 @@
 // GrandBrownTown.cuh
 //
 // Terrance Howard <heyterrance@gmail.com>
-
 #pragma once
 
+#include "CudaUtil.cuh"
+
 __device__
 Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
 						 float timestep, BaseGrid *sys, Random *randoGen, int num);
@@ -20,7 +21,7 @@ void clearInternalForces(Vector3* __restrict__ forceInternal, const int num) {
 		forceInternal[idx] = Vector3(0.0f);
 }
 __global__
-void updateKernel(Vector3 pos[], Vector3 forceInternal[],
+void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal,
 									int type[], BrownianParticleType* part[],
 									float kT, BaseGrid* kTGrid,
 									float electricField, int tGridLength,
@@ -29,7 +30,9 @@ void updateKernel(Vector3 pos[], Vector3 forceInternal[],
 	// Calculate this thread's ID
 	const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
-	// Loop over ALL particles in ALL replicas
+
+	// TODO: Make this a grid-stride loop to make efficient reuse of RNG states 
+  // Loop over ALL particles in ALL replicas
 	if (idx < num * numReplicas) {
 		const int t = type[idx];
 		Vector3& p = pos[idx];
@@ -102,6 +105,7 @@ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
 						 Vector3 diffGrad, float timestep, BaseGrid *sys,
 						 Random *randoGen, int num) {
 	const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	// TODO: improve performance by storing state locally, then sending it back to GPU
 	Vector3 rando = randoGen->gaussian_vector(idx, num);
 
 	diffusion *= timestep;
@@ -109,6 +113,7 @@ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
 	Vector3 r = r0 + (diffusion / kTlocal) * force
 							+ timestep * diffGrad
 							+ sqrtf(2.0f * diffusion) * rando;
+	// Wrap about periodic boundaries
 	Vector3 l = sys->getInverseBasis().transform(r - sys->getOrigin());
 	l.x = sys->wrapFloat(l.x, sys->getNx());
  	l.y = sys->wrapFloat(l.y, sys->getNy());
diff --git a/RandomCUDA.h b/RandomCUDA.h
index e031785..b3f719a 100644
--- a/RandomCUDA.h
+++ b/RandomCUDA.h
@@ -46,12 +46,23 @@ public:
 			return curand_normal(&states[idx]);
 		return 0.0f;
 	}
+	DEVICE inline float gaussian(curandState* state) {
+		return curand_normal(state);
+	}
 
 	DEVICE inline Vector3 gaussian_vector(int idx, int num) {
 		// TODO do stuff
-		float x = gaussian(idx, num);
-		float y = gaussian(idx, num);
-		float z = gaussian(idx, num);
+		if (idx < num) {
+			curandState localState = states[idx];
+			Vector3 v = gaussian_vector(&localState);
+			states[idx] = localState;
+			return v;
+		} else return Vector3(0.0f);			
+	}
+	DEVICE inline Vector3 gaussian_vector(curandState* state) {
+		float x = gaussian(state);
+		float y = gaussian(state);
+		float z = gaussian(state);
 		return Vector3(x, y, z);
 	}
 
diff --git a/TabulatedMethods.cuh b/TabulatedMethods.cuh
index 3a07b7c..af169c8 100644
--- a/TabulatedMethods.cuh
+++ b/TabulatedMethods.cuh
@@ -2,7 +2,7 @@
 
 #define BD_PI 3.1415927f
 
-__device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ a, const BaseGrid* __restrict__ sys, Vector3* __restrict__ force, const Vector3* __restrict__ pos,
+__device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ a, const BaseGrid* __restrict__ sys, Vector3* force, const Vector3* __restrict__ pos,
 				const int& i, const int& j, const int& k) {
 	    
 	    
@@ -67,7 +67,7 @@ __device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__
 
 
 __device__ inline void computeDihedral(const TabulatedDihedralPotential* __restrict__ d,
-				const BaseGrid* __restrict__ sys, Vector3* __restrict__ forces, const Vector3* __restrict__ pos,
+				const BaseGrid* __restrict__ sys, Vector3* forces, const Vector3* __restrict__ pos,
 				const int& i, const int& j, const int& k, const int& l) {
 	const Vector3 posa = pos[i];
 	const Vector3 posb = pos[j];
diff --git a/notes.org b/notes.org
index 996b47a..9d64c91 100644
--- a/notes.org
+++ b/notes.org
@@ -1,5 +1,19 @@
 
 * TODO active
+
+** test RB code 
+
+** re-implement exclusions
+how?
+
+*** in-thread approach, with particles i,j?
+
+ 1) Look at whether i*j is in an array? --> potentially very memory intensive; could run out of integers!
+ 2) Look at find i,j in array? --> potentially slow
+ 3) Look at 
+*** post-pairlist filtering step
+
+
 ** split computation into multiple regions
 *** simulate regions, also in overlap
 **** atoms at edge of overlap can have DoF frozen
-- 
GitLab