From 4650488b8f6bcdf99210c6a397b0af626d47bc6a Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Fri, 16 Dec 2016 17:07:29 -0600
Subject: [PATCH] Multiple bug fixes:   (1) PairlistDistance is properly passed
 on to ComputeForce objects;   (2) Power of 2 threads are now used in
 particle-grid force reduction;   (3) Fixed accidental transpose of threads
 and blocks in grids' particleList kernel call

---
 src/ComputeGridGrid.cu     | 10 +++++-----
 src/ComputeGridGrid.cuh    |  2 +-
 src/Configuration.cpp      | 15 +++++++++------
 src/CudaUtil.cuh           |  2 +-
 src/GrandBrownTown.cu      | 12 ++++++------
 src/GrandBrownTown.cuh     |  8 ++++++++
 src/RigidBody.cu           | 11 +++++++----
 src/RigidBodyController.cu | 12 +++---------
 src/RigidBodyController.h  |  4 ----
 9 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/src/ComputeGridGrid.cu b/src/ComputeGridGrid.cu
index b074fde..1cbba92 100644
--- a/src/ComputeGridGrid.cu
+++ b/src/ComputeGridGrid.cu
@@ -36,7 +36,7 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 		/* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */
 		const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
 		force[tid] = fe.f;
-
+		
 		const float r_val = rho->val[r_id]; /* maybe move to beginning of function?  */
 		force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */
 
@@ -47,6 +47,7 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 	// Reduce force and torques
 	// http://www.cuvilib.com/Reduction.pdf
 	// RBTODO optimize further, perhaps
+	// assert( NUMTHREADS==32 || NUMTHREADS==64 || NUMTHREADS==128 || NUMTHREADS==256 || NUMTHREADS==512 );
 	__syncthreads();
 	for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
 		if (tid < offset) {
@@ -73,7 +74,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 	extern __shared__ Vector3 s[];
 	Vector3 *force = s;
 	Vector3 *torque = &s[NUMTHREADS];
-  
+  	
 	const int tid = threadIdx.x;
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -94,6 +95,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 	}
 
 	// Reduce force and torques
+	// assert( NUMTHREADS==32 || NUMTHREADS==64 || NUMTHREADS==128 || NUMTHREADS==256 || NUMTHREADS==512 );
 	__syncthreads();
 	for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
 		if (tid < offset) {
@@ -103,7 +105,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 		}
 		__syncthreads();
 	}
-
+	
 	if (tid == 0) {
 		retForce[blockIdx.x] = force[0];
 		retTorque[blockIdx.x] = torque[0];
@@ -117,8 +119,6 @@ void createPartlist(const Vector3* __restrict__ pos,
 				const Vector3 gridCenter, const float radius2) {
 	const int tid = threadIdx.x;
 	const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
-	const int split = 32;					/* numblocks should be divisible by split */
-	/* const int blocksPerCell = gridDim.x/split;  */
 	
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < numTypeParticles) {
diff --git a/src/ComputeGridGrid.cuh b/src/ComputeGridGrid.cuh
index 6150ae6..0140616 100644
--- a/src/ComputeGridGrid.cuh
+++ b/src/ComputeGridGrid.cuh
@@ -1,7 +1,7 @@
 // Included in RigidBodyController.cu
 #pragma once
 #include "useful.h"
-#define NUMTHREADS 96
+#define NUMTHREADS 128
 #define WARPSIZE 32
 
 class RigidBodyGrid;
diff --git a/src/Configuration.cpp b/src/Configuration.cpp
index 4440e42..c38ba2a 100644
--- a/src/Configuration.cpp
+++ b/src/Configuration.cpp
@@ -1,4 +1,5 @@
 #include "Configuration.h"
+#include "myAssert.h"
 
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
@@ -1223,14 +1224,16 @@ void Configuration::readExcludes()
 	for (int i = 0; i < numExcludes; i++) {
 		if (excludes[i].ind1 != currPart) {
 			currPart = excludes[i].ind1;
-			if (currPart < num) {
-				excludeMap[currPart].x = i;
-				if (lastPart >= 0)
-					excludeMap[lastPart].y = i;
-				lastPart = currPart;
-			}
+			myAssert(currPart < num);
+			excludeMap[currPart].x = i;
+			if (lastPart >= 0)
+			    excludeMap[lastPart].y = i;
+			lastPart = currPart;
 		}
 	}
+	if (excludeMap[lastPart].x > 0)
+	    excludeMap[lastPart].y = numExcludes;
+
 }
 
 void Configuration::readAngles() {
diff --git a/src/CudaUtil.cuh b/src/CudaUtil.cuh
index 3442969..6775ea4 100644
--- a/src/CudaUtil.cuh
+++ b/src/CudaUtil.cuh
@@ -76,7 +76,7 @@ __device__ inline void inclIntCumSum(int* in, const int n) {
 	__syncthreads();
 }
 
-__device__ inline void atomicAdd(Vector3* address, Vector3 val) {
+__device__ inline void atomicAdd(Vector3* address, const Vector3 val) {
 	atomicAdd( &(address->x), val.x);
 	atomicAdd( &(address->y), val.y);
 	atomicAdd( &(address->z), val.z);
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 356d854..9e85ef4 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -179,8 +179,8 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 
 	// Prepare internal force computation
 	internal = new ComputeForce(num, part, numParts, sys, switchStart, switchLen, coulombConst,
-			fullLongRange, numBonds, numTabBondFiles, numExcludes, numAngles, numTabAngleFiles,
-			numDihedrals, numTabDihedralFiles, numReplicas);
+				    fullLongRange, numBonds, numTabBondFiles, numExcludes, numAngles, numTabAngleFiles,
+				    numDihedrals, numTabDihedralFiles, c.pairlistDistance, numReplicas);
 	
 	//MLog: I did the other halve of the copyToCUDA function from the Configuration class here, keep an eye on any mistakes that may occur due to the location.
 	internal -> copyToCUDA(c.simNum, c.type, c.bonds, c.bondMap, c.excludes, c.excludeMap, c.angles, c.dihedrals);
@@ -480,7 +480,7 @@ void GrandBrownTown::run() {
 		// Make sure the force computation has completed without errors before continuing.
 		//gpuErrchk(cudaPeekAtLastError()); // Does not work on old GPUs (like mine). TODO: write a better wrapper around Peek
 		gpuErrchk(cudaDeviceSynchronize());
-
+		
 		// printf("  Computed energies\n");
 
 		// int numBlocks = (num * numReplicas) / NUM_THREADS + 1;
@@ -503,7 +503,7 @@ void GrandBrownTown::run() {
 			printf("Net Force: %f %f %f\n", runningNetForce.x,runningNetForce.y,runningNetForce.z);
 		}		
 		// */
-		
+
 		//MLog: Call the kernel to update the positions of each particle
 		// cudaSetDevice(0);
 		updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
@@ -520,10 +520,10 @@ void GrandBrownTown::run() {
 		/* computeGridGridForce<<< numBlocks, NUM_THREADS >>>(grid1_d, grid2_d); */
 		
 		// int numBlocks = (numRB ) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
-		
-		Vector3 force0(0.0f);
 
+		Vector3 force0(0.0f);		
 
+		
 		if (s % outputPeriod == 0) {
 			// Copy particle positions back to CPU
 			// cudaSetDevice(0);
diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh
index 401d066..fab9c77 100644
--- a/src/GrandBrownTown.cuh
+++ b/src/GrandBrownTown.cuh
@@ -153,3 +153,11 @@ __global__ void devicePrint(RigidBodyType* rb[]) {
 // 	// i, j, k, rb[i]->rawPotentialGrids[j]).val[k];
 	
 // }
+
+
+// __device__ Vector3* totalForce;
+// Vector3* totalForce_h;
+// void initTotalForce() {
+//     cudaMalloc( &totalForce_h, sizeof(Vector3) );
+//     cudaMemcpyToSymbol(totalForce, &totalForce_h, sizeof(totalForce_h));
+// }
diff --git a/src/RigidBody.cu b/src/RigidBody.cu
index 54d1b17..bb498d5 100644
--- a/src/RigidBody.cu
+++ b/src/RigidBody.cu
@@ -47,10 +47,12 @@ void RigidBody::init() {
 	particleForces_d = new Vector3*[numGrids];
 	particleTorques_d = new Vector3*[numGrids];
 	for (int i = 0; i < numGrids; ++i) {
+	    numParticles[i] = -1;
 		const int& n = t->numParticles[i];
 		const int nb = (n/NUMTHREADS)+1; // max number of blocks
 		if (n > 0) {
-			gpuErrchk(cudaMalloc( &particles_d[i], 0.5*sizeof(int)*n ));
+		    // gpuErrchk(cudaMalloc( &particles_d[i], 0.5*sizeof(int)*n )); // not sure why 0.5 was here; prolly bug
+		        gpuErrchk(cudaMalloc( &particles_d[i], sizeof(int)*n )); // TODO: dynamically allocate memory as needed
 			particleForces[i] = new Vector3[nb];
 			particleTorques[i] = new Vector3[nb];
 			gpuErrchk(cudaMalloc( &particleForces_d[i], sizeof(Vector3)*nb ));
@@ -104,15 +106,16 @@ void RigidBody::updateParticleList(Vector3* pos_d) {
 
 			int nb = floor(tnp/NUMTHREADS) + 1;
 #if __CUDA_ARCH__ >= 300
-			createPartlist<<<NUMTHREADS,nb>>>(pos_d, tnp, t->particles_d[i],
+			createPartlist<<<nb,NUMTHREADS>>>(pos_d, tnp, t->particles_d[i],
 							tmp_d, particles_d[i],
 							gridCenter + position, cutoff*cutoff);
 #else
-			createPartlist<<<NUMTHREADS,nb,NUMTHREADS/WARPSIZE>>>(pos_d, tnp, t->particles_d[i],
+			createPartlist<<<nb,NUMTHREADS,NUMTHREADS/WARPSIZE>>>(pos_d, tnp, t->particles_d[i],
 							tmp_d, particles_d[i],
 							gridCenter + position, cutoff*cutoff);
 #endif			
 			gpuErrchk(cudaMemcpy(&numParticles[i], tmp_d, sizeof(int), cudaMemcpyDeviceToHost ));
+			gpuErrchk(cudaFree( tmp_d ));
 		}
 	}
 }
@@ -220,7 +223,7 @@ void RigidBody::integrate(int startFinishAll) {
 	Vector3 trans; // = *p_trans;
 	Matrix3 rot = Matrix3(1); // = *p_rot;
 
-	/* printf("Rigid Body force\n"); */
+	// printf("Rigid Body force\n"); force.print();
 	
 #ifdef DEBUGM
 	switch (startFinishAll) {
diff --git a/src/RigidBodyController.cu b/src/RigidBodyController.cu
index 8b57969..1fc9a6a 100644
--- a/src/RigidBodyController.cu
+++ b/src/RigidBodyController.cu
@@ -1,9 +1,3 @@
-/* #ifndef MIN_DEBUG_LEVEL */
-/* #define MIN_DEBUG_LEVEL 5 */
-/* #endif */
-/* #define DEBUGM */
-/* #include "Debug.h" */
-
 /* #include "RigidBody.h" */
 #include "RigidBodyController.h"
 #include "Configuration.h"
@@ -436,12 +430,12 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
 		
 		// RBTODO: get energy
 		if (!isPmf) {								/* pair of RBs */
-			computeGridGridForce<<< nb, numThreads, NUMTHREADS*2*sizeof(Vector3), s >>>
+			computeGridGridForce<<< nb, NUMTHREADS, NUMTHREADS*2*sizeof(Vector3), s >>>
 				(type1->rawDensityGrids_d[k1], type2->rawPotentialGrids_d[k2],
 				 B1, B2, c,
 				 forces_d[i], torques_d[i]);
 		} else {										/* RB with a PMF */
-			computeGridGridForce<<< nb, numThreads, NUMTHREADS*2*sizeof(Vector3), s >>>
+			computeGridGridForce<<< nb, NUMTHREADS, NUMTHREADS*2*sizeof(Vector3), s >>>
 				(type1->rawDensityGrids_d[k1], type2->rawPmfs_d[k2],
 				 B1, B2, c,
 				 forces_d[i], torques_d[i]);
@@ -618,7 +612,7 @@ int RigidBodyForcePair::initialize() {
 	for (int i = 0; i < numGrids; i++) {
 		const int k1 = gridKeyId1[i];
 		const int sz = type1->rawDensityGrids[k1].getSize();
-		const int nb = sz / numThreads + ((sz % numThreads == 0) ? 0:1 );
+		const int nb = sz / NUMTHREADS + ((sz % NUMTHREADS == 0) ? 0:1 );
 		streamID.push_back( nextStreamID % NUMSTREAMS );
 		nextStreamID++;
 
diff --git a/src/RigidBodyController.h b/src/RigidBodyController.h
index ad06f20..35ffe73 100644
--- a/src/RigidBodyController.h
+++ b/src/RigidBodyController.h
@@ -7,8 +7,6 @@
 #include <cuda_runtime.h>
 #include "useful.h"
 
-// #define NUMTHREADS 128					/* try with 64, every 32+ */
-#define NUMTHREADS 96
 #define NUMSTREAMS 8
 
 // #include "RigidBody.h"
@@ -55,8 +53,6 @@ private:
 
 	int updatePeriod;
 	
-	static const int numThreads = NUMTHREADS;
-	
 	RigidBodyType* type1;
 	RigidBodyType* type2;
 	RigidBody* rb1;
-- 
GitLab