From acd1ecbed7269d1a6dce030437d16338b511937a Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Tue, 17 May 2016 11:16:00 -0500
Subject: [PATCH] slightly simplified pairlist kernel

---
 ComputeForce.cu  |  78 ++++++++++++++++++++-----------
 ComputeForce.cuh | 119 +++++++++++++++++++++++++++++++++++++----------
 makefile         |  10 ++--
 notes.org        |  10 ++--
 4 files changed, 160 insertions(+), 57 deletions(-)

diff --git a/ComputeForce.cu b/ComputeForce.cu
index c311281..6566354 100644
--- a/ComputeForce.cu
+++ b/ComputeForce.cu
@@ -6,6 +6,7 @@
 #include "ComputeForce.cuh"
 #include <cuda_profiler_api.h>
 
+
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
@@ -16,6 +17,10 @@ inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
 
 cudaEvent_t start, stop;
 
+void runSort(int2 *d1, int *d2, float *key,
+				int2 *scratch1, int  *scratch2, float *scratchKey,
+				unsigned int count);
+
 ComputeForce::ComputeForce(int num, const BrownianParticleType part[],
 													 int numParts, const BaseGrid* g, float switchStart,
 													 float switchLen, float electricConst,
@@ -367,7 +372,7 @@ void ComputeForce::decompose(Vector3* pos, int type[]) {
 	//RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d);	
 
 
-	size_t free, total;
+	/* size_t free, total; */
 	/* { */
 	/* 	cuMemGetInfo(&free,&total); */
 	/* 	printf("Free memory: %zu / %zu\n", free, total); */
@@ -380,14 +385,12 @@ void ComputeForce::decompose(Vector3* pos, int type[]) {
 		// RBTODO: free memory elsewhere
 		// allocate device data
 		// initializePairlistArrays<<< 1, 32 >>>(10*nCells*blocksPerCell);
-		const int maxPairs = 1<<26;
-		gpuErrchk(cudaMalloc(&numPairs_d,       sizeof(int)*maxPairs));
-		gpuErrchk(cudaMalloc(&pairLists_d,     sizeof(int2)*maxPairs));
+		const int maxPairs = 1<<25;
+		gpuErrchk(cudaMalloc(&numPairs_d,       sizeof(int)));
+
+		gpuErrchk(cudaMalloc(&pairLists_d,      sizeof(int2)*maxPairs));
 		gpuErrchk(cudaMalloc(&pairTabPotType_d, sizeof(int)*maxPairs));
-		int tmp = 0;
-		gpuErrchk(cudaMemcpyAsync(numPairs_d, &tmp,
-															sizeof(int), cudaMemcpyHostToDevice));
-		
+
 		gpuErrchk(cudaDeviceSynchronize());
 	}
 
@@ -417,14 +420,32 @@ void ComputeForce::decompose(Vector3* pos, int type[]) {
 	float pairlistdist2 = (sqrt(cutoff2) + 2.0f);
 	pairlistdist2 = pairlistdist2*pairlistdist2;
 	
-	createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas,
-																						 sys_d, decomp_d, nCells, blocksPerCell,
-																						 numPairs_d, pairLists_d,
-																						 numParts, type, pairTabPotType_d,pairlistdist2);
-
-	gpuErrchk(cudaDeviceSynchronize());
-	
-	
+	createPairlists<<< 2048, 64 >>>(pos, num, numReplicas,
+					sys_d, decomp_d, nCells,
+					numPairs_d, pairLists_d,
+					numParts, type, pairTabPotType_d, pairlistdist2);
+	/* createPairlistsOld<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, */
+	/* 																					 sys_d, decomp_d, nCells, blocksPerCell, */
+	/* 																					 numPairs_d, pairLists_d, */
+	/* 																					 numParts, type, pairTabPotType_d, pairlistdist2); */
+
+	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */
+	// if (false)
+	{ // sort pairlist
+		int numPairs;
+		gpuErrchk(cudaMemcpyAsync( &numPairs, numPairs_d, sizeof(int), cudaMemcpyDeviceToHost));
+		gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */
+		printf("here, %d pairs\n", numPairs);
+		/* runSort(pairLists_d, pairTabPotType_d, pairDists_d, */
+		/* 				pairLists_s, pairTabPotType_s, pairDists_s, */
+		/* 				numPairs); */
+		/* printf("done\n"); */
+		
+		/* // RBTODO: sort pairListInd as well!!! (i.e. roll your own sort!) */
+		/* // thrust::sort_by_key( pairDists_d, pairDists_d+numPairs_d, pairLists_d ); */
+		/* // thrust::sort_by_key( pairDists_d, pairDists_d+numPairs_d, pairLists_d ); */
+		/* gpuErrchk(cudaDeviceSynchronize()); /\* RBTODO: sync needed here? *\/ */
+	}
 }
 
 IndexList ComputeForce::decompDim() const {
@@ -549,21 +570,25 @@ float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type,
 	// int nb = (1+(decomp.nCells.x * decomp.nCells.y * decomp.nCells.z)) * 75; /* RBTODO: number of pairLists */
 	const int nb = 800;
 	// printf("ComputeTabulated\n");
-	computeTabulatedKernel<<< nb, numThreads >>>(force, pos, type,
-			tablePot_d, tableBond_d,
-			num, sys_d,
-			bonds, bondMap,	numBonds,
-																							 cutoff2,
-																							 numPairs_d, pairLists_d,
-																							 pairTabPotType_d);
-
-	gpuErrchk(cudaDeviceSynchronize());
+	
+	if (get_energy) {
+		clearEnergies<<< nb, numThreads >>>(energies_d,num);
+		gpuErrchk(cudaDeviceSynchronize());
+		computeTabulatedEnergyKernel<<< nb, numThreads >>>(force, pos, type,
+						tablePot_d, tableBond_d, sys_d,
+						bonds, bondMap,	numBonds,	energies_d,	cutoff2,
+						numPairs_d, pairLists_d, pairTabPotType_d);
+	} else {
+		computeTabulatedKernel<<< nb, numThreads >>>(force, pos, type,
+						tablePot_d, tableBond_d, sys_d,
+						bonds, bondMap,	numBonds, cutoff2,
+						numPairs_d, pairLists_d, pairTabPotType_d);
+	}
 	/* printPairForceCounter<<<1,32>>>(); */
 	/* gpuErrchk(cudaDeviceSynchronize()); */
 
 	computeAngles<<<numBlocks, numThreads>>>(force, pos, angles,
 			tableAngle_d, numAngles, num, sys_d, energies_d, get_energy);
-	gpuErrchk(cudaDeviceSynchronize());
 
 	computeDihedrals<<<numBlocks, numThreads>>>(force, pos, dihedrals,
 			tableDihedral_d, numDihedrals, num, sys_d, energies_d, get_energy);
@@ -611,4 +636,3 @@ float ComputeForce::computeTabulatedFull(Vector3* force, Vector3* pos, int* type
 
 	return energy;
 }
-
diff --git a/ComputeForce.cuh b/ComputeForce.cuh
index 6d6350c..10454f2 100644
--- a/ComputeForce.cuh
+++ b/ComputeForce.cuh
@@ -197,7 +197,7 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[],
 }
 
 
-const __device__ int maxPairs = 1 << 14;
+/* const __device__ int maxPairs = 1 << 14; */
 
 /* __global__ */
 /* void pairlistTest(Vector3 pos[], int num, int numReplicas, */
@@ -213,7 +213,7 @@ const __device__ int maxPairs = 1 << 14;
 /* } */
 
 __global__
-void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas,
+void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas,
 										 BaseGrid* sys, CellDecomposition* __restrict__ decomp,
 										 const int nCells, const int blocksPerCell,
 										 int* g_numPairs, int2* g_pair,
@@ -262,18 +262,19 @@ void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas,
 							if (aj <= ai) continue;
 
 							// skip ones that are too far away
-							const Vector3 dr = sys->wrapDiff(pos[aj] - pos[ai]);
-							if (dr.length2() > pairlistdist2) continue;
-							
+							float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2();
+							if (dr > pairlistdist2) continue;
+
+							int gid = atomicAggInc( g_numPairs, warpLane );
+
 							// RBTODO: skip exclusions, non-interacting types
 							int pairType = type[ai] + type[aj] * numParts;
-							int gid = atomicAggInc( g_numPairs, warpLane );
 							/* assert( ai >= 0 ); */
 							/* assert( aj >= 0 ); */
 							
 							g_pair[gid] = make_int2(ai,aj);
 							g_pairTabPotType[gid] = pairType;
-
+							// g_pairDists[gid] = dr; 
 						} 	// atoms J
 					} 		// z				
 				} 			// y
@@ -282,8 +283,75 @@ void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas,
 	} // replicas
 	/* if (tid == 0) printf("Cell%d: found %d pairs\n",cID,g_numPairs[cID]); */
 }
-	
+
 __global__
+void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas,
+				BaseGrid* sys, CellDecomposition* __restrict__ decomp,
+				const int nCells,
+				int* g_numPairs, int2* g_pair,
+				int numParts, int type[], int* __restrict__ g_pairTabPotType,
+				float pairlistdist2) {
+	// Loop over threads searching for atom pairs
+  //   Each thread has designated values in shared memory as a buffer
+  //   A sync operation periodically moves data from shared to global
+	const int tid = threadIdx.x;
+	const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
+	const int split = 32;					/* numblocks should be divisible by split */
+	/* const int blocksPerCell = gridDim.x/split;  */
+	
+	const CellDecomposition::cell_t* pairs = decomp->getCells();
+	for (int cID = 0 + (blockIdx.x % split); cID < nCells; cID += split) {
+	// for (int cID = blockIdx.x/blocksPerCell; cID < nCells; cID += split ) {
+		for (int repID = 0; repID < numReplicas; repID++) {
+			const CellDecomposition::range_t rangeI = decomp->getRange(cID, repID);
+
+			for (int ci = rangeI.first + blockIdx.x/split; ci < rangeI.last; ci += gridDim.x/split) {
+			/* for (int ci = rangeI.first + (blockIdx.x % blocksPerCell); */
+			/* 		 ci < rangeI.last; */
+			/* 		 ci += blocksPerCell) { */
+
+			// ai - index of the particle in the original, unsorted array
+				const int ai = pairs[ci].particle;
+				// const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
+				const CellDecomposition::cell_t celli = pairs[ci];
+				const Vector3 posi = pos[ai];
+				
+				for (int x = -1; x <= 1; ++x) {
+					for (int y = -1; y <= 1; ++y) {
+						for (int z = -1; z <= 1; ++z) {					
+							
+							const int nID = decomp->getNeighborID(celli, x, y, z);				
+							if (nID < 0) continue; 
+							const CellDecomposition::range_t range = decomp->getRange(nID, repID);
+							
+							for (int n = range.first + tid; n < range.last; n+=blockDim.x) {
+								const int aj = pairs[n].particle;
+								if (aj <= ai) continue;
+								
+								// skip ones that are too far away
+								float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2();
+								if (dr > pairlistdist2) continue;
+								
+								int gid = atomicAggInc( g_numPairs, warpLane );
+								
+								// RBTODO: skip exclusions, non-interacting types
+								int pairType = type[ai] + type[aj] * numParts;
+								/* assert( ai >= 0 ); */
+								/* assert( aj >= 0 ); */
+								
+								g_pair[gid] = make_int2(ai,aj);
+								g_pairTabPotType[gid] = pairType;
+								// g_pairDists[gid] = dr; 
+							} 	// atoms J
+						} 		// z				
+					} 			// y
+				} 				// x
+			} // atoms I					
+		} // replicas
+		/* if (tid == 0) printf("Cell%d: found %d pairs\n",cID,g_numPairs[cID]); */
+	}
+}
+	__global__
 void computeKernel(Vector3 force[], Vector3 pos[], int type[],
 									 float tableAlpha[], float tableEps[], float tableRad6[],
 									 int num, int numParts, BaseGrid* sys,
@@ -365,14 +433,13 @@ __global__ void printPairForceCounter() {
 // Kernel computes forces between Brownian particles (ions)
 // using cell decomposition
 //
-__global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict__ pos, int* type,
-		TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond,
-		int num, const BaseGrid* __restrict__ sys,
-		const Bond* __restrict__ bonds, const int2* __restrict__ bondMap, int numBonds,
-																			 float cutoff2,
-																			 const int* __restrict__ g_numPairs,
-																			 const int2* __restrict__ g_pair,
-																			 const int* __restrict__ g_pairTabPotType) {
+__global__ void computeTabulatedKernel(
+	Vector3* force, const Vector3* __restrict__ pos, int* type,
+	TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond,
+	const BaseGrid* __restrict__ sys,
+	const Bond* __restrict__ bonds, const int2* __restrict__ bondMap, int numBonds,
+	float cutoff2, const int* __restrict__ g_numPairs,
+	const int2* __restrict__ g_pair, const int* __restrict__ g_pairTabPotType) {
 	
 
 	const int numPairs = *g_numPairs;
@@ -401,14 +468,18 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict
 }
 
 
-__global__ void computeTabulatedEnergyKernel(Vector3* force, Vector3* __restrict__ pos, int* type,
-		TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond,
-		int num, BaseGrid* __restrict__ sys,
-		Bond* __restrict__ bonds, int2* __restrict__ bondMap, int numBonds,
-		float* g_energies, float cutoff2,
-																			 int* __restrict__ g_numPairs,
-																			 int2* __restrict__ g_pair,
-																			 int* __restrict__ g_pairTabPotType) {
+__global__ void clearEnergies(float* g_energies, int num) {
+	for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) {
+		g_energies[i] = 0.0f;
+	}
+}
+__global__ void computeTabulatedEnergyKernel(Vector3* force, Vector3* __restrict__ pos,
+				int* type, TabulatedPotential** __restrict__ tablePot,
+				TabulatedPotential** __restrict__ tableBond,
+				BaseGrid* __restrict__ sys, Bond* __restrict__ bonds,
+				int2* __restrict__ bondMap, int numBonds, float* g_energies, float cutoff2,
+				int* __restrict__ g_numPairs,	int2* __restrict__ g_pair,
+				int* __restrict__ g_pairTabPotType) {
 	
 	const int numPairs = *g_numPairs;
 	// RBTODO: BONDS (handle through an independent kernel call?)
diff --git a/makefile b/makefile
index 015abdf..6ec127e 100644
--- a/makefile
+++ b/makefile
@@ -2,7 +2,8 @@
 
 # CUDA_PATH ?= /Developer/NVIDIA/CUDA-6.5
 # CUDA_PATH ?= /software/cuda-toolkit-6.5-x86_64
-CUDA_PATH ?= /usr/local/cuda-7.0
+# CUDA_PATH ?= /usr/local/cuda-7.0
+CUDA_PATH ?= /usr/local/cuda-7.5
 
 #CUDA_PATH ?= /usr/local/encap/cuda-5.5
 
@@ -11,7 +12,7 @@ include ./findcudalib.mk
 INCLUDE = $(CUDA_PATH)/include
 
 
-DEBUG = -g -O0
+# DEBUG = -g -O0
 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic
 # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no
 NV_FLAGS = -Xptxas -v # -v,-abi=no 
@@ -47,7 +48,10 @@ CODE_20 := -arch=sm_20
 # CODE_35 := -arch=compute_35
 
 # NV_FLAGS += $(CODE_10) $(CODE_12) $(CODE_20) $(CODE_30) $(CODE_35)
-NV_FLAGS += -arch=sm_35
+# NV_FLAGS += -arch=sm_35
+SM=52
+NV_FLAGS += -gencode arch=compute_$(SM),code=compute_$(SM)
+
 
 NVLD_FLAGS := $(NV_FLAGS) --device-link 
 # NV_FLAGS += -rdc=true
diff --git a/notes.org b/notes.org
index 06de8c0..996b47a 100644
--- a/notes.org
+++ b/notes.org
@@ -1,8 +1,12 @@
 
 * TODO active
-** move to efficient linear interpolation everywhere
-** update pairlists
-** statistical tests of results
+** split computation into multiple regions
+*** simulate regions, also in overlap
+**** atoms at edge of overlap can have DoF frozen
+*** periodically synchronize positions
+*** optimize to minimize communication & error
+**** monitor energy jumps associated with this process
+
 
 * TODO eventually
 ** organize code a bit better
-- 
GitLab