From 8733d418cd6c54b536573d1bd9ed319d83e6c608 Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Fri, 19 Feb 2016 12:46:11 -0600
Subject: [PATCH] pairlist kernels work but need optimization

---
 ComputeForce.cu  |  24 +++--
 ComputeForce.cuh | 235 ++++++++++++++++-------------------------------
 CudaUtil.cuh     |   1 +
 makefile         |   7 +-
 4 files changed, 99 insertions(+), 168 deletions(-)

diff --git a/ComputeForce.cu b/ComputeForce.cu
index 563762c..3e0d912 100644
--- a/ComputeForce.cu
+++ b/ComputeForce.cu
@@ -95,10 +95,6 @@ ComputeForce::ComputeForce(int num, const BrownianParticleType part[],
 
 	// Create and allocate the energy arrays
 	gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * num));
-
-	// initializePairlistArrays
-	initializePairlistArrays<<< 1, 32 >>>();
-	gpuErrchk(cudaDeviceSynchronize());
 	
 	cudaEventCreate(&start);
 	cudaEventCreate(&stop);
@@ -358,19 +354,32 @@ bool ComputeForce::addDihedralPotential(String fileName, int ind,
 void ComputeForce::decompose(Vector3* pos) {
 	gpuErrchk( cudaProfilerStart() );
 	// Reset the cell decomposition.
+	bool newDecomp = false;
 	if (decomp_d)
 		cudaFree(decomp_d);
+	else
+		newDecomp = true;
+		
 	decomp.decompose_d(pos, num);
 	decomp_d = decomp.copyToCUDA();
 
 	// Update pairlists using cell decomposition (not sure this is really needed or good) 
 	//RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d);	
+
+	
+		// initializePairlistArrays
+	int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
+	if (newDecomp) {
+		initializePairlistArrays<<< 1, 32 >>>(nCells);
+		gpuErrchk(cudaDeviceSynchronize());
+	}
 	const int NUMTHREADS = 128;
-	const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1;
+	//const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1;
+	const size_t nBlocks = nCells;
 
 	/* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d, decomp_d); */
 	/* gpuErrchk(cudaDeviceSynchronize()); */
-	createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d);
+	createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d, nCells);
 	gpuErrchk(cudaDeviceSynchronize());
 	
 	
@@ -495,7 +504,8 @@ float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type,
 
 	
 	// Call the kernel to calculate the forces
-	computeTabulatedKernel<<< numBlocks, numThreads >>>(force, pos, type,
+	int nb = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
+	computeTabulatedKernel<<< nb, numThreads >>>(force, pos, type,
 			tablePot_d, tableBond_d,
 			num, numParts, sys_d,
 			bonds, bondMap,	numBonds, 
diff --git a/ComputeForce.cuh b/ComputeForce.cuh
index 4347449..c93022e 100644
--- a/ComputeForce.cuh
+++ b/ComputeForce.cuh
@@ -198,180 +198,96 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[],
 
 // RBTODO: remove global device variables for fast prototyping of pairlist kernel
 // #define MAXPAIRS 25500*25500 // (num/numReplicas) * ((num/numReplicas)-1);
-__device__ int g_numPairs;
-__device__ int* g_pairI;
-__device__ int* g_pairJ;
+__device__ int* g_numPairs;
+__device__ int** g_pairI;
+__device__ int** g_pairJ;
 __device__ bool g_pairsSet = false;
 
 __global__
-void initializePairlistArrays() {
+void initializePairlistArrays(const int nCells) {
 	const int tid = threadIdx.x;
-	const int ai = blockIdx.x * blockDim.x + threadIdx.x;
-
-	const int maxPairs = 1 << 20;
-	if ( ai == 0 ) {
-		// RBTODO: free later, do this elsewhere
-		g_pairI = (int*) malloc( maxPairs * sizeof(int));
-		g_pairJ = (int*) malloc( maxPairs * sizeof(int));
-		assert( g_pairI != NULL );
-		assert( g_pairJ != NULL );
-		g_numPairs = 0;
+	const int maxPairs = 1 << 16;	/* ~60,000 per cell */
+	if (blockIdx.x > 0) return;
+
+	// RBTODO: free later
+	if (tid == 0) {
+		printf("Initializing device pairlists for %d cells\n", nCells);
+		g_numPairs = (int*) malloc( nCells * sizeof(int) );  
+		g_pairI = (int**) malloc( nCells * sizeof(int*));
+		g_pairJ = (int**) malloc( nCells * sizeof(int*));
+	}		
+	__syncthreads();	
+	assert( g_numPairs != NULL );
+	assert( g_pairI != NULL );
+	assert( g_pairJ != NULL );
+	
+	for (int i = tid; i < nCells; i += blockDim.x) {
+		g_pairI[i] = (int*) malloc( maxPairs * sizeof(int));
+		g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int));
+		g_numPairs[i] = 0;
+		assert( g_pairI[i] != NULL );
+		assert( g_pairJ[i] != NULL );
 	}
 }
 
 __global__
 void createPairlists(Vector3 pos[], int num, int numReplicas,
-										 BaseGrid* sys, CellDecomposition* decomp) {
+										 BaseGrid* sys, CellDecomposition* decomp, int nCells) {
 	// Loop over threads searching for atom pairs
   //   Each thread has designated values in shared memory as a buffer
   //   A sync operation periodically moves data from shared to global
 	const int NUMTHREADS = 128;		/* RBTODO: fix */
 	const int tid = threadIdx.x;
-	const int ai = blockIdx.x * blockDim.x + threadIdx.x; /* atom index 0 */
+
+	const int cID = blockIdx.x;
+	// * blockDim.x + threadIdx.x; 
+	// const int cID = blockIdx.x * blockDim.x + threadIdx.x; /* atom index 0 */
 
 	const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
-	
-	if (ai == 0) {
-		g_numPairs = 0;	/* RBTODO: be more efficient */
-		assert( g_pairI != NULL );
-		assert( g_pairJ != NULL );
-	}
 
-	// ai - index of the particle in the original, unsorted array
-	if (ai < (num-1) * numReplicas) {
-		const int repID = ai / (num-1);
-		const Vector3 posi = pos[ai];
-		
-		CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
-		const CellDecomposition::cell_t* pairs = decomp->getCells();
-		for (int x = -1; x <= 1; ++x) {
-			for (int y = -1; y <= 1; ++y) {
-				for (int z = -1; z <= 1; ++z) {					
+	/* const int3& nCells = decomp->nCells; */
+	if (cID >= nCells) return;
+	int count = 0;								/* debug */
+	const CellDecomposition::cell_t* pairs = decomp->getCells();
+
+	for (int repID = 0; repID < numReplicas; repID++) {
+		const CellDecomposition::range_t rangeI = decomp->getRange(cID, repID);
+		if (tid == 0) printf("  Cell%d: Working on %d atoms for repID %d\n",
+												 cID, rangeI.last - rangeI.first, repID);
+
+		for (int ci = rangeI.first; ci < rangeI.last; ci++) {
+			// ai - index of the particle in the original, unsorted array
+			const int ai = pairs[ci].particle;
+			// const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
+			const CellDecomposition::cell_t celli = pairs[ci];
+			const Vector3 posi = pos[ai];
+				
+			for (int x = -1; x <= 1; ++x) {
+				for (int y = -1; y <= 1; ++y) {
+					for (int z = -1; z <= 1; ++z) {					
 					
-					int n = 0;
-					int last = -1;
-					const int nID = decomp->getNeighborID(celli, x, y, z);				
-					if (nID >= 0) { // set range for valid cells
+						const int nID = decomp->getNeighborID(celli, x, y, z);				
+						if (nID < 0) continue; 
 						const CellDecomposition::range_t range = decomp->getRange(nID, repID);
-						n = range.first;
-						last = range.last;
-					}
-					
-					for (; n < last; n++) {
-						const int aj = pairs[n].particle;
-						if (aj <= ai) continue;
-						// RBTODO: skip exclusions
-						
-						int wid = atomicAggInc( &g_numPairs, warpLane );
-						g_pairI[wid] = ai;
-						g_pairJ[wid] = aj;
-					} 	// n
-				} 		// z				
-			} 			// y
-		} 				// x
-	}						/* replicas */
-}
-
-__global__
-void createPairlistsOLD(Vector3 pos[], int num, int numReplicas,
-										 BaseGrid* sys, CellDecomposition* decomp) {
-	// Loop over threads searching for atom pairs
-  //   Each thread has designated values in shared memory as a buffer
-  //   A sync operation periodically moves data from shared to global
-	const int NUMTHREADS = 32;		/* RBTODO: fix */
-	__shared__ int spid[NUMTHREADS];
-	const int tid = threadIdx.x;
-	const int ai = blockIdx.x * blockDim.x + threadIdx.x; /* atom index 0 */
-
-	const int MAXPAIRSPERTHREAD = 8; /* optimized for 32 threads on K40 */
-	
-	// int numPairs = g_numPairs; 
-	if (ai == 0) {
-		g_numPairs = 0;	/* RBTODO: be more efficient */
-		// RBTODO: free later, do this elsewhere
-		// const int maxPairs = (num/numReplicas) * ((num/numReplicas)-1) / 1000;
-		const int maxPairs = 1e5;
-		g_pairI = (int*) malloc( maxPairs * sizeof(int));
-		g_pairJ = (int*) malloc( maxPairs * sizeof(int));
-		assert( g_pairI != NULL );
-		assert( g_pairJ != NULL );
-	}
-	
-	__syncthreads();
-	int pid = 0;
-	
-	// ai - index of the particle in the original, unsorted array
-	if (ai < (num-1) * numReplicas) {
-		const int repID = ai / (num-1);
-		// const int typei = type[i]; // maybe irrelevent
-		const Vector3 posi = pos[ai];
-		
-		// RBTODO: if this approach works well, make cell decomposition finer so limit still works
-
-		// TODO: Fix this: Find correct celli (add a new function to
-		//       CellDecomposition, binary search over cells)
-		CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
-		const CellDecomposition::cell_t* pairs = decomp->getCells();
-		for (int x = -1; x <= 1; ++x) {
-			for (int y = -1; y <= 1; ++y) {
-				for (int z = -1; z <= 1; ++z) {					
-	
-					int n = 0;
-					int last = -1;
-					const int nID = decomp->getNeighborID(celli, x, y, z);				
-					if (nID >= 0) { // set range for valid cells
-						const CellDecomposition::range_t range = decomp->getRange(nID, repID);
-						n = range.first;
-						last = range.last;
-					}
-					
-					// go through possible pairs 32 at a time per thread
-					int pairI[MAXPAIRSPERTHREAD];
-					int pairJ[MAXPAIRSPERTHREAD];
-
-					bool done = false;
-					while (!done) {
-						// if (tid == 0)
-						// printf("Working to find pairlist: numPairs = %d\n", g_numPairs);
-								
-						for (int dn = 0; dn < MAXPAIRSPERTHREAD; dn++) {
-							if ( n + dn >= last ) continue;
-
-							const int aj = pairs[n+dn].particle;
-							// RBTODO: skip exclusions
+						const int last = range.last;
+						// for (int n = 0; n < last; n++) {}
+						for (int n = range.first + tid; n < last; n+=NUMTHREADS) {
+							count++;
+							const int aj = pairs[n].particle;
 							if (aj <= ai) continue;
-							
-							pairI[pid] = ai;
-							pairJ[pid] = aj;
-							pid++;
-						}
-						n += MAXPAIRSPERTHREAD;
-						done = __all( n >= last );
+							// RBTODO: skip exclusions
 						
-						{ // SYNC THREADS
-							//__syncthreads();				
-							// assert(pid <= MAXPAIRSPERTHREAD);
-
-							// find where each thread should put its pairs in global list
-							spid[tid] = pid;
-							exclIntCumSum(spid,NUMTHREADS); // returns cumsum with spid[0] = 0
-
-							for (int d = 0; d < pid; d++) {
-								g_pairI[g_numPairs + d + spid[tid]] = pairI[d];
-								g_pairJ[g_numPairs + d + spid[tid]] = pairJ[d];
-							}
-							// update global index
-							__syncthreads();
-							if (tid == NUMTHREADS) g_numPairs += spid[tid] + pid;
-							pid = 0;
-						} // END SYNC THREADS 
-
-
-					} 	// n
-				} 		// z				
-			} 			// y
-		} 				// x
-	}						/* replicas */
+							int wid = atomicAggInc( &g_numPairs[cID], warpLane ); // fails
+							// int wid = atomicAdd( &g_numPairs[cID], 1 ); // works
+							g_pairI[cID][wid] = ai;
+							g_pairJ[cID][wid] = aj;
+						} 	// atoms J
+					} 		// z				
+				} 			// y
+			} 				// x
+		} // atoms I					
+	} // replicas
+	if (tid == 0) printf("Cell%d: found %d pairs\n",cID,g_numPairs[cID]);
 }
 	
 __global__
@@ -463,17 +379,20 @@ __global__ void computeTabulatedKernel(Vector3* force, Vector3* pos, int* type,
 	__shared__ int atomI[NUMTHREADS];
 	__shared__ int atomJ[NUMTHREADS];
 
-	int numPairs = g_numPairs; 
+
 
 	const int tid = threadIdx.x;
-	const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+	const int cID = blockIdx.x;
+	int numPairs = g_numPairs[cID]; 
+	// const int gid = blockIdx.x * blockDim.x + threadIdx.x;
 
+	
 	// loop over particle pairs in pairlist
-	if (gid < numPairs) {
+	for (int i = tid; i < numPairs; i+=NUMTHREADS) {
 		// BONDS: RBTODO: handle through an independent kernel call
 		// RBTODO: handle exclusions in other kernel call
-		const int ai = g_pairI[tid];
-		const int aj = g_pairJ[tid];
+		const int ai = g_pairI[cID][i];
+		const int aj = g_pairJ[cID][i];
 		
 		// Particle's type and position
 		const int ind = type[ai] + type[aj] * numParts; /* RBTODO: why is numParts here? */
diff --git a/CudaUtil.cuh b/CudaUtil.cuh
index 1adb2af..7241f5c 100644
--- a/CudaUtil.cuh
+++ b/CudaUtil.cuh
@@ -12,6 +12,7 @@ __device__ int atomicAggInc(int *ctr, int warpLane) {
 	if ( warpLane == leader )
 		res = atomicAdd(ctr, __popc(mask));
 	res = warp_bcast(res,leader);
+	
 	return res + __popc( mask & ((1 << warpLane) - 1) );
 }
 	
diff --git a/makefile b/makefile
index 8f9b8fc..83fcb81 100644
--- a/makefile
+++ b/makefile
@@ -11,14 +11,15 @@ include ./findcudalib.mk
 INCLUDE = $(CUDA_PATH)/include
 
 
-DEBUG = -g -O0
+# DEBUG = -g -O0
 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic
 # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no
 NV_FLAGS = -Xptxas -v # -v,-abi=no 
-EX_FLAGS = -O3 -m$(OS_SIZE)
-# EX_FLAGS = -O0 -m$(OS_SIZE)
 ifneq ($(DEBUG),)
 	NV_FLAGS += -g -G								#debug
+	EX_FLAGS = -O0 -m$(OS_SIZE)
+else
+	EX_FLAGS = -O3 -m$(OS_SIZE)
 endif
 NV_FLAGS += -lineinfo
 
-- 
GitLab