From 8c67cfb978c60ebfa0cad47a6ef507ea2d36d5d8 Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Mon, 22 Feb 2016 14:06:53 -0600
Subject: [PATCH] pairlist kernel being rewritten; ammend this broken commit

---
 ComputeForce.cu  |  7 +++--
 ComputeForce.cuh | 80 +++++++++++++++++++++++++++++++++++-------------
 CudaUtil.cuh     |  2 +-
 makefile         |  2 +-
 4 files changed, 65 insertions(+), 26 deletions(-)

diff --git a/ComputeForce.cu b/ComputeForce.cu
index 3e0d912..68468cb 100644
--- a/ComputeForce.cu
+++ b/ComputeForce.cu
@@ -369,17 +369,18 @@ void ComputeForce::decompose(Vector3* pos) {
 	
 		// initializePairlistArrays
 	int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
+	int blocksPerCell = 10;
 	if (newDecomp) {
-		initializePairlistArrays<<< 1, 32 >>>(nCells);
+		initializePairlistArrays<<< 1, 32 >>>(nCells*blocksPerCell);
 		gpuErrchk(cudaDeviceSynchronize());
 	}
 	const int NUMTHREADS = 128;
 	//const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1;
-	const size_t nBlocks = nCells;
+	const size_t nBlocks = nCells*blocksPerCell;
 
 	/* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d, decomp_d); */
 	/* gpuErrchk(cudaDeviceSynchronize()); */
-	createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d, nCells);
+	createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d, nCells, blocksPerCell);
 	gpuErrchk(cudaDeviceSynchronize());
 	
 	
diff --git a/ComputeForce.cuh b/ComputeForce.cuh
index c93022e..f7ca683 100644
--- a/ComputeForce.cuh
+++ b/ComputeForce.cuh
@@ -6,7 +6,6 @@
 #include "CudaUtil.cuh"
 #include <assert.h>
 
-
 __host__ __device__
 EnergyForce ComputeForce::coulombForce(Vector3 r, float alpha,
 																			 float start, float len) {
@@ -201,27 +200,29 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[],
 __device__ int* g_numPairs;
 __device__ int** g_pairI;
 __device__ int** g_pairJ;
-__device__ bool g_pairsSet = false;
+__device__ int g_nextPairlist;
+const __device__ int maxPairs = 1 << 12;
 
 __global__
-void initializePairlistArrays(const int nCells) {
+void initializePairlistArrays(const int nLists) {
 	const int tid = threadIdx.x;
-	const int maxPairs = 1 << 16;	/* ~60,000 per cell */
+//	const int maxPairs = 1 << 12;	/* ~120,000 per cell */
 	if (blockIdx.x > 0) return;
-
+	
 	// RBTODO: free later
 	if (tid == 0) {
-		printf("Initializing device pairlists for %d cells\n", nCells);
-		g_numPairs = (int*) malloc( nCells * sizeof(int) );  
-		g_pairI = (int**) malloc( nCells * sizeof(int*));
-		g_pairJ = (int**) malloc( nCells * sizeof(int*));
+		printf("Initializing device pairlists for %d cells\n", nLists);
+		g_numPairs = (int*) malloc( nLists * sizeof(int) );  
+		g_pairI = (int**) malloc( nLists * sizeof(int*));
+		g_pairJ = (int**) malloc( nLists * sizeof(int*));
+		g_nextPairlist = 0;
 	}		
 	__syncthreads();	
 	assert( g_numPairs != NULL );
 	assert( g_pairI != NULL );
 	assert( g_pairJ != NULL );
 	
-	for (int i = tid; i < nCells; i += blockDim.x) {
+	for (int i = tid; i < nLists; i += blockDim.x) {
 		g_pairI[i] = (int*) malloc( maxPairs * sizeof(int));
 		g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int));
 		g_numPairs[i] = 0;
@@ -232,20 +233,27 @@ void initializePairlistArrays(const int nCells) {
 
 __global__
 void createPairlists(Vector3 pos[], int num, int numReplicas,
-										 BaseGrid* sys, CellDecomposition* decomp, int nCells) {
+										 BaseGrid* sys, CellDecomposition* decomp, const int nCells, const int blocksPerCell) {
 	// Loop over threads searching for atom pairs
   //   Each thread has designated values in shared memory as a buffer
   //   A sync operation periodically moves data from shared to global
 	const int NUMTHREADS = 128;		/* RBTODO: fix */
 	const int tid = threadIdx.x;
+	const int bid = blockIdx.x;
 
-	const int cID = blockIdx.x;
-	// * blockDim.x + threadIdx.x; 
-	// const int cID = blockIdx.x * blockDim.x + threadIdx.x; /* atom index 0 */
+	const int cID = bid / blocksPerCell;
 
 	const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
+	const int wid = tid/WARPSIZE;
+	const int blockLane = bid % blocksPerCell;
+
+	__shared__ int pid[NUMTHREADS/WARPSIZE];
+	if (warpLane == 0) pid[wid] = 0;
+
+	/* if (warpLane == 0) */
+	/* 	pid = atomicAdd( &g_nextPairlist, 1 ); */
+	/* res = warp_bcast(res,leader); */
 
-	/* const int3& nCells = decomp->nCells; */
 	if (cID >= nCells) return;
 	int count = 0;								/* debug */
 	const CellDecomposition::cell_t* pairs = decomp->getCells();
@@ -255,7 +263,7 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
 		if (tid == 0) printf("  Cell%d: Working on %d atoms for repID %d\n",
 												 cID, rangeI.last - rangeI.first, repID);
 
-		for (int ci = rangeI.first; ci < rangeI.last; ci++) {
+		for (int ci = rangeI.first + blockLane; ci < rangeI.last; ci+=blocksPerCell) {
 			// ai - index of the particle in the original, unsorted array
 			const int ai = pairs[ci].particle;
 			// const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
@@ -276,11 +284,41 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
 							const int aj = pairs[n].particle;
 							if (aj <= ai) continue;
 							// RBTODO: skip exclusions
-						
-							int wid = atomicAggInc( &g_numPairs[cID], warpLane ); // fails
-							// int wid = atomicAdd( &g_numPairs[cID], 1 ); // works
-							g_pairI[cID][wid] = ai;
-							g_pairJ[cID][wid] = aj;
+
+							/* int gid; */
+							/* { // per-warp atomic add to get global inices */
+							/* 	int t_active = __ballot(1); */
+							/* 	int leader = __ffs(t_active)-1; */
+							/* 	int res; */
+
+							/* 	// RBTODO: see if performance improves with __any(gid >= maxPairs  */
+							/* 	if ( warpLane == leader ) { */
+							/* 		const int t_count = __popc(t_active); */
+							/* 		res = atomicAdd( &g_numPairs[pid], t_count ); */
+							/* 		if ( res + t_count >= maxPairs ) { // went too far; mark invalid and go again */
+							/* 			int tmp = atomicSub( &g_numPairs[pid], t_count ); */
+							/* 			assert( tmp == res + t_count ); */
+							/* 			pid++; */
+							/* 			res = atomicAdd( &g_numPairs[pid], t_count ); */
+							/* 		} */
+							/* 	} */
+							/* 	pid = warp_bcast(pid,leader); */
+							/* 	res = warp_bcast(res,leader); */
+							/* 	gid = res + __popc( t_active & ((1 << warpLane) - 1) ); */
+							/* } */
+							int gid = atomicAggInc( &g_numPairs[pid], warpLane ); // fails
+							if (__any(gid >= maxPairs)) { // a little inefficient, but important
+								g_pairI[pid][gid] = -1;
+								g_pairJ[pid][gid] = -1;
+								pid++;					/* needs to apply to ALL warp threads */
+								// we assume arrays at pid are nearly empty (no while loop)  
+								gid = atomicAggInc( &g_numPairs[pid], warpLane ); /* assume this hasn't filled */
+							}
+							
+							
+							// int wid = atomicAdd( &g_numPairs[pid], 1 ); // works
+							g_pairI[pid][gid] = ai;
+							g_pairJ[pid][gid] = aj;
 						} 	// atoms J
 					} 		// z				
 				} 			// y
diff --git a/CudaUtil.cuh b/CudaUtil.cuh
index 7241f5c..70d7e78 100644
--- a/CudaUtil.cuh
+++ b/CudaUtil.cuh
@@ -15,7 +15,7 @@ __device__ int atomicAggInc(int *ctr, int warpLane) {
 	
 	return res + __popc( mask & ((1 << warpLane) - 1) );
 }
-	
+
 	
 __device__ inline void exclIntCumSum(int* in, const int n) {
 	// 1) int* in must point to shared memory
diff --git a/makefile b/makefile
index 83fcb81..435923b 100644
--- a/makefile
+++ b/makefile
@@ -11,7 +11,7 @@ include ./findcudalib.mk
 INCLUDE = $(CUDA_PATH)/include
 
 
-# DEBUG = -g -O0
+DEBUG = -g -O0
 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic
 # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no
 NV_FLAGS = -Xptxas -v # -v,-abi=no 
-- 
GitLab