From 7ce58c53c46c2e338ba65dc5dc520fc22e7e628b Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Thu, 18 Feb 2016 18:57:56 -0600
Subject: [PATCH] pairlist code in horrible shape; about to clean for a
 simplified approach; committing for posterity

---
 ComputeForce.cuh |  48 +++++++++++------
 CudaUtil.cuh     | 132 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 164 insertions(+), 16 deletions(-)

diff --git a/ComputeForce.cuh b/ComputeForce.cuh
index 4564bbb..57d652d 100644
--- a/ComputeForce.cuh
+++ b/ComputeForce.cuh
@@ -225,6 +225,8 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
 
 	const int MAXPAIRSPERTHREAD = 8; /* optimized for 32 threads on K40 */
 	const int MAXPAIRSPERBLOCK = MAXPAIRSPERTHREAD*NUMTHREADS*4;
+	const int WARPSIZE = 32;			
+	const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
 	
 	pairCount[tid] = 0;
 	// int numPairs = g_numPairs; 
@@ -278,8 +280,10 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
 					}
 					
 					// go through possible pairs in chunks for each thread
-					__shared__ int pairI[MAXPAIRSPERTHREAD*NUMTHREADS];
-					__shared__ int pairJ[MAXPAIRSPERTHREAD*NUMTHREADS];
+					/* __shared__ int pairI[NUMTHREADS]; */
+					__shared__ int pairJ[NUMTHREADS];
+					/* __shared__ sharedIntBuffer pairI( */
+					/* 	MAXPAIRSPERTHREAD*NUMTHREADS, NUMTHREADS, 1); */
 					
 					bool done = false;
 					int pidLast = 0;
@@ -287,24 +291,36 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
 						// if (tid == 0)
 						// printf("Working to find pairlist: numPairs = %d\n", g_numPairs);
 								
-						for (int dn = 0; dn < MAXPAIRSPERTHREAD; dn++) {
-							if ( n + dn >= last ) break;
-
-							const int aj = pairs[n+dn].particle;
-							// RBTODO: skip exclusions
-							if (aj <= ai) continue;
-							
-							pairI[pid+MAXPAIRSPERTHREAD*tid] = ai;
-							pairJ[pid+MAXPAIRSPERTHREAD*tid] = aj;
-							pid++;
-						}
-						n += MAXPAIRSPERTHREAD;
+						if ( n >= last ) break;
+						const int aj = pairs[n+dn].particle;
+						bool valid = true;
+						// RBTODO: skip exclusions
+						if (aj <= ai) continue;
+						/* pairI[pid+MAXPAIRSPERTHREAD*tid] = ai; */
+						pairJ[tid] = aj;
+						n++;
 						done = __all( n >= last );
 
 						{ // SYNC THREADS
 							// find where each thread should put its pairs in global list
-							pairCount[tid] = pid;
-							inclIntCumSum(pairCount,NUMTHREADS); // returns cumsum with pairCoun[t0] = 0
+
+							// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ 
+							int validVote = __ballot(valid);
+							/* int mask      = INT_MAX >> ( WARPSIZE - warpLane - 1 ); */
+							/* int warpPairs = __popc( validVote & mask ); */
+							
+							int leader = __ffs(validVode)-1; /* select a leader */
+
+							int wid;
+							if ( warpLane == leader )
+								wid = atomicAdd( &gpid , warpPairs );
+							wid = warp_bcast( wid, leader );
+							wid = wid + __popc( validVote & ((1 << warpLane) - 1) );
+							g_pairI[wid] = ai;
+							g_pairJ[wid] = aj;
+						}
+								// inclIntCumSum(pairCount,NUMTHREADS); // returns cumsum with pairCoun[t0] = 0
+							
 							int total = pairCount[NUMTHREADS-1];		/* number of new pairs */
 
 							// loop over new pairs, with stride of threads
diff --git a/CudaUtil.cuh b/CudaUtil.cuh
index f44034f..2bbee58 100644
--- a/CudaUtil.cuh
+++ b/CudaUtil.cuh
@@ -1,5 +1,137 @@
 #pragma once
 
+#define WARPSIZE 32
+
+/* __device__ int atomicAggInc(int *ctr, int warpLane) { */
+/* 	int mask = __ballot(1); */
+/* 	int leadThread = __ffs(mask)-1; */
+
+/* 	int res; */
+/* 	if ( warpLane == leadThread ) */
+/* 		res = atomicAdd(ctr, __popc(mask)); */
+
+/* 	res = warp_ */
+										
+	
+class localIntBuffer {
+public:
+	__device__ void localIntBuffer(const int s, const int nt, const int c) :
+		size(s), numThreads(nt), subChunksPerBlock(c),
+		chunkSize(c*nt), numChunks(s/(c*nt)) {
+		// numChunks = size / chunkSize; /* assume 0 remainder */
+		
+		buffer = (int*) malloc( size * sizeof(int) );
+
+		freeChunk = (bool*) malloc( numChunks * sizeof(bool) );
+		for (int i = 0; i < numChunks; i++)	freeChunk = true;
+		chunk = getNextChunk();
+		chunkOffset = 0;
+	}
+	__device__ void ~localIntBuffer() {
+		free buffer;
+	}
+	__device__ void append( const int tid, const int val ) {
+		// RBTODO
+		buffer[chunk*chunkSize + chunkOffset + tid] = val;
+
+		// update offset
+	}
+
+	__device__ void sendChunks( const int tid, int* globalDest, int& g_offset ) {
+		int chunksSent = 0;
+		for (int i = 0; i < numChunks; i++) {
+			if (!freeChunk[i] && i != chunk) { // found a chunk to send
+				for (int c = 0; c < subChunksPerBlock; c++) {
+					globalDest[g_offset + tid] = buffer[i*chunkSize + c*nt + tid];
+					g_offset += nt;
+				}
+				// free chunk
+				freeChunk[i] = true;
+				// NOT DONE
+			}
+		}
+	}
+
+private:
+	__device__ int getNextChunk() {
+		for (int i = 0; i < numChunks; i++) {
+			if (freeChunk[i]) {
+				freeChunk[i] = false;
+				return i;
+			}
+		}
+		return -1;									/* out of chunks */
+	}
+
+	const int size;
+  const int numThreads;
+	const int subChunksPerBlock;
+	const int chunkSize;
+  const int numChunks;
+	int* buffer
+  bool* freeChunk;
+  int chunk;
+	int chunkOffset;
+};
+
+class sharedIntBuffer {
+public:
+	__device__ void sharedIntBuffer(const int s, const int nt, const int c) :
+		size(s), numThreads(nt), subChunksPerBlock(c),
+		chunkSize(c*nt), numChunks(s/(c*nt)) {
+		// numChunks = size / chunkSize; /* assume 0 remainder */
+		
+		buffer = (int*) malloc( size * sizeof(int) );
+
+		freeChunk = (bool*) malloc( numChunks * sizeof(bool) );
+		for (int i = 0; i < numChunks; i++)	freeChunk = true;
+		chunk = getNextChunk();
+		chunkOffset = 0;
+	}
+	__device__ void ~sharedIntBuffer() {
+		free buffer;
+	}
+	__device__ void append( const int tid, const int val ) {
+		// RBTODO
+		buffer[chunk*chunkSize + chunkOffset + tid] = val;
+
+		// update offset
+	}
+
+	__device__ void sendChunks( const int tid, int* globalDest, int& g_offset ) {
+		int chunksSent = 0;
+		for (int i = 0; i < numChunks; i++) {
+			if (!freeChunk[i] && i != chunk) { // found a chunk to send
+				for (int c = 0; c < subChunksPerBlock; c++) {
+					globalDest[g_offset + tid] = buffer[i*chunkSize + c*nt + tid];
+					g_offset += nt;
+				}
+			}
+		}
+	}
+
+private:
+	__device__ int getNextChunk() {
+		for (int i = 0; i < numChunks; i++) {
+			if (freeChunk[i]) {
+				freeChunk[i] = false;
+				return i;
+			}
+		}
+		return -1;									/* out of chunks */
+	}
+
+	const int size;
+  const int numThreads;
+	const int subChunksPerBlock;
+	const int chunkSize;
+  const int numChunks;
+	int* buffer
+  bool* freeChunk;
+  int chunk;
+	int chunkOffset;
+};
+	
 __device__ inline void exclIntCumSum(int* in, const int n) {
 	// 1) int* in must point to shared memory
 	// 2) int n must be power of 2
-- 
GitLab