From 7ce58c53c46c2e338ba65dc5dc520fc22e7e628b Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Thu, 18 Feb 2016 18:57:56 -0600 Subject: [PATCH] pairlist code in horrible shape; about to clean for a simplified approach; committing for posterity --- ComputeForce.cuh | 48 +++++++++++------ CudaUtil.cuh | 132 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 16 deletions(-) diff --git a/ComputeForce.cuh b/ComputeForce.cuh index 4564bbb..57d652d 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -225,6 +225,8 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, const int MAXPAIRSPERTHREAD = 8; /* optimized for 32 threads on K40 */ const int MAXPAIRSPERBLOCK = MAXPAIRSPERTHREAD*NUMTHREADS*4; + const int WARPSIZE = 32; + const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */ pairCount[tid] = 0; // int numPairs = g_numPairs; @@ -278,8 +280,10 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, } // go through possible pairs in chunks for each thread - __shared__ int pairI[MAXPAIRSPERTHREAD*NUMTHREADS]; - __shared__ int pairJ[MAXPAIRSPERTHREAD*NUMTHREADS]; + /* __shared__ int pairI[NUMTHREADS]; */ + __shared__ int pairJ[NUMTHREADS]; + /* __shared__ sharedIntBuffer pairI( */ + /* MAXPAIRSPERTHREAD*NUMTHREADS, NUMTHREADS, 1); */ bool done = false; int pidLast = 0; @@ -287,24 +291,36 @@ void createPairlists(Vector3 pos[], int num, int numReplicas, // if (tid == 0) // printf("Working to find pairlist: numPairs = %d\n", g_numPairs); - for (int dn = 0; dn < MAXPAIRSPERTHREAD; dn++) { - if ( n + dn >= last ) break; - - const int aj = pairs[n+dn].particle; - // RBTODO: skip exclusions - if (aj <= ai) continue; - - pairI[pid+MAXPAIRSPERTHREAD*tid] = ai; - pairJ[pid+MAXPAIRSPERTHREAD*tid] = aj; - pid++; - } - n += MAXPAIRSPERTHREAD; + if ( n >= last ) break; + const int aj = pairs[n+dn].particle; + bool valid = true; + // RBTODO: skip exclusions + if (aj <= ai) continue; + /* pairI[pid+MAXPAIRSPERTHREAD*tid] = ai; */ + pairJ[tid] = aj; + n++; done = __all( n >= last ); { // SYNC THREADS // find where each thread should put its pairs in global list - pairCount[tid] = pid; - inclIntCumSum(pairCount,NUMTHREADS); // returns cumsum with pairCoun[t0] = 0 + + // https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ + int validVote = __ballot(valid); + /* int mask = INT_MAX >> ( WARPSIZE - warpLane - 1 ); */ + /* int warpPairs = __popc( validVote & mask ); */ + + int leader = __ffs(validVode)-1; /* select a leader */ + + int wid; + if ( warpLane == leader ) + wid = atomicAdd( &gpid , warpPairs ); + wid = warp_bcast( wid, leader ); + wid = wid + __popc( validVote & ((1 << warpLane) - 1) ); + g_pairI[wid] = ai; + g_pairJ[wid] = aj; + } + // inclIntCumSum(pairCount,NUMTHREADS); // returns cumsum with pairCoun[t0] = 0 + int total = pairCount[NUMTHREADS-1]; /* number of new pairs */ // loop over new pairs, with stride of threads diff --git a/CudaUtil.cuh b/CudaUtil.cuh index f44034f..2bbee58 100644 --- a/CudaUtil.cuh +++ b/CudaUtil.cuh @@ -1,5 +1,137 @@ #pragma once +#define WARPSIZE 32 + +/* __device__ int atomicAggInc(int *ctr, int warpLane) { */ +/* int mask = __ballot(1); */ +/* int leadThread = __ffs(mask)-1; */ + +/* int res; */ +/* if ( warpLane == leadThread ) */ +/* res = atomicAdd(ctr, __popc(mask)); */ + +/* res = warp_ */ + + +class localIntBuffer { +public: + __device__ void localIntBuffer(const int s, const int nt, const int c) : + size(s), numThreads(nt), subChunksPerBlock(c), + chunkSize(c*nt), numChunks(s/(c*nt)) { + // numChunks = size / chunkSize; /* assume 0 remainder */ + + buffer = (int*) malloc( size * sizeof(int) ); + + freeChunk = (bool*) malloc( numChunks * sizeof(bool) ); + for (int i = 0; i < numChunks; i++) freeChunk = true; + chunk = getNextChunk(); + chunkOffset = 0; + } + __device__ void ~localIntBuffer() { + free buffer; + } + __device__ void append( const int tid, const int val ) { + // RBTODO + buffer[chunk*chunkSize + chunkOffset + tid] = val; + + // update offset + } + + __device__ void sendChunks( const int tid, int* globalDest, int& g_offset ) { + int chunksSent = 0; + for (int i = 0; i < numChunks; i++) { + if (!freeChunk[i] && i != chunk) { // found a chunk to send + for (int c = 0; c < subChunksPerBlock; c++) { + globalDest[g_offset + tid] = buffer[i*chunkSize + c*nt + tid]; + g_offset += nt; + } + // free chunk + freeChunk[i] = true; + // NOT DONE + } + } + } + +private: + __device__ int getNextChunk() { + for (int i = 0; i < numChunks; i++) { + if (freeChunk[i]) { + freeChunk[i] = false; + return i; + } + } + return -1; /* out of chunks */ + } + + const int size; + const int numThreads; + const int subChunksPerBlock; + const int chunkSize; + const int numChunks; + int* buffer + bool* freeChunk; + int chunk; + int chunkOffset; +}; + +class sharedIntBuffer { +public: + __device__ void sharedIntBuffer(const int s, const int nt, const int c) : + size(s), numThreads(nt), subChunksPerBlock(c), + chunkSize(c*nt), numChunks(s/(c*nt)) { + // numChunks = size / chunkSize; /* assume 0 remainder */ + + buffer = (int*) malloc( size * sizeof(int) ); + + freeChunk = (bool*) malloc( numChunks * sizeof(bool) ); + for (int i = 0; i < numChunks; i++) freeChunk = true; + chunk = getNextChunk(); + chunkOffset = 0; + } + __device__ void ~sharedIntBuffer() { + free buffer; + } + __device__ void append( const int tid, const int val ) { + // RBTODO + buffer[chunk*chunkSize + chunkOffset + tid] = val; + + // update offset + } + + __device__ void sendChunks( const int tid, int* globalDest, int& g_offset ) { + int chunksSent = 0; + for (int i = 0; i < numChunks; i++) { + if (!freeChunk[i] && i != chunk) { // found a chunk to send + for (int c = 0; c < subChunksPerBlock; c++) { + globalDest[g_offset + tid] = buffer[i*chunkSize + c*nt + tid]; + g_offset += nt; + } + } + } + } + +private: + __device__ int getNextChunk() { + for (int i = 0; i < numChunks; i++) { + if (freeChunk[i]) { + freeChunk[i] = false; + return i; + } + } + return -1; /* out of chunks */ + } + + const int size; + const int numThreads; + const int subChunksPerBlock; + const int chunkSize; + const int numChunks; + int* buffer + bool* freeChunk; + int chunk; + int chunkOffset; +}; + __device__ inline void exclIntCumSum(int* in, const int n) { // 1) int* in must point to shared memory // 2) int n must be power of 2 -- GitLab