Skip to content
Snippets Groups Projects
Commit 8c67cfb9 authored by cmaffeo2's avatar cmaffeo2
Browse files

pairlist kernel being rewritten; ammend this broken commit

parent 8733d418
No related branches found
No related tags found
No related merge requests found
......@@ -369,17 +369,18 @@ void ComputeForce::decompose(Vector3* pos) {
// initializePairlistArrays
int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
int blocksPerCell = 10;
if (newDecomp) {
initializePairlistArrays<<< 1, 32 >>>(nCells);
initializePairlistArrays<<< 1, 32 >>>(nCells*blocksPerCell);
gpuErrchk(cudaDeviceSynchronize());
}
const int NUMTHREADS = 128;
//const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1;
const size_t nBlocks = nCells;
const size_t nBlocks = nCells*blocksPerCell;
/* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d, decomp_d); */
/* gpuErrchk(cudaDeviceSynchronize()); */
createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d, nCells);
createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d, nCells, blocksPerCell);
gpuErrchk(cudaDeviceSynchronize());
......
......@@ -6,7 +6,6 @@
#include "CudaUtil.cuh"
#include <assert.h>
__host__ __device__
EnergyForce ComputeForce::coulombForce(Vector3 r, float alpha,
float start, float len) {
......@@ -201,27 +200,29 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[],
__device__ int* g_numPairs;
__device__ int** g_pairI;
__device__ int** g_pairJ;
__device__ bool g_pairsSet = false;
__device__ int g_nextPairlist;
const __device__ int maxPairs = 1 << 12;
__global__
void initializePairlistArrays(const int nCells) {
void initializePairlistArrays(const int nLists) {
const int tid = threadIdx.x;
const int maxPairs = 1 << 16; /* ~60,000 per cell */
// const int maxPairs = 1 << 12; /* ~120,000 per cell */
if (blockIdx.x > 0) return;
// RBTODO: free later
if (tid == 0) {
printf("Initializing device pairlists for %d cells\n", nCells);
g_numPairs = (int*) malloc( nCells * sizeof(int) );
g_pairI = (int**) malloc( nCells * sizeof(int*));
g_pairJ = (int**) malloc( nCells * sizeof(int*));
printf("Initializing device pairlists for %d cells\n", nLists);
g_numPairs = (int*) malloc( nLists * sizeof(int) );
g_pairI = (int**) malloc( nLists * sizeof(int*));
g_pairJ = (int**) malloc( nLists * sizeof(int*));
g_nextPairlist = 0;
}
__syncthreads();
assert( g_numPairs != NULL );
assert( g_pairI != NULL );
assert( g_pairJ != NULL );
for (int i = tid; i < nCells; i += blockDim.x) {
for (int i = tid; i < nLists; i += blockDim.x) {
g_pairI[i] = (int*) malloc( maxPairs * sizeof(int));
g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int));
g_numPairs[i] = 0;
......@@ -232,20 +233,27 @@ void initializePairlistArrays(const int nCells) {
__global__
void createPairlists(Vector3 pos[], int num, int numReplicas,
BaseGrid* sys, CellDecomposition* decomp, int nCells) {
BaseGrid* sys, CellDecomposition* decomp, const int nCells, const int blocksPerCell) {
// Loop over threads searching for atom pairs
// Each thread has designated values in shared memory as a buffer
// A sync operation periodically moves data from shared to global
const int NUMTHREADS = 128; /* RBTODO: fix */
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int cID = blockIdx.x;
// * blockDim.x + threadIdx.x;
// const int cID = blockIdx.x * blockDim.x + threadIdx.x; /* atom index 0 */
const int cID = bid / blocksPerCell;
const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
const int wid = tid/WARPSIZE;
const int blockLane = bid % blocksPerCell;
__shared__ int pid[NUMTHREADS/WARPSIZE];
if (warpLane == 0) pid[wid] = 0;
/* if (warpLane == 0) */
/* pid = atomicAdd( &g_nextPairlist, 1 ); */
/* res = warp_bcast(res,leader); */
/* const int3& nCells = decomp->nCells; */
if (cID >= nCells) return;
int count = 0; /* debug */
const CellDecomposition::cell_t* pairs = decomp->getCells();
......@@ -255,7 +263,7 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
if (tid == 0) printf(" Cell%d: Working on %d atoms for repID %d\n",
cID, rangeI.last - rangeI.first, repID);
for (int ci = rangeI.first; ci < rangeI.last; ci++) {
for (int ci = rangeI.first + blockLane; ci < rangeI.last; ci+=blocksPerCell) {
// ai - index of the particle in the original, unsorted array
const int ai = pairs[ci].particle;
// const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
......@@ -276,11 +284,41 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
const int aj = pairs[n].particle;
if (aj <= ai) continue;
// RBTODO: skip exclusions
int wid = atomicAggInc( &g_numPairs[cID], warpLane ); // fails
// int wid = atomicAdd( &g_numPairs[cID], 1 ); // works
g_pairI[cID][wid] = ai;
g_pairJ[cID][wid] = aj;
/* int gid; */
/* { // per-warp atomic add to get global inices */
/* int t_active = __ballot(1); */
/* int leader = __ffs(t_active)-1; */
/* int res; */
/* // RBTODO: see if performance improves with __any(gid >= maxPairs */
/* if ( warpLane == leader ) { */
/* const int t_count = __popc(t_active); */
/* res = atomicAdd( &g_numPairs[pid], t_count ); */
/* if ( res + t_count >= maxPairs ) { // went too far; mark invalid and go again */
/* int tmp = atomicSub( &g_numPairs[pid], t_count ); */
/* assert( tmp == res + t_count ); */
/* pid++; */
/* res = atomicAdd( &g_numPairs[pid], t_count ); */
/* } */
/* } */
/* pid = warp_bcast(pid,leader); */
/* res = warp_bcast(res,leader); */
/* gid = res + __popc( t_active & ((1 << warpLane) - 1) ); */
/* } */
int gid = atomicAggInc( &g_numPairs[pid], warpLane ); // fails
if (__any(gid >= maxPairs)) { // a little inefficient, but important
g_pairI[pid][gid] = -1;
g_pairJ[pid][gid] = -1;
pid++; /* needs to apply to ALL warp threads */
// we assume arrays at pid are nearly empty (no while loop)
gid = atomicAggInc( &g_numPairs[pid], warpLane ); /* assume this hasn't filled */
}
// int wid = atomicAdd( &g_numPairs[pid], 1 ); // works
g_pairI[pid][gid] = ai;
g_pairJ[pid][gid] = aj;
} // atoms J
} // z
} // y
......
......@@ -15,7 +15,7 @@ __device__ int atomicAggInc(int *ctr, int warpLane) {
return res + __popc( mask & ((1 << warpLane) - 1) );
}
__device__ inline void exclIntCumSum(int* in, const int n) {
// 1) int* in must point to shared memory
......
......@@ -11,7 +11,7 @@ include ./findcudalib.mk
INCLUDE = $(CUDA_PATH)/include
# DEBUG = -g -O0
DEBUG = -g -O0
CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic
# NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no
NV_FLAGS = -Xptxas -v # -v,-abi=no
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment