diff --git a/ComputeForce.cu b/ComputeForce.cu
index 68468cb261726e491111ea42f2a8d53579c4a39b..10fc45a23cd81665ff7d5849175892f5922070dc 100644
--- a/ComputeForce.cu
+++ b/ComputeForce.cu
@@ -366,14 +366,33 @@ void ComputeForce::decompose(Vector3* pos) {
 	// Update pairlists using cell decomposition (not sure this is really needed or good) 
 	//RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d);	
 
-	
-		// initializePairlistArrays
+
+	size_t free, total;
+	cuMemGetInfo(&free,&total);
+	printf("Free memory: %zu / %zu\n", free, total);
+	// initializePairlistArrays
 	int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
 	int blocksPerCell = 10;
 	if (newDecomp) {
-		initializePairlistArrays<<< 1, 32 >>>(nCells*blocksPerCell);
+		// initializePairlistArrays<<< 1, 32 >>>(10*nCells*blocksPerCell);
+
+		const int nLists = blocksPerCell*nCells*100;
+		const size_t s = sizeof(int*) * nLists;
+		gpuErrchk(cudaMalloc(&pairListList_d, s));
+
+		int *tmpPairLists[nLists];
+		for (int i = 0; i < nLists; i++) {
+			gpuErrchk(cudaMalloc(&tmpPairLists[i], sizeof(int)*(1<<14)));
+		}
+		gpuErrchk(cudaMemcpyAsync(pairListList_d, tmpPairLists,
+															sizeof(int*)*nLists, cudaMemcpyHostToDevice));
 		gpuErrchk(cudaDeviceSynchronize());
 	}
+
+	
+	cuMemGetInfo(&free,&total);
+	printf("Free memory: %zu / %zu\n", free, total);
+	
 	const int NUMTHREADS = 128;
 	//const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1;
 	const size_t nBlocks = nCells*blocksPerCell;
diff --git a/ComputeForce.cuh b/ComputeForce.cuh
index f7ca68358400ae2fe1068b16ee035b499bd79f84..1e2a6a36140e05b830e7d103b13418b0b3300b42 100644
--- a/ComputeForce.cuh
+++ b/ComputeForce.cuh
@@ -196,12 +196,13 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[],
 
 
 // RBTODO: remove global device variables for fast prototyping of pairlist kernel
-// #define MAXPAIRS 25500*25500 // (num/numReplicas) * ((num/numReplicas)-1);
+#define MAXPAIRS 25500*25500 // (num/numReplicas) * ((num/numReplicas)-1);
 __device__ int* g_numPairs;
 __device__ int** g_pairI;
 __device__ int** g_pairJ;
 __device__ int g_nextPairlist;
-const __device__ int maxPairs = 1 << 12;
+__device__ int g_numPairlistArrays;
+const __device__ int maxPairs = 1 << 14;
 
 __global__
 void initializePairlistArrays(const int nLists) {
@@ -211,25 +212,58 @@ void initializePairlistArrays(const int nLists) {
 	
 	// RBTODO: free later
 	if (tid == 0) {
-		printf("Initializing device pairlists for %d cells\n", nLists);
-		g_numPairs = (int*) malloc( nLists * sizeof(int) );  
+		printf("Initializing %d device pairlists.\n", nLists);
+		g_numPairs = (int*) malloc( nLists * sizeof(int) );
 		g_pairI = (int**) malloc( nLists * sizeof(int*));
 		g_pairJ = (int**) malloc( nLists * sizeof(int*));
 		g_nextPairlist = 0;
-	}		
-	__syncthreads();	
-	assert( g_numPairs != NULL );
-	assert( g_pairI != NULL );
-	assert( g_pairJ != NULL );
+		g_numPairlistArrays = nLists;
+
+		assert( g_numPairs != NULL );
+		assert( g_pairI != NULL );
+		assert( g_pairJ != NULL );
 	
-	for (int i = tid; i < nLists; i += blockDim.x) {
-		g_pairI[i] = (int*) malloc( maxPairs * sizeof(int));
-		g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int));
-		g_numPairs[i] = 0;
-		assert( g_pairI[i] != NULL );
-		assert( g_pairJ[i] != NULL );
+		for (int i = 0; i < nLists; i++) {
+			g_pairI[i] = (int*) malloc( maxPairs * sizeof(int));
+			g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int));
+			g_numPairs[i] = 0;
+			assert( g_pairI[i] != NULL );
+			assert( g_pairJ[i] != NULL );
+		}
 	}
 }
+/* __global__ */
+/* void initializePairlistArrays(const int nLists) { */
+/* 	const int tid = threadIdx.x; */
+/* //	const int maxPairs = 1 << 12;	/\* ~120,000 per cell *\/ */
+/* 	if (blockIdx.x > 0) return; */
+	
+/* 	// RBTODO: free later */
+/* 	if (tid == 0) { */
+/* 		printf("Initializing %d device pairlists.\n", nLists); */
+/* 		g_numPairs = (int*) malloc( nLists * sizeof(int) ); */
+/* 		g_pairI = (int**) malloc( nLists * sizeof(int*)); */
+/* 		g_pairJ = (int**) malloc( nLists * sizeof(int*)); */
+/* 		g_nextPairlist = 0; */
+/* 		g_numPairlistArrays = nLists; */
+/* 	} */
+/* 	__syncthreads(); */
+/* 	assert( g_numPairs != NULL ); */
+/* 	assert( g_pairI != NULL ); */
+/* 	assert( g_pairJ != NULL ); */
+	
+/* 	for (int i = tid; i < nLists; i += blockDim.x) { */
+/* 		g_pairI[i] = (int*) malloc( maxPairs * sizeof(int)); */
+/* 		g_pairJ[i] = (int*) malloc( maxPairs * sizeof(int)); */
+/* 		g_numPairs[i] = 0; */
+/* 	} */
+
+/* 	__syncthreads(); */
+/* 	for (int i = tid; i < nLists; i += blockDim.x) { */
+/* 		assert( g_pairI[i] != NULL ); */
+/* 		assert( g_pairJ[i] != NULL ); */
+/* 	} */
+/* } */
 
 __global__
 void createPairlists(Vector3 pos[], int num, int numReplicas,
@@ -247,12 +281,8 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
 	const int wid = tid/WARPSIZE;
 	const int blockLane = bid % blocksPerCell;
 
-	__shared__ int pid[NUMTHREADS/WARPSIZE];
-	if (warpLane == 0) pid[wid] = 0;
-
-	/* if (warpLane == 0) */
-	/* 	pid = atomicAdd( &g_nextPairlist, 1 ); */
-	/* res = warp_bcast(res,leader); */
+	volatile __shared__ int pid[NUMTHREADS/WARPSIZE];
+	if (warpLane == 0) pid[wid] = bid;
 
 	if (cID >= nCells) return;
 	int count = 0;								/* debug */
@@ -285,40 +315,23 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
 							if (aj <= ai) continue;
 							// RBTODO: skip exclusions
 
-							/* int gid; */
-							/* { // per-warp atomic add to get global inices */
-							/* 	int t_active = __ballot(1); */
-							/* 	int leader = __ffs(t_active)-1; */
-							/* 	int res; */
-
-							/* 	// RBTODO: see if performance improves with __any(gid >= maxPairs  */
-							/* 	if ( warpLane == leader ) { */
-							/* 		const int t_count = __popc(t_active); */
-							/* 		res = atomicAdd( &g_numPairs[pid], t_count ); */
-							/* 		if ( res + t_count >= maxPairs ) { // went too far; mark invalid and go again */
-							/* 			int tmp = atomicSub( &g_numPairs[pid], t_count ); */
-							/* 			assert( tmp == res + t_count ); */
-							/* 			pid++; */
-							/* 			res = atomicAdd( &g_numPairs[pid], t_count ); */
-							/* 		} */
-							/* 	} */
-							/* 	pid = warp_bcast(pid,leader); */
-							/* 	res = warp_bcast(res,leader); */
-							/* 	gid = res + __popc( t_active & ((1 << warpLane) - 1) ); */
-							/* } */
-							int gid = atomicAggInc( &g_numPairs[pid], warpLane ); // fails
-							if (__any(gid >= maxPairs)) { // a little inefficient, but important
-								g_pairI[pid][gid] = -1;
-								g_pairJ[pid][gid] = -1;
-								pid++;					/* needs to apply to ALL warp threads */
+ 							int gid = atomicAggInc( pid[wid], &g_numPairs[pid[wid]], warpLane ); // fails
+							while (__any(gid >= maxPairs)) { // does any thread in the warp have too large and index?
+								if (gid < maxPairs) {
+									g_pairI[pid[wid]][gid] = -1;
+									g_pairJ[pid[wid]][gid] = -1;
+								}
+
+								// Have 'leader' thread in warp increment counter
+								if ( warpLane + 1 == __ffs(__ballot(1)) )	pid[wid]++;
+								
 								// we assume arrays at pid are nearly empty (no while loop)  
-								gid = atomicAggInc( &g_numPairs[pid], warpLane ); /* assume this hasn't filled */
+								gid = atomicAggInc( pid[wid], &g_numPairs[pid[wid]], warpLane ); /* assume this hasn't filled */
 							}
 							
-							
 							// int wid = atomicAdd( &g_numPairs[pid], 1 ); // works
-							g_pairI[pid][gid] = ai;
-							g_pairJ[pid][gid] = aj;
+							g_pairI[pid[wid]][gid] = ai;
+							g_pairJ[pid[wid]][gid] = aj;
 						} 	// atoms J
 					} 		// z				
 				} 			// y
diff --git a/ComputeForce.h b/ComputeForce.h
index c9f6fcb57fd1ea9fde2ea1cffcb74228938d2e4f..a0abaace11418f42b30dad91ef52469f5b653d01 100644
--- a/ComputeForce.h
+++ b/ComputeForce.h
@@ -112,9 +112,10 @@ private:
 	TabulatedDihedralPotential **tableDihedral_d, **tableDihedral_addr;
 
 	// Pairlists
-	int *pairIds_d;
-	int numPairs;
-	int numPairs_d;	
+	int **pairListList_d;
+	int **pairLists_d;
+
+	int *numPairs_d;	
 };
 
 #endif
diff --git a/CudaUtil.cuh b/CudaUtil.cuh
index 70d7e78e1bdc276a1cb3297d77f978116d6d5b2a..84067175209b3c1ec2d71e8561933798fa69e110 100644
--- a/CudaUtil.cuh
+++ b/CudaUtil.cuh
@@ -3,16 +3,21 @@
 #define WARPSIZE 32
 
 __device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); }
-__device__ int atomicAggInc(int *ctr, int warpLane) {
+__device__ int atomicAggInc(int tmpidx, int *ctr, int warpLane) {
 	// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
 	int mask = __ballot(1);
 	int leader = __ffs(mask)-1;
 
+	int tmp4 = tmpidx;
 	int res;
 	if ( warpLane == leader )
 		res = atomicAdd(ctr, __popc(mask));
 	res = warp_bcast(res,leader);
+
 	
+	int tmp2 = warpLane;					/* diddo */
+	int* tmp = ctr;								/* avoid optimizing */
+	int tmp3 = tmpidx;
 	return res + __popc( mask & ((1 << warpLane) - 1) );
 }
 
diff --git a/makefile b/makefile
index 435923b2eb045884ce780d21b067110adbf94231..015abdf56a0d4c1b94865998e27c0b72998cc827 100644
--- a/makefile
+++ b/makefile
@@ -49,10 +49,11 @@ CODE_20 := -arch=sm_20
 # NV_FLAGS += $(CODE_10) $(CODE_12) $(CODE_20) $(CODE_30) $(CODE_35)
 NV_FLAGS += -arch=sm_35
 
-NVLD_FLAGS := $(NV_FLAGS) --device-link
+NVLD_FLAGS := $(NV_FLAGS) --device-link 
 # NV_FLAGS += -rdc=true
 
 LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(LIBRARY)
+LD_FLAGS += -lcuda 
 
 
 ### Sources