diff --git a/CellDecomposition.cu b/CellDecomposition.cu
index 300b19fe187ee680d1f6d5272253e1db471f18d7..07a7b028c8f1404aa94f7bf2e4e84615788bf22c 100644
--- a/CellDecomposition.cu
+++ b/CellDecomposition.cu
@@ -113,6 +113,8 @@ void CellDecomposition::decompose_d(Vector3 pos_d[], size_t num) {
 																								numCells, numReplicas);
 	gpuErrchk(cudaMemcpy(ranges, ranges_d, numCellRep, cudaMemcpyDeviceToHost));
 	gpuErrchk( cudaFree(temp_ranges) );
+
+
 }
 
 // *****************************************************************************
diff --git a/ComputeForce.cu b/ComputeForce.cu
index 309845a60bfba3bd4dbc10770b32dd3700e81043..0569f6c20dd461a584fef06f57982fd4df6603dd 100644
--- a/ComputeForce.cu
+++ b/ComputeForce.cu
@@ -356,6 +356,15 @@ void ComputeForce::decompose(Vector3* pos) {
 		cudaFree(decomp_d);
 	decomp.decompose_d(pos, num);
 	decomp_d = decomp.copyToCUDA();
+
+	// Update pairlists using cell decomposition (not sure this is really needed or good) 
+	//RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d);
+	const int NUMTHREADS = 32;
+	const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1;
+	createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, sys_d, decomp_d);
+	gpuErrchk(cudaDeviceSynchronize());
+	
+	
 }
 
 IndexList ComputeForce::decompDim() const {
@@ -475,10 +484,12 @@ float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type,
 	dim3 numBlocks(gridSize, 1, 1);
 	dim3 numThreads(NUM_THREADS, 1, 1);
 
+	
 	// Call the kernel to calculate the forces
 	computeTabulatedKernel<<< numBlocks, numThreads >>>(force, pos, type,
-			tablePot_d, tableBond_d, num, numParts, sys_d, bonds, bondMap,
-			numBonds, excludes, excludeMap, numExcludes, decomp_d,
+			tablePot_d, tableBond_d,
+			num, numParts, sys_d,
+			bonds, bondMap,	numBonds, 
 			energies_d, cutoff2, gridSize, numReplicas, get_energy);
 	gpuErrchk(cudaDeviceSynchronize());
 
diff --git a/ComputeForce.cuh b/ComputeForce.cuh
index 1f1e1c5ceabea867442c3739a5fdd52c51195841..c954f005dc00f8d444c8b9d8fd824e8ece43eeb5 100644
--- a/ComputeForce.cuh
+++ b/ComputeForce.cuh
@@ -3,25 +3,21 @@
 // Terrance Howard <heyterrance@gmail.com>
 
 #pragma once
+#include "CudaUtil.cuh"
+#include <assert.h>
+
 
 __host__ __device__
 EnergyForce ComputeForce::coulombForce(Vector3 r, float alpha,
 																			 float start, float len) {
 	float d = r.length();
-
+	
 	if (d >= start + len)
 		return EnergyForce();
 	if (d <= start) {
 		float energy = alpha/d - alpha/start + 0.5f*alpha/(start*start)*len;
 		Vector3 force = -alpha/(d*d*d)*r;
 
-		if (isnan(force.x))
-			printf(">> DamnX.\n");
-		if (isnan(force.y))
-			printf(">> DamnY.\n");
-		if (isnan(force.z))
-			printf(">> DamnZ.\n");
-
 		return EnergyForce(energy, force);
 	}
 
@@ -199,6 +195,86 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[],
 	}
 }
 
+
+// RBTODO: remove global device variables for fast prototyping of pairlist kernel
+__device__ int g_numPairs = 0;
+__device__ int* g_pairI;
+__device__ int* g_pairJ;
+
+__global__
+void createPairlists(Vector3 pos[], int num, int numReplicas,
+										 BaseGrid* sys, CellDecomposition* decomp) {
+	  // Loop over threads searching for atom pairs
+  //   Each thread has designated values in shared memory as a buffer
+  //   A sync operation periodically moves data from shared to global
+	const int NUMTHREADS = 32;		/* RBTODO: fix */
+	__shared__ int spid[NUMTHREADS];
+	const int tid = threadIdx.x;
+	const int ai = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (tid == 0) g_numPairs = 0;	/* RBTODO: be more efficient */
+	__syncthreads(); 
+	
+	int numPairs = g_numPairs; 
+
+	const int MAXPAIRSPERTHREAD = 256; /* RBTODO Hard limit likely too low */
+	int pairI[MAXPAIRSPERTHREAD];
+	int pairJ[MAXPAIRSPERTHREAD];
+	int pid = 0;
+	
+	// ai - index of the particle in the original, unsorted array
+	if (ai < (num-1) * numReplicas) {
+		const int repID = ai / (num-1);
+		// const int typei = type[i]; // maybe irrelevent
+		const Vector3 posi = pos[ai];
+		
+		// RBTODO: if this approach works well, make cell decomposition finer so limit still works
+
+		// TODO: Fix this: Find correct celli (add a new function to
+		//       CellDecomposition, binary search over cells)
+		CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
+		const CellDecomposition::cell_t* pairs = decomp->getCells();
+		for (int x = -1; x <= 1; ++x) {
+			for (int y = -1; y <= 1; ++y) {
+				for (int z = -1; z <= 1; ++z) {					
+					if (x+y+z != -3) { // SYNC THREADS
+						__syncthreads();				
+						assert(pid < MAXPAIRSPERTHREAD);
+
+						// find where each thread should put its pairs in global list
+						spid[tid] = pid;
+						exclIntCumSum(spid,NUMTHREADS); // returns cumsum with spid[0] = 0
+
+						for (int d = 0; d < pid; d++) {
+							g_pairI[g_numPairs + d + spid[tid]] = pairI[d];
+							g_pairJ[g_numPairs + d + spid[tid]] = pairJ[d];
+						}
+						// update global index
+						__syncthreads();
+						if (tid == NUMTHREADS) g_numPairs += spid[tid] + pid;
+						pid = 0;
+					} // END SYNC THREADS 
+	
+					const int nID = decomp->getNeighborID(celli, x, y, z);				
+					if (nID < 0) continue; // Skip if got wrong or duplicate cell.
+					const CellDecomposition::range_t range = decomp->getRange(nID, repID);
+					
+					for (int n = range.first; n < range.last; n++) {
+						const int aj = pairs[n].particle;
+
+						// RBTODO: skip exclusions
+						if (aj <= ai) continue;
+
+						pairI[pid] = ai;
+						pairJ[pid] = aj;
+						pid++;							/* RBTODO synchronize across threads somehow */
+					} 	// n
+				} 		// z				
+			} 			// y
+		} 				// x
+	}						/* replicas */
+}
+	
 __global__
 void computeKernel(Vector3 force[], Vector3 pos[], int type[],
 									 float tableAlpha[], float tableEps[], float tableRad6[],
@@ -272,7 +348,7 @@ void computeKernel(Vector3 force[], Vector3 pos[], int type[],
 }
 
 
-// ===================================================================================
+// ============================================================================
 // Kernel computes forces between Brownian particles (ions)
 // using cell decomposition
 //
@@ -280,142 +356,46 @@ __global__ void computeTabulatedKernel(Vector3* force, Vector3* pos, int* type,
 		TabulatedPotential** tablePot, TabulatedPotential** tableBond,
 		int num, int numParts, BaseGrid* sys,
 		Bond* bonds, int2* bondMap, int numBonds,
-		Exclude* excludes, int2* excludeMap, int numExcludes,
-		CellDecomposition* decomp, float* g_energies, float cutoff2, int gridSize,
+		float* g_energies, float cutoff2, int gridSize,
 		int numReplicas, bool get_energy) {
+	
+	const int NUMTHREADS = 32;		/* RBTODO: fix */
+	__shared__ EnergyForce fe[NUMTHREADS];
+	__shared__ int atomI[NUMTHREADS];
+	__shared__ int atomJ[NUMTHREADS];
 
-	// Thread's unique ID.
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-
-	// Initialize interaction energy (per particle)
-	float energy_local = 0;
+	int numPairs = g_numPairs; 
 
-	// Loop over ALL particles in ALL replicas
-	if (i < num * numReplicas) {
-		const int repID = i / num;
+	const int tid = threadIdx.x;
+	const int gid = blockIdx.x * blockDim.x + threadIdx.x;
 
+	// loop over particle pairs in pairlist
+	if (gid < numPairs) {
 		// BONDS: RBTODO: handle through an independent kernel call
-		/* Each particle may have a varying number of bonds
-		 * bondMap is an array with one element for each particle
-		 * which keeps track of where a particle's bonds are stored
-		 * in the bonds array.
-		 * bondMap[i].x is the index in the bonds array where the ith particle's bonds begin
-		 * bondMap[i].y is the index in the bonds array where the ith particle's bonds end
-		 */
-
-		// Initialize bond_start and bond_end to -1
-		// If these are not changed later, then we know this particle does not have any bonds
-		/* int bond_start = -1; */
-		/* int bond_end = -1; */
-
-		/* // Get bond_start and bond_end from the bondMap */
-		/* if (bondMap != NULL) { */
-		/* 	bond_start = bondMap[i - repID * num].x; */
-		/* 	bond_end = bondMap[i - repID * num].y; */
-		/* } */
-
-		/* // currBond is the index in the bonds array that we should look at next */
-		/* // currBond is initialized to bond_start because that is the first index of the */
-		/* // bonds array where this particle's bonds are stored */
-		/* int currBond = bond_start; */
-
-		/* // nextBond is the ID number of the next particle that this particle is bonded to */
-		/* // If this particle has at least one bond, then nextBond is initialized to be the */
-	 	/* // first particle that this particle is bonded to */
-		/* int nextBond = (bond_start >= 0) ? bonds[bond_start].ind2 : -1; */
-
-		
 		// RBTODO: handle exclusions in other kernel call
+		const int ai = g_pairI[tid];
+		const int aj = g_pairJ[tid];
 		
-		/* // Same as for bonds, but for exclusions now */
-		/* int ex_start = -1; */
-		/* int ex_end = -1; */
-		/* if (excludeMap != NULL) { */
-		/* 	ex_start = excludeMap[i].x; */
-		/* 	ex_end = excludeMap[i].y; */
-		/* } */
-		/* int currEx = ex_start; */
-		/* int nextEx = (ex_start >= 0) ? excludes[ex_start].ind2 : -1; */
-
 		// Particle's type and position
-    int typei = type[i];
-		Vector3 posi = pos[i];
-		const CellDecomposition::cell_t celli = decomp->getCellForParticle(i);
-
-    // Initialize force_local - force on a particle (i)
-		Vector3 force_local(0.0f);
-		const CellDecomposition::cell_t* cells = decomp->getCells();
-
-		// Loop over neighboring cells
-		for (int x = -1; x <= 1; ++x) {
-			for (int y = -1; y <= 1; ++y) {
-				for (int z = -1; z <= 1; ++z) {
-          // Get ID of the neighboring cell (x, y, z)
-					const int nID = decomp->getNeighborID(celli, x, y, z);
-
-					// Skip if bad neighbor.
-					if (nID == -1) continue;
+		const int ind = type[ai] + type[aj] * numParts; /* RBTODO: why is numParts here? */
 
-					const CellDecomposition::range_t range = decomp->getRange(nID, repID);
+		// RBTODO: implement wrapDiff2, returns dr2
+		const Vector3 dr = sys->wrapDiff(pos[aj] - pos[ai]);
 
-					int typej = -1;
-					int ind = -1;
-					for (int n = range.first; n < range.last; ++n) {
-						const int j = cells[n].particle;
-						if (j == i) continue;
+    // Calculate the force based on the tabulatedPotential file
+		fe[tid] = EnergyForce(0.0f, Vector3(0.0f));
+		if (tablePot[ind] != NULL && dr.length2() <= cutoff2)
+			fe[tid] = tablePot[ind]->compute(dr);
 
-						const int newj = type[j];
-						if (typej != newj) {
-							typej = newj;
-							ind = typei + typej * numParts;
-						}
-
-						// Find the distance between particles i and j,
-						// wrapping this value if necessary
-						const Vector3 dr = sys->wrapDiff(pos[j] - posi);
+		// RBTODO: think of a better approach; e.g. shared atomicAdds that are later reduced in global memory
+		atomicAdd( &force[ai], fe[tid].f );
+		atomicAdd( &force[aj], -fe[tid].f );
 
-						// First, calculate the force based on the tabulatedPotential file
-						EnergyForce ft(0.0f, Vector3(0.0f));
-						/* bool exclude = (nextEx == j); */
-						/* if (exclude) */
-						/* 	nextEx = (currEx < ex_end - 1) ? excludes[++currEx].ind2 : -1; */
-
-						if (tablePot[ind] != NULL && dr.length2() <= cutoff2)
-								ft = tablePot[ind]->compute(dr);
-
-						/* // If the next bond we want is the same as j, then there is a bond */
-						/* // between particles i and j. */
-						/* if (nextBond == j && tableBond != NULL) { */
-
-						/* 	// Calculate the force on the particle from the bond */
-						/* 	// If the user has specified the REPLACE option for this bond, */
-						/* 	// then overwrite the force we calculated from the regular */
-						/* 	// tabulated potential */
-						/* 	// If the user has specified the ADD option, then add the bond */
-						/* 	// force to the tabulated potential value */
-						/* 	EnergyForce bond_ef = */
-						/* 			tableBond[ bonds[currBond].tabFileIndex ]->compute(dr); */
-						/* 	switch (bonds[currBond].flag) { */
-						/* 		case Bond::REPLACE: ft = bond_ef; break; */
-						/* 		case Bond::ADD: ft += bond_ef; break; */
-						/* 		default: break; */
-						/* 	} */
-
-						/* 	// Increment currBond, so that we can find the index of the */
-						/* 	// next particle that this particle is bonded to. */
-						/* 	nextBond = (currBond < bond_end - 1) ? bonds[++currBond].ind2 : -1; */
-						/* } */
-						force_local += ft.f;
-						if (get_energy && j > i)
-							energy_local += ft.e;
-					}		// n
-				}			// z
-			}				// y
-		}					// z
-		force[i] = force_local;
-		if (get_energy)
-			g_energies[i] = energy_local;
+		// RBTODO: why are energies calculated for each atom? Could be reduced
+		if (get_energy && aj > ai)
+			atomicAdd( &(g_energies[ai]), fe[tid].e );		
 	}
+	
 }
 
 
diff --git a/ComputeForce.h b/ComputeForce.h
index 38fdf36d4887921b36471068de7aed0f9d5258e1..c9f6fcb57fd1ea9fde2ea1cffcb74228938d2e4f 100644
--- a/ComputeForce.h
+++ b/ComputeForce.h
@@ -111,6 +111,10 @@ private:
 	TabulatedAnglePotential **tableAngle_d, **tableAngle_addr;
 	TabulatedDihedralPotential **tableDihedral_d, **tableDihedral_addr;
 
+	// Pairlists
+	int *pairIds_d;
+	int numPairs;
+	int numPairs_d;	
 };
 
 #endif
diff --git a/CudaUtil.cuh b/CudaUtil.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f44034f68ef35acc7651bca5932dc88fc536ba03
--- /dev/null
+++ b/CudaUtil.cuh
@@ -0,0 +1,76 @@
+#pragma once
+
+__device__ inline void exclIntCumSum(int* in, const int n) {
+	// 1) int* in must point to shared memory
+	// 2) int n must be power of 2
+	const int tid = threadIdx.x;
+	// RBTODO: worry about possible bank conflicts http://www.eecs.umich.edu/courses/eecs570/hw/parprefix.pdf
+	
+	// build tree of sums
+	int stride = 1;
+	for (int d = n>>1; d > 0; d >>= 1) {
+		__syncthreads();
+		if (tid < d) {
+			int id = 2*stride*(tid+1)-1;
+			in[id] += in[id-stride];
+		}
+		stride *= 2;
+	}
+	if (tid == 0) in[n-1] = 0;		/* exclusive cumsum (starts at 0) */
+
+	// traverse down tree and build 'scan'
+	for (int d = 1; d < n; d*= 2) {
+		stride >>= 1;
+		__syncthreads();
+
+		if (tid < d) { // RBTODO: this could be incorrect ==> test
+			int id = 2*stride*(tid+1)-1;
+			int t = in[id];
+			in[id] += in[id-stride];
+			in[id-stride] = t;
+		}
+	}
+	__syncthreads();
+}
+
+
+__device__ inline void inclIntCumSum(int* in, const int n) {
+	// 1) int* in must point to shared memory
+	// 2) int n must be power of 2
+	const int tid = threadIdx.x;
+	
+	// RBTODO: worry about possible bank conflicts http://www.eecs.umich.edu/courses/eecs570/hw/parprefix.pdf
+	
+	// build tree of sums
+	int stride = 1;
+	for (int d = n>>1; d > 0; d >>= 1) {
+		__syncthreads();
+		if (tid < d) {
+			int id = 2*stride*(tid+1)-1;
+			in[id] += in[id-stride];
+		}
+		stride *= 2;
+	}
+	// if (tid == 0) in[n-1] = 0;		/* exclusive cumsum (starts at 0) */
+
+	// traverse down tree and build 'scan'
+	for (int d = 1; d < n; d*= 2) {
+		stride >>= 1;
+		__syncthreads();
+
+		if (tid < d) { // RBTODO: this could be incorrect ==> test
+			int id = 2*stride*(tid+1)-1;
+			in[id+stride] += in[id];
+			/* int t = in[id]; */
+			/* in[id] += in[id-stride]; */
+			/* in[id-stride] = t; */
+		}
+	}
+	__syncthreads();
+}
+
+__device__ void atomicAdd( Vector3* address, Vector3 val) {
+	atomicAdd( &(address->x), val.x);
+	atomicAdd( &(address->y), val.y);
+	atomicAdd( &(address->z), val.z);
+}
diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu
index 92c6dfd694a7083938925a973a1271a977effdcb..1fdd9e5d2683b83ebe18d9984fc721ecf8651ead 100644
--- a/GrandBrownTown.cu
+++ b/GrandBrownTown.cu
@@ -334,8 +334,10 @@ void GrandBrownTown::run() {
 				// 1 - do not use cutoff [ N^2 ]
 				switch (fullLongRange) {
 					case 0: // [ N*log(N) ] interactions, + cutoff | decomposition
-						if (s % decompPeriod == 0)
+						if (s % decompPeriod == 0) {
 							internal->decompose(pos_d);
+							//internal->updatePairlists(pos_d); // perhaps this way?
+						}
 						energy = internal->computeTabulated(forceInternal_d, pos_d, type_d,
 																								bonds_d, bondMap_d,
 																								excludes_d, excludeMap_d,
diff --git a/makefile b/makefile
index a7de250d1b0280edfcadf243c2f76c0bb5bbaf75..8d5c8eb1ddc329cae1398ea5d348be94f645e106 100644
--- a/makefile
+++ b/makefile
@@ -11,13 +11,13 @@ include ./findcudalib.mk
 INCLUDE = $(CUDA_PATH)/include
 
 
-# DEBUG = -g -O0
+DEBUG = -g -O0
 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic
-# NV_FLAGS = -g -G								#debug
 # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no
 NV_FLAGS = -Xptxas -v # -v,-abi=no 
 EX_FLAGS = -O3 -m$(OS_SIZE)
 # EX_FLAGS = -O0 -m$(OS_SIZE)
+NV_FLAGS += -g -G								#debug
 
 ifneq ($(MAVERICKS),)
     CC = $(CLANG)