diff --git a/BaseGrid.h b/BaseGrid.h
index b4ac7c5452736af9ba02ad2a4ce7b44f2fe9e36f..4eca85da1652936533a0b3907249b341b96a6495 100644
--- a/BaseGrid.h
+++ b/BaseGrid.h
@@ -26,6 +26,8 @@ using namespace std;
 
 #define STRLEN 512
 
+DEVICE float __fdividef(float x, float y);
+
 class NeighborList {
 public:
   float v[3][3][3];
@@ -649,7 +651,14 @@ public:
   }
   
   // Wrap distance: -0.5*l <= x < 0.5*l
-  HOST DEVICE static inline float wrapDiff(float x, float l) {
+	DEVICE static inline float d_wrapDiff(float x, float l) {
+		// int image = int(floor(x/l));
+		int image = int(floorf( __fdividef(x,l) ));
+		x -= image*l;
+		x = (x >= 0.5f * l) ? x-l : x;
+		return x;
+  }
+	HOST DEVICE static inline float wrapDiff(float x, float l) {
 		int image = int(floor(x/l));
 		x -= image*l;
 		if (x >= 0.5f * l)
@@ -681,6 +690,13 @@ public:
     l.z = wrapDiff(l.z, nz);
     return basis.transform(l);
   }
+  DEVICE inline Vector3 d_wrapDiff(Vector3 r) const {
+		Vector3 l = basisInv.transform(r);
+    l.x = d_wrapDiff(l.x, nx);
+    l.y = d_wrapDiff(l.y, ny);
+    l.z = d_wrapDiff(l.z, nz);
+    return basis.transform(l);
+  }
 
   Vector3 wrapDiffNearest(Vector3 r) const;
 
diff --git a/ComputeForce.cu b/ComputeForce.cu
index 3ef58db501944731fa3e7142d27fb56a47ae3b2a..c311281fbcd0808ddc835cc09c0fe0aa9e8d54a6 100644
--- a/ComputeForce.cu
+++ b/ComputeForce.cu
@@ -380,10 +380,9 @@ void ComputeForce::decompose(Vector3* pos, int type[]) {
 		// RBTODO: free memory elsewhere
 		// allocate device data
 		// initializePairlistArrays<<< 1, 32 >>>(10*nCells*blocksPerCell);
-		const int maxPairs = 1<<29;
+		const int maxPairs = 1<<26;
 		gpuErrchk(cudaMalloc(&numPairs_d,       sizeof(int)*maxPairs));
-		gpuErrchk(cudaMalloc(&pairListsI_d,     sizeof(int)*maxPairs));
-		gpuErrchk(cudaMalloc(&pairListsJ_d,     sizeof(int)*maxPairs));
+		gpuErrchk(cudaMalloc(&pairLists_d,     sizeof(int2)*maxPairs));
 		gpuErrchk(cudaMalloc(&pairTabPotType_d, sizeof(int)*maxPairs));
 		int tmp = 0;
 		gpuErrchk(cudaMemcpyAsync(numPairs_d, &tmp,
@@ -420,7 +419,7 @@ void ComputeForce::decompose(Vector3* pos, int type[]) {
 	
 	createPairlists<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas,
 																						 sys_d, decomp_d, nCells, blocksPerCell,
-																						 numPairs_d, pairListsI_d, pairListsJ_d,
+																						 numPairs_d, pairLists_d,
 																						 numParts, type, pairTabPotType_d,pairlistdist2);
 
 	gpuErrchk(cudaDeviceSynchronize());
@@ -548,14 +547,14 @@ float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type,
 	// Call the kernel to calculate the forces
 	// int nb = (decomp.nCells.x * decomp.nCells.y * decomp.nCells.z);
 	// int nb = (1+(decomp.nCells.x * decomp.nCells.y * decomp.nCells.z)) * 75; /* RBTODO: number of pairLists */
-	const int nb = 200;
+	const int nb = 800;
 	// printf("ComputeTabulated\n");
 	computeTabulatedKernel<<< nb, numThreads >>>(force, pos, type,
 			tablePot_d, tableBond_d,
-			num, numParts, sys_d,
-			bonds, bondMap,	numBonds, 
-			energies_d, cutoff2, gridSize, numReplicas, get_energy,
-																							 numPairs_d, pairListsI_d, pairListsJ_d,
+			num, sys_d,
+			bonds, bondMap,	numBonds,
+																							 cutoff2,
+																							 numPairs_d, pairLists_d,
 																							 pairTabPotType_d);
 
 	gpuErrchk(cudaDeviceSynchronize());
diff --git a/ComputeForce.cuh b/ComputeForce.cuh
index 15a3a1804c222cf7097dcc7c8b1a3704bf621893..6d6350c7aac03f058028fdd0ee4db66b0eceb006 100644
--- a/ComputeForce.cuh
+++ b/ComputeForce.cuh
@@ -213,11 +213,11 @@ const __device__ int maxPairs = 1 << 14;
 /* } */
 
 __global__
-void createPairlists(Vector3 pos[], int num, int numReplicas,
-										 BaseGrid* sys, CellDecomposition* decomp,
+void createPairlists(Vector3* __restrict__ pos, int num, int numReplicas,
+										 BaseGrid* sys, CellDecomposition* __restrict__ decomp,
 										 const int nCells, const int blocksPerCell,
-										 int* g_numPairs, int* g_pairI, int* g_pairJ,
-										 int numParts, int type[], int* g_pairTabPotType,
+										 int* g_numPairs, int2* g_pair,
+										 int numParts, int type[], int* __restrict__ g_pairTabPotType,
 	float pairlistdist2) {
 	// Loop over threads searching for atom pairs
   //   Each thread has designated values in shared memory as a buffer
@@ -267,12 +267,11 @@ void createPairlists(Vector3 pos[], int num, int numReplicas,
 							
 							// RBTODO: skip exclusions, non-interacting types
 							int pairType = type[ai] + type[aj] * numParts;
- 							int gid = atomicAggInc( g_numPairs, warpLane );
+							int gid = atomicAggInc( g_numPairs, warpLane );
 							/* assert( ai >= 0 ); */
 							/* assert( aj >= 0 ); */
 							
-							g_pairI[gid] = ai;
-							g_pairJ[gid] = aj;
+							g_pair[gid] = make_int2(ai,aj);
 							g_pairTabPotType[gid] = pairType;
 
 						} 	// atoms J
@@ -366,83 +365,75 @@ __global__ void printPairForceCounter() {
 // Kernel computes forces between Brownian particles (ions)
 // using cell decomposition
 //
-__global__ void computeTabulatedKernel(Vector3* force, Vector3* pos, int* type,
-		TabulatedPotential** tablePot, TabulatedPotential** tableBond,
-		int num, int numParts, BaseGrid* sys,
-		Bond* bonds, int2* bondMap, int numBonds,
-		float* g_energies, float cutoff2, int gridSize,
-																			 int numReplicas, bool get_energy,
-																			 int* g_numPairs, int* g_pairI, int* g_pairJ,
-																			 int* g_pairTabPotType) {
+__global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict__ pos, int* type,
+		TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond,
+		int num, const BaseGrid* __restrict__ sys,
+		const Bond* __restrict__ bonds, const int2* __restrict__ bondMap, int numBonds,
+																			 float cutoff2,
+																			 const int* __restrict__ g_numPairs,
+																			 const int2* __restrict__ g_pair,
+																			 const int* __restrict__ g_pairTabPotType) {
 	
-	// const int NUMTHREADS = 256;		/* RBTODO: fix */
-	// __shared__ EnergyForce fe[NUMTHREADS];
-	/* __shared__ int atomI[NUMTHREADS]; */
-	/* __shared__ int atomJ[NUMTHREADS]; */
 
-	// const int& tid = threadIdx.x;
-	// const int blockStride = (*g_numPairs) / gridDim.x > 0 ? (*g_numPairs) / gridDim.x : 1 ; /* RBTODO: fix this */
 	const int numPairs = *g_numPairs;
-	const int blockStride = ceil(((float)(numPairs)) / gridDim.x);
-	
-	
-	/* if (threadIdx.x == 0) { */
-	/* 	printf( "block %d running from pair %d to %d\n", blockIdx.x, blockIdx.x*blockStride, blockIdx.x*blockStride + blockStride -1 );\ */
-	/* } */
-		
-	
-	// RBTODO: fix this
-	// loop over particle pairs in pairlist
-	/* if (threadIdx.x == 0 && blockIdx.x == 0) { */
-	/* 	printf("%d %d %d\n", *g_numPairs, gridDim.x, blockStride); */
-	/* 	printf("%d %d\n", blockDim.x, gridDim.x); */
-	/* } */
-
-	/* if (threadIdx.x == 0 && blockIdx.x == 0) { */
-	/* 	printf("blockStride: %d\n",blockStride); */
-	/* for (int i = 1; i < 1; i++) */
-	/* 	printf("TEST FAILED\n"); */
-
-	/* printf("g_numPairs: %d\n", numPairs); */
-	
-	for (int i = threadIdx.x + blockIdx.x*blockStride; i < (blockIdx.x+1)*blockStride && i < numPairs; i+=blockDim.x) {
-	
+	for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < numPairs; i+=blockDim.x*gridDim.x) {
+		const int2 pair = g_pair[i];
+		const int ai = pair.x;
+		const int aj = pair.y;
 		// BONDS: RBTODO: handle through an independent kernel call
-		const int ai = g_pairI[i];
-		const int aj = g_pairJ[i];
-		if (ai < 0) continue;
 			
 		// Particle's type and position
-		// const int ind = type[ai] + type[aj] * numParts; /* RBTODO: why is numParts here? */
 		const int ind = g_pairTabPotType[i];
-		// if (tablePot[ind] == NULL) continue;
-		
-		// RBTODO: implement wrapDiff2, returns dr2
+
 	 	/* printf("tid,bid,pos[ai(%d)]: %d %d %f %f %f\n", ai, threadIdx.x, blockIdx.x, pos[ai].x, pos[ai].y, pos[ai].z); */
-	 	/* printf("pos[ai(%d)]: %f %f %f\n", ai, pos[ai].x, pos[ai].y, pos[ai].z); */
-		/* printf("pos[aj(%d)]: %f %f %f\n", aj, pos[aj].x, pos[aj].y, pos[aj].z); */
 
-		const Vector3 dr = sys->wrapDiff(pos[aj] - pos[ai]);
-		// const Vector3 dr = pos[aj] - pos[ai];
-		
+		Vector3 dr = pos[aj] - pos[ai];
+		dr = sys->d_wrapDiff(dr);
+	
     // Calculate the force based on the tabulatedPotential file
-		// if (dr.length2() > cutoff2) continue;
-		if (tablePot[ind] != NULL && dr.length2() <= cutoff2) {
-			EnergyForce fe = tablePot[ind]->compute(dr);
-			
-			// RBTODO: think of a better approach; e.g. shared atomicAdds that are later reduced in global memory
-			atomicAdd( &force[ai], -fe.f );
-			atomicAdd( &force[aj],  fe.f );
+		float d2 = dr.length2();
+		if (tablePot[ind] != NULL && d2 <= cutoff2) {
+			Vector3 f = tablePot[ind]->computef(dr,d2);
+			atomicAdd( &force[ai], -f );
+			atomicAdd( &force[aj],  f );
+		}
+	}
+}
 
-			/* printf("force[ai(%d)]: %0.2f %0.2f %0.2f\n", ai, fe.f.x, fe.f.y, fe.f.z);  */
 
-			// atomicAdd( &pairForceCounter, 1 ); /* DEBUG */
-			
+__global__ void computeTabulatedEnergyKernel(Vector3* force, Vector3* __restrict__ pos, int* type,
+		TabulatedPotential** __restrict__ tablePot, TabulatedPotential** __restrict__ tableBond,
+		int num, BaseGrid* __restrict__ sys,
+		Bond* __restrict__ bonds, int2* __restrict__ bondMap, int numBonds,
+		float* g_energies, float cutoff2,
+																			 int* __restrict__ g_numPairs,
+																			 int2* __restrict__ g_pair,
+																			 int* __restrict__ g_pairTabPotType) {
+	
+	const int numPairs = *g_numPairs;
+	// RBTODO: BONDS (handle through an independent kernel call?)
+	for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < numPairs; i+=blockDim.x*gridDim.x) {
+		const int2 pair = g_pair[i];
+		const int ai = pair.x;
+		const int aj = pair.y;
+		const int ind = g_pairTabPotType[i];
+
+		// RBTODO: implement wrapDiff2, returns dr2 (???)
+		Vector3 dr = pos[aj] - pos[ai];
+		dr = sys->d_wrapDiff(dr);
+    // Calculate the force based on the tabulatedPotential file
+		float d2 = dr.length2();
+		/* RBTODO: filter tabpot particles ahead of time */
+		// RBTODO: order pairs according to distance to reduce divergence
+		
+		if (tablePot[ind] != NULL && d2 <= cutoff2) { 
+			EnergyForce fe = tablePot[ind]->compute(dr,d2);
+			// RBTODO: is this the best approach?
+			atomicAdd( &force[ai], -fe.f );
+			atomicAdd( &force[aj],  fe.f );
 			// RBTODO: why are energies calculated for each atom? Could be reduced
-			if (get_energy) {
-				atomicAdd( &(g_energies[ai]), fe.e );
-				atomicAdd( &(g_energies[aj]), fe.e );
-			}
+			atomicAdd( &(g_energies[ai]), fe.e );
+			atomicAdd( &(g_energies[aj]), fe.e );
 		}
 	}
 }
diff --git a/ComputeForce.h b/ComputeForce.h
index f1f56feb5f4ab6e7ff194126673da37178c8030b..ea1761d9fbec60137a02f36f33a3dc5fb054b041 100644
--- a/ComputeForce.h
+++ b/ComputeForce.h
@@ -113,8 +113,7 @@ private:
 	TabulatedDihedralPotential **tableDihedral_d, **tableDihedral_addr;
 
 	// Pairlists
-	int *pairListsI_d;
-	int *pairListsJ_d;
+	int2 *pairLists_d;
 	int *pairTabPotType_d;
 
 	int *numPairs_d;	
diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh
index 586108b455f68f88784d6160cac5b2c7dab2b18c..066680dff4697656c13e89ab0825500a94b03dfd 100644
--- a/ComputeGridGrid.cuh
+++ b/ComputeGridGrid.cuh
@@ -1,7 +1,7 @@
 // Included in RigidBodyController.cu
 #pragma once
 
-
+//RBTODO: add __restrict__, benchmark (Q: how to restrict member data?)
 __global__
 void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 													const Matrix3 basis_rho, const Matrix3 basis_u_inv,
diff --git a/GPUManager.cpp b/GPUManager.cpp
index 393b649bbd0b1e11123734c3c2359cd7d5bdbc05..b3f7d1d6db8cc4b054f9fe1b1d08fb09c8f41063 100644
--- a/GPUManager.cpp
+++ b/GPUManager.cpp
@@ -86,6 +86,7 @@ void GPUManager::init_devices() {
 void GPUManager::set(int gpu_id) {
 	gpu_id = gpu_id % (int) gpus.size();
 	cudaSetDevice(gpus[gpu_id]);
+	cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
 }
 
 int GPUManager::current() {
diff --git a/TabulatedPotential.h b/TabulatedPotential.h
index 1c47343e80e244d396a5eedadc8ade74096e8314..91a401a46a85d0c06bf3a38fbc50e29b5cff49c4 100644
--- a/TabulatedPotential.h
+++ b/TabulatedPotential.h
@@ -80,6 +80,40 @@ public:
 		Vector3 force = (-du/(d*dr))*r;
 		return EnergyForce(energy,force);
 	}
+  HOST DEVICE inline EnergyForce compute(Vector3 r, float d) {
+		d = sqrt(d);
+		// float d = r.length();
+		float w = (d - r0)/dr;
+		int home = int( floorf(w) );
+		w = w - home;
+		// if (home < 0) return EnergyForce(v0[0], Vector3(0.0f));
+		home = home < 0 ? 0 : home;
+		if (home >= n) return EnergyForce(e0, Vector3(0.0f));
+		
+		float u0 = v0[home];
+		float du = home+1 < n ? v0[home+1]-u0 : 0;
+				
+		// Interpolate.
+		float energy = du*w+u0;
+		Vector3 force = (-du/(d*dr))*r;
+		return EnergyForce(energy,force);
+	}
+  HOST DEVICE inline Vector3 computef(Vector3 r, float d) {
+		d = sqrt(d);
+		// float d = r.length();
+		// RBTODO: precompute so that initial blocks are zero; reduce computation here
+		float w = (d - r0)/dr;
+		int home = int( floorf(w) );
+		w = w - home;
+		// if (home < 0) return EnergyForce(v0[0], Vector3(0.0f));
+		home = home < 0 ? 0 : home;
+		if (home >= n) return Vector3(0.0f);
+		
+		if (home+1 < n) 
+			return (-(v0[home+1]-v0[home])/(d*dr))*r;
+		else
+			return Vector3(0.0f);
+	}
 
 // private:
 public:
diff --git a/runBrownTown.cpp b/runBrownTown.cpp
index fd1bd4bbfa0cec2d0022119829d5d53892eff204..49b272df357592d2f2ff18f41e0f3b3145de5581 100644
--- a/runBrownTown.cpp
+++ b/runBrownTown.cpp
@@ -19,6 +19,7 @@ int main(int argc, char* argv[]) {
 		printf("Usage: %s [OPTIONS [ARGS]] CONFIGFILE OUTPUT [SEED]\n", argv[0]);
 		printf("\n");
 		printf("  -r, --replicas=    Number of replicas to run\n");
+		printf("  -g, --gpu=         Index of gpu to use (defaults to 0)\n");
 		printf("  -i, --imd=         IMD port (defaults to %d)\n", kIMDPort);
 		printf("  -d, --debug        Debug mode: allows user to choose which forces are computed\n");
 		printf("  --safe             Do not use GPUs that may timeout (default)\n");
@@ -42,7 +43,10 @@ int main(int argc, char* argv[]) {
     printf("Try '%s --help' for more information.\n", argv[0]);
     return 1;
   }
-  
+	size_t n_gpus = max(GPUManager::gpus.size(), 1lu);
+
+	int gpuID = 0;
+	
 	bool debug = false, safe = true;
 	int replicas = 1;
 	unsigned int imd_port;
@@ -59,6 +63,16 @@ int main(int argc, char* argv[]) {
 		} else if (strcmp(arg, "-d") == 0 || strcmp(arg, "--debug") == 0) {
 			debug = true;
 			num_flags++;
+
+		} else if (strcmp(arg, "-g") == 0 || strcmp(arg, "--gpu") == 0) {
+			int arg_val = atoi(argv[pos + 1]);
+			gpuID = arg_val;
+			num_flags += 2;
+			if (arg_val < 0 || arg_val > n_gpus) {
+				printf("Invalid argument given to %s\n", arg);
+				return 1;
+			}
+			
 		} else if (strcmp(arg, "-r") == 0 || strcmp(arg, "--replicas") == 0) {
 			int arg_val = atoi(argv[pos + 1]);
 			if (arg_val <= 0) {
@@ -104,7 +118,7 @@ int main(int argc, char* argv[]) {
 	GPUManager::safe(safe);
 	Configuration config(configFile, replicas, debug);
 	// GPUManager::set(0);
-	GPUManager::set(1);
+	GPUManager::set(gpuID);
 	config.copyToCUDA();
 
 	GrandBrownTown brown(config, outArg, randomSeed,