diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index 7fca72cf7db6506cc4d1905702a793167b54bb27..69c192986617af9def3fe51a9ea7541616acab54 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -386,7 +386,6 @@ void ComputeForce::decompose() {
 	// Update pairlists using cell decomposition (not sure this is really needed or good) 
 	//RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d);	
 
-
 	/* size_t free, total; */
 	/* { */
 	/* 	cuMemGetInfo(&free,&total); */
@@ -399,8 +398,7 @@ void ComputeForce::decompose() {
 	if (newDecomp) {
 		// RBTODO: free memory elsewhere
 		// allocate device data
-		// initializePairlistArrays<<< 1, 32 >>>(10*nCells*blocksPerCell);
-		const int maxPairs = 1<<25;
+		const int maxPairs = 1<<20;
 		gpuErrchk(cudaMalloc(&numPairs_d,       sizeof(int)));
 
 		gpuErrchk(cudaMalloc(&pairLists_d,      sizeof(int2)*maxPairs));
@@ -431,12 +429,14 @@ void ComputeForce::decompose() {
 	float pairlistdist2 = (sqrt(cutoff2) + 2.0f);
 	pairlistdist2 = pairlistdist2*pairlistdist2;
 	
+#if __CUDA_ARCH__ >= 300
 	createPairlists<<< 2048, 64 >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2);
-	/* createPairlistsOld<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, */
-	/* 																					 sys_d, decomp_d, nCells, blocksPerCell, */
-	/* 																					 numPairs_d, pairLists_d, */
-	/* 																					 numParts, type, pairTabPotType_d, pairlistdist2); */
+#else
+	// Use shared memory for warp_bcast function
+	createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2);
+#endif			
 
+	
 	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */
 	// if (false)
 	{ // sort pairlist
diff --git a/src/ComputeForce.cuh b/src/ComputeForce.cuh
index 73ec7766664ca9c4db40e62bf25d196247bc83ea..bd524dc23d8420bcf534fc90931f67a9478a72d8 100644
--- a/src/ComputeForce.cuh
+++ b/src/ComputeForce.cuh
@@ -213,78 +213,6 @@ void computeElecFullKernel(Vector3 force[], Vector3 pos[], int type[],
 /* 	} */
 /* } */
 
-__global__
-void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas,
-										 BaseGrid* sys, CellDecomposition* __restrict__ decomp,
-										 const int nCells, const int blocksPerCell,
-										 int* g_numPairs, int2* g_pair,
-										 int numParts, int type[], int* __restrict__ g_pairTabPotType,
-	float pairlistdist2) {
-	// Loop over threads searching for atom pairs
-  //   Each thread has designated values in shared memory as a buffer
-  //   A sync operation periodically moves data from shared to global
-	const int tid = threadIdx.x;
-	const int bid = blockIdx.x;
-
-	const int cID = bid / blocksPerCell; /* cellID */
-
-	const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
-	// const int wid = tid/WARPSIZE;
-	const int blockLane = bid % blocksPerCell;
-
-	if (cID >= nCells) return;
-	int count = 0;								/* debug */
-	const CellDecomposition::cell_t* pairs = decomp->getCells();
-
-	for (int repID = 0; repID < numReplicas; repID++) {
-		const CellDecomposition::range_t rangeI = decomp->getRange(cID, repID);
-		/* if (tid == 0) printf("  Cell%d: Working on %d atoms for repID %d\n", */
-		/* 										 cID, rangeI.last - rangeI.first, repID); */
-
-		for (int ci = rangeI.first + blockLane; ci < rangeI.last; ci+=blocksPerCell) {
-			// ai - index of the particle in the original, unsorted array
-			const int ai = pairs[ci].particle;
-			// const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai);
-			const CellDecomposition::cell_t celli = pairs[ci];
-			const Vector3 posi = pos[ai];
-				
-			for (int x = -1; x <= 1; ++x) {
-				for (int y = -1; y <= 1; ++y) {
-					for (int z = -1; z <= 1; ++z) {					
-					
-						const int nID = decomp->getNeighborID(celli, x, y, z);				
-						if (nID < 0) continue; 
-						const CellDecomposition::range_t range = decomp->getRange(nID, repID);
-						const int last = range.last;
-						// for (int n = 0; n < last; n++) {}
-						for (int n = range.first + tid; n < last; n+=blockDim.x) {
-							count++;
-							const int aj = pairs[n].particle;
-							if (aj <= ai) continue;
-
-							// skip ones that are too far away
-							float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2();
-							if (dr > pairlistdist2) continue;
-
-							int gid = atomicAggInc( g_numPairs, warpLane );
-
-							// RBTODO: skip exclusions, non-interacting types
-							int pairType = type[ai] + type[aj] * numParts;
-							/* assert( ai >= 0 ); */
-							/* assert( aj >= 0 ); */
-							
-							g_pair[gid] = make_int2(ai,aj);
-							g_pairTabPotType[gid] = pairType;
-							// g_pairDists[gid] = dr; 
-						} 	// atoms J
-					} 		// z				
-				} 			// y
-			} 				// x
-		} // atoms I					
-	} // replicas
-	/* if (tid == 0) printf("Cell%d: found %d pairs\n",cID,g_numPairs[cID]); */
-}
-
 __global__
 void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas,
 				const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp,
@@ -294,8 +222,8 @@ void createPairlists(Vector3* __restrict__ pos, const int num, const int numRepl
 				const Exclude* __restrict__ excludes, const int2* __restrict__ excludeMap, const int numExcludes,
 				float pairlistdist2) {
 	// Loop over threads searching for atom pairs
-  //   Each thread has designated values in shared memory as a buffer
-  //   A sync operation periodically moves data from shared to global
+	//   Each thread has designated values in shared memory as a buffer
+	//   A sync operation periodically moves data from shared to global
 	const int tid = threadIdx.x;
 	const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */
 	const int split = 32;					/* numblocks should be divisible by split */
diff --git a/src/ComputeGridGrid.cuh b/src/ComputeGridGrid.cuh
index 670041ed15164ff4b3a3cf9511a472087da9c9ec..6150ae6726405e84c4cc49538ef6c8802d8d8dfd 100644
--- a/src/ComputeGridGrid.cuh
+++ b/src/ComputeGridGrid.cuh
@@ -2,6 +2,7 @@
 #pragma once
 #include "useful.h"
 #define NUMTHREADS 96
+#define WARPSIZE 32
 
 class RigidBodyGrid;
 
diff --git a/src/CudaUtil.cu b/src/CudaUtil.cu
index 40f8ad2d7182d6dc3d20876fb429f462e9ea5d87..c7bb55a1ad75d2d8f6e891b2207fa7dbed3ef0be 100644
--- a/src/CudaUtil.cu
+++ b/src/CudaUtil.cu
@@ -1,6 +1,19 @@
 #include "CudaUtil.cuh"
 
-__device__ int warp_bcast(int v, int leader) { return __shfl(v, leader); }
+#if __CUDA_ARCH__ >= 300
+__device__ int warp_bcast(int v, int leader) {return __shfl(v, leader); }
+#else
+volatile extern __shared__ int sh[];
+__device__ int warp_bcast(int v, int leader) {
+	// WARNING: might not be safe to call in divergent branches 
+	const int tid = threadIdx.x;
+	const int warpLane = tid % WARPSIZE;
+	if (warpLane == leader)
+		sh[tid/WARPSIZE] = v;
+	return sh[tid/WARPSIZE];		
+}	
+#endif
+
 __device__ int atomicAggInc(int *ctr, int warpLane) {
 	// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
 	int mask = __ballot(1);
diff --git a/src/RigidBody.cu b/src/RigidBody.cu
index 873e42973ff849a1a277c1ba89d3901fe2367e15..54d1b17aa80bbb71232d3bc480c941a50bbd3269 100644
--- a/src/RigidBody.cu
+++ b/src/RigidBody.cu
@@ -103,10 +103,15 @@ void RigidBody::updateParticleList(Vector3* pos_d) {
 			gpuErrchk(cudaMemcpy( tmp_d, &numParticles[i], sizeof(int), cudaMemcpyHostToDevice ));
 
 			int nb = floor(tnp/NUMTHREADS) + 1;
+#if __CUDA_ARCH__ >= 300
 			createPartlist<<<NUMTHREADS,nb>>>(pos_d, tnp, t->particles_d[i],
 							tmp_d, particles_d[i],
 							gridCenter + position, cutoff*cutoff);
-
+#else
+			createPartlist<<<NUMTHREADS,nb,NUMTHREADS/WARPSIZE>>>(pos_d, tnp, t->particles_d[i],
+							tmp_d, particles_d[i],
+							gridCenter + position, cutoff*cutoff);
+#endif			
 			gpuErrchk(cudaMemcpy(&numParticles[i], tmp_d, sizeof(int), cudaMemcpyDeviceToHost ));
 		}
 	}