diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh
index b9fede90d607c4e48df1e83129e0f2e395cb6807..664732ab7785a0e0a5e4e1169668f8d6bed166ee 100644
--- a/ComputeGridGrid.cuh
+++ b/ComputeGridGrid.cuh
@@ -34,27 +34,24 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 	/* return; */
 
 	// RBTODO: maybe organize data into compact regions for grid data for better use of shared memory...
-	Vector3 r_ijk = rho->getPosition(r_id); /* i,j,k value of voxel */
-	Vector3 r_pos = basis_rho.transform( r_ijk ) + origin_rho;
+	const Vector3 r_ijk = rho->getPosition(r_id); /* i,j,k value of voxel */
+	const Vector3 r_pos = basis_rho.transform( r_ijk ) + origin_rho;
 	// Vector3 u_ijk_float = basis_u.transform( r_pos - origin_u );
-	Matrix3 basis_u_inv = basis_u.inverse();
-	Vector3 u_ijk_float = basis_u_inv.transform( r_pos - origin_u );
+	const Matrix3 basis_u_inv = basis_u.inverse();
+	const Vector3 u_ijk_float = basis_u_inv.transform( r_pos - origin_u );
 	
-	float r_val = rho->val[r_id];
+	const float r_val = rho->val[r_id];
 	float energy = r_val * u->interpolatePotential( u_ijk_float ); 
 
 	// RBTODO What about non-unit delta?
 	// RBTODO combine interp methods and reduce repetition! 
-	Vector3 f = r_val*u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
+	const Vector3 f = r_val*u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
 	// f = basis_u.inverse().transpose().transform( f ); /* transform to lab frame */
-	f = basis_u_inv.transpose().transform( f ); /* transform to lab frame */
+	force[tid] = basis_u_inv.transpose().transform( f ); /* transform to lab frame */
 
 	// Calculate torque about lab-frame origin 
-	Vector3 t = r_pos.cross(f);				// RBTODO: test if sign is correct!
+	torque[tid] = r_pos.cross(f);				// RBTODO: test if sign is correct!
 	
-	force[tid] = f;
-	torque[tid] = t;
-	__syncthreads();
 
 	/*/ debug forces
 	float cutoff = 1e-3;
@@ -65,6 +62,7 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 	// Reduce force and torques
 	// http://www.cuvilib.com/Reduction.pdf
 	// RBTODO optimize
+	__syncthreads();
 	for (unsigned int offset = blockDim.x/2; offset > 0; offset >>= 1) {
 		if (tid < offset) {
 			unsigned int oid = tid + offset;
diff --git a/RigidBody.cu b/RigidBody.cu
index 10a6aa45d2285c5fda4c558f72dcf83324f0f901..9cfe33ea0573c2344b91026fd7cbb79878eb60a5 100644
--- a/RigidBody.cu
+++ b/RigidBody.cu
@@ -11,8 +11,8 @@
 #include "Debug.h"
 
 
-RigidBody::RigidBody(const Configuration& cref, RigidBodyType& tref)
-	: c(&cref), t(&tref), impulse_to_momentum(4.184e8f) {
+RigidBody::RigidBody(String name, const Configuration& cref, RigidBodyType& tref)
+	: name(name), c(&cref), t(&tref), impulse_to_momentum(4.184e8f) {
 	// units "(kcal_mol/AA) * ns" "amu AA/ns" * 4.184e+08
 	
 	timestep = c->timestep;
diff --git a/RigidBody.h b/RigidBody.h
index b1eddccb62915352ab2616c972d49ee9979fa312..dd71ebed4f61e50b7d21a31a59be4f26c0adc78a 100644
--- a/RigidBody.h
+++ b/RigidBody.h
@@ -27,7 +27,7 @@ class RigidBody { // host side representation of rigid bodies
 	| splitting methods for rigid body molecular dynamics". J Chem Phys. (1997) |
 	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
 		public:
-	HOST DEVICE RigidBody(const Configuration& c, RigidBodyType& t);
+	HOST DEVICE RigidBody(String name, const Configuration& c, RigidBodyType& t);
 	/* HOST DEVICE RigidBody(RigidBodyType t); */
 	HOST DEVICE ~RigidBody();
 
@@ -43,8 +43,9 @@ class RigidBody { // host side representation of rigid bodies
 	HOST DEVICE void integrate(int startFinishAll);
 	
 	// HOST DEVICE inline String getKey() const { return key; }
-	HOST DEVICE inline String getKey() const { return t->name; }
-
+	// HOST DEVICE inline String getKey() const { return t->name; }
+	HOST DEVICE inline String getKey() const { return name; }
+	
 	HOST DEVICE inline Vector3 getPosition() const { return position; }
 	HOST DEVICE inline Matrix3 getOrientation() const { return orientation; }
 	// HOST DEVICE inline Matrix3 getBasis() const { return orientation; }
@@ -59,7 +60,8 @@ class RigidBody { // host side representation of rigid bodies
 	Vector3 torque; // lab frame (except in integrate())
     
 private:
-	String key;
+	// String key;
+	String name;
 	/* static const SimParameters * simParams; */
 	Vector3 position;
 	// Q = orientation.transpose(); in Dullweber et al
diff --git a/RigidBodyController.cu b/RigidBodyController.cu
index 0142f4c3d92c661da123ae0525fe20c82ee37676..adf6d479db0cdcdcabe2cd2d0615ded590f84422 100644
--- a/RigidBodyController.cu
+++ b/RigidBodyController.cu
@@ -39,16 +39,24 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* out
 	for (int i = 0; i < conf.numRigidTypes; i++) {			
 		numRB += conf.rigidBody[i].num;
 		std::vector<RigidBody> tmp;
-		for (int j = 0; j < conf.rigidBody[i].num; j++) {
-			RigidBody r(conf, conf.rigidBody[i]);
+		// RBTODO: change conf.rigidBody to conf.rigidBodyType
+		const int jmax = conf.rigidBody[i].num;
+		for (int j = 0; j < jmax; j++) {
+			String name = conf.rigidBody[i].name;
+			if (jmax > 1) {
+				char tmp[128];
+				snprintf(tmp, 128, "#%d", j);
+				name.add( tmp );
+			}
+			RigidBody r(name, conf, conf.rigidBody[i]);
 			tmp.push_back( r );
-		}
-		rigidBodyByType.push_back(tmp);
 	}
+		rigidBodyByType.push_back(tmp);
+}
 
 	random = new RandomCPU(conf.seed + 1); /* +1 to avoid using same seed as RandomCUDA */
 	
-	initializeForcePairs();
+initializeForcePairs();
 }
 RigidBodyController::~RigidBodyController() {
 	for (int i = 0; i < rigidBodyByType.size(); i++)
@@ -60,6 +68,7 @@ RigidBodyController::~RigidBodyController() {
 void RigidBodyController::initializeForcePairs() {
 	// Loop over all pairs of rigid body types
 	//   the references here make the code more readable, but they may incur a performance loss
+	RigidBodyForcePair::createStreams();
 	printf("Initializing force pairs\n");
 	for (int ti = 0; ti < conf.numRigidTypes; ti++) {
 		RigidBodyType& t1 = conf.rigidBody[ti];
@@ -178,9 +187,11 @@ void RigidBodyController::updateForces(int s) {
 		}
 	}
 			
-	for (int i=0; i < forcePairs.size(); i++) {
-		forcePairs[i].updateForces(i,s);
-	}	
+	for (int i=0; i < forcePairs.size(); i++)
+		forcePairs[i].callGridForceKernel(i,s);
+
+	for (int i=0; i < forcePairs.size(); i++)
+		forcePairs[i].retrieveForces();
 
 	// RBTODO: see if there is a better way to sync
 	gpuErrchk(cudaDeviceSynchronize());
@@ -257,21 +268,32 @@ void RigidBodyController::integrate(int step) {
 		}
 	}
 }
-	
+
+// allocate and initialize an array of stream handles
+cudaStream_t *RigidBodyForcePair::stream = (cudaStream_t *) malloc(NUMSTREAMS * sizeof(cudaStream_t));
+// new cudaStream_t[NUMSTREAMS];
+int RigidBodyForcePair::nextStreamID = 0;
+void RigidBodyForcePair::createStreams() {
+	for (int i = 0; i < NUMSTREAMS; i++)
+		gpuErrchk( cudaStreamCreate( &(stream[i]) ) );
+		// gpuErrchk( cudaStreamCreateWithFlags( &(stream[i]) , cudaStreamNonBlocking ) );
+}
+
 // RBTODO: bundle several rigidbodypair evaluations in single kernel call
-void RigidBodyForcePair::updateForces(int pairId, int s) {
+void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
 	// get the force/torque between a pair of rigid bodies
 	/* printf("  Updating rbPair forces\n"); */
 	const int numGrids = gridKeyId1.size();
 
-	if (s%10 != 0)
-		pairId = -1000;
+	/* if (s%10 != 0) */
+	/* 	pairId = -1000; */
 
 	// RBTODO: precompute certain common transformations and pass in kernel call
 	for (int i = 0; i < numGrids; i++) {
 		const int nb = numBlocks[i];
 		const int k1 = gridKeyId1[i];
 		const int k2 = gridKeyId2[i];
+		const cudaStream_t &s = stream[streamID[i]];
 
 		/*
 			ijk: index of grid value
@@ -301,7 +323,7 @@ void RigidBodyForcePair::updateForces(int pairId, int s) {
 			c2 = rb2->getOrientation()*type2->potentialGrids[k2].getOrigin() + rb2->getPosition();
 			
 			// RBTODO: get energy
-			computeGridGridForce<<< nb, numThreads >>>
+			computeGridGridForce<<< nb, numThreads, 0, s >>>
 				(type1->rawDensityGrids_d[k1], type2->rawPotentialGrids_d[k2],
 				 B1, c1, B2, c2,
 				 forces_d[i], torques_d[i], pairId+i);
@@ -311,37 +333,39 @@ void RigidBodyForcePair::updateForces(int pairId, int s) {
 			B2 = type2->rawPmfs[k2].getBasis();
 			c2 = type2->rawPmfs[k2].getOrigin();
 
-			computeGridGridForce<<< nb, numThreads >>>
+			computeGridGridForce<<< nb, numThreads, 0, s >>>
 				(type1->rawDensityGrids_d[k1], type2->rawPmfs_d[k2],
 				 B1, c1, B2, c2,
 				 forces_d[i], torques_d[i], pairId+i);
 		}
-			
-	}
 
-	// RBTODO better way to sync?
-	gpuErrchk(cudaDeviceSynchronize());
-	for (int i = 0; i < numGrids; i++) {
-		const int nb = numBlocks[i];
-		gpuErrchk(cudaMemcpy(forces[i], forces_d[i], sizeof(Vector3)*nb,
-												 cudaMemcpyDeviceToHost));
-		gpuErrchk(cudaMemcpy(torques[i], torques_d[i], sizeof(Vector3)*nb,
-												 cudaMemcpyDeviceToHost));
 	}
-	gpuErrchk(cudaDeviceSynchronize());
+}
 
+void RigidBodyForcePair::retrieveForces() {
 	// sum forces + torques
+	const int numGrids = gridKeyId1.size();
 	Vector3 f = Vector3(0.0f);
 	Vector3 t = Vector3(0.0f);
 
+	// RBTODO better way to sync?
 	for (int i = 0; i < numGrids; i++) {
+		const cudaStream_t &s = stream[streamID[i]];
 		const int nb = numBlocks[i];
+
+		gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(Vector3)*nb,
+															cudaMemcpyDeviceToHost, s));
+		gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb,
+															cudaMemcpyDeviceToHost, s));
+
+		gpuErrchk(cudaStreamSynchronize( s ));
+		
 		for (int j = 0; j < nb; j++) {
 			f = f + forces[i][j];
 			t = t + torques[i][j];
 		}
 	}
-
+	
 	// transform torque from lab-frame origin to rb centers
 	// add forces to rbs
 	/* Vector3 tmp; */
@@ -350,7 +374,7 @@ void RigidBodyForcePair::updateForces(int pairId, int s) {
 	/* tmp = rb1->getPosition(); */
 	/* printf("rb1->getPosition(): (%f,%f,%f)\n", tmp.x, tmp.y, tmp.z); */
 	Vector3 t1 = t - rb1->getPosition().cross( f );
-	rb1->addForce( f);
+	rb1->addForce( f );
 	rb1->addTorque(t1);
 
 	if (!isPmf) {
@@ -775,7 +799,7 @@ RigidBodyParams* RigidBodyParamsList::find_key(const char* key) {
 /* 	printf("    Done constructing RB force pair\n"); */
 
 /* } */
-void RigidBodyForcePair::initialize() {
+int RigidBodyForcePair::initialize() {
 	printf("    Initializing (memory for) RB force pair...\n");
 
 	const int numGrids = gridKeyId1.size();
@@ -786,6 +810,8 @@ void RigidBodyForcePair::initialize() {
 		const int k1 = gridKeyId1[i];
 		const int sz = type1->rawDensityGrids[k1].getSize();
 		const int nb = sz / numThreads + ((sz % numThreads == 0) ? 0:1 );
+		streamID.push_back( nextStreamID % NUMSTREAMS );
+		nextStreamID++;
 
 		numBlocks.push_back(nb);
 		forces.push_back( new Vector3[nb] );
@@ -801,7 +827,7 @@ void RigidBodyForcePair::initialize() {
 	}
 	gpuErrchk(cudaDeviceSynchronize());
 	// printf("    Done initializing RB force pair\n");
-
+	return nextStreamID;
 }
 
 /* RigidBodyForcePair::RigidBodyForcePair(const RigidBodyForcePair& orig ) : */
diff --git a/RigidBodyController.h b/RigidBodyController.h
index 4acfa5f4ff1b8ff1f6da4f6ee1714e17733a7cc6..d54d48c14fea33d41961e79f4272459ab491c9c6 100644
--- a/RigidBodyController.h
+++ b/RigidBodyController.h
@@ -5,7 +5,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#define NUMTHREADS 256
+#define NUMTHREADS 256					/* try with 64, every 32+ */
+#define NUMSTREAMS 8
 
 class Configuration;
 class RandomCPU;
@@ -39,11 +40,11 @@ public:
 	~RigidBodyForcePair();
 
 private:
-	void initialize();
+	int initialize();
 	void swap(RigidBodyForcePair& a, RigidBodyForcePair& b);
 	
 	static const int numThreads = NUMTHREADS;
-
+	
 	bool isPmf;
 	
 	RigidBodyType* type1;
@@ -54,13 +55,18 @@ private:
 	std::vector<int> gridKeyId1;
 	std::vector<int> gridKeyId2;
 	std::vector<int> numBlocks;
-
+	
 	std::vector<Vector3*> forces;
 	std::vector<Vector3*> forces_d;
 	std::vector<Vector3*> torques;
 	std::vector<Vector3*> torques_d;
-	
-	void updateForces(int pairId, int s);
+
+	static int nextStreamID; 
+	std::vector<int> streamID;
+	static cudaStream_t* stream;
+	static void createStreams();
+	void callGridForceKernel(int pairId, int s);
+	void retrieveForces();
 };
 
 class RigidBodyController {
diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu
index c43fb364b3fb6c0ee33b35dbb3817a22a92159ad..4544d10540f250b8eee3b9521f5f86d3e694fb02 100644
--- a/RigidBodyGrid.cu
+++ b/RigidBodyGrid.cu
@@ -343,32 +343,29 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(Vector3 l) const {
 	home[1] = homeY;
 	home[2] = homeZ;
 
-	// Get the grid dimensions.
-	int g[3];
-	g[0] = nx;
-	g[1] = ny;
-	g[2] = nz;
-  
 	// Get the interpolation coordinates.
-	float w[3];
-	w[0] = l.x - homeX;
-	w[1] = l.y - homeY;
-	w[2] = l.z - homeZ;
+	const float wx = l.x - homeX;
+	const float wy = l.y - homeY;
+	const float wz = l.z - homeZ;
+
+	// RBTODO: remove wrap
+	// RBTODO:
 
 	// Find the values at the neighbors.
 	float g1[4][4][4];
 	for (int ix = 0; ix < 4; ix++) {
 		int jx = ix-1 + home[0];
-		jx = wrap(jx, g[0]);
+		jx = wrap(jx, nx);
 		for (int iy = 0; iy < 4; iy++) {
 			int jy = iy-1 + home[1];
-			jy = wrap(jy, g[1]);
+			jy = wrap(jy, ny);
 			for (int iz = 0; iz < 4; iz++) {
 				// Wrap around the periodic boundaries. 
 				int jz = iz-1 + home[2];
-				jz = wrap(jz, g[2]);
-	  
+				jz = wrap(jz, nz);
+
 				int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
+				// g1[ix][iy][iz] =  jz < 0 || jz > home[2] || ... ? val[ind] : 0;
 				g1[ix][iy][iz] = val[ind];
 			}
 		}
@@ -384,7 +381,7 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(Vector3 l) const {
 			a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
 			a0 = g1[1][iy][iz];
 
-			g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+			g2[iy][iz] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0;
 		}
 	}
 
@@ -396,7 +393,7 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(Vector3 l) const {
 		a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
 		a0 = g2[1][iz];
    
-		g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+		g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0;
 	}
 
 	// Mix along z.
@@ -405,11 +402,11 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(Vector3 l) const {
 	a1 = 0.5f*(-g3[0] + g3[2]);
 	a0 = g3[1];
 
-	return a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0;
+	return a3*wz*wz*wz + a2*wz*wz + a1*wz + a0;
 }
 
 /** interpolateForce() to be used on CUDA Device **/
-HOST DEVICE Vector3 RigidBodyGrid::interpolateForceD(Vector3 l) const {
+HOST DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	Vector3 f;
 	// Vector3 l = basisInv.transform(pos - origin);
 	int homeX = int(floor(l.x));
@@ -426,17 +423,11 @@ HOST DEVICE Vector3 RigidBodyGrid::interpolateForceD(Vector3 l) const {
 	home[1] = homeY;
 	home[2] = homeZ;
 
-	// Shift the indices in the grid dimensions.
-	int g[3];
-	g[0] = nx;
-	g[1] = ny;
-	g[2] = nz;
-
 	// Get the interpolation coordinates.
-	float w[3];
-	w[0] = l.x - homeX;
-	w[1] = l.y - homeY;
-	w[2] = l.z - homeZ;
+	const float wx = l.x - homeX;
+	const float wy = l.y - homeY;
+	const float wz = l.z - homeZ;
+
 	// Find the values at the neighbors.
 	float g1[4][4][4];						/* neighborhood */
 	for (int ix = 0; ix < 4; ix++) {
@@ -445,78 +436,25 @@ HOST DEVICE Vector3 RigidBodyGrid::interpolateForceD(Vector3 l) const {
 				// RBTODO: probably don't wrap!
 				// Wrap around the periodic boundaries. 
 				int jx = ix-1 + home[0];
-				jx = wrap(jx, g[0]);
+				jx = wrap(jx, nx);
 				int jy = iy-1 + home[1];
-				jy = wrap(jy, g[1]);
+				jy = wrap(jy, ny);
 				int jz = iz-1 + home[2];
-				jz = wrap(jz, g[2]);
+				jz = wrap(jz, nz);
 				int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
 				g1[ix][iy][iz] = val[ind];
 			}
 		}
 	}  
-	f.x = interpolateDiffX(w, g1);
-	f.y = interpolateDiffY(w, g1);
-	f.z = interpolateDiffZ(w, g1);
+	f.x = interpolateDiffX(wx,wy,wz, g1);
+	f.y = interpolateDiffY(wx,wy,wz, g1);
+	f.z = interpolateDiffZ(wx,wy,wz, g1);
 	// Vector3 f1 = basisInv.transpose().transform(f);
 	// return f1;
 	return f;
 }
 
-/* inline virtual Vector3 interpolateForce(Vector3 pos) const { */
-/* 	Vector3 f; */
-/* 	Vector3 l = basisInv.transform(pos - origin); */
-/* 	int homeX = int(floor(l.x)); */
-/* 	int homeY = int(floor(l.y)); */
-/* 	int homeZ = int(floor(l.z)); */
-/* 	// Get the array jumps with shifted indices. */
-/* 	int jump[3]; */
-/* 	jump[0] = nz*ny; */
-/* 	jump[1] = nz; */
-/* 	jump[2] = 1; */
-/* 	// Shift the indices in the home array. */
-/* 	int home[3]; */
-/* 	home[0] = homeX; */
-/* 	home[1] = homeY; */
-/* 	home[2] = homeZ; */
-
-/* 	// Shift the indices in the grid dimensions. */
-/* 	int g[3]; */
-/* 	g[0] = nx; */
-/* 	g[1] = ny; */
-/* 	g[2] = nz; */
-
-/* 	// Get the interpolation coordinates. */
-/* 	float w[3]; */
-/* 	w[0] = l.x - homeX; */
-/* 	w[1] = l.y - homeY; */
-/* 	w[2] = l.z - homeZ; */
-/* 	// Find the values at the neighbors. */
-/* 	float g1[4][4][4]; */
-/* 	//RBTODO parallelize? */
-/* 	for (int ix = 0; ix < 4; ix++) { */
-/* 		for (int iy = 0; iy < 4; iy++) { */
-/* 			for (int iz = 0; iz < 4; iz++) { */
-/* 				// Wrap around the periodic boundaries.  */
-/* 				int jx = ix-1 + home[0]; */
-/* 				jx = wrap(jx, g[0]); */
-/* 				int jy = iy-1 + home[1]; */
-/* 				jy = wrap(jy, g[1]); */
-/* 				int jz = iz-1 + home[2]; */
-/* 				jz = wrap(jz, g[2]); */
-/* 				int ind = jz*jump[2] + jy*jump[1] + jx*jump[0]; */
-/* 				g1[ix][iy][iz] = val[ind]; */
-/* 			} */
-/* 		} */
-/* 	}   */
-/* 	f.x = interpolateDiffX(pos, w, g1); */
-/* 	f.y = interpolateDiffY(pos, w, g1); */
-/* 	f.z = interpolateDiffZ(pos, w, g1); */
-/* 	Vector3 f1 = basisInv.transpose().transform(f); */
-/* 	return f1; */
-/* } */
-
-HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(float w[3], float g1[4][4][4]) const {
+HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(const float wx, const float wy, const float wz, float g1[4][4][4]) const {
 	float a0, a1, a2, a3;
 	// RBTODO further parallelize loops? unlikely?
 		
@@ -529,8 +467,8 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(float w[3], float g1[4]
 			a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
 			a0 = g1[1][iy][iz];
 
-			//g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
-			g2[iy][iz] = 3.0f*a3*w[0]*w[0] + 2.0f*a2*w[0] + a1;
+			//g2[iy][iz] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0;
+			g2[iy][iz] = 3.0f*a3*wx*wx + 2.0f*a2*wx + a1;
 		}
 	}
 
@@ -542,7 +480,7 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(float w[3], float g1[4]
 		a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
 		a0 = g2[1][iz];
    
-		g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+		g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0;
 	}
 
 	// Mix along z.
@@ -551,11 +489,11 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(float w[3], float g1[4]
 	a1 = 0.5f*(-g3[0] + g3[2]);
 	a0 = g3[1];
  
-	float retval = -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
+	float retval = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0);
 	return retval;
 }
 
-HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(float w[3], float g1[4][4][4]) const {
+HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(const float wx, const float wy, const float wz, float g1[4][4][4]) const {
 	float a0, a1, a2, a3;
   
 	// Mix along x, taking the derivative.
@@ -567,7 +505,7 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(float w[3], float g1[4]
 			a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
 			a0 = g1[1][iy][iz];
 
-			g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+			g2[iy][iz] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0;
 		}
 	}
 
@@ -579,8 +517,8 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(float w[3], float g1[4]
 		a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
 		a0 = g2[1][iz];
    
-		//g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
-		g3[iz] = 3.0f*a3*w[1]*w[1] + 2.0f*a2*w[1] + a1;
+		//g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0;
+		g3[iz] = 3.0f*a3*wy*wy + 2.0f*a2*wy + a1;
 	}
 
 	// Mix along z.
@@ -589,10 +527,10 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(float w[3], float g1[4]
 	a1 = 0.5f*(-g3[0] + g3[2]);
 	a0 = g3[1];
 
-	return -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
+	return -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0);
 }
 
-HOST DEVICE inline float  RigidBodyGrid::interpolateDiffZ(float w[3], float g1[4][4][4]) const {
+HOST DEVICE inline float  RigidBodyGrid::interpolateDiffZ(const float wx, const float wy, const float wz, float g1[4][4][4]) const {
 	float a0, a1, a2, a3;
   
 	// Mix along x, taking the derivative.
@@ -604,7 +542,7 @@ HOST DEVICE inline float  RigidBodyGrid::interpolateDiffZ(float w[3], float g1[4
 			a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
 			a0 = g1[1][iy][iz];
 
-			g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+			g2[iy][iz] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0;
 		}
 	}
 
@@ -616,7 +554,7 @@ HOST DEVICE inline float  RigidBodyGrid::interpolateDiffZ(float w[3], float g1[4
 		a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
 		a0 = g2[1][iz];
    
-		g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+		g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0;
 	}
 
 	// Mix along z.
@@ -625,7 +563,7 @@ HOST DEVICE inline float  RigidBodyGrid::interpolateDiffZ(float w[3], float g1[4
 	a1 = 0.5f*(-g3[0] + g3[2]);
 	a0 = g3[1];
 
-	return -(3.0f*a3*w[2]*w[2] + 2.0f*a2*w[2] + a1);
+	return -(3.0f*a3*wz*wz + 2.0f*a2*wz + a1);
 }
 
 
diff --git a/RigidBodyGrid.h b/RigidBodyGrid.h
index b456af80c6673d02d783da97a0c0cbdb58a79cac..518e496f693b52b8846435df3565573488c9dc2c 100644
--- a/RigidBodyGrid.h
+++ b/RigidBodyGrid.h
@@ -142,9 +142,9 @@ public:
 	// Added by Rogan for times when simpler calculations are required.
   virtual float interpolatePotentialLinearly(Vector3 pos) const;
 
-	HOST DEVICE float interpolateDiffX(float w[3], float g1[4][4][4]) const;
-  HOST DEVICE float interpolateDiffY(float w[3], float g1[4][4][4]) const;
-	HOST DEVICE float interpolateDiffZ(float w[3], float g1[4][4][4]) const;
+	HOST DEVICE float interpolateDiffX(const float wx, const float wy, const float wz, float g1[4][4][4]) const;
+  HOST DEVICE float interpolateDiffY(const float wx, const float wy, const float wz, float g1[4][4][4]) const;
+	HOST DEVICE float interpolateDiffZ(const float wx, const float wy, const float wz, float g1[4][4][4]) const;
 
   HOST DEVICE float interpolatePotential(Vector3 l) const;
 
@@ -180,17 +180,11 @@ public:
 		home[1] = homeY;
 		home[2] = homeZ;
 
-		// Shift the indices in the grid dimensions.
-		int g[3];
-		g[0] = nx;
-		g[1] = ny;
-		g[2] = nz;
-
 		// Get the interpolation coordinates.
-		float w[3];
-		w[0] = l.x - homeX;
-		w[1] = l.y - homeY;
-		w[2] = l.z - homeZ;
+		const float wx = l.x - homeX;
+		const float wy = l.y - homeY;
+		const float wz = l.z - homeZ;
+
 		// Find the values at the neighbors.
 		float g1[4][4][4];
 		//RBTODO parallelize?
@@ -199,19 +193,19 @@ public:
 				for (int iz = 0; iz < 4; iz++) {
 	  			// Wrap around the periodic boundaries. 
 					int jx = ix-1 + home[0];
-					jx = wrap(jx, g[0]);
+					jx = wrap(jx, nx);
 					int jy = iy-1 + home[1];
-					jy = wrap(jy, g[1]);
+					jy = wrap(jy, ny);
 					int jz = iz-1 + home[2];
-					jz = wrap(jz, g[2]);
+					jz = wrap(jz, nz);
 					int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
 					g1[ix][iy][iz] = val[ind];
 				}
 			}
 		}  
-		f.x = interpolateDiffX( w, g1);
-		f.y = interpolateDiffY( w, g1);
-		f.z = interpolateDiffZ( w, g1);
+		f.x = interpolateDiffX( wx, wy, wz, g1 );
+		f.y = interpolateDiffY( wx, wy, wz, g1 );
+		f.z = interpolateDiffZ( wx, wy, wz, g1 );
 		// Vector3 f1 = basisInv.transpose().transform(f);
 		// return f1;
 		return f;
diff --git a/makefile b/makefile
index 247504ddd39e38a5537a99b8fb0bddd4f64c0cdc..add8f91bba193406fd792f3e231e029d669d3fc6 100644
--- a/makefile
+++ b/makefile
@@ -37,14 +37,16 @@ endif
 #CODE_20 := -gencode arch=compute_20,code=sm_20
 CODE_20 := -arch=sm_20
 #CODE_30 := -gencode arch=compute_30,code=sm_30
-#CODE_35 := -gencode arch=compute_35,code=\"sm_35,compute_35\"
+# CODE_35 := -gencode arch=compute_35,code=\"sm_35,compute_35\"
+# CODE_35 := -arch=compute_35
 
-NV_FLAGS += $(CODE_10) $(CODE_12) $(CODE_20) $(CODE_30) $(CODE_35)
+# NV_FLAGS += $(CODE_10) $(CODE_12) $(CODE_20) $(CODE_30) $(CODE_35)
+NV_FLAGS += -arch=sm_35
 
 NVLD_FLAGS := $(NV_FLAGS) --device-link
-NV_FLAGS += -rdc=true
+# NV_FLAGS += -rdc=true
 
-LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -Wl,-rpath,$(LIBRARY)
+LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(LIBRARY)
 
 
 ### Sources
@@ -65,7 +67,8 @@ all: $(TARGET)
 	@echo "Done ->" $(TARGET)
 
 $(TARGET): $(CU_OBJ) $(CC_OBJ) runBrownTown.cpp vmdsock.c imd.c imd.h
-	$(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o
+#	$(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o
+	$(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o
 	$(EXEC) $(CC) $(CC_FLAGS) $(EX_FLAGS) runBrownTown.cpp vmdsock.c imd.c $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) $(LD_FLAGS)  -o $(TARGET)
 
 # $(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o
@@ -73,7 +76,7 @@ $(TARGET): $(CU_OBJ) $(CC_OBJ) runBrownTown.cpp vmdsock.c imd.c imd.h
 
 .SECONDEXPANSION:
 $(CU_OBJ): %.o: %.cu $$(wildcard %.h) $$(wildcard %.cuh)
-	$(EXEC) $(NVCC) $(NV_FLAGS) $(EX_FLAGS) -c $< -o $@
+	$(EXEC) $(NVCC) $(NV_FLAGS) $(EX_FLAGS) -dc $< -o $@
 
 $(CC_OBJ): %.o: %.cpp %.h 
 	$(EXEC) $(CC) $(CC_FLAGS) $(EX_FLAGS) -c $< -o $@