diff --git a/.gitignore b/.gitignore
index 99c1cd7b962d7d04c3f32359f72b9ad0e3a6d472..23e5af8024fb69f70ca20558365dc130e0dbf1bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,4 @@ TAGS
 TAGS.sh
 .dir-locals.el
 BD_example*
-runBrownCUDA
\ No newline at end of file
+runBrownCUDA
diff --git a/BaseGrid.h b/BaseGrid.h
index e3032d5d88648ea42e19b87987fe8b7aa910f3f1..fe4c127cf1d838dad4e1cfc4aec23e630142260f 100644
--- a/BaseGrid.h
+++ b/BaseGrid.h
@@ -308,8 +308,6 @@ public:
     return -(3.0f*a3*w[2]*w[2] + 2.0f*a2*w[2] + a1);
   }
 
-	// RBTODO overload with optimized algorithm
-	//  skip transforms (assume identity basis)
   HOST DEVICE inline float interpolatePotential(Vector3 pos) const {
     // Find the home node.
     Vector3 l = basisInv.transform(pos - origin);
diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh
index 173e7bb32cada796ca76baa7fc9b1b413b346e1c..337b455c6ba531036929d1ef0bde6082a634c490 100644
--- a/ComputeGridGrid.cuh
+++ b/ComputeGridGrid.cuh
@@ -3,32 +3,64 @@
 /* #include "useful.h" */
 
 __global__
-void computeGridGridForce(const RigidBodyGrid& rho, const RigidBodyGrid& u,
+void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 													Matrix3 basis_rho, Vector3 origin_rho,
-													Matrix3 basis_u,   Vector3 origin_u) {
+													Matrix3 basis_u,   Vector3 origin_u,
+													Vector3 * retForce, Vector3 * retTorque) {
+
+	printf("ComputeGridGridForce called\n");
+	extern __shared__ Vector3 force [];
+	extern __shared__ Vector3 torque [];
+	
   // RBTODO http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+	const unsigned int tid = threadIdx.x;
 	const unsigned int r_id = blockIdx.x * blockDim.x + threadIdx.x;
 	
 	// RBTODO parallelize transform
-	if (r_id > rho.size)					// skip threads with no data 
+	if (r_id > rho->size) {				// skip threads with no data
+		force[tid] = Vector3(0.0f);
+		torque[tid] = Vector3(0.0f);
 		return;
-	
+	}
 	// Maybe: Tile grid data into shared memory
-	//   RBTODO: think about localizing regions of grid data
-	Vector3 p = rho.getPosition(r_id, basis_rho, origin_rho);
-	float val = rho.val[r_id];
+	//   RBTODO: think about localizing regions of grid datas
+	Vector3 r_ijk = rho->getPosition(r_id); /* i,j,k value of voxel */
+	Vector3 r_pos = basis_rho.transform( r_ijk ) + origin_rho;
+	Vector3 u_ijk_float = basis_u.transform( r_pos - origin_u );
 
-	// RBTODO potential basis and rho!
+	
+	float r_val = rho->val[r_id];
+	float energy = r_val * u->interpolatePotential( u_ijk_float ); 
 	
 	// RBTODO combine interp methods and reduce repetition! 
-	float energy = val*u.interpolatePotential(p); 
-	Vector3 f = val*u.interpolateForceD(p);
-	Vector3 t = p.cross(f);				// test if sign is correct!
+	Vector3 f = r_val*u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
+	f = basis_u.inverse().transpose().transform( f ); /* transform to lab frame */
 
-	// RBTODO reduce forces and torques
+	// Calculate torque about lab-frame origin 
+	Vector3 t = r_pos.cross(f);				// RBTODO: test if sign is correct!
+	
+
+	force[tid] = f;
+	torque[tid] = t;
+	__syncthreads();
+
+	// Reduce force and torques
 	// http://www.cuvilib.com/Reduction.pdf
+	// RBTODO optimize
+	for (unsigned int offset = blockDim.x/2; offset > 0; offset >>= 1) {
+		if (tid < offset) {
+			unsigned int oid = tid + offset;
+			force[tid] = force[tid] + force[oid];
+			torque[tid] = torque[tid] + torque[oid];
+		}
+		__syncthreads();
+	}
 
+	if (tid == 0) {
+		retForce[blockIdx.x] = force[0];
+		retTorque[blockIdx.x] = force[0];
+	}
+	
 	// RBTODO 3rd-law forces + torques (maybe outside kernel)
-
 	// must reference rigid bodies in some way
 }
diff --git a/Configuration.cpp b/Configuration.cpp
index d23362505c47a91074c80e631d173d415b6937cb..01278da1d399ee3d94badc2d5b69cd67f5462d1b 100644
--- a/Configuration.cpp
+++ b/Configuration.cpp
@@ -775,8 +775,8 @@ int Configuration::readParameters(const char * config_file) {
 		else if (param == String("rotDamping"))
 			rigidBody[currRB].rotDamping = stringToVector3( value );
 
-		else if (param == String("potentialGrid"))
-			rigidBody[currRB].addPotentialGrid(value);
+		else if (param == String("densityGrid"))
+			rigidBody[currRB].addDensityGrid(value);
 		else if (param == String("potentialGrid"))
 			rigidBody[currRB].addPotentialGrid(value);
 
diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu
index fa54f7e228686c1025b2254a275ced13b03be2c9..9f2dcc4f9f0df6585c97da96f2a6a1bd3418dfef 100644
--- a/GrandBrownTown.cu
+++ b/GrandBrownTown.cu
@@ -413,7 +413,7 @@ void GrandBrownTown::run() {
 		printf("Position Update Time: %f ms\n", dt2 * 1000);
 		*/
 
-		
+		RBC.updateForces();
 		/* 	for (int j = 0; j < t->num; j++) { */
 		/* computeGridGridForce<<< numBlocks, NUM_THREADS >>>(grid1_d, grid2_d); */
 		
diff --git a/RigidBody.h b/RigidBody.h
index e0fe66ef1a2d4056ba70f523e79d528e49a037d7..6e8af6d2a2206c0ddec2186ed49f2bc67122a13d 100644
--- a/RigidBody.h
+++ b/RigidBody.h
@@ -41,6 +41,7 @@ class RigidBody { // host side representation of rigid bodies
 	HOST DEVICE void integrate(Vector3& old_trans, Matrix3& old_rot, int startFinishAll);
 	
 	HOST DEVICE inline String getKey() { return key; }
+
 	HOST DEVICE inline Vector3 getPosition() const { return position; }
 	HOST DEVICE inline Matrix3 getOrientation() const { return orientation; }
 	HOST DEVICE inline Matrix3 getBasis() const { return orientation; }
diff --git a/RigidBodyController.cu b/RigidBodyController.cu
index fea8194218db91debb1d97e5cbcf49e02ea088f5..bba8d815af8ffd99e80a047ebc11150c1c32068e 100644
--- a/RigidBodyController.cu
+++ b/RigidBodyController.cu
@@ -10,6 +10,8 @@
 #include "RigidBodyType.h"
 #include "ComputeGridGrid.cuh"
 
+// #include <vector>
+
 /* #include "Random.h" */
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
@@ -24,7 +26,6 @@ inline void gpuAssert(cudaError_t code, String file, int line, bool abort=true)
 /* #include <cuda_runtime.h> */
 /* #include <curand_kernel.h> */
 
-
 RigidBodyController::RigidBodyController(const Configuration& c) :
 conf(c) {
 
@@ -42,7 +43,9 @@ conf(c) {
 			tmp.push_back( r );
 		}
 		rigidBodyByType.push_back(tmp);
-	}	
+	}
+
+	initializeForcePairs();
 }
 RigidBodyController::~RigidBodyController() {
 	for (int i = 0; i < rigidBodyByType.size(); i++)
@@ -50,6 +53,65 @@ RigidBodyController::~RigidBodyController() {
 	rigidBodyByType.clear();
 }
 
+void RigidBodyController::initializeForcePairs() {
+	// Loop over all pairs of rigid body types
+	//   the references here make the code more readable, but they may incur a performance loss
+	printf("Initializing force pairs\n");
+	for (int ti = 0; ti < conf.numRigidTypes; ti++) {
+		RigidBodyType& t1 = conf.rigidBody[ti];
+		for (int tj = ti; tj < conf.numRigidTypes; tj++) {
+			RigidBodyType& t2 = conf.rigidBody[tj];
+
+
+			const std::vector<String>& keys1 = t1.densityGridKeys; 
+			const std::vector<String>& keys2 = t2.potentialGridKeys;
+
+			printf("  Working on type pair ");
+			t1.name.printInline(); printf(":"); t2.name.print();
+			
+			// Loop over all pairs of grid keys (e.g. "Elec")
+			std::vector<int> gridKeyId1;
+			std::vector<int> gridKeyId2;
+			
+			printf("  Grid keys %d:%d\n",keys1.size(),keys2.size());
+
+			bool paired = false;
+			for(int k1 = 0; k1 < keys1.size(); k1++) {
+				for(int k2 = 0; k2 < keys2.size(); k2++) {
+					printf("    checking grid keys ");
+					keys1[k1].printInline(); printf(":"); keys2[k2].print();
+					
+					if ( keys1[k1] == keys2[k2] ) {
+						gridKeyId1.push_back(k1);
+						gridKeyId2.push_back(k2);
+						paired = true;
+					}
+				}
+			}
+			
+			if (paired) {
+				// found matching keys => calculate force between all grid pairs
+				std::vector<RigidBody>& rbs1 = rigidBodyByType[ti];
+				std::vector<RigidBody>& rbs2 = rigidBodyByType[tj];
+
+				// Loop over rigid bodies of these types
+				for (int i = 0; i < rbs1.size(); i++) {
+					for (int j = (ti==tj ? i+1 : 0); j < rbs2.size(); j++) {
+						RigidBody* rb1 = &(rbs1[i]);
+						RigidBody* rb2 = &(rbs2[j]);
+
+						printf("    pushing RB force pair for %d:%d\n",i,j);
+						RigidBodyForcePair fp = RigidBodyForcePair(&(t1),&(t2),rb1,rb2,gridKeyId1,gridKeyId2);
+						gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */
+						forcePairs.push_back( fp ); 
+						printf("    done pushing RB force pair for %d:%d\n",i,j);
+					}
+				}
+			}
+		}
+	}
+}
+	
 void RigidBodyController::updateForces() {
 	/*â€“â€“{ RBTODO }â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“.
 	| probably coalesce kernel calls, or move this to a device kernel caller   |
@@ -67,49 +129,74 @@ void RigidBodyController::updateForces() {
 	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
 	// int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
 	// int numBlocks = 1;
-	int numThreads = 256;
-
-	
-	// Loop over all pairs of rigid body types
-	//   the references here make the code more readable, but they may incur a performance loss
-	for (int ti = 0; ti < conf.numRigidTypes; ti++) {
-		const RigidBodyType& t1 = conf.rigidBody[ti];
-		for (int tj = ti; tj < conf.numRigidTypes; tj++) {
-			const RigidBodyType& t2 = conf.rigidBody[tj];
-				
-			const std::vector<String>& keys1 = t1.densityGridKeys; 
-			const std::vector<String>& keys2 = t2.potentialGridKeys;
+	/* int numThreads = 256; */
 
-			// Loop over all pairs of grid keys (e.g. "Elec")
-			for(int k1 = 0; k1 < keys1.size(); k1++) {
-				for(int k2 = 0; k2 < keys2.size(); k2++) {
-					if ( keys1[k1] == keys2[k2] ) {
-						// found matching keys => calculate force between all grid pairs
-						//   loop over rigid bodies of this type
-						const std::vector<RigidBody>& rbs1 = rigidBodyByType[ti];
-						const std::vector<RigidBody>& rbs2 = rigidBodyByType[tj];
-
-						for (int i = 0; i < rbs1.size(); i++) {
-							for (int j = (ti==tj ? 0 : i); j < rbs2.size(); j++) {
-								const RigidBody& rb1 = rbs1[i];
-								const RigidBody& rb2 = rbs2[j];
-
-								const int sz = t1.rawDensityGrids[k1].getSize();
-								const int numBlocks = sz / numThreads + ((sz % numThreads == 0) ? 0:1 );
-								computeGridGridForce<<< numBlocks, numThreads >>>
-									(t1.rawDensityGrids[k1], t2.rawPotentialGrids[k2],
-									 rb1.getBasis(), rb1.getPosition(),
-									 rb2.getBasis(), rb2.getPosition());
-							}
-						}
-					}
-				}
-			}
-		}
-	}
+	for (int i=0; i < forcePairs.size(); i++) {
+		forcePairs[i].updateForces();
+	}	
+	// get 3rd law forces and torques
 		
 	// RBTODO: see if there is a better way to sync
 	gpuErrchk(cudaDeviceSynchronize());
+
+}
+
+void RigidBodyForcePair::updateForces() {
+	// get the force/torque between a pair of rigid bodies
+	printf("  Updating rbPair forces\n");
+	const int numGrids = gridKeyId1.size();
+
+	// RBTODO: precompute certain common transformations and pass in kernel call
+	for (int i = 0; i < numGrids; i++) {
+		const int nb = numBlocks[i];
+		const int k1 = gridKeyId1[i];
+		const int k2 = gridKeyId2[i];
+
+		printf("  Calculating grid forces\n");
+
+		computeGridGridForce<<< nb, numThreads >>>
+		(type1->rawDensityGrids_d[k1], type2->rawPotentialGrids_d[k2],
+		 rb1->getBasis(), rb1->getPosition(), /* RBTODO: include offset from grid */
+		 rb2->getBasis(), rb2->getPosition(),
+		 forces_d[i], torques_d[i]);
+		
+		// RBTODO: ASYNCHRONOUSLY retrieve forces
+	}
+	gpuErrchk(cudaDeviceSynchronize());
+	for (int i = 0; i < numGrids; i++) {
+		const int nb = numBlocks[i];
+		gpuErrchk(cudaMemcpy(forces[i], &(forces_d[i]), sizeof(Vector3)*nb,
+												 cudaMemcpyDeviceToHost));
+		gpuErrchk(cudaMemcpy(torques[i], &(torques_d[i]), sizeof(Vector3)*nb,
+												 cudaMemcpyDeviceToHost));
+	}
+
+	gpuErrchk(cudaDeviceSynchronize());
+
+	// sum forces + torques
+	Vector3 f = Vector3(0.0f);
+	Vector3 t = Vector3(0.0f);
+
+	for (int i = 0; i < numGrids; i++) {
+		const int nb = numBlocks[i];
+		for (int j = 0; j < nb; j++) {
+			f = f + forces[i][j];
+			t = t + forces[i][j];
+		}
+	}
+
+	// transform torque from lab-frame origin to rb centers
+	Vector3 t1 = t - rb1->getPosition().cross( f );
+	Vector3 t2 = -t - rb2->getPosition().cross( -f );
+
+	// add forces to rbs
+	rb1->addForce( f);
+	rb1->addTorque(t1);
+	rb2->addForce(-f);
+	rb2->addTorque(t2);
+	
+	// integrate
+	
 }
 
 void RigidBodyController::copyGridsToDevice() {
@@ -138,7 +225,8 @@ void RigidBodyController::copyGridsToDevice() {
 		RigidBodyGrid * gtmp;
 		// gtmp = new RigidBodyGrid[ng];
 		size_t sz = sizeof(RigidBodyGrid)*ng;
-		
+
+		printf("Copying potential grids!\n");
 		// allocate grids on device
 		// copy temporary host pointer to device pointer
 		// copy grids to device through temporary host poin
@@ -160,10 +248,10 @@ void RigidBodyController::copyGridsToDevice() {
 			float **tmpData;
 			tmpData = new float*[len];
 
-			printf("  RigidBodyType %d: potGrid[%d] size: %d\n", i, gid, len);
-			for (int k = 0; k < len; k++)
-				printf("    rbType_d[%d]->potGrid[%d].val[%d]: %g\n",
-							 i, gid, k, g->val[k]);
+			/* printf("  RigidBodyType %d: potGrid[%d] size: %d\n", i, gid, len); */
+			/* for (int k = 0; k < len; k++) */
+			/* 	printf("    rbType_d[%d]->potGrid[%d].val[%d]: %g\n", */
+			/* 				 i, gid, k, g->val[k]); */
 			
       // allocate grid data on device
 			// copy temporary host pointer to device pointer
@@ -183,18 +271,27 @@ void RigidBodyController::copyGridsToDevice() {
 
 	// density grids
  	for (int i = 0; i < conf.numRigidTypes; i++) {
-		printf("working on RB %d\n",i);
+		printf("Copying density grids of RB type %d\n",i);
 		RigidBodyType& rb = conf.rigidBody[i];
 
 		int ng = rb.numDenGrids;
 		RigidBodyGrid * gtmp;
 		size_t sz = sizeof(RigidBodyGrid)*ng;
+
+		printf("  copying %d grids\n",ng);
+		for (int gid = 0; gid < ng; gid++) {
+			int gs = rb.rawDensityGrids[gid].getSize();
+			printf("    grid %d contains %d values\n",gid, gs);
+			for (int idx = 0; idx < gs; idx++) {
+				printf("      val[%d] = %g\n",idx, rb.rawDensityGrids[gid].val[idx] );
+			}
+		}
 		
 		// allocate grids on device
 		// copy temporary host pointer to device pointer
 		// copy grids to device through temporary host poin
 		gpuErrchk(cudaMalloc((void **) &gtmp, sz));
-		gpuErrchk(cudaMemcpy(&(rb_addr[i]->rawDensityGrids), &gtmp, 
+		gpuErrchk(cudaMemcpy(&(rb_addr[i]->rawDensityGrids_d), &gtmp, 
 												 sizeof(RigidBodyGrid*) * ng, cudaMemcpyHostToDevice ));
 		gpuErrchk(cudaMemcpy(gtmp, &(rb.rawDensityGrids),
 												 sizeof(RigidBodyGrid)  * ng, cudaMemcpyHostToDevice ));
@@ -237,324 +334,94 @@ void RigidBodyController::copyGridsToDevice() {
 }
 
 
+/* RigidBodyForcePair::RigidBodyForcePair(RigidBodyType* t1, RigidBodyType* t2, */
+/* 																			 RigidBody* rb1, RigidBody* rb2, */
+/* 																			 std::vector<int> gridKeyId1, std::vector<int> gridKeyId2) : */
+/* 	type1(t1), type2(t2), rb1(rb1), rb2(rb2), gridKeyId1(gridKeyId1), gridKeyId2(gridKeyId2) { */
 
-/*
-void RigidBodyController::print(int step) {
-    // modeled after outputExtendedData() in Controller.C
-    if ( step >= 0 ) {
-	// Write RIGID BODY trajectory file
-      if ( (step % simParams->rigidBodyOutputFrequency) == 0 ) {
-	  if ( ! trajFile.rdbuf()->is_open() ) {
-	      // open file
-	      iout << "OPENING RIGID BODY TRAJECTORY FILE\n" << endi;
-	      NAMD_backup_file(simParams->rigidBodyTrajectoryFile);
-	      trajFile.open(simParams->rigidBodyTrajectoryFile);
-	      while (!trajFile) {
-		  if ( errno == EINTR ) {
-		      CkPrintf("Warning: Interrupted system call opening RIGIDBODY trajectory file, retrying.\n");
-		      trajFile.clear();
-		      trajFile.open(simParams->rigidBodyTrajectoryFile);
-		      continue;
-		  }
-		  char err_msg[257];
-		  sprintf(err_msg, "Error opening RigidBody trajectory file %s",simParams->rigidBodyTrajectoryFile);
-		  NAMD_err(err_msg);
-	      }
-	      trajFile << "# NAMD RigidBody trajectory file" << std::endl;
-	      printLegend(trajFile);
-	  }
-	  printData(step,trajFile);
-	  trajFile.flush();    
-      }
-    
-      // Write restart File
-      if ( simParams->restartFrequency &&
-	   ((step % simParams->restartFrequency) == 0) &&
-	   (step != simParams->firstTimestep) )
-      {
-	  iout << "RIGID BODY: WRITING RESTART FILE AT STEP " << step << "\n" << endi;
-	  char fname[140];
-	  strcpy(fname, simParams->restartFilename);
-	
-	  strcat(fname, ".rigid");
-	  NAMD_backup_file(fname,".old");
-	  std::ofstream restartFile(fname);
-	  while (!restartFile) {
-	      if ( errno == EINTR ) {
-		  CkPrintf("Warning: Interrupted system call opening rigid body restart file, retrying.\n");
-		  restartFile.clear();
-		  restartFile.open(fname);
-		  continue;
-	      }
-	      char err_msg[257];
-	      sprintf(err_msg, "Error opening rigid body restart file %s",fname);
-	      NAMD_err(err_msg);
-	  }
-	  restartFile << "# NAMD rigid body restart file" << std::endl;
-	  printLegend(restartFile);
-	  printData(step,restartFile);
-	  if (!restartFile) {
-	      char err_msg[257];
-	      sprintf(err_msg, "Error writing rigid body restart file %s",fname);
-	      NAMD_err(err_msg);
-	  } 
-      }
-    }
-    //  Output final coordinates
-    if (step == FILE_OUTPUT || step == END_OF_RUN) {
-	int realstep = ( step == FILE_OUTPUT ?
-			 simParams->firstTimestep : simParams->N );
-	iout << "WRITING RIGID BODY OUTPUT FILE AT STEP " << realstep << "\n" << endi;
-	static char fname[140];
-	strcpy(fname, simParams->outputFilename);
-	strcat(fname, ".rigid");
-	NAMD_backup_file(fname);
-	std::ofstream outputFile(fname);
-	while (!outputFile) {
-	    if ( errno == EINTR ) {
-		CkPrintf("Warning: Interrupted system call opening rigid body output file, retrying.\n");
-		outputFile.clear();
-		outputFile.open(fname);
-		continue;
-	    }
-	    char err_msg[257];
-	    sprintf(err_msg, "Error opening rigid body output file %s",fname);
-	    NAMD_err(err_msg);
-	} 
-	outputFile << "# NAMD rigid body output file" << std::endl;
-	printLegend(outputFile);
-	printData(realstep,outputFile);
-	if (!outputFile) {
-	    char err_msg[257];
-	    sprintf(err_msg, "Error writing rigid body output file %s",fname);
-	    NAMD_err(err_msg);
-	} 
-    }
-
-    //  Close trajectory file
-    if (step == END_OF_RUN) {
-	if ( trajFile.rdbuf()->is_open() ) {
-	    trajFile.close();
-	    iout << "CLOSING RIGID BODY TRAJECTORY FILE\n" << endi;
+/* 	printf("    Constructing RB force pair...\n"); */
+/* 	allocateMem(); */
+/* 	printf("    Done constructing RB force pair\n"); */
+
+/* } */
+void RigidBodyForcePair::initialize() {
+	printf("    Initializing (memory for) RB force pair...\n");
+
+	const int numGrids = gridKeyId1.size();
+	// RBTODO assert gridKeysIds are same size 
+
+	// allocate memory for forces/torques
+	for (int i = 0; i < numGrids; i++) {
+		const int k1 = gridKeyId1[i];
+		const int sz = type1->rawDensityGrids[k1].getSize();
+		const int nb = sz / numThreads + ((sz % numThreads == 0) ? 0:1 );
+
+		numBlocks.push_back(nb);
+		forces.push_back( new Vector3[nb] );
+		torques.push_back( new Vector3[nb] );
+
+		forces_d.push_back( new Vector3[nb] ); // RBTODO: correct?
+		torques_d.push_back( new Vector3[nb] );
+
+		// allocate device memory for numBlocks of torque, etc.
+    // printf("      Allocating device memory for forces/torques\n");
+		gpuErrchk(cudaMalloc(&(forces_d[i]), sizeof(Vector3) * nb));
+		gpuErrchk(cudaMalloc(&(torques_d[i]), sizeof(Vector3) * nb));
 	}
-    }
+	gpuErrchk(cudaDeviceSynchronize());
+	// printf("    Done initializing RB force pair\n");
 
 }
-void RigidBodyController::printLegend(std::ofstream &file) {
-        file << "#$LABELS step RigidBodyKey"
-		 << " posX  posY  posZ"
-		 << " rotXX rotXY rotXZ"
-		 << " rotYX rotYY rotYZ"
-		 << " rotZX rotZY rotZZ"
-		 << " velX  velY  velZ"
-		 << " angVelX angVelY angVelZ" << std::endl;
-}
-void RigidBodyController::printData(int step,std::ofstream &file) {
-    iout << "WRITING RIGID BODY COORDINATES AT STEP "<< step << "\n" << endi;
-    for (int i; i < rigidBodyList.size(); i++) {
-	Vector v =  rigidBodyList[i]->getPosition();
-	Tensor t =  rigidBodyList[i]->getOrientation();
-	file << step <<" "<< rigidBodyList[i]->getKey()
-		 <<" "<< v.x <<" "<< v.y <<" "<< v.z;
-	file <<" "<< t.xx <<" "<< t.xy <<" "<< t.xz
-		 <<" "<< t.yx <<" "<< t.yy <<" "<< t.yz
-		 <<" "<< t.zx <<" "<< t.zy <<" "<< t.zz;
-	v = rigidBodyList[i]->getVelocity();
-	file <<" "<< v.x <<" "<< v.y <<" "<< v.z;
-	v = rigidBodyList[i]->getAngularVelocity();
-	file <<" "<< v.x <<" "<< v.y <<" "<< v.z
-		 << std::endl;
-    }
+
+/* RigidBodyForcePair::RigidBodyForcePair(const RigidBodyForcePair& orig ) : */
+	
+/* } */
+
+void RigidBodyForcePair::swap(RigidBodyForcePair& a, RigidBodyForcePair& b) {
+	using std::swap;
+	swap(a.type1, b.type1);
+	swap(a.type2, b.type2);
+	swap(a.rb1, b.rb1);
+	swap(a.rb2, b.rb2);
+
+	swap(a.gridKeyId1, b.gridKeyId1);
+	swap(a.gridKeyId2, b.gridKeyId2);
+
+	swap(a.numBlocks, b.numBlocks);
+
+	swap(a.forces,    b.forces);
+	swap(a.forces_d,  b.forces_d);
+	swap(a.torques,   b.torques);
+	swap(a.torques_d, b.torques_d);
 }
 
-void RigidBodyController::integrate(int step) {
-    DebugM(3, "RBC::integrate: step  "<< step << "\n" << endi);
-    
-    DebugM(1, "RBC::integrate: Waiting for grid reduction\n" << endi);
-    gridReduction->require();
-  
-    const Molecule * mol = Node::Object()->molecule;
-
-    // pass reduction force and torque to each grid
-    // DebugM(3, "Summing forces on rigid bodies" << "\n" << endi);
-    for (int i=0; i < mol->rbReductionIdToRigidBody.size(); i++) {
-	Force f;
-	Force t;
-	for (int k = 0; k < 3; k++) {
-	    f[k] = gridReduction->item(6*i + k);
-	    t[k] = gridReduction->item(6*i + k + 3);
-	}
 
-	if (step % 100 == 1)
-	    DebugM(4, "RBC::integrate: reduction/rb " << i <<":"
-		   << "\n\tposition: "
-		   << rigidBodyList[mol->rbReductionIdToRigidBody[i]]->getPosition()
-		   <<"\n\torientation: "
-		   << rigidBodyList[mol->rbReductionIdToRigidBody[i]]->getOrientation()
-		   << "\n" << endi);
-
-	DebugM(4, "RBC::integrate: reduction/rb " << i <<": "
-	       << "force " << f <<": "<< "torque: " << t << "\n" << endi);
-	rigidBodyList[mol->rbReductionIdToRigidBody[i]]->addForce(f);
-	rigidBodyList[mol->rbReductionIdToRigidBody[i]]->addTorque(t);
-    }
-    
-    // Langevin 
-    for (int i=0; i<rigidBodyList.size(); i++) {
-	// continue;  // debug
-	if (rigidBodyList[i]->langevin) {
-	    DebugM(1, "RBC::integrate: reduction/rb " << i
-		   <<": calling langevin" << "\n" << endi);
-	    rigidBodyList[i]->addLangevin(
-		random->gaussian_vector(), random->gaussian_vector() );
-	    // DebugM(4, "RBC::integrate: reduction/rb " << i
-	    // 	   << " after langevin: force " << rigidBodyList[i]f <<": "<< "torque: " << t << "\n" << endi);
-	    // <<": calling langevin" << "\n" << endi);
-	}
-    }
-    
-    if ( step >= 0 && simParams->rigidBodyOutputFrequency &&
-	 (step % simParams->rigidBodyOutputFrequency) == 0 ) {
-	DebugM(1, "RBC::integrate:integrating for before printing output" << "\n" << endi);
-	// PRINT
-	if ( step == simParams->firstTimestep ) {
-	    print(step);
-	    // first step so only start this cycle
-	    for (int i=0; i<rigidBodyList.size(); i++)  {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		       <<": starting integration cycle of step "
-		       << step << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],0);
-	    }
-	} else {
-	    // finish last cycle
-	    // DebugM(1, "RBC::integrate: reduction/rb " << i
-	    // 	   <<": firststep: calling rb->integrate" << "\n" << endi);
-	    for (int i=0; i<rigidBodyList.size(); i++) {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		       <<": finishing integration cycle of step "
-		       << step-1 << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],1);
-	    }
-	    print(step);
-	    // start this cycle
-	    for (int i=0; i<rigidBodyList.size(); i++) {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		       <<": starting integration cycle of step "
-		       << step << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],0);
-	    }
-	}
-    } else {
-	DebugM(1, "RBC::integrate: trans[0] before: " << trans[0] << "\n" << endi);
-	if ( step == simParams->firstTimestep ) {
-	    // integrate the start of this cycle
-	    for (int i=0; i<rigidBodyList.size(); i++) {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		       <<": starting integration cycle of (first) step "
-		       << step << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],0);
-	    }
-	} else {
-	    // integrate end of last ts and start of this one 
-	    for (int i=0; i<rigidBodyList.size(); i++) {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		   <<": ending / starting integration cycle of step "
-		   << step-1 << "-" << step << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],2);
-	    }
+RigidBodyForcePair::~RigidBodyForcePair() {
+	printf("    Destructing RB force pair\n");
+	const int numGrids = gridKeyId1.size();
+
+	// printf("      numGrids = %d\n",numGrids);
+
+	// RBTODO assert gridKeysIds are same size 
+
+	// allocate memory for forces/torques
+	for (int i = 0; i < numGrids; i++) {
+		const int k1 = gridKeyId1[i];
+		const int nb = numBlocks[i];
+
+		// free device memory for numBlocks of torque, etc.
+		// printf("      Freeing device memory for forces/torques\n");
+		gpuErrchk(cudaFree( forces_d[i] ));	
+		gpuErrchk(cudaFree( torques_d[i] ));
 	}
-	DebugM(1, "RBC::integrate: trans[0] after: " << trans[0] << "\n" << endi);
-    }
-    
-    DebugM(3, "sendRigidBodyUpdate on step: " << step << "\n" << endi);
-
-    if (trans.size() != rot.size())
-	NAMD_die("failed sanity check\n");    
-    RigidBodyMsg *msg = new RigidBodyMsg;
-    msg->trans.copy(trans);	// perhaps .swap() would cause problems
-    msg->rot.copy(rot);
-    computeMgr->sendRigidBodyUpdate(msg);
+	gpuErrchk(cudaDeviceSynchronize());
+	
+	numBlocks.clear();
+	forces.clear();
+	forces_d.clear();
+	torques.clear();
+	torques_d.clear();
 }
 
 
-RigidBodyParams* RigidBodyParamsList::find_key(const char* key)
-{
-    RBElem* cur = head;
-    RBElem* found = NULL;
-    RigidBodyParams* result = NULL;
-    
-    while (found == NULL && cur != NULL) {
-       if (!strcasecmp((cur->elem).rigidBodyKey,key)) {
-        found = cur;
-      } else {
-        cur = cur->nxt;
-      }
-    }
-    if (found != NULL) {
-      result = &(found->elem);
-    }
-    return result;
-}
-  
-int RigidBodyParamsList::index_for_key(const char* key)
-{
-    RBElem* cur = head;
-    RBElem* found = NULL;
-    int result = -1;
-    
-    int idx = 0;
-    while (found == NULL && cur != NULL) {
-       if (!strcasecmp((cur->elem).rigidBodyKey,key)) {
-        found = cur;
-      } else {
-        cur = cur->nxt;
-	idx++;
-      }
-    }
-    if (found != NULL) {
-	result = idx;
-    }
-    return result;
-}
-  
-RigidBodyParams* RigidBodyParamsList::add(const char* key) 
-{
-    // If the key is already in the list, we can't add it
-    if (find_key(key)!=NULL) {
-      return NULL;
-    }
-    
-    RBElem* new_elem = new RBElem();
-    int len = strlen(key);
-    RigidBodyParams* elem = &(new_elem->elem);
-    elem->rigidBodyKey = new char[len+1];
-    strncpy(elem->rigidBodyKey,key,len+1);
-    elem->mass = NULL;
-    elem->inertia = Vector(NULL);
-    elem->langevin = NULL;
-    elem->temperature = NULL;
-    elem->transDampingCoeff = Vector(NULL);
-    elem->rotDampingCoeff = Vector(NULL);
-    elem->gridList.clear();
-    elem->position = Vector(NULL);
-    elem->velocity = Vector(NULL);
-    elem->orientation = Tensor();
-    elem->orientationalVelocity = Vector(NULL);
-    
-    elem->next = NULL;
-    new_elem->nxt = NULL;
-    if (head == NULL) {
-      head = new_elem;
-    }
-    if (tail != NULL) {
-      tail->nxt = new_elem;
-      tail->elem.next = elem;
-    }
-    tail = new_elem;
-    n_elements++;
-    
-    return elem;
-}  
-
-*/
+
diff --git a/RigidBodyController.h b/RigidBodyController.h
index 528741008b1dc49257c9363631bcaed99396a58e..c6e162d750f87911eb3efa832fd70416f299c205 100644
--- a/RigidBodyController.h
+++ b/RigidBodyController.h
@@ -6,8 +6,55 @@
 
 class Configuration;
 
-struct gridInteractionList {
+class RigidBodyForcePair  {
+	friend class RigidBodyController;
+
+public:
+	RigidBodyForcePair(RigidBodyType* t1, RigidBodyType* t2,
+										 RigidBody* rb1, RigidBody* rb2,
+										 std::vector<int> gridKeyId1, std::vector<int> gridKeyId2) :
+		type1(t1), type2(t2), rb1(rb1), rb2(rb2),
+		gridKeyId1(gridKeyId1), gridKeyId2(gridKeyId2)
+		{
+			printf("    Constructing RB force pair...\n");
+			initialize();
+			// printf("    done constructing RB force pair\n");
+		}
+	RigidBodyForcePair(const RigidBodyForcePair& o) :
+		type1(o.type1), type2(o.type2), rb1(o.rb1), rb2(o.rb2),
+		gridKeyId1(o.gridKeyId1), gridKeyId2(o.gridKeyId2) {
+		printf("    Copying RB force pair...\n");
+		initialize();
+	}
+	RigidBodyForcePair& operator=(RigidBodyForcePair& o) {
+		printf("    Copying assigning RB force pair...\n");
+		swap(*this,o);
+		return *this;
+	}
+	
+	~RigidBodyForcePair();
+
+private:
+	void initialize();
+	void swap(RigidBodyForcePair& a, RigidBodyForcePair& b);
+	
+	static const int numThreads = 256;
+
+	RigidBodyType* type1;
+	RigidBodyType* type2;
+	RigidBody* rb1;
+	RigidBody* rb2;
+	
+	std::vector<int> gridKeyId1;
+	std::vector<int> gridKeyId2;
+	std::vector<int> numBlocks;
+
+	std::vector<Vector3*> forces;
+	std::vector<Vector3*> forces_d;
+	std::vector<Vector3*> torques;
+	std::vector<Vector3*> torques_d;
 	
+	void updateForces();
 };
 
 class RigidBodyController {
@@ -17,13 +64,15 @@ public:
   ~RigidBodyController();
 	RigidBodyController(const Configuration& c);
 
-	DEVICE void integrate(int step);
+	void integrate(int step);
 	DEVICE void print(int step);
+
+	void updateForces();
     
 private:
 	void copyGridsToDevice();
-	void updateForces();
-
+	void initializeForcePairs();
+	
 	/* void printLegend(std::ofstream &file); */
 	/* void printData(int step, std::ofstream &file); */
 public:
@@ -41,5 +90,5 @@ private:
 	Matrix3* rot;  	// there are errors on rigidBody->integrate
 	std::vector< std::vector<RigidBody> > rigidBodyByType;
 	
-	
+	std::vector< RigidBodyForcePair > forcePairs;
 };
diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu
index 7075071b332bcb31a6e203c5e6ea1df2aa9b573a..c43fb364b3fb6c0ee33b35dbb3817a22a92159ad 100644
--- a/RigidBodyGrid.cu
+++ b/RigidBodyGrid.cu
@@ -319,6 +319,315 @@ float RigidBodyGrid::mean() const {
 // 	return val[j];
 // }
 
+	// RBTODO overload with optimized algorithm
+	//  skip transforms (assume identity basis)
+HOST DEVICE float RigidBodyGrid::interpolatePotential(Vector3 l) const {
+	// Find the home node.
+	// Vector3 l = basisInv.transform(pos - origin);
+	int homeX = int(floor(l.x));
+	int homeY = int(floor(l.y));
+	int homeZ = int(floor(l.z));
+
+	// out of grid? return 0
+	// RBTODO
+			
+	// Get the array jumps.
+	int jump[3];
+	jump[0] = nz*ny;
+	jump[1] = nz;
+	jump[2] = 1;
+
+	// Shift the indices in the home array.
+	int home[3];
+	home[0] = homeX;
+	home[1] = homeY;
+	home[2] = homeZ;
+
+	// Get the grid dimensions.
+	int g[3];
+	g[0] = nx;
+	g[1] = ny;
+	g[2] = nz;
+  
+	// Get the interpolation coordinates.
+	float w[3];
+	w[0] = l.x - homeX;
+	w[1] = l.y - homeY;
+	w[2] = l.z - homeZ;
+
+	// Find the values at the neighbors.
+	float g1[4][4][4];
+	for (int ix = 0; ix < 4; ix++) {
+		int jx = ix-1 + home[0];
+		jx = wrap(jx, g[0]);
+		for (int iy = 0; iy < 4; iy++) {
+			int jy = iy-1 + home[1];
+			jy = wrap(jy, g[1]);
+			for (int iz = 0; iz < 4; iz++) {
+				// Wrap around the periodic boundaries. 
+				int jz = iz-1 + home[2];
+				jz = wrap(jz, g[2]);
+	  
+				int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
+				g1[ix][iy][iz] = val[ind];
+			}
+		}
+	}
+	float a0, a1, a2, a3;
+  
+	// Mix along x.
+	float g2[4][4];
+	for (int iy = 0; iy < 4; iy++) {
+		for (int iz = 0; iz < 4; iz++) {
+			a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
+			a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
+			a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
+			a0 = g1[1][iy][iz];
+
+			g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+		}
+	}
+
+	// Mix along y.
+	float g3[4];
+	for (int iz = 0; iz < 4; iz++) {
+		a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
+		a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
+		a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
+		a0 = g2[1][iz];
+   
+		g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+	}
+
+	// Mix along z.
+	a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
+	a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
+	a1 = 0.5f*(-g3[0] + g3[2]);
+	a0 = g3[1];
+
+	return a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0;
+}
+
+/** interpolateForce() to be used on CUDA Device **/
+HOST DEVICE Vector3 RigidBodyGrid::interpolateForceD(Vector3 l) const {
+	Vector3 f;
+	// Vector3 l = basisInv.transform(pos - origin);
+	int homeX = int(floor(l.x));
+	int homeY = int(floor(l.y));
+	int homeZ = int(floor(l.z));
+	// Get the array jumps with shifted indices.
+	int jump[3];
+	jump[0] = nz*ny;
+	jump[1] = nz;
+	jump[2] = 1;
+	// Shift the indices in the home array.
+	int home[3];
+	home[0] = homeX;
+	home[1] = homeY;
+	home[2] = homeZ;
+
+	// Shift the indices in the grid dimensions.
+	int g[3];
+	g[0] = nx;
+	g[1] = ny;
+	g[2] = nz;
+
+	// Get the interpolation coordinates.
+	float w[3];
+	w[0] = l.x - homeX;
+	w[1] = l.y - homeY;
+	w[2] = l.z - homeZ;
+	// Find the values at the neighbors.
+	float g1[4][4][4];						/* neighborhood */
+	for (int ix = 0; ix < 4; ix++) {
+		for (int iy = 0; iy < 4; iy++) {
+			for (int iz = 0; iz < 4; iz++) {
+				// RBTODO: probably don't wrap!
+				// Wrap around the periodic boundaries. 
+				int jx = ix-1 + home[0];
+				jx = wrap(jx, g[0]);
+				int jy = iy-1 + home[1];
+				jy = wrap(jy, g[1]);
+				int jz = iz-1 + home[2];
+				jz = wrap(jz, g[2]);
+				int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
+				g1[ix][iy][iz] = val[ind];
+			}
+		}
+	}  
+	f.x = interpolateDiffX(w, g1);
+	f.y = interpolateDiffY(w, g1);
+	f.z = interpolateDiffZ(w, g1);
+	// Vector3 f1 = basisInv.transpose().transform(f);
+	// return f1;
+	return f;
+}
+
+/* inline virtual Vector3 interpolateForce(Vector3 pos) const { */
+/* 	Vector3 f; */
+/* 	Vector3 l = basisInv.transform(pos - origin); */
+/* 	int homeX = int(floor(l.x)); */
+/* 	int homeY = int(floor(l.y)); */
+/* 	int homeZ = int(floor(l.z)); */
+/* 	// Get the array jumps with shifted indices. */
+/* 	int jump[3]; */
+/* 	jump[0] = nz*ny; */
+/* 	jump[1] = nz; */
+/* 	jump[2] = 1; */
+/* 	// Shift the indices in the home array. */
+/* 	int home[3]; */
+/* 	home[0] = homeX; */
+/* 	home[1] = homeY; */
+/* 	home[2] = homeZ; */
+
+/* 	// Shift the indices in the grid dimensions. */
+/* 	int g[3]; */
+/* 	g[0] = nx; */
+/* 	g[1] = ny; */
+/* 	g[2] = nz; */
+
+/* 	// Get the interpolation coordinates. */
+/* 	float w[3]; */
+/* 	w[0] = l.x - homeX; */
+/* 	w[1] = l.y - homeY; */
+/* 	w[2] = l.z - homeZ; */
+/* 	// Find the values at the neighbors. */
+/* 	float g1[4][4][4]; */
+/* 	//RBTODO parallelize? */
+/* 	for (int ix = 0; ix < 4; ix++) { */
+/* 		for (int iy = 0; iy < 4; iy++) { */
+/* 			for (int iz = 0; iz < 4; iz++) { */
+/* 				// Wrap around the periodic boundaries.  */
+/* 				int jx = ix-1 + home[0]; */
+/* 				jx = wrap(jx, g[0]); */
+/* 				int jy = iy-1 + home[1]; */
+/* 				jy = wrap(jy, g[1]); */
+/* 				int jz = iz-1 + home[2]; */
+/* 				jz = wrap(jz, g[2]); */
+/* 				int ind = jz*jump[2] + jy*jump[1] + jx*jump[0]; */
+/* 				g1[ix][iy][iz] = val[ind]; */
+/* 			} */
+/* 		} */
+/* 	}   */
+/* 	f.x = interpolateDiffX(pos, w, g1); */
+/* 	f.y = interpolateDiffY(pos, w, g1); */
+/* 	f.z = interpolateDiffZ(pos, w, g1); */
+/* 	Vector3 f1 = basisInv.transpose().transform(f); */
+/* 	return f1; */
+/* } */
+
+HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(float w[3], float g1[4][4][4]) const {
+	float a0, a1, a2, a3;
+	// RBTODO further parallelize loops? unlikely?
+		
+	// Mix along x, taking the derivative.
+	float g2[4][4];
+	for (int iy = 0; iy < 4; iy++) {
+		for (int iz = 0; iz < 4; iz++) {
+			a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
+			a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
+			a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
+			a0 = g1[1][iy][iz];
+
+			//g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+			g2[iy][iz] = 3.0f*a3*w[0]*w[0] + 2.0f*a2*w[0] + a1;
+		}
+	}
+
+	// Mix along y.
+	float g3[4];
+	for (int iz = 0; iz < 4; iz++) {
+		a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
+		a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
+		a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
+		a0 = g2[1][iz];
+   
+		g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+	}
+
+	// Mix along z.
+	a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
+	a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
+	a1 = 0.5f*(-g3[0] + g3[2]);
+	a0 = g3[1];
+ 
+	float retval = -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
+	return retval;
+}
+
+HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(float w[3], float g1[4][4][4]) const {
+	float a0, a1, a2, a3;
+  
+	// Mix along x, taking the derivative.
+	float g2[4][4];
+	for (int iy = 0; iy < 4; iy++) {
+		for (int iz = 0; iz < 4; iz++) {
+			a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
+			a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
+			a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
+			a0 = g1[1][iy][iz];
+
+			g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+		}
+	}
+
+	// Mix along y.
+	float g3[4];
+	for (int iz = 0; iz < 4; iz++) {
+		a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
+		a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
+		a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
+		a0 = g2[1][iz];
+   
+		//g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+		g3[iz] = 3.0f*a3*w[1]*w[1] + 2.0f*a2*w[1] + a1;
+	}
+
+	// Mix along z.
+	a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
+	a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
+	a1 = 0.5f*(-g3[0] + g3[2]);
+	a0 = g3[1];
+
+	return -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
+}
+
+HOST DEVICE inline float  RigidBodyGrid::interpolateDiffZ(float w[3], float g1[4][4][4]) const {
+	float a0, a1, a2, a3;
+  
+	// Mix along x, taking the derivative.
+	float g2[4][4];
+	for (int iy = 0; iy < 4; iy++) {
+		for (int iz = 0; iz < 4; iz++) {
+			a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
+			a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
+			a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
+			a0 = g1[1][iy][iz];
+
+			g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+		}
+	}
+
+	// Mix along y.
+	float g3[4];
+	for (int iz = 0; iz < 4; iz++) {
+		a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
+		a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
+		a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
+		a0 = g2[1][iz];
+   
+		g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+	}
+
+	// Mix along z.
+	a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
+	a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
+	a1 = 0.5f*(-g3[0] + g3[2]);
+	a0 = g3[1];
+
+	return -(3.0f*a3*w[2]*w[2] + 2.0f*a2*w[2] + a1);
+}
+
 
 // Added by Rogan for times when simpler calculations are required.
 float RigidBodyGrid::interpolatePotentialLinearly(Vector3 pos) const {
diff --git a/RigidBodyGrid.h b/RigidBodyGrid.h
index 1aa49c1da08b91d69f10b9106ebbc2c33f08e9fa..b456af80c6673d02d783da97a0c0cbdb58a79cac 100644
--- a/RigidBodyGrid.h
+++ b/RigidBodyGrid.h
@@ -142,205 +142,11 @@ public:
 	// Added by Rogan for times when simpler calculations are required.
   virtual float interpolatePotentialLinearly(Vector3 pos) const;
 
-	HOST DEVICE inline float interpolateDiffX(Vector3 pos, float w[3], float g1[4][4][4]) const {
-    float a0, a1, a2, a3;
-		
-		// RBTODO further parallelize loops? unlikely?
-		
-    // Mix along x, taking the derivative.
-    float g2[4][4];
-    for (int iy = 0; iy < 4; iy++) {
-      for (int iz = 0; iz < 4; iz++) {
-				a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
-				a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
-				a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
-				a0 = g1[1][iy][iz];
-
-				//g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
-				g2[iy][iz] = 3.0f*a3*w[0]*w[0] + 2.0f*a2*w[0] + a1;
-      }
-    }
-
-
-    // Mix along y.
-    float g3[4];
-    for (int iz = 0; iz < 4; iz++) {
-      a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
-      a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
-      a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
-      a0 = g2[1][iz];
-   
-      g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
-    }
-
-    // Mix along z.
-    a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
-    a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
-    a1 = 0.5f*(-g3[0] + g3[2]);
-    a0 = g3[1];
- 
-    return -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
-  }
-
-  HOST DEVICE inline float interpolateDiffY(Vector3 pos, float w[3], float g1[4][4][4]) const {
-    float a0, a1, a2, a3;
-  
-    // Mix along x, taking the derivative.
-    float g2[4][4];
-    for (int iy = 0; iy < 4; iy++) {
-      for (int iz = 0; iz < 4; iz++) {
-				a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
-				a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
-				a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
-				a0 = g1[1][iy][iz];
-
-				g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
-      }
-    }
-
-    // Mix along y.
-    float g3[4];
-    for (int iz = 0; iz < 4; iz++) {
-      a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
-      a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
-      a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
-      a0 = g2[1][iz];
-   
-      //g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
-      g3[iz] = 3.0f*a3*w[1]*w[1] + 2.0f*a2*w[1] + a1;
-    }
-
-    // Mix along z.
-    a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
-    a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
-    a1 = 0.5f*(-g3[0] + g3[2]);
-    a0 = g3[1];
-
-    return -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
-  }
-
-  HOST DEVICE inline float interpolateDiffZ(Vector3 pos, float w[3], float g1[4][4][4]) const {
-    float a0, a1, a2, a3;
-  
-    // Mix along x, taking the derivative.
-    float g2[4][4];
-    for (int iy = 0; iy < 4; iy++) {
-      for (int iz = 0; iz < 4; iz++) {
-				a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
-				a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
-				a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
-				a0 = g1[1][iy][iz];
-
-				g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
-      }
-    }
-
-    // Mix along y.
-    float g3[4];
-    for (int iz = 0; iz < 4; iz++) {
-      a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
-      a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
-      a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
-      a0 = g2[1][iz];
-   
-      g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
-    }
-
-    // Mix along z.
-    a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
-    a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
-    a1 = 0.5f*(-g3[0] + g3[2]);
-    a0 = g3[1];
-
-    return -(3.0f*a3*w[2]*w[2] + 2.0f*a2*w[2] + a1);
-  }
+	HOST DEVICE float interpolateDiffX(float w[3], float g1[4][4][4]) const;
+  HOST DEVICE float interpolateDiffY(float w[3], float g1[4][4][4]) const;
+	HOST DEVICE float interpolateDiffZ(float w[3], float g1[4][4][4]) const;
 
-  HOST DEVICE inline float interpolatePotential(Vector3 pos) const {
-    // Find the home node.
-    Vector3 l = pos;
-    int homeX = int(floor(l.x));
-    int homeY = int(floor(l.y));
-    int homeZ = int(floor(l.z));
-
-		// out of grid? return 0
-		// RBTODO
-			
-    // Get the array jumps.
-    int jump[3];
-    jump[0] = nz*ny;
-    jump[1] = nz;
-    jump[2] = 1;
-
-    // Shift the indices in the home array.
-    int home[3];
-    home[0] = homeX;
-    home[1] = homeY;
-    home[2] = homeZ;
-
-    // Get the grid dimensions.
-    int g[3];
-    g[0] = nx;
-    g[1] = ny;
-    g[2] = nz;
-  
-    // Get the interpolation coordinates.
-    float w[3];
-    w[0] = l.x - homeX;
-    w[1] = l.y - homeY;
-    w[2] = l.z - homeZ;
-
-    // Find the values at the neighbors.
-    float g1[4][4][4];
-    for (int ix = 0; ix < 4; ix++) {
-	  	int jx = ix-1 + home[0];
-      for (int iy = 0; iy < 4; iy++) {
-	  		int jy = iy-1 + home[1];
-				for (int iz = 0; iz < 4; iz++) {
-	  			int jz = iz-1 + home[2];
-					if (jx <  0  ||  jy < 0  ||  jz < 0  ||
-							jx >= nx || jz >= nz || jz >= nz) {
-						g1[ix][iy][iz] = 0;
-					} else {
-						int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
-						g1[ix][iy][iz] = val[ind];
-					}
-				}
-      }
-    }
-    float a0, a1, a2, a3;
-  
-    // Mix along x.
-    float g2[4][4];
-    for (int iy = 0; iy < 4; iy++) {
-      for (int iz = 0; iz < 4; iz++) {
-				a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
-				a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
-				a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
-				a0 = g1[1][iy][iz];
-
-				g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
-      }
-    }
-
-    // Mix along y.
-    float g3[4];
-    for (int iz = 0; iz < 4; iz++) {
-      a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
-      a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
-      a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
-      a0 = g2[1][iz];
-   
-      g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
-    }
-
-    // Mix along z.
-    a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
-    a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
-    a1 = 0.5f*(-g3[0] + g3[2]);
-    a0 = g3[1];
-
-    return a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0;
-  }
+  HOST DEVICE float interpolatePotential(Vector3 l) const;
 
   HOST DEVICE inline static int wrap(int i, int n) {
 		if (i < 0) {
@@ -355,61 +161,7 @@ public:
 	}
 
 	/** interpolateForce() to be used on CUDA Device **/
-	DEVICE inline Vector3 interpolateForceD(Vector3 pos) const {
-		Vector3 f;
- 		Vector3 l = pos;
-		int homeX = int(floor(l.x));
-		int homeY = int(floor(l.y));
-		int homeZ = int(floor(l.z));
-		// Get the array jumps with shifted indices.
-		int jump[3];
-		jump[0] = nz*ny;
-		jump[1] = nz;
-		jump[2] = 1;
-		// Shift the indices in the home array.
-		int home[3];
-		home[0] = homeX;
-		home[1] = homeY;
-		home[2] = homeZ;
-
-		// Shift the indices in the grid dimensions.
-		int g[3];
-		g[0] = nx;
-		g[1] = ny;
-		g[2] = nz;
-
-		// Get the interpolation coordinates.
-		float w[3];
-		w[0] = l.x - homeX;
-		w[1] = l.y - homeY;
-		w[2] = l.z - homeZ;
-		// Find the values at the neighbors.
-		float g1[4][4][4];					/* RBTODO: inefficient for my algorithm? */
-		for (int ix = 0; ix < 4; ix++) {
-			int jx = ix-1 + home[0];
-			for (int iy = 0; iy < 4; iy++) {
-				int jy = iy-1 + home[1];
-				for (int iz = 0; iz < 4; iz++) {
-	  			// Assume zero value at edges
-					int jz = iz-1 + home[2];
-					// RBTODO: possible branch divergence in warp?
-					if (jx <  0  ||  jy < 0  ||  jz < 0  ||
-							jx >= nx || jz >= nz || jz >= nz) {
-						g1[ix][iy][iz] = 0;
-					} else {
-						int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
-						g1[ix][iy][iz] = val[ind];
-					}
-				}
-			}
-		}  
-		f.x = interpolateDiffX(pos, w, g1);
-		f.y = interpolateDiffY(pos, w, g1);
-		f.z = interpolateDiffZ(pos, w, g1);
-		// Vector3 f1 = basisInv.transpose().transform(f);
-		// return f1;
-		return f;
-	}
+	HOST DEVICE Vector3 interpolateForceD(Vector3 l) const;
 
   inline virtual Vector3 interpolateForce(Vector3 pos) const {
 		Vector3 f;
@@ -457,9 +209,9 @@ public:
 				}
 			}
 		}  
-		f.x = interpolateDiffX(pos, w, g1);
-		f.y = interpolateDiffY(pos, w, g1);
-		f.z = interpolateDiffZ(pos, w, g1);
+		f.x = interpolateDiffX( w, g1);
+		f.y = interpolateDiffY( w, g1);
+		f.z = interpolateDiffZ( w, g1);
 		// Vector3 f1 = basisInv.transpose().transform(f);
 		// return f1;
 		return f;
@@ -482,7 +234,7 @@ public:
   }
 
   // Wrap vector, 0 <= x < lx  &&  0 <= y < ly  &&  0 <= z < lz
-  HOST DEVICE inline virtual Vector3 wrap(Vector3 l) const {
+  HOST DEVICE inline Vector3 wrap(Vector3 l) const {
     l.x = wrapFloat(l.x, nx);
     l.y = wrapFloat(l.y, ny);
     l.z = wrapFloat(l.z, nz);
diff --git a/makefile b/makefile
index 6f9a87576267ceea2020f279fd54a7ba0c9699b5..942255dcb3d6aa6a51a6d511ca1c59cbf22349fb 100644
--- a/makefile
+++ b/makefile
@@ -12,7 +12,7 @@ INCLUDE = $(CUDA_PATH)/include
 
 DEBUG = -g
 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic
-NV_FLAGS = $(DEBUG)
+NV_FLAGS = -g -G								#debug
 EX_FLAGS = -O3 -m$(OS_SIZE)
 
 ifneq ($(MAVERICKS),)
diff --git a/notes.org b/notes.org
index 25d583185188dfd1fb34d5546d133a0fd4ca0482..28b02e96b2e1f176e60d6a094632ee7d275448dd 100644
--- a/notes.org
+++ b/notes.org
@@ -1,6 +1,6 @@
 * TODO active
 ** make rigid body object device pointer
-
+** branch rb-device: have RBType objects hold device pointers for raw RigidBodyGrids
 
 
 * organization
diff --git a/useful.cu b/useful.cu
index 443a3544a9b762e51dc88802457347eb79f7b404..01de12fd8a05800d247fc8aba23b93bdb0d9c8b3 100644
--- a/useful.cu
+++ b/useful.cu
@@ -35,10 +35,13 @@ int firstSpace(const char* s, int max) {
 
 // A classic growable string class.
 
-void String::print() {
+void String::print() const {
+	printInline();
+	printf("\n");
+}
+void String::printInline() const {
 	for (int i = 0; i < len; i++)
 		printf("%c", c[i]);
-	printf("\n");
 }
 
 String& String::operator=(const String& s) {
diff --git a/useful.h b/useful.h
index 327cd7a7d9f0ab4962d9baa0c28613a7982635f9..19b88dc09f03232e5d97fa7a5d3d09b5192c9b47 100644
--- a/useful.h
+++ b/useful.h
@@ -73,7 +73,8 @@ public:
 	void add(char c0);
 	void add(const char* s);
 	void add(String& s);
-	void print();
+	void printInline() const;
+	void print() const;
 	int length() const;
 
 	// Negative indices go from the end.