diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh
index 82bc0ea22f3298f2a326f76f0088e82f7a930861..f78b705de7233f3609aa7cb602be14d7211a0047 100644
--- a/ComputeGridGrid.cuh
+++ b/ComputeGridGrid.cuh
@@ -1,40 +1,46 @@
 // Included in RigidBodyController.cu
 #pragma once
 
+
 __global__
 void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
-													const Matrix3 basis_rho, const Vector3 origin_rho,
-													const Matrix3 basis_u_inv, const Vector3 origin_u,
-													Vector3 * retForce, Vector3 * retTorque, int gridNum) {
+													const Matrix3 basis_rho, const Matrix3 basis_u_inv,
+													const Vector3 origin_rho_minus_origin_u,
+													Vector3 * retForce, Vector3 * retTorque) {
 
-	__shared__ Vector3 force [NUMTHREADS];
-	__shared__ Vector3 torque [NUMTHREADS];
-	
-  // RBTODO http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+	extern __shared__ Vector3 s[];
+	Vector3 *force = s;
+	Vector3 *torque = &s[NUMTHREADS];
+
+  // RBTODO: http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops
 	const int tid = threadIdx.x;
 	const int r_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-	if (r_id >= rho->getSize()) {				// skip threads with no data
+	if (r_id >= rho->getSize()) { // skip threads with nothing to do
 		force[tid] = Vector3(0.0f);
 		torque[tid] = Vector3(0.0f);
 		return;
 	}
 
-	// RBTODO: reduce registers used; commenting out interpolatePotential / interpolateForceD still uses ~40 registers, otherwise 236!!!
-	// RBTODO: maybe organize data into compact regions for grid data for better use of shared memory...
+	// RBTODO: reduce registers used;
+	//   commenting out interpolateForceD still uses ~40 registers
+	//   -- the innocuous-looking fn below is responsible; consumes ~17 registers!
+	Vector3 r_pos= rho->getPosition(r_id); /* i,j,k value of voxel */
 
-	Vector3 r_pos = rho->getPosition(r_id); /* i,j,k value of voxel */
-	r_pos = basis_rho.transform( r_pos ) + origin_rho; /* real space */
-	const Vector3 u_ijk_float = basis_u_inv.transform( r_pos - origin_u );
+	r_pos = basis_rho.transform( r_pos ) + origin_rho_minus_origin_u; /* real space */
+	const Vector3 u_ijk_float = basis_u_inv.transform( r_pos );
 
 	// RBTODO What about non-unit delta?
+	/* Vector3 tmpf  = Vector3(0.0f); */
+	/* float tmpe = 0.0f; */
+	/* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */
 	const ForceEnergy fe = u->interpolateForceD( u_ijk_float ); /* in coord frame of u */
 	force[tid] = fe.f;
 
 	const float r_val = rho->val[r_id]; /* maybe move to beginning of function?  */
 	force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */
 
-	// Calculate torque about lab-frame origin 
+	// Calculate torque about origin_u in the lab frame
 	torque[tid] = r_pos.cross(force[tid]);				// RBTODO: test if sign is correct!
 	
 
diff --git a/RigidBodyController.cu b/RigidBodyController.cu
index ffadd63a078864bd46d9fa82a2229043856a9c82..6e9c449ad670d28356f4df909b660b114dd9d19f 100644
--- a/RigidBodyController.cu
+++ b/RigidBodyController.cu
@@ -278,6 +278,28 @@ void RigidBodyForcePair::createStreams() {
 		gpuErrchk( cudaStreamCreate( &(stream[i]) ) );
 		// gpuErrchk( cudaStreamCreateWithFlags( &(stream[i]) , cudaStreamNonBlocking ) );
 }
+Vector3 RigidBodyForcePair::getOrigin1(const int i) {
+	const int k1 = gridKeyId1[i];
+	return rb1->getOrientation()*type1->densityGrids[k1].getOrigin() + rb1->getPosition();
+}
+Vector3 RigidBodyForcePair::getOrigin2(const int i) {
+	const int k2 = gridKeyId2[i];
+	if (!isPmf)
+		return rb2->getOrientation()*type2->potentialGrids[k2].getOrigin() + rb2->getPosition();
+	else
+		return type2->rawPmfs[k2].getOrigin();
+}		
+Matrix3 RigidBodyForcePair::getBasis1(const int i) {
+	const int k1 = gridKeyId1[i];
+	return rb1->getOrientation()*type1->densityGrids[k1].getBasis();
+}
+Matrix3 RigidBodyForcePair::getBasis2(const int i) {
+	const int k2 = gridKeyId2[i];
+	if (!isPmf)
+		return rb2->getOrientation()*type2->potentialGrids[k2].getBasis();
+	else
+		return type2->rawPmfs[k2].getBasis();
+}
 
 // RBTODO: bundle several rigidbodypair evaluations in single kernel call
 void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
@@ -294,7 +316,6 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
 		const int k1 = gridKeyId1[i];
 		const int k2 = gridKeyId2[i];
 		const cudaStream_t &s = stream[streamID[i]];
-
 		/*
 			ijk: index of grid value
 			r: postion of point ijk in real space
@@ -308,39 +329,26 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
 
   		/.––––––––––––––––––.
 	  	| r = R.(B.ijk+o)+c |
-	  	| r = B'.ijk + c'    |
+	  	| r = B'.ijk + c'   |
 	  	`––––––––––––––––––./
 		*/
-		Matrix3 B1 = rb1->getOrientation()*type1->densityGrids[k1].getBasis();
-		Vector3 c1 = rb1->getOrientation()*type1->densityGrids[k1].getOrigin() + rb1->getPosition();
-		
-		Matrix3 B2;
-		Vector3 c2;
+		Matrix3 B1 = getBasis1(i);
+		Vector3 c = getOrigin1(i) - getOrigin2(i);
 		
-		/* printf("  Calculating grid forces\n"); */
-		if (!isPmf) {								/* pair of RBs */
-			B2 = rb2->getOrientation()*type2->potentialGrids[k2].getBasis();
-			c2 = rb2->getOrientation()*type2->potentialGrids[k2].getOrigin() + rb2->getPosition();
-			B2 = B2.inverse();
+		Matrix3 B2 = getBasis2(i).inverse();
 
-			// RBTODO: get energy
-			computeGridGridForce<<< nb, numThreads, 0, s >>>
+		// RBTODO: get energy
+		if (!isPmf) {								/* pair of RBs */
+			computeGridGridForce<<< nb, numThreads, NUMTHREADS*2*sizeof(Vector3), s >>>
 				(type1->rawDensityGrids_d[k1], type2->rawPotentialGrids_d[k2],
-				 B1, c1, B2, c2,
-				 forces_d[i], torques_d[i], pairId+i);
+				 B1, B2, c,
+				 forces_d[i], torques_d[i]);
 		} else {										/* RB with a PMF */
-			// B2 = type2->rawPmfs[i].getBasis(); // not 100% certain k2 should be used rather than i
-			/// c2 = type2->rawPmfs[i].getOrigin();
-			B2 = type2->rawPmfs[k2].getBasis();
-			c2 = type2->rawPmfs[k2].getOrigin();
-			B2 = B2.inverse();
-
-			computeGridGridForce<<< nb, numThreads, 0, s >>>
+			computeGridGridForce<<< nb, numThreads, NUMTHREADS*2*sizeof(Vector3), s >>>
 				(type1->rawDensityGrids_d[k1], type2->rawPmfs_d[k2],
-				 B1, c1, B2, c2,
-				 forces_d[i], torques_d[i], pairId+i);
+				 B1, B2, c,
+				 forces_d[i], torques_d[i]);
 		}
-
 	}
 }
 
@@ -361,11 +369,23 @@ void RigidBodyForcePair::retrieveForces() {
 															cudaMemcpyDeviceToHost, s));
 
 		gpuErrchk(cudaStreamSynchronize( s ));
-		
+
+		Vector3 tmpF = Vector3(0.0f);
+		Vector3 tmpT = Vector3(0.0f);
+			
 		for (int j = 0; j < nb; j++) {
-			f = f + forces[i][j];
-			t = t + torques[i][j];
+			tmpF = tmpF + forces[i][j];
+			tmpT = tmpT + torques[i][j];
 		}
+		
+		// tmpT is the torque calculated about the origin of grid k2 (e.g. c2)
+		//   so here we transform torque to be about rb1
+		Vector3 c2 = getOrigin2(i);
+		tmpT = tmpT - (rb1->getPosition() - c2).cross( tmpF ); 
+
+		// sum forces and torques
+		f = f + tmpF;
+		t = t + tmpT;
 	}
 	
 	// transform torque from lab-frame origin to rb centers
@@ -375,14 +395,13 @@ void RigidBodyForcePair::retrieveForces() {
 	/* /\* printf("rb1->position: (%f,%f,%f)\n", tmp.x, tmp.y, tmp.z); *\/ */
 	/* tmp = rb1->getPosition(); */
 	/* printf("rb1->getPosition(): (%f,%f,%f)\n", tmp.x, tmp.y, tmp.z); */
-	Vector3 t1 = t - rb1->getPosition().cross( f );
 	rb1->addForce( f );
-	rb1->addTorque(t1);
+	rb1->addTorque( t );
 
 	if (!isPmf) {
-		Vector3 t2 = -t - rb2->getPosition().cross( -f );
-		rb2->addForce(-f);
-		rb2->addTorque(t2);
+		const Vector3 t2 = -t + (rb2->getPosition()-rb1->getPosition()).cross( f );
+		rb2->addForce( -f );
+		rb2->addTorque( t2 );
 	}
 		
 	/* printf("force: (%f,%f,%f)\n",f.x,f.y,f.z); */
@@ -590,46 +609,6 @@ void RigidBodyController::print(int step) {
 			} 
 		}
 	}
-	/*
-	//  Output final coordinates
-	if (step == FILE_OUTPUT || step == END_OF_RUN) {
-		int realstep = ( step == FILE_OUTPUT ?
-										 simParams->firstTimestep : simParams->N );
-		iout << "WRITING RIGID BODY OUTPUT FILE AT STEP " << realstep << "\n" << endi;
-		static char fname[140];
-		strcpy(fname, simParams->outputFilename);
-		strcat(fname, ".rigid");
-		NAMD_backup_file(fname);
-		std::ofstream outputFile(fname);
-		while (!outputFile) {
-	    if ( errno == EINTR ) {
-				CkPrintf("Warning: Interrupted system call opening rigid body output file, retrying.\n");
-				outputFile.clear();
-				outputFile.open(fname);
-				continue;
-	    }
-	    char err_msg[257];
-	    sprintf(err_msg, "Error opening rigid body output file %s",fname);
-	    NAMD_err(err_msg);
-		} 
-		outputFile << "# NAMD rigid body output file" << std::endl;
-		printLegend(outputFile);
-		printData(realstep,outputFile);
-		if (!outputFile) {
-	    char err_msg[257];
-	    sprintf(err_msg, "Error writing rigid body output file %s",fname);
-	    NAMD_err(err_msg);
-		} 
-	}
-
-	//  Close trajectory file
-	if (step == END_OF_RUN) {
-		if ( trajFile.rdbuf()->is_open() ) {
-	    trajFile.close();
-	    iout << "CLOSING RIGID BODY TRAJECTORY FILE\n" << endi;
-		}
-	}
-	*/
 }
 void RigidBodyController::printLegend(std::ofstream &file) {
         file << "#$LABELS step RigidBodyKey"
@@ -661,146 +640,7 @@ void RigidBodyController::printData(int step,std::ofstream &file) {
 		}
 	}
 }
-/*
-void RigidBodyController::integrate(int step) {
-    DebugM(3, "RBC::integrate: step  "<< step << "\n" << endi);
-    
-    DebugM(1, "RBC::integrate: Waiting for grid reduction\n" << endi);
-    gridReduction->require();
-  
-    const Molecule * mol = Node::Object()->molecule;
-
-    // pass reduction force and torque to each grid
-    // DebugM(3, "Summing forces on rigid bodies" << "\n" << endi);
-    for (int i=0; i < mol->rbReductionIdToRigidBody.size(); i++) {
-	Force f;
-	Force t;
-	for (int k = 0; k < 3; k++) {
-	    f[k] = gridReduction->item(6*i + k);
-	    t[k] = gridReduction->item(6*i + k + 3);
-	}
-
-	if (step % 100 == 1)
-	    DebugM(4, "RBC::integrate: reduction/rb " << i <<":"
-		   << "\n\tposition: "
-		   << rigidBodyList[mol->rbReductionIdToRigidBody[i]]->getPosition()
-		   <<"\n\torientation: "
-		   << rigidBodyList[mol->rbReductionIdToRigidBody[i]]->getOrientation()
-		   << "\n" << endi);
-
-	DebugM(4, "RBC::integrate: reduction/rb " << i <<": "
-	       << "force " << f <<": "<< "torque: " << t << "\n" << endi);
-	rigidBodyList[mol->rbReductionIdToRigidBody[i]]->addForce(f);
-	rigidBodyList[mol->rbReductionIdToRigidBody[i]]->addTorque(t);
-    }
-    
-    // Langevin 
-    for (int i=0; i<rigidBodyList.size(); i++) {
-	// continue;  // debug
-	if (rigidBodyList[i]->langevin) {
-	    DebugM(1, "RBC::integrate: reduction/rb " << i
-		   <<": calling langevin" << "\n" << endi);
-	    rigidBodyList[i]->addLangevin(
-		random->gaussian_vector(), random->gaussian_vector() );
-	    // DebugM(4, "RBC::integrate: reduction/rb " << i
-	    // 	   << " after langevin: force " << rigidBodyList[i]f <<": "<< "torque: " << t << "\n" << endi);
-	    // <<": calling langevin" << "\n" << endi);
-	}
-    }
-    
-    if ( step >= 0 && simParams->rigidBodyOutputFrequency &&
-	 (step % simParams->rigidBodyOutputFrequency) == 0 ) {
-	DebugM(1, "RBC::integrate:integrating for before printing output" << "\n" << endi);
-	// PRINT
-	if ( step == simParams->firstTimestep ) {
-	    print(step);
-	    // first step so only start this cycle
-	    for (int i=0; i<rigidBodyList.size(); i++)  {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		       <<": starting integration cycle of step "
-		       << step << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],0);
-	    }
-	} else {
-	    // finish last cycle
-	    // DebugM(1, "RBC::integrate: reduction/rb " << i
-	    // 	   <<": firststep: calling rb->integrate" << "\n" << endi);
-	    for (int i=0; i<rigidBodyList.size(); i++) {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		       <<": finishing integration cycle of step "
-		       << step-1 << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],1);
-	    }
-	    print(step);
-	    // start this cycle
-	    for (int i=0; i<rigidBodyList.size(); i++) {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		       <<": starting integration cycle of step "
-		       << step << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],0);
-	    }
-	}
-    } else {
-	DebugM(1, "RBC::integrate: trans[0] before: " << trans[0] << "\n" << endi);
-	if ( step == simParams->firstTimestep ) {
-	    // integrate the start of this cycle
-	    for (int i=0; i<rigidBodyList.size(); i++) {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		       <<": starting integration cycle of (first) step "
-		       << step << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],0);
-	    }
-	} else {
-	    // integrate end of last ts and start of this one 
-	    for (int i=0; i<rigidBodyList.size(); i++) {
-		DebugM(2, "RBC::integrate: reduction/rb " << i
-		   <<": ending / starting integration cycle of step "
-		   << step-1 << "-" << step << "\n" << endi);
-		rigidBodyList[i]->integrate(&trans[i],&rot[i],2);
-	    }
-	}
-	DebugM(1, "RBC::integrate: trans[0] after: " << trans[0] << "\n" << endi);
-    }
-    
-    DebugM(3, "sendRigidBodyUpdate on step: " << step << "\n" << endi);
-    if (trans.size() != rot.size())
-	NAMD_die("failed sanity check\n");    
-    RigidBodyMsg *msg = new RigidBodyMsg;
-    msg->trans.copy(trans);	// perhaps .swap() would cause problems
-    msg->rot.copy(rot);
-    computeMgr->sendRigidBodyUpdate(msg);
-}
-
-
-RigidBodyParams* RigidBodyParamsList::find_key(const char* key) {
-    RBElem* cur = head;
-    RBElem* found = NULL;
-    RigidBodyParams* result = NULL;
-    
-    while (found == NULL && cur != NULL) {
-       if (!strcasecmp((cur->elem).rigidBodyKey,key)) {
-        found = cur;
-      } else {
-        cur = cur->nxt;
-      }
-    }
-    if (found != NULL) {
-      result = &(found->elem);
-    }
-    return result;
-}
-*/
 
-/* RigidBodyForcePair::RigidBodyForcePair(RigidBodyType* t1, RigidBodyType* t2, */
-/* 																			 RigidBody* rb1, RigidBody* rb2, */
-/* 																			 std::vector<int> gridKeyId1, std::vector<int> gridKeyId2) : */
-/* 	type1(t1), type2(t2), rb1(rb1), rb2(rb2), gridKeyId1(gridKeyId1), gridKeyId2(gridKeyId2) { */
-
-/* 	printf("    Constructing RB force pair...\n"); */
-/* 	allocateMem(); */
-/* 	printf("    Done constructing RB force pair\n"); */
-
-/* } */
 int RigidBodyForcePair::initialize() {
 	printf("    Initializing (memory for) RB force pair...\n");
 
@@ -832,10 +672,6 @@ int RigidBodyForcePair::initialize() {
 	return nextStreamID;
 }
 
-/* RigidBodyForcePair::RigidBodyForcePair(const RigidBodyForcePair& orig ) : */
-	
-/* } */
-
 void RigidBodyForcePair::swap(RigidBodyForcePair& a, RigidBodyForcePair& b) {
 	using std::swap;
 	swap(a.type1, b.type1);
diff --git a/RigidBodyController.h b/RigidBodyController.h
index b214722274aeafbb64e51ad65eb7ad5df251193d..76b270c35415d41c141665973d85716c3643e062 100644
--- a/RigidBodyController.h
+++ b/RigidBodyController.h
@@ -68,6 +68,10 @@ private:
 	static void createStreams();
 	void callGridForceKernel(int pairId, int s);
 	void retrieveForces();
+	Matrix3 getBasis1(const int i);
+	Matrix3 getBasis2(const int i);
+	Vector3 getOrigin1(const int i);
+	Vector3 getOrigin2(const int i);
 };
 
 class RigidBodyController {
diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu
index 0649c39c40f879dbc655c69c88e0556476588218..d3467324368cf60720d0cddd305d5733d6edcda1 100644
--- a/RigidBodyGrid.cu
+++ b/RigidBodyGrid.cu
@@ -7,11 +7,10 @@
 
 #define STRLEN 512
 
-// Initialize the variables that get used a lot.
-// Also, allocate the main value array.
-void RigidBodyGrid::init() {
-	val = new float[nx*ny*nz];
-}
+	/*                               \
+	| CONSTRUCTORS, DESTRUCTORS, I/O |
+	\===============================*/
+
 RigidBodyGrid::RigidBodyGrid() {
 	RigidBodyGrid tmp(1,1,1);
 	val = new float[1];
@@ -24,7 +23,7 @@ RigidBodyGrid::RigidBodyGrid(int nx0, int ny0, int nz0) {
 	ny = abs(ny0);
 	nz = abs(nz0);
 	
-	init();
+	val = new float[nx*ny*nz];
 	zero();
 }
 
@@ -44,7 +43,7 @@ RigidBodyGrid::RigidBodyGrid(Vector3 box, float dx) {
 	if (ny <= 0) ny = 1;
 	if (nz <= 0) nz = 1;
 	
-	init();
+	val = new float[nx*ny*nz];
 	zero();
 }
 
@@ -60,7 +59,7 @@ RigidBodyGrid::RigidBodyGrid(Matrix3 box, int nx0, int ny0, int nz0) {
 	if (ny <= 0) ny = 1;
 	if (nz <= 0) nz = 1;
 
-	init();
+	val = new float[nx*ny*nz];
 	zero();
 }
 
@@ -79,7 +78,7 @@ RigidBodyGrid::RigidBodyGrid(Matrix3 box, Vector3 origin0, float dx) {
 	if (ny <= 0) ny = 1;
 	if (nz <= 0) nz = 1;
 
-	init();
+	val = new float[nx*ny*nz];
 	zero();
 }
 
@@ -98,7 +97,7 @@ RigidBodyGrid::RigidBodyGrid(Matrix3 box, float dx) {
 	if (ny <= 0) ny = 1;
 	if (nz <= 0) nz = 1;
 
-	init();
+	val = new float[nx*ny*nz];
 	zero();
 }
 
@@ -107,7 +106,7 @@ RigidBodyGrid::RigidBodyGrid(const BaseGrid& g) {
 	ny = g.ny;
 	nz = g.nz;
 	
-	init();
+	val = new float[nx*ny*nz];
 	for (int i = 0; i < nx*ny*nz; i++) val[i] = g.val[i];
 }
 
@@ -117,7 +116,7 @@ RigidBodyGrid::RigidBodyGrid(const RigidBodyGrid& g) {
 	ny = g.ny;
 	nz = g.nz;
 	
-	init();
+	val = new float[nx*ny*nz];
 	for (int i = 0; i < nx*ny*nz; i++) val[i] = g.val[i];
 }
 
@@ -133,31 +132,12 @@ RigidBodyGrid& RigidBodyGrid::operator=(const RigidBodyGrid& g) {
 	ny = g.ny;
 	nz = g.nz;
 	
-	init();
+	val = new float[nx*ny*nz];
 	for (int i = 0; i < nx*ny*nz; i++) val[i] = g.val[i];
 
 	return *this;
 }
 
-
-// Make a copy of a grid, but at a different resolution.
-RigidBodyGrid::RigidBodyGrid(const RigidBodyGrid& g, int nx0, int ny0, int nz0) : nx(nx0),  ny(ny0), nz(nz0) {
-	if (nx <= 0) nx = 1;
-	if (ny <= 0) ny = 1;
-	if (nz <= 0) nz = 1;
-
-	// // Tile the grid into the box of the template grid.
-	// Matrix3 box = g.getBox();
-
-	init();
-
-	// Do an interpolation to obtain the values.
-	for (int i = 0; i < nx*ny*nz; i++) {
-		Vector3 r = getPosition(i);
-		val[i] = g.interpolatePotential(r);
-	}
-}
-
 RigidBodyGrid::~RigidBodyGrid() {
 	if (val != NULL)
 		delete[] val;
@@ -197,10 +177,10 @@ float RigidBodyGrid::getValue(int ix, int iy, int iz) const {
 	return val[j];
 }
 
-Vector3 RigidBodyGrid::getPosition(int j) const {
-	int iz = j%nz;
-	int iy = (j/nz)%ny;
-	int ix = j/(nz*ny);
+Vector3 RigidBodyGrid::getPosition(const int j) const {
+	const int iz = j%nz;
+	const int iy = (j/nz)%ny;
+	const int ix = j/(nz*ny);
 
 	return Vector3(ix,iy,iz);
 }
@@ -213,33 +193,6 @@ Vector3 RigidBodyGrid::getPosition(int j, Matrix3 basis, Vector3 origin) const {
 	return basis.transform(Vector3(ix, iy, iz)) + origin;
 }
 
-// // Does the point r fall in the grid?
-// // Obviously this is without periodic boundary conditions.
-// bool RigidBodyGrid::inGrid(Vector3 r,) const {
-// 	Vector3 l = basisInv.transform(r-origin);
-
-// 	if (l.x < 0.0f || l.x >= nx) return false;
-// 	if (l.y < 0.0f || l.y >= ny) return false;
-// 	if (l.z < 0.0f || l.z >= nz) return false;
-// 	return true;
-// }
-
-// bool RigidBodyGrid::inGridInterp(Vector3 r) const {
-// 	Vector3 l = basisInv.transform(r-origin);
-
-// 	if (l.x < 2.0f || l.x >= nx-3.0f) return false;
-// 	if (l.y < 2.0f || l.y >= ny-3.0f) return false;
-// 	if (l.z < 2.0f || l.z >= nz-3.0f) return false;
-// 	return true;
-// }
-
-// Vector3 RigidBodyGrid::transformTo(Vector3 r) const {
-// 	return basisInv.transform(r-origin);
-// }
-// Vector3 RigidBodyGrid::transformFrom(Vector3 l) const {
-// 	return basis.transform(l) + origin;
-// }
-
 IndexList RigidBodyGrid::index(int j) const {
 	int iz = j%nz;
 	int iy = (j/nz)%ny;
@@ -255,35 +208,6 @@ int RigidBodyGrid::indexY(int j) const { return (j/nz)%ny; }
 int RigidBodyGrid::indexZ(int j) const { return j%nz; }
 int RigidBodyGrid::index(int ix, int iy, int iz) const { return iz + iy*nz + ix*ny*nz; }
 
-// int RigidBodyGrid::index(Vector3 r) const {
-// 	Vector3 l = basisInv.transform(r-origin);
-	
-// 	int ix = int(floor(l.x));
-// 	int iy = int(floor(l.y));
-// 	int iz = int(floor(l.z));
-
-// 	ix = wrap(ix, nx);
-// 	iy = wrap(iy, ny);
-// 	iz = wrap(iz, nz);
-	
-// 	return iz + iy*nz + ix*ny*nz;
-// }
-
-// int RigidBodyGrid::nearestIndex(Vector3 r) const {
-// 	Vector3 l = basisInv.transform(r-origin);
-	
-// 	int ix = int(floorf(l.x + 0.5f));
-// 	int iy = int(floorf(l.y + 0.5f));
-// 	int iz = int(floorf(l.z + 0.5f));
-
-// 	ix = wrap(ix, nx);
-// 	iy = wrap(iy, ny);
-// 	iz = wrap(iz, nz);
-	
-// 	return iz + iy*nz + ix*ny*nz;
-// }
-
-
 // Add a fixed value to the grid.
 void RigidBodyGrid::shift(float s) {
 	for (int i = 0; i < nx*ny*nz; i++) val[i] += s;
@@ -294,66 +218,6 @@ void RigidBodyGrid::scale(float s) {
 	for (int i = 0; i < nx*ny*nz; i++) val[i] *= s;
 }
 
-// Get the mean of the entire grid.
-float RigidBodyGrid::mean() const {
-	float sum = 0.0f;
-	for (int i = 0; i < nx*ny*nz; i++) sum += val[i];
-	return sum/nx*ny*nz;
-}
-
-// Get the potential at the closest node.
-// float RigidBodyGrid::getPotential(Vector3 pos) const {
-// 	// Find the nearest node.
-// 	int j = nearestIndex(pos);
-
-// 	return val[j];
-// }
-
-HOST DEVICE float RigidBodyGrid::interpolatePotential(const Vector3& l) const {
-	// Find the home node.
-	const int homeX = int(floor(l.x));
-	const int homeY = int(floor(l.y));
-	const int homeZ = int(floor(l.z));
-	
-	float g3[4];
-	for (int iz = 0; iz < 4; iz++) {
-		float g2[4];
-		for (int iy = 0; iy < 4; iy++) {
-			// Fetch values from nearby
-			float g1[4];
-			for (volatile  int ix = 0; ix < 4; ix++) {
-				volatile int jx = (ix-1 + homeX);
-				volatile int jy = (iy-1 + homeY);
-				volatile int jz = (iz-1 + homeZ);
-				const int ind = jz + jy*nz + jx*nz*ny;
-				g1[ix] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ?
-					0 : val[ind];
-			}
-			// Mix along x.
-			const float a3 = 0.5f*(-g1[0] + 3.0f*g1[1] - 3.0f*g1[2] + g1[3]);
-			const float a2 = 0.5f*(2.0f*g1[0] - 5.0f*g1[1] + 4.0f*g1[2] - g1[3]);
-			const float a1 = 0.5f*(-g1[0] + g1[2]);
-			const float a0 = g1[1];
-			const float wx = l.x - homeX;
-			g2[iy] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0;
-		}
-		// Mix along y.
-		const float a3 = 0.5f*(-g2[0] + 3.0f*g2[1] - 3.0f*g2[2] + g2[3]);
-		const float a2 = 0.5f*(2.0f*g2[0] - 5.0f*g2[1] + 4.0f*g2[2] - g2[3]);
-		const float a1 = 0.5f*(-g2[0] + g2[2]);
-		const float a0 = g2[1];
-		const float wy = l.y - homeY;
-		g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0;
-	}
-	// Mix along z.
-	const float a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
-	const float a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
-	const float a1 = 0.5f*(-g3[0] + g3[2]);
-	const float a0 = g3[1];
-	const float wz = l.z - homeZ;
-	return a3*wz*wz*wz + a2*wz*wz + a1*wz + a0;
-}
-
 /** interpolateForce() to be used on CUDA Device **/
 DEVICE ForceEnergy RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	Vector3 f;
@@ -366,9 +230,6 @@ DEVICE ForceEnergy RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	const float wz = l.z - homeZ;
 	const float wx2 = wx*wx;
 
-	// RBTODO: test against cpu algorithm; also see if its the same algorithm used by NAMD
-	// RBTODO: test NAMD alg. for speed
-	
 	/* f.x */
 	float g3[3][4];
 	for (int iz = 0; iz < 4; iz++) {
@@ -408,23 +269,17 @@ DEVICE ForceEnergy RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	}
 
 	// Mix along z.
-	{
-		f.x = -0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3])*wz*wz*wz +
-			-0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3])*wz*wz +
-			-0.5f*(-g3[0][0] + g3[0][2])                                    *wz -
-			g3[0][1];
-	}
-	{
-		f.y = -0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3])*wz*wz*wz +
-			-0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3])*wz*wz +
-			-0.5f*(-g3[1][0] + g3[1][2])                                    *wz -
-			g3[1][1];
-	}
-	{
-		f.z = -1.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz -
-			(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])      *wz -
-			0.5f*(-g3[2][0] + g3[2][2]);
-	}
+	f.x = -0.5f*(-g3[0][0] + 3.0f*g3[0][1] - 3.0f*g3[0][2] + g3[0][3])*wz*wz*wz +
+		-0.5f*(2.0f*g3[0][0] - 5.0f*g3[0][1] + 4.0f*g3[0][2] - g3[0][3])*wz*wz +
+		-0.5f*(-g3[0][0] + g3[0][2])                                    *wz -
+		g3[0][1];
+	f.y = -0.5f*(-g3[1][0] + 3.0f*g3[1][1] - 3.0f*g3[1][2] + g3[1][3])*wz*wz*wz +
+		-0.5f*(2.0f*g3[1][0] - 5.0f*g3[1][1] + 4.0f*g3[1][2] - g3[1][3])*wz*wz +
+		-0.5f*(-g3[1][0] + g3[1][2])                                    *wz -
+		g3[1][1];
+	f.z = -1.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz -
+		(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])      *wz -
+		0.5f*(-g3[2][0] + g3[2][2]);
 	float e = 0.5f*(-g3[2][0] + 3.0f*g3[2][1] - 3.0f*g3[2][2] + g3[2][3])*wz*wz*wz +
 		0.5f*(2.0f*g3[2][0] - 5.0f*g3[2][1] + 4.0f*g3[2][2] - g3[2][3])    *wz*wz +
 		0.5f*(-g3[2][0] + g3[2][2])                                        *wz +
@@ -433,39 +288,6 @@ DEVICE ForceEnergy RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	return ForceEnergy(f,e);
 }
 
-
-DEVICE float RigidBodyGrid::interpolatePotentialLinearly(const Vector3& l) const {
-	// Find the home node.
-	const int homeX = int(floor(l.x));
-	const int homeY = int(floor(l.y));
-	const int homeZ = int(floor(l.z));
-
-	float g3[2];
-	for (int iz = 0; iz < 2; iz++) {
-		float g2[2];
-		for (int iy = 0; iy < 2; iy++) {
-			// Fetch values from nearby
-			float g1[2];
-			for (volatile int ix = 0; ix < 2; ix++) {
-				volatile int jx = (ix + homeX);
-				volatile int jy = (iy + homeY);
-				volatile int jz = (iz + homeZ);
-				const int ind = jz + jy*nz + jx*nz*ny;
-				g1[ix] = jz < 0 || jz >= nz || jy < 0 || jy >= ny || jx < 0 || jx >= nx ?
-					0 : val[ind];
-			}
-			// Mix along x.
-			const float wx = l.x - homeX;
-			g2[iy] = wx * (g1[1] - g1[0]) + g1[0];
-		}
-		// Mix along y.
-		const float wy = l.y - homeY;
-		g3[iz] = wy * (g2[1] - g2[0]) + g2[0];
-	}
-	// Mix along z.
-	const float wz = l.z - homeZ;
-	return wz * (g3[1] - g3[0]) + g3[0];
-}
 DEVICE ForceEnergy RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const {
 	// Find the home node.
 	const int homeX = int(floor(l.x));
@@ -512,64 +334,3 @@ DEVICE ForceEnergy RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) co
 	return ForceEnergy(f,e);
 }
 
-
-
-// Vector3 RigidBodyGrid::wrapDiffNearest(Vector3 r) const {
-// 	Vector3 l = r;
-// 	l.x = wrapDiff(l.x, nx);
-// 	l.y = wrapDiff(l.y, ny);
-// 	l.z = wrapDiff(l.z, nz);
-
-// 	float length2 = l.length2();
-
-// 	for (int dx = -1; dx <= 1; dx++) {
-// 		for (int dy = -1; dy <= 1; dy++) {
-// 			for (int dz = -1; dz <= 1; dz++) {
-// 				//if (dx == 0 && dy == 0 && dz == 0) continue;
-// 				Vector3 tmp = Vector3(l.x+dx*nx, l.y+dy*ny, l.z+dz*nz);
-// 				if (tmp.length2() < length2) {
-// 					l = tmp;
-// 					length2 = l.length2();
-// 				}
-// 			}
-// 		}
-// 	}
-
-// 	return l;
-// 	// return basis.transform(l);
-// }
-
-
-// Includes the home node.
-// indexBuffer must have a size of at least 27.
-void RigidBodyGrid::getNeighbors(int j, int* indexBuffer) const {
-	int jx = indexX(j);
-	int jy = indexY(j);
-	int jz = indexZ(j);
-
-	int k = 0;
-	for (int ix = -1; ix <= 1; ix++) {
-		for (int iy = -1; iy <= 1; iy++) {
-			for (int iz = -1; iz <= 1; iz++) {
-				int ind = wrap(jz+iz,nz) + nz*wrap(jy+iy,ny) + ny*nz*wrap(jx+ix,nx);
-				indexBuffer[k] = ind;
-				k++;
-			}
-		}
-	}
-}
-
-
-// Get the values at the neighbors of a node.
-// Note that homeX, homeY, and homeZ do not need to be wrapped,
-// since we do it here.
-void RigidBodyGrid::getNeighborValues(NeighborList* neigh, int homeX, int homeY, int homeZ) const {
-	for (int ix = -1; ix <= 1; ix++) {
-		for (int iy = -1; iy <= 1; iy++) {
-			for (int iz = -1; iz <= 1; iz++) {
-				int ind = wrap(homeZ+iz,nz) + nz*wrap(homeY+iy,ny) + ny*nz*wrap(homeX+ix,nx);
-				neigh->v[ix+1][iy+1][iz+1] = val[ind];
-			}
-		}
-	}
-}  
diff --git a/RigidBodyGrid.h b/RigidBodyGrid.h
index d74f0d7142f0c4fba3cdcbd8f0ef69cd5f10a3ef..6206b9dc27e460acac7d688cdceee52e291ba005 100644
--- a/RigidBodyGrid.h
+++ b/RigidBodyGrid.h
@@ -28,7 +28,7 @@ using namespace std;
 
 class ForceEnergy {
 public:
-	DEVICE ForceEnergy(Vector3 f, float e) :
+	DEVICE ForceEnergy(Vector3 &f, float &e) :
 		f(f), e(e) {};
 	Vector3 f;
 	float e;
@@ -36,13 +36,8 @@ public:
 
 class RigidBodyGrid { 
 	friend class SparseGrid;
-private:
-  // Initialize the variables that get used a lot.
-  // Also, allocate the main value array.
-  void init();
-
+	
 public:
-
 	/*                               \
 	| CONSTRUCTORS, DESTRUCTORS, I/O |
 	\===============================*/
@@ -78,9 +73,6 @@ public:
   RigidBodyGrid mult(const RigidBodyGrid& g);
 
   RigidBodyGrid& operator=(const RigidBodyGrid& g);
-
-  // Make a copy of a grid, but at a different resolution.
-  RigidBodyGrid(const RigidBodyGrid& g, int nx0, int ny0, int nz0);
   
 	virtual ~RigidBodyGrid();
 
@@ -98,20 +90,9 @@ public:
 
   virtual float getValue(int ix, int iy, int iz) const;
 
-  // Vector3 getPosition(int ix, int iy, int iz) const;
   HOST DEVICE Vector3 getPosition(int j) const;
 	HOST DEVICE Vector3 getPosition(int j, Matrix3 basis, Vector3 origin) const;
 		
-  /* // Does the point r fall in the grid? */
-  /* // Obviously this is without periodic boundary conditions. */
-  /* bool inGrid(Vector3 r) const; */
-
-  /* bool inGridInterp(Vector3 r) const; */
-
-  /* Vector3 transformTo(Vector3 r) const; */
-
-  /* Vector3 transformFrom(Vector3 l) const; */
-
   IndexList index(int j) const;
   int indexX(int j) const;
   int indexY(int j) const;
@@ -134,81 +115,10 @@ public:
 
   // Multiply the grid by a fixed value.
   void scale(float s);
-
-	/*         \
-	| COMPUTED |
-	\=========*/
-	
-  // Get the mean of the entire grid.
-  float mean() const;
 	
-  // Get the potential at the closest node.
-  /* virtual float getPotential(Vector3 pos) const; */
-
-	DEVICE float interpolatePotentialLinearly(const Vector3& l) const;
 	DEVICE ForceEnergy interpolateForceDLinearly(const Vector3& l) const;
-
-  HOST DEVICE float interpolatePotential(const Vector3& l) const;
-
-  HOST DEVICE inline static int wrap(int i, int n) {
-		if (i < 0) {
-			i %= n;
-			i += n;
-		}
-		// The portion above allows i == n, so no else keyword.
-		if (i >= n) {
-			i %= n;
-		} 
-		return i;
-	}
-
-	/** interpolateForce() to be used on CUDA Device **/
 	DEVICE ForceEnergy interpolateForceD(Vector3 l) const;
-
-  // Wrap coordinate: 0 <= x < l
-  HOST DEVICE inline float wrapFloat(float x, float l) const {
-		int image = int(floor(x/l));
-		x -= image*l;
-		return x;
-  }
-  
-  // Wrap distance: -0.5*l <= x < 0.5*l
-  HOST DEVICE static inline float wrapDiff(float x, float l) {
-		int image = int(floor(x/l));
-		x -= image*l;
-		if (x >= 0.5f * l)
-			x -= l;
-		return x;
-  }
-
-  // Wrap vector, 0 <= x < lx  &&  0 <= y < ly  &&  0 <= z < lz
-  HOST DEVICE inline Vector3 wrap(Vector3 l) const {
-    l.x = wrapFloat(l.x, nx);
-    l.y = wrapFloat(l.y, ny);
-    l.z = wrapFloat(l.z, nz);
-    return l;
-  }
-
-  // Wrap vector distance, -0.5*l <= x < 0.5*l  && ...
-  HOST DEVICE inline Vector3 wrapDiff(Vector3 l) const {
-    l.x = wrapDiff(l.x, nx);
-    l.y = wrapDiff(l.y, ny);
-    l.z = wrapDiff(l.z, nz);
-		return l;
-  }
-
-  /* Vector3 wrapDiffNearest(Vector3 r) const; */
-
-  // Includes the home node.
-  // indexBuffer must have a size of at least 27.
-  void getNeighbors(int j, int* indexBuffer) const;
   
-  // Get the values at the neighbors of a node.
-  // Note that homeX, homeY, and homeZ do not need to be wrapped,
-  // since we do it here.
-  void getNeighborValues(NeighborList* neigh, int homeX, int homeY, int homeZ) const;
-  inline void setVal(float* v) { val = v; }
-	
 public:
   int nx, ny, nz;
   int size;