diff --git a/ComputeGridGrid.cu b/ComputeGridGrid.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e98c2f16389f15375bc32226b27d6933cc158336
--- /dev/null
+++ b/ComputeGridGrid.cu
@@ -0,0 +1,119 @@
+// Included in RigidBodyController.cu
+#include "ComputeGridGrid.cuh"
+#include "RigidBodyGrid.h"
+#include "CudaUtil.cuh"
+
+//RBTODO handle periodic boundaries
+//RBTODO: add __restrict__, benchmark (Q: how to restrict member data?)
+__global__
+void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
+													const Matrix3 basis_rho, const Matrix3 basis_u_inv,
+													const Vector3 origin_rho_minus_origin_u,
+													Vector3 * retForce, Vector3 * retTorque) {
+
+	extern __shared__ Vector3 s[];
+	Vector3 *force = s;
+	Vector3 *torque = &s[NUMTHREADS];
+
+  // RBTODO: http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops
+	const int tid = threadIdx.x;
+	const int r_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+	force[tid] = Vector3(0.0f);
+	torque[tid] = Vector3(0.0f);
+	if (r_id < rho->getSize()) { // skip threads with nothing to do
+		// RBTODO: reduce registers used;
+		//   commenting out interpolateForceD still uses ~40 registers
+		//   -- the innocuous-looking fn below is responsible; consumes ~17 registers!
+		Vector3 r_pos= rho->getPosition(r_id); /* i,j,k value of voxel */
+
+		r_pos = basis_rho.transform( r_pos ) + origin_rho_minus_origin_u; /* real space */
+		const Vector3 u_ijk_float = basis_u_inv.transform( r_pos );
+
+		// RBTODO What about non-unit delta?
+		/* Vector3 tmpf  = Vector3(0.0f); */
+		/* float tmpe = 0.0f; */
+		/* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */
+		const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
+		force[tid] = fe.f;
+
+		const float r_val = rho->val[r_id]; /* maybe move to beginning of function?  */
+		force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */
+
+		// Calculate torque about origin_u in the lab frame
+		torque[tid] = r_pos.cross(force[tid]);				// RBTODO: test if sign is correct!
+	}
+
+	// Reduce force and torques
+	// http://www.cuvilib.com/Reduction.pdf
+	// RBTODO optimize further, perhaps
+	__syncthreads();
+	for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
+		if (tid < offset) {
+			int oid = tid + offset;
+			force[tid] = force[tid] + force[oid];
+			torque[tid] = torque[tid] + torque[oid];
+		}
+		__syncthreads();
+	}
+
+	if (tid == 0) {
+		retForce[blockIdx.x] = force[0];
+		retTorque[blockIdx.x] = torque[0];
+	}
+}
+
+__global__
+void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForce,
+				const int num, const int* __restrict__ particleIds, 
+				const RigidBodyGrid* __restrict__ u,
+				const Matrix3 basis_u_inv, const Vector3 origin_u,
+				Vector3* __restrict__ retForce, Vector3* __restrict__ retTorque) {
+
+	extern __shared__ Vector3 s[];
+	Vector3 *force = s;
+	Vector3 *torque = &s[NUMTHREADS];
+  
+	const int tid = threadIdx.x;
+	const int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+	force[tid] = Vector3(0.0f);
+	torque[tid] = Vector3(0.0f);
+	if (i < num) {
+		const int& id = particleIds[i];
+		Vector3 p = pos[id] - origin_u;
+		// TODO: wrap to center of u
+		const Vector3 u_ijk_float = basis_u_inv.transform( p );
+		const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
+		force[tid] = fe.f;
+		force[tid] = basis_u_inv.transpose().transform( force[tid] ); /* transform to lab frame */
+		atomicAdd( &particleForce[id], force[tid] ); // apply force to particle
+		
+		// Calculate torque about origin_u in the lab frame
+		torque[tid] = p.cross(force[tid]);				// RBTODO: test if sign is correct!
+	}
+
+	// Reduce force and torques
+	__syncthreads();
+	for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
+		if (tid < offset) {
+			int oid = tid + offset;
+			force[tid] = force[tid] + force[oid];
+			torque[tid] = torque[tid] + torque[oid];
+		}
+		__syncthreads();
+	}
+
+	if (tid == 0) {
+		retForce[blockIdx.x] = force[0];
+		retTorque[blockIdx.x] = torque[0];
+	}
+}
+
+
+__global__
+void printRigidBodyGrid(const RigidBodyGrid* rho) {
+  printf("Printing an RB of size %d\n",rho->size);
+  for (int i=0; i < rho->size; i++)
+	printf("  val[%d] = %f\n", i, rho->val[i]);
+}
diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh
index 066680dff4697656c13e89ab0825500a94b03dfd..20067121dc23b853cdf362ac0a650445062e0bd3 100644
--- a/ComputeGridGrid.cuh
+++ b/ComputeGridGrid.cuh
@@ -1,68 +1,22 @@
 // Included in RigidBodyController.cu
 #pragma once
+#include "useful.h"
+#define NUMTHREADS 96
 
-//RBTODO: add __restrict__, benchmark (Q: how to restrict member data?)
-__global__
-void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
-													const Matrix3 basis_rho, const Matrix3 basis_u_inv,
-													const Vector3 origin_rho_minus_origin_u,
-													Vector3 * retForce, Vector3 * retTorque) {
-
-	extern __shared__ Vector3 s[];
-	Vector3 *force = s;
-	Vector3 *torque = &s[NUMTHREADS];
-
-  // RBTODO: http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops
-	const int tid = threadIdx.x;
-	const int r_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-	force[tid] = Vector3(0.0f);
-	torque[tid] = Vector3(0.0f);
-	if (r_id < rho->getSize()) { // skip threads with nothing to do
-		// RBTODO: reduce registers used;
-		//   commenting out interpolateForceD still uses ~40 registers
-		//   -- the innocuous-looking fn below is responsible; consumes ~17 registers!
-		Vector3 r_pos= rho->getPosition(r_id); /* i,j,k value of voxel */
-
-		r_pos = basis_rho.transform( r_pos ) + origin_rho_minus_origin_u; /* real space */
-		const Vector3 u_ijk_float = basis_u_inv.transform( r_pos );
-
-		// RBTODO What about non-unit delta?
-		/* Vector3 tmpf  = Vector3(0.0f); */
-		/* float tmpe = 0.0f; */
-		/* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */
-		const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
-		force[tid] = fe.f;
+class RigidBodyGrid;
 
-		const float r_val = rho->val[r_id]; /* maybe move to beginning of function?  */
-		force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */
-
-		// Calculate torque about origin_u in the lab frame
-		torque[tid] = r_pos.cross(force[tid]);				// RBTODO: test if sign is correct!
-	}
-
-	// Reduce force and torques
-	// http://www.cuvilib.com/Reduction.pdf
-	// RBTODO optimize further, perhaps
-	__syncthreads();
-	for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
-		if (tid < offset) {
-			int oid = tid + offset;
-			force[tid] = force[tid] + force[oid];
-			torque[tid] = torque[tid] + torque[oid];
-		}
-		__syncthreads();
-	}
-
-	if (tid == 0) {
-		retForce[blockIdx.x] = force[0];
-		retTorque[blockIdx.x] = torque[0];
-	}
-}
-
-__global__
-void printRigidBodyGrid(const RigidBodyGrid* rho) {
-	printf("Printing an RB of size %d\n",rho->size);
-	for (int i=0; i < rho->size; i++)
-		printf("  val[%d] = %f\n", i, rho->val[i]);
-}
+extern __global__
+void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
+				const Matrix3 basis_rho, const Matrix3 basis_u_inv,
+				const Vector3 origin_rho_minus_origin_u,
+				Vector3 * retForce, Vector3 * retTorque);
+
+extern __global__
+void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForce,
+				const int num, const int* __restrict__ particleIds,
+				const RigidBodyGrid* __restrict__ u,
+				const Matrix3 basis_u_inv, const Vector3 origin_u,
+				Vector3* __restrict__ retForce, Vector3* __restrict__ retTorque);
+
+extern __global__
+void printRigidBodyGrid(const RigidBodyGrid* rho);
diff --git a/GrandBrownTown.cuh b/GrandBrownTown.cuh
index da0ffd97221b237b38bc0f1c33566c8fbb3e85c1..736970e4876bb08c3bacee56131c94d09701203a 100644
--- a/GrandBrownTown.cuh
+++ b/GrandBrownTown.cuh
@@ -4,6 +4,8 @@
 #pragma once
 
 #include "CudaUtil.cuh"
+#include "RigidBodyType.h"
+#include "RigidBodyGrid.h"
 
 __device__
 Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
@@ -32,7 +34,7 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal,
 
 
 	// TODO: Make this a grid-stride loop to make efficient reuse of RNG states 
-  // Loop over ALL particles in ALL replicas
+	// Loop over ALL particles in ALL replicas
 	if (idx < num * numReplicas) {
 		const int t = type[idx];
 		Vector3& p = pos[idx];
diff --git a/RigidBody.h b/RigidBody.h
index b6df0612475857b783c89f93c6a8f05b2671e1c5..59703c5f2e0f7094e787fe7bc79355dbdfe7e5d9 100644
--- a/RigidBody.h
+++ b/RigidBody.h
@@ -15,7 +15,7 @@
     #define DEVICE 
 #endif
 
-class Configuration;						/* forward decleration */
+class Configuration;
 
 typedef float BigReal;					/* strip this out later */
 typedef Vector3 Force;
diff --git a/RigidBodyController.cu b/RigidBodyController.cu
index 9db8d2d7261d19d5b9a45a868068e662c65208bf..e3744a2db8583ac62e8b59064db37f5ae3df110e 100644
--- a/RigidBodyController.cu
+++ b/RigidBodyController.cu
@@ -8,6 +8,7 @@
 #include "RigidBodyController.h"
 #include "Configuration.h"
 #include "RigidBodyType.h"
+#include "RigidBodyGrid.h"
 #include "ComputeGridGrid.cuh"
 
 #include <cuda_profiler_api.h>
diff --git a/RigidBodyController.h b/RigidBodyController.h
index ec31395e1021444b1fa923075adce64000cd8720..f04251f606ea6350abeffa54f2c10a9c2d4898cc 100644
--- a/RigidBodyController.h
+++ b/RigidBodyController.h
@@ -1,14 +1,20 @@
+#pragma once
+
 #include <vector>
 #include <fstream>
-#include "RigidBody.h"
 /* #include "RandomCUDA.h" */
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "useful.h"
 
 // #define NUMTHREADS 128					/* try with 64, every 32+ */
 #define NUMTHREADS 96
 #define NUMSTREAMS 8
 
+// #include "RigidBody.h"
+
+class RigidBodyType;
+class RigidBody;
 class Configuration;
 class RandomCPU;
 
diff --git a/RigidBodyType.cu b/RigidBodyType.cu
index 2c3965adc8bb3c43de9a3fb70ed0240ab0307ba8..b8ea109855c6dc2cdfc80ddf98b4287acbcb232b 100644
--- a/RigidBodyType.cu
+++ b/RigidBodyType.cu
@@ -1,4 +1,7 @@
 #include "RigidBodyType.h"
+#include "Reservoir.h"
+#include "BaseGrid.h"
+#include "RigidBodyGrid.h"
 
 void RigidBodyType::clear() {
 	num = 0;											// RBTODO: not 100% sure about this
diff --git a/RigidBodyType.h b/RigidBodyType.h
index 4b793234afd950cb016dbd09da80d2451dda2551..44a813ded4ee4f6112a91f898b7ef88658a4eef7 100644
--- a/RigidBodyType.h
+++ b/RigidBodyType.h
@@ -5,13 +5,17 @@
 #include <vector>
 /* #include <thrust/host_vector.h> */
 /* #include <thrust/device_vector.h> */
-#include "Reservoir.h"
-#include "BaseGrid.h"
-#include "RigidBodyGrid.h"
+/* #include "Reservoir.h" */
+/* #include "BaseGrid.h" */
+/* #include "RigidBodyGrid.h" */
 #include "useful.h"
 
 #include <cstdio>
 
+class Reservoir;
+class BaseGrid;
+class RigidBodyGrid;
+
 class RigidBodyType {
 private:
 	// Deletes all members