From 9c9cbc1e50efcfb066af9f7c9b74e100ce50f50d Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Fri, 13 Nov 2015 11:22:50 -0600
Subject: [PATCH] starting work on grid-grid kernel

---
 .gitignore          |   1 -
 BaseGrid.h          |  37 +++-
 ComputeGridGrid.cuh |  34 +++
 Configuration.cpp   | 155 ++++---------
 GrandBrownTown.cu   |  13 +-
 GrandBrownTown.cuh  |   2 +
 RigidBody.h         | 312 +++++++++++++-------------
 RigidBodyGrid.cu    | 470 +++++++++++++++++++++++++++++++++++++++
 RigidBodyGrid.h     | 519 ++++++++++++++++++++++++++++++++++++++++++++
 RigidBodyType.cu    |   8 +-
 RigidBodyType.h     |   3 +
 notes.org           |  25 ++-
 runBrownTown.cpp    |   2 -
 useful.h            |   2 +-
 14 files changed, 1304 insertions(+), 279 deletions(-)
 create mode 100644 ComputeGridGrid.cuh
 create mode 100644 RigidBodyGrid.cu
 create mode 100644 RigidBodyGrid.h

diff --git a/.gitignore b/.gitignore
index 56e5b99..99c1cd7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-*.org
 *.o
 .svn
 *~
diff --git a/BaseGrid.h b/BaseGrid.h
index 9156f95..bcdda0b 100644
--- a/BaseGrid.h
+++ b/BaseGrid.h
@@ -40,8 +40,13 @@ private:
   void init();
 
 public:
-		BaseGrid(); // cmaffeo2 (2015) moved this out of protected, cause I wanted BaseGrid in a struct
 
+	/*                               \
+	| CONSTRUCTORS, DESTRUCTORS, I/O |
+	\===============================*/
+	
+	// RBTODO Fix?
+	BaseGrid(); // cmaffeo2 (2015) moved this out of protected, cause I wanted BaseGrid in a struct
   // The most obvious of constructors.
 		BaseGrid(Matrix3 basis0, Vector3 origin0, int nx0, int ny0, int nz0);
 
@@ -90,7 +95,11 @@ public:
   
 	virtual ~BaseGrid();
 
-  void zero();
+	/*             \
+	| DATA METHODS |
+	\=============*/
+		
+	void zero();
   
   bool setValue(int j, float v);
 
@@ -161,9 +170,13 @@ public:
   // Multiply the grid by a fixed value.
   void scale(float s);
 
+	/*         \
+	| COMPUTED |
+	\=========*/
+	
   // Get the mean of the entire grid.
-  float mean() const;
-
+  float mean() const
+	
   // Compute the average profile along an axis.
   // Assumes that the grid axis with index "axis" is aligned with the world axis of index "axis".
   void averageProfile(const char* fileName, int axis);
@@ -183,7 +196,9 @@ public:
 
   HOST DEVICE inline float interpolateDiffX(Vector3 pos, float w[3], float g1[4][4][4]) const {
     float a0, a1, a2, a3;
-    
+
+		// RBTODO parallelize loops?
+		
     // Mix along x, taking the derivative.
     float g2[4][4];
     for (int iy = 0; iy < 4; iy++) {
@@ -292,14 +307,19 @@ public:
 
     return -(3.0f*a3*w[2]*w[2] + 2.0f*a2*w[2] + a1);
   }
-  
+
+	// RBTODO overload with optimized algorithm
+	//  skip transforms (assume identity basis)
   HOST DEVICE inline float interpolatePotential(Vector3 pos) const {
     // Find the home node.
     Vector3 l = basisInv.transform(pos - origin);
     int homeX = int(floor(l.x));
     int homeY = int(floor(l.y));
     int homeZ = int(floor(l.z));
-    
+
+		// out of grid? return 0
+		// RBTODO
+			
     // Get the array jumps.
     int jump[3];
     jump[0] = nz*ny;
@@ -317,7 +337,7 @@ public:
     g[0] = nx;
     g[1] = ny;
     g[2] = nz;
-
+  
     // Get the interpolation coordinates.
     float w[3];
     w[0] = l.x - homeX;
@@ -472,6 +492,7 @@ public:
 		w[2] = l.z - homeZ;
 		// Find the values at the neighbors.
 		float g1[4][4][4];
+		//RBTODO parallelize?
 		for (int ix = 0; ix < 4; ix++) {
 			for (int iy = 0; iy < 4; iy++) {
 				for (int iz = 0; iz < 4; iz++) {
diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh
new file mode 100644
index 0000000..11e2f04
--- /dev/null
+++ b/ComputeGridGrid.cuh
@@ -0,0 +1,34 @@
+#pragma once
+#include "RigidBodyGrid.h"
+
+__global__
+void computeGridGridForce(RigidBodyGrid* rho, RigidBodyGrid* u) {
+	unsigned int bidx = blockIdx.x;
+	unsigned int tidx = threadIdx.x;
+
+	// GTX 980 has ~ 4 x 10x10x10 floats available for shared memory
+	//    but code should work without access to this
+	
+	// RBTODO rewrite to use shared memory, break problem into subgrids
+	// will lookups of 
+
+// http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+
+	const unsigned int r_id = bidx * blockDim.x + threadIdx.x;
+	
+	// RBTODO parallelize transform
+	if (r_id > rho->size)					// skip threads with no data 
+		return;
+
+	// Tile grid data into shared memory
+	//   RBTODO: think about localizing regions of grid data
+	Vector3 p = rho->getPosition(r_id);
+	float val = rho->val[r_id];
+
+	// RBTODO reduce these
+	// http://www.cuvilib.com/Reduction.pdf
+	float energy = ;
+	Vector3 f = u->interpolateForceD(p);
+	Vector3 t = f * 
+		
+}
diff --git a/Configuration.cpp b/Configuration.cpp
index 028b2fc..8e7328e 100644
--- a/Configuration.cpp
+++ b/Configuration.cpp
@@ -482,12 +482,8 @@ void Configuration::copyToCUDA() {
 			gpuErrchk(cudaMemcpy(&(gtmp[gid]), &(rb->rawPotentialGrids[gid]),
 													 sizeof(BaseGrid), cudaMemcpyHostToDevice ));
 		}
-		// RBTODO: segfault when gtmp is deleted --> why?
-		//    delete[] gtmp;
 
-		printf("  RigidBodyType %d: numGrids = %d\n", i, ng);
-
-		
+		printf("  RigidBodyType %d: numGrids = %d\n", i, ng);		
 		// copy grid data to device
 		for (int gid = 0; gid < ng; gid++) { 
 			BaseGrid *g = &(rb->rawPotentialGrids[gid]); // convenience
@@ -499,7 +495,6 @@ void Configuration::copyToCUDA() {
 			for (int k = 0; k < len; k++)
 				printf("    rbType_d[%d]->potGrid[%d].val[%d]: %g\n",
 							 i, gid, k, g->val[k]);
-
 			
       // allocate grid data on device
 			// copy temporary host pointer to device pointer
@@ -515,118 +510,56 @@ void Configuration::copyToCUDA() {
 			// RBTODO: why can't this be deleted? 
 			// delete[] tmpData;
 		}
+	}
 
+	// density grids
+ 	for (int i = 0; i < numRigidTypes; i++) {
+		printf("working on RB %d\n",i);
+		RigidBodyType *rb = &(rigidBody[i]); // temporary for convenience
 
-
-		// // gpuErrchk(cudaMemcpy(&(rb_d->potentialGrids[i]), &gtmp, 
-		// // 											sz, cudaMemcpyHostToDevice ));
-				
-		// // size_t sz;
-		// // for (int gid = 0; gid < ng; gid++) { 
-		// // 	gpuErrchk(cudaMalloc((void**) &(gtmp[i]), sizeof(BaseGrid)));
-		// // 	gpuErrchk(cudaMemcpy(&(gtmp[i]->potentialGrids[i]), &(gtmp[i]),
-		// // 												sizeof(BaseGrid*), cudaMemcpyHostToDevice));
-		// // }
-
+		int ng = rb->numDenGrids;
+		BaseGrid * gtmp;
+		size_t sz = sizeof(BaseGrid)*ng;
 		
-		// {
-		// 	BaseGrid *tmpData;
-		// 	size_t sz = sizeof(BaseGrid)*ng;
-
-		// 	gpuErrchk(cudaMalloc((void **) &tmpData, sz));
-		// 	gpuErrchk(cudaMemcpy(&(rb_d->potentialGrids[i]), &tmpData, 
-		// 											 sizeof(BaseGrid*), cudaMemcpyHostToDevice ));
-		// 	sz = sizeof(float) * ng
-		// 	gpuErrchk(cudaMemcpy(tmpData, rbg->val, sz, cudaMemcpyHostToDevice));
-		// }
-
-
-			// size_t sz = sizeof(float) * rb->potentialGrids[gid]->getSize();
-			// // gpuErrchk(cudaMalloc(&g_d, sizeof(BaseGrid)));
-
-			// gpuErrchk(cudaMalloc(&tmpData, sz));
-			// gpuErrchk(cudaMemcpyAsync(g_d, rb->potentialGrids[gid], 
-			// 													sizeof(BaseGrid), cudaMemcpyHostToDevice));
-
-		  // gpuErrchk(cudaMemcpy(val_d, rb->potentialGrids[gid], sz, cudaMemcpyHostToDevice));
-			
-			// set rb pointers appropriately
-			// rb_d->potentialGrids[gid] = g_d;
-			// g_d->val = val_d; // hopefully?
-				
-
-
-	// 		BaseGrid *g_d;
-	// 		float *val_d;
-	// 		size_t sz = sizeof(float) * rb.potentialGrids[gid]->getSize();
-	// 		gpuErrchk(cudaMalloc(&g_d, sizeof(BaseGrid)));
-	// 		gpuErrchk(cudaMalloc(&val_d, sz));
-	// 		gpuErrchk(cudaMemcpyAsync(g_d, rb->potentialGrids[gid], 
-	// 															sizeof(BaseGrid), cudaMemcpyHostToDevice));
-	// 	  gpuErrchk(cudaMemcpy(val_d, rb->potentialGrids[gid], sz, cudaMemcpyHostToDevice));
-			
-	// 		// set rb pointers appropriately
-	// 		rb_d->potentialGrids[gid] = g_d;
-	// 		g_d->val = val_d; // hopefully?
-
-	// 	}
-
-
-	// // Copy rigidbody types 
- 	// for (int i = 0; i < numRigidTypes; i++) {
-	// 	RigidBodyType *rb = rigidBody[i]; // temporary for convenience
-	// 	// gpuErrchk(cudaMemcpy(&rb, sizeof(RigidBodyType)));
+		// allocate grids on device
+		// copy temporary host pointer to device pointer
+		// copy grids to device through temporary host poin
+		gpuErrchk(cudaMalloc((void **) &gtmp, sz));
+		gpuErrchk(cudaMemcpy(&(rb_addr[i]->rawDensityGrids), &gtmp, 
+												 sizeof(BaseGrid*) * ng, cudaMemcpyHostToDevice ));
+		gpuErrchk(cudaMemcpy(gtmp, &(rb->rawDensityGrids),
+												 sizeof(BaseGrid)  * ng, cudaMemcpyHostToDevice ));
+		for (int gid = 0; gid < ng; gid++) {
+			gpuErrchk(cudaMemcpy(&(gtmp[gid]), &(rb->rawDensityGrids[gid]),
+													 sizeof(BaseGrid), cudaMemcpyHostToDevice ));
+		}
 
-	// 	RigidBodyType *rb_d;
+		printf("  RigidBodyType %d: numGrids = %d\n", i, ng);		
+		// copy grid data to device
+		for (int gid = 0; gid < ng; gid++) { 
+			BaseGrid *g = &(rb->rawDensityGrids[gid]); // convenience
+			int len = g->getSize();
+			float **tmpData;
+			tmpData = new float*[len];
 
-		
-	// 	for (int gid = 0; i < rb.numPotGrids) { 
-	// 		BaseGrid *g_d;
-	// 		float *val_d;
-	// 		size_t sz = sizeof(float) * rb.potentialGrids[gid]->getSize();
-	// 		gpuErrchk(cudaMalloc(&g_d, sizeof(BaseGrid)));
-	// 		gpuErrchk(cudaMalloc(&val_d, sz));
-	// 		gpuErrchk(cudaMemcpyAsync(g_d, rb->potentialGrids[gid], 
-	// 															sizeof(BaseGrid), cudaMemcpyHostToDevice));
-	// 	  gpuErrchk(cudaMemcpy(val_d, rb->potentialGrids[gid], sz, cudaMemcpyHostToDevice));
-			
-	// 		// set rb pointers appropriately
-	// 		rb_d->potentialGrids[gid] = g_d;
-	// 		g_d->val = val_d; // hopefully?
-
-	// 	}
-	// 	for (int gid = 0; i < rb.numDenGrids) { 
-	// 		BaseGrid *g_d;
-	// 		float *val_d;
-	// 		size_t sz = sizeof(float) * rb->densityGrids[gid]->getSize();
-	// 		gpuErrchk(cudaMalloc(&g_d, sizeof(BaseGrid)));
-	// 		gpuErrchk(cudaMalloc(&val_d, sz));
-	// 		gpuErrchk(cudaMemcpyAsync(g_d, rb->densityGrids[gid], 
-	// 															sizeof(BaseGrid), cudaMemcpyHostToDevice));
-	// 	  gpuErrchk(cudaMemcpy(val_d, rb->densityGrids[gid], sz, cudaMemcpyHostToDevice));
+			printf("  RigidBodyType %d: potGrid[%d] size: %d\n", i, gid, len);
+			for (int k = 0; k < len; k++)
+				printf("    rbType_d[%d]->potGrid[%d].val[%d]: %g\n",
+							 i, gid, k, g->val[k]);
 			
-	// 	}
-		
-		
-	// 	// RBTODO
-	// 	// b->pmf = pmf;
-	// 	// b->diffusionGrid = diffusionGrid;
-	// 	gpuErrchk(cudaMalloc(&rb_addr[i], sizeof(BrownianParticleType)));
-	// 	gpuErrchk(cudaMemcpyAsync( rb_addr[i], rb_d, 
-	// 														sizeof(BrownianParticleType), cudaMemcpyHostToDevice));
-		
-
-	// 	gpuErrchk(cudaMalloc(&rbType_d, sizeof(RigidBodyType*) * numRigidTypes));
-	// 	rigidBody[i].potentialGrids_D = rigidBody[i].potentialGrids;
-	// 	rigidBody[i].densityGrids_D = rigidBody[i].densityGrids;
-		
-	// 	gpuErrchk(cudaMemcpyAsync(rb_addr[i], rb, sizeof(RigidBodyType),
-	// 														cudaMemcpyHostToDevice));
+      // allocate grid data on device
+			// copy temporary host pointer to device pointer
+			// copy data to device through temporary host pointer
+			sz = sizeof(float*) * len;
+			gpuErrchk(cudaMalloc((void **) &tmpData, sz)); 
+			gpuErrchk(cudaMemcpy( &(gtmp[gid].val), &tmpData,
+														sizeof(float*), cudaMemcpyHostToDevice));
+			sz = sizeof(float) * len;
+			gpuErrchk(cudaMemcpy( tmpData, g->val, sz, cudaMemcpyHostToDevice));
+			// RBTODO: why can't this be deleted? 
+			// delete[] tmpData;
+		}
 		
-	// 	gpuErrchk(cudaMemcpyAsync(part_d, part_addr, sizeof(BrownianParticleType*) * numParts,
-	// 														cudaMemcpyHostToDevice));
-
-	// 	}
   }
 	gpuErrchk(cudaMemcpy(rbType_d, rb_addr, sizeof(RigidBodyType*) * numRigidTypes,
 				cudaMemcpyHostToDevice));
diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu
index fdf5036..cd1650b 100644
--- a/GrandBrownTown.cu
+++ b/GrandBrownTown.cu
@@ -56,6 +56,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	type   = new     int[num * numReplicas];  // [HOST] array of particles' types.
 	serial = new     int[num * numReplicas];  // [HOST] array of particles' serial numbers.
 
+	
 	// Replicate identical initial conditions across all replicas
 	// TODO: add an option to generate random initial conditions for all replicas
 	for (int r = 0; r < numReplicas; ++r) {
@@ -403,13 +404,23 @@ void GrandBrownTown::run() {
 																							 electricField, tl, timestep, num,
 																							 sys_d, randoGen_d, numReplicas);
 		//gpuErrchk(cudaPeekAtLastError()); // Does not work on old GPUs (like mine). TODO: write a better wrapper around Peek
-
+		
+		
 		/* Time position computations.
 		rt_timer_stop(cputimer);
 		float dt2 = rt_timer_time(cputimer);
 		printf("Position Update Time: %f ms\n", dt2 * 1000);
 		// */
 
+		// calculateRigidBodyForce<<< numBlocks, NUM_THREADS >>>(rb_d, rbType_d,
+		// 																						 kT, kTGrid_d,
+		// 																						 tl, timestep,
+		// 																					 sys_d, randoGen_d, numReplicas);
+		computeGridGridForce<<< numBlocks, NUM_THREADS >>>(grid1_d, grid2_d);
+		
+		// int numBlocks = (numRB ) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
+		
+		
 		Vector3 force0(0.0f);
 
 		if (imd_on && clientsock && s % outputPeriod == 0) {
diff --git a/GrandBrownTown.cuh b/GrandBrownTown.cuh
index 5f8c1a7..7b447e5 100644
--- a/GrandBrownTown.cuh
+++ b/GrandBrownTown.cuh
@@ -16,6 +16,8 @@ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
 __global__
 void updateKernel(Vector3 pos[], Vector3 forceInternal[],
 									int type[], BrownianParticleType* part[],
+									RigidBody* rb[],
+									RigidBodyType* rbType[],
 									float kT, BaseGrid* kTGrid,
 									float electricField, int tGridLength,
 									float timestep, int num, BaseGrid* sys,
diff --git a/RigidBody.h b/RigidBody.h
index ef3a788..0f7c047 100644
--- a/RigidBody.h
+++ b/RigidBody.h
@@ -6,11 +6,12 @@
 
 #pragma once
 
-#include <set>
-#include <vector>
+/* #include <set> */
+/* #include <vector> */
 
-#include "Vector.h"
-#include "Tensor.h"
+#include "useful.h"
+/* #include "Vector.h" */
+/* #include "Tensor.h" */
 
 
 /* #include "strlib.h" */
@@ -25,203 +26,210 @@
 /* #include "NamdTypes.h" */
 
 typedef BigReal float;					/* strip this out later */
+typedef Force Vector3;
 
-class ComputeMgr;
-class Random;
+#define DEVICE __device__
+
+/* class ComputeMgr; */
+/* class Random; */
 
 class RigidBody {
 	/*=====================================================================\
-	|   See Appendix A of: Dullweber, Leimkuhler and McLaclan. "Symplectic |
-	|   splitting methods for rigid body molecular dynamics". J Chem       |
-	|   Phys. (1997)                                                       |
-	\=====================================================================*/
+		|   See Appendix A of: Dullweber, Leimkuhler and McLaclan. "Symplectic |
+		|   splitting methods for rigid body molecular dynamics". J Chem       |
+		|   Phys. (1997)                                                       |
+		\=====================================================================*/
 public:
-	RigidBody(SimParameters *simParams, RigidBodyParams *rbParams);
-	~RigidBody();
+	DEVICE RigidBody(SimParameters *simParams, RigidBodyParams *rbParams);
+	DEVICE ~RigidBody();
 
-	void addForce(Force f); 
-	void addTorque(Force t);
-	void addLangevin(Vector w1, Vector w2);
+	DEVICE void addForce(Force f); 
+	DEVICE void addTorque(Force t);
+	DEVICE void addLangevin(Vector3 w1, Vector3 w2);
 
-	inline void clearForce() { force = Force(0); }
-	inline void clearTorque() { torque = Force(0); }
+	DEVICE inline void clearForce() { force = Force(0); }
+	DEVICE inline void clearTorque() { torque = Force(0); }
 
-	void integrate(Vector *p_trans, Tensor *p_rot, int startFinishAll);
+	DEVICE void integrate(Vector3 *p_trans, Matrix3 *p_rot, int startFinishAll);
 
-	inline char* getKey() { return key; }
-	inline Vector getPosition() { return position; }
-	inline Tensor getOrientation() { return orientation; }
-	inline BigReal getMass() { return mass; }
-	inline Vector getVelocity() { return momentum/mass; }
-	inline Vector getAngularVelocity() { 
-		return Vector( angularMomentum.x / inertia.x,
+	DEVICE inline char* getKey() { return key; }
+	DEVICE inline Vector3 getPosition() { return position; }
+	DEVICE inline Matrix3 getOrientation() { return orientation; }
+	DEVICE inline BigReal getMass() { return mass; }
+	DEVICE inline Vector3 getVelocity() { return momentum/mass; }
+	DEVICE inline Vector3 getAngularVelocity() { 
+		return Vector3( angularMomentum.x / inertia.x,
 									 angularMomentum.y / inertia.y,
 									 angularMomentum.z / inertia.z );
 	}
 	bool langevin;
     
 private:
-	char* key;
-	static const SimParameters * simParams;
+	String key;
+	/* static const SimParameters * simParams; */
 	BigReal mass;
 
-	Vector position;
-	Tensor orientation;
+	Vector3 position;
+	Matrix3 orientation;
 
-	Vector inertia; // diagonal elements of inertia tensor
-	Vector momentum;
-	Vector angularMomentum; // angular momentum along corresponding principal axes
+	Vector3 inertia; // diagonal elements of inertia tensor
+	Vector3 momentum;
+	Vector3 angularMomentum; // angular momentum along corresponding principal axes
     
 	// Langevin
-	Vector langevinTransFriction;
-	Vector langevinRotFriction;
+	Vector3 langevinTransFriction;
+	Vector3 langevinRotFriction;
 	BigReal Temp;
 
-	Vector transDampingCoeff;
-	Vector transForceCoeff;
-	Vector rotDampingCoeff;
-	Vector rotTorqueCoeff;    
+	Vector3 transDampingCoeff;
+	Vector3 transForceCoeff;
+	Vector3 rotDampingCoeff;
+	Vector3 rotTorqueCoeff;    
 
 	// integration
 	int timestep;
-	Vector force;  // lab frame
-	Vector torque; // lab frame (except in integrate())
+	Vector3 force;  // lab frame
+	Vector3 torque; // lab frame (except in integrate())
 
 	bool isFirstStep; 
 
 	// units "kcal_mol/AA * fs"  "(AA/fs) * amu"
 	const BigReal impulse_to_momentum;
 
-	inline Tensor Rx(BigReal t);
-	inline Tensor Ry(BigReal t);
-	inline Tensor Rz(BigReal t);
-	inline Tensor eulerToMatrix(const Vector e);
+	DEVICE inline Matrix3 Rx(BigReal t);
+	DEVICE inline Matrix3 Ry(BigReal t);
+	DEVICE inline Matrix3 Rz(BigReal t);
+	DEVICE inline Matrix3 eulerToMatrix(const Vector3 e);
 };
 
 class RigidBodyController {
 public:
-	RigidBodyController(const NamdState *s, int reductionTag, SimParameters *sp);
-	~RigidBodyController();
-	void integrate(int step);
-	void print(int step);
+	/* DEVICE RigidBodyController(const NamdState *s, int reductionTag, SimParameters *sp); */
+	DEVICE RigidBodyController(const NamdState *s, int reductionTag, SimParameters *sp);
+	
+	DEVICE ~RigidBodyController();
+	DEVICE void integrate(int step);
+	DEVICE void print(int step);
     
 private:
-	void printLegend(std::ofstream &file);
-	void printData(int step, std::ofstream &file);
-
-	SimParameters *simParams;
-	const NamdState * state;		
-    
-	std::ofstream trajFile;
-
-	Random *random;
-	ComputeMgr *computeMgr;
-	RequireReduction *gridReduction;
-
-	ResizeArray<Vector> trans; // would have made these static, but
-	ResizeArray<Tensor> rot;	// there are errors on rigidBody->integrate
-	std::vector<RigidBody*> rigidBodyList;
+	/* void printLegend(std::ofstream &file); */
+	/* void printData(int step, std::ofstream &file); */
+
+	/* SimParameters* simParams; */
+	/* const NamdState* state;		 */    
+	/* std::ofstream trajFile; */
+
+	Random* random;
+	/* RequireReduction *gridReduction; */
+	
+	Vector3* trans; // would have made these static, but
+	Matrix3* rot;  	// there are errors on rigidBody->integrate
+	RigidBody* rigidBodyList;
+	
 };
 
 class RigidBodyParams {
 public:
-    RigidBodyParams() {
-	rigidBodyKey = 0;
-	mass = 0;
-	inertia = Vector(0);
-	langevin = FALSE;
-	temperature = 0;
-	transDampingCoeff = Vector(0);
-	rotDampingCoeff = Vector(0);
-	gridList;
-	position = Vector(0);
-	velocity = Vector(0);
-	orientation = Tensor();
-	orientationalVelocity = Vector(0);	   
-    }    
-    char *rigidBodyKey;
-    BigReal mass;
-    zVector inertia;
-    Bool langevin;
-    BigReal temperature;
-    zVector transDampingCoeff;
-    zVector rotDampingCoeff;
-    std::vector<std::string> gridList;
+	RigidBodyParams() {
+		rigidBodyKey = 0;
+		mass = 0;
+		inertia = Vector3();
+		langevin = FALSE;
+		temperature = 0;
+		transDampingCoeff = Vector3();
+		rotDampingCoeff = Vector3();
+		gridList;
+		position = Vector3();
+		velocity = Vector3();
+		orientation = Matrix3();
+		orientationalVelocity = Vector3();	   
+	}    
+	int typeID;
+
+	String *rigidBodyKey;
+	BigReal mass;
+	Vector3 inertia;
+	Bool langevin;
+	BigReal temperature;
+	Vector3 transDampingCoeff;
+	Vector3 rotDampingCoeff;
+	String *gridList;
+	
     
-    zVector position;
-    zVector velocity;
-    Tensor orientation;
-    zVector orientationalVelocity;
+	Vector3 position;
+	Vector3 velocity;
+	Matrix3 orientation;
+	Vector3 orientationalVelocity;
 
-    RigidBodyParams *next;
+	RigidBodyParams *next;
 
-    const void print();
+	const void print();
 };
 
 
-class RigidBodyParamsList {
-public:
-  RigidBodyParamsList() {
-    clear();
-  }
+/* class RigidBodyParamsList { */
+/* public: */
+/*   RigidBodyParamsList() { */
+/*     clear(); */
+/*   } */
   
-  ~RigidBodyParamsList() 
-  {
-    RBElem* cur;
-    while (head != NULL) {
-      cur = head;
-      head = cur->nxt;
-      delete cur;
-    }
-    clear();
-  }
-  const void print(char *s);
-  const void print();
-
-  // The SimParameters bit copy overwrites these values with illegal pointers,
-  // So thise throws away the garbage and lets everything be reinitialized
-  // from scratch
-  void clear() {
-    head = tail = NULL;
-    n_elements = 0;
-  }
+/*   ~RigidBodyParamsList()  */
+/*   { */
+/*     RBElem* cur; */
+/*     while (head != NULL) { */
+/*       cur = head; */
+/*       head = cur->nxt; */
+/*       delete cur; */
+/*     } */
+/*     clear(); */
+/*   } */
+/*   const void print(char *s); */
+/*   const void print(); */
+
+/*   // The SimParameters bit copy overwrites these values with illegal pointers, */
+/*   // So thise throws away the garbage and lets everything be reinitialized */
+/*   // from scratch */
+/*   void clear() { */
+/*     head = tail = NULL; */
+/*     n_elements = 0; */
+/*   } */
   
-  RigidBodyParams* find_key(const char* key);  
-  int index_for_key(const char* key);
-  RigidBodyParams* add(const char* key);
+/*   RigidBodyParams* find_key(const char* key);   */
+/*   int index_for_key(const char* key); */
+/*   RigidBodyParams* add(const char* key); */
   
-  RigidBodyParams *get_first() {
-    if (head == NULL) {
-      return NULL;
-    } else return &(head->elem);
-  }
+/*   RigidBodyParams *get_first() { */
+/*     if (head == NULL) { */
+/*       return NULL; */
+/*     } else return &(head->elem); */
+/*   } */
   
-  void pack_data(MOStream *msg);  
-  void unpack_data(MIStream *msg);
+/*   void pack_data(MOStream *msg);   */
+/*   void unpack_data(MIStream *msg); */
   
-  // convert from a string to Bool; returns 1(TRUE) 0(FALSE) or -1(if unknown)
-  static int atoBool(const char *s)
-  {
-    if (!strcasecmp(s, "on")) return 1;
-    if (!strcasecmp(s, "off")) return 0;
-    if (!strcasecmp(s, "true")) return 1;
-    if (!strcasecmp(s, "false")) return 0;
-    if (!strcasecmp(s, "yes")) return 1;
-    if (!strcasecmp(s, "no")) return 0;
-    if (!strcasecmp(s, "1")) return 1;
-    if (!strcasecmp(s, "0")) return 0;
-    return -1;
-  }
-
-
-private:
-  class RBElem {
-  public:
-    RigidBodyParams elem;
-    RBElem* nxt;
-  };
-  RBElem* head;
-  RBElem* tail;
-  int n_elements;
-
-};
+/*   // convert from a string to Bool; returns 1(TRUE) 0(FALSE) or -1(if unknown) */
+/*   static int atoBool(const char *s) */
+/*   { */
+/*     if (!strcasecmp(s, "on")) return 1; */
+/*     if (!strcasecmp(s, "off")) return 0; */
+/*     if (!strcasecmp(s, "true")) return 1; */
+/*     if (!strcasecmp(s, "false")) return 0; */
+/*     if (!strcasecmp(s, "yes")) return 1; */
+/*     if (!strcasecmp(s, "no")) return 0; */
+/*     if (!strcasecmp(s, "1")) return 1; */
+/*     if (!strcasecmp(s, "0")) return 0; */
+/*     return -1; */
+/*   } */
+
+
+/* private: */
+/*   class RBElem { */
+/*   public: */
+/*     RigidBodyParams elem; */
+/*     RBElem* nxt; */
+/*   }; */
+/*   RBElem* head; */
+/*   RBElem* tail; */
+/*   int n_elements; */
+
+/* }; */
diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu
new file mode 100644
index 0000000..f893867
--- /dev/null
+++ b/RigidBodyGrid.cu
@@ -0,0 +1,470 @@
+
+//////////////////////////////////////////////////////////////////////
+// Grid base class that does just the basics.
+// Author: Jeff Comer <jcomer2@illinois.edu>
+
+#include "RigidBodyGrid.h"
+#include <cuda.h>
+
+
+#define STRLEN 512
+
+// Initialize the variables that get used a lot.
+// Also, allocate the main value array.
+void RigidBodyGrid::init() {
+	nynz = ny*nz;
+	size = nx*ny*nz;
+	val = new float[size];
+}
+RigidBodyGrid::RigidBodyGrid() {
+	RigidBodyGrid tmp(Matrix3(),Vector3(),1,1,1);
+	val = new float[1];
+	*this = tmp;									// TODO: verify that this is OK
+	
+	// origin = Vector3();
+	// nx = 1;
+	// ny = 1;
+	// nz = 1;
+	
+	// init();
+	// zero();
+}
+
+// The most obvious of constructors.
+RigidBodyGrid::RigidBodyGrid(int nx0, int ny0, int nz0) {
+	nx = abs(nx0);
+	ny = abs(ny0);
+	nz = abs(nz0);
+	
+	init();
+	zero();
+}
+
+// Make an orthogonal grid given the box dimensions and resolution.
+RigidBodyGrid::RigidBodyGrid(Vector3 box, float dx) {
+	dx = fabsf(dx);
+	box.x = fabsf(box.x);
+	box.y = fabsf(box.y);
+	box.z = fabsf(box.z);
+
+	// Tile the grid into the system box.
+	// The grid spacing is always a bit smaller than dx.
+	nx = int(ceilf(box.x/dx));
+	ny = int(ceilf(box.y/dx));
+	nz = int(ceilf(box.z/dx));
+	if (nx <= 0) nx = 1;
+	if (ny <= 0) ny = 1;
+	if (nz <= 0) nz = 1;
+	
+	init();
+	zero();
+}
+
+// The box gives the system geometry.
+// The grid point numbers define the resolution.
+RigidBodyGrid::RigidBodyGrid(Matrix3 box, int nx0, int ny0, int nz0) {
+	nx = nx0;
+	ny = ny0;
+	nz = nz0;
+
+	// Tile the grid into the system box.
+	if (nx <= 0) nx = 1;
+	if (ny <= 0) ny = 1;
+	if (nz <= 0) nz = 1;
+
+	init();
+	zero();
+}
+
+// The box gives the system geometry.
+// dx is the approx. resolution.
+// The grid spacing is always a bit larger than dx.
+RigidBodyGrid::RigidBodyGrid(Matrix3 box, Vector3 origin0, float dx) {
+	dx = fabs(dx);
+	
+	// Tile the grid into the system box.
+	// The grid spacing is always a bit larger than dx.
+	nx = int(floor(box.ex().length()/dx))-1;
+	ny = int(floor(box.ey().length()/dx))-1;
+	nz = int(floor(box.ez().length()/dx))-1;
+	if (nx <= 0) nx = 1;
+	if (ny <= 0) ny = 1;
+	if (nz <= 0) nz = 1;
+
+	init();
+	zero();
+}
+
+// The box gives the system geometry.
+// dx is the approx. resolution.
+// The grid spacing is always a bit smaller than dx.
+RigidBodyGrid::RigidBodyGrid(Matrix3 box, float dx) {
+	dx = fabs(dx);
+	
+	// Tile the grid into the system box.
+	// The grid spacing is always a bit smaller than dx.
+	nx = int(ceilf(box.ex().length()/dx));
+	ny = int(ceilf(box.ey().length()/dx));
+	nz = int(ceilf(box.ez().length()/dx));
+	if (nx <= 0) nx = 1;
+	if (ny <= 0) ny = 1;
+	if (nz <= 0) nz = 1;
+
+	init();
+	zero();
+}
+
+// Make an exact copy of a grid.
+RigidBodyGrid::RigidBodyGrid(const RigidBodyGrid& g) {
+	nx = g.nx;
+	ny = g.ny;
+	nz = g.nz;
+	
+	init();
+	for (int i = 0; i < size; i++) val[i] = g.val[i];
+}
+
+RigidBodyGrid RigidBodyGrid::mult(const RigidBodyGrid& g) {
+	for (int i = 0; i < size; i++) val[i] *= g.val[i];
+	return *this;
+}
+
+RigidBodyGrid& RigidBodyGrid::operator=(const RigidBodyGrid& g) {
+	delete[] val;
+	val = NULL;
+	nx = g.nx;
+	ny = g.ny;
+	nz = g.nz;
+	
+	init();
+	for (int i = 0; i < size; i++) val[i] = g.val[i];
+
+	return *this;
+}
+
+
+// Make a copy of a grid, but at a different resolution.
+RigidBodyGrid::RigidBodyGrid(const RigidBodyGrid& g, int nx0, int ny0, int nz0) : nx(nx0),  ny(ny0), nz(nz0) {
+	if (nx <= 0) nx = 1;
+	if (ny <= 0) ny = 1;
+	if (nz <= 0) nz = 1;
+
+	// Tile the grid into the box of the template grid.
+	Matrix3 box = g.getBox();
+
+	init();
+
+	// Do an interpolation to obtain the values.
+	for (int i = 0; i < size; i++) {
+		Vector3 r = getPosition(i);
+		val[i] = g.interpolatePotential(r);
+	}
+}
+
+RigidBodyGrid::~RigidBodyGrid() {
+	if (val != NULL)
+		delete[] val;
+}
+
+void RigidBodyGrid::zero() {
+	for (int i = 0; i < size; i++) val[i] = 0.0f;
+}
+
+bool RigidBodyGrid::setValue(int j, float v) {
+	if (j < 0 || j >= size) return false;
+	val[j] = v;
+	return true;
+}
+
+bool RigidBodyGrid::setValue(int ix, int iy, int iz, float v) {
+	if (ix < 0 || ix >= nx) return false;
+	if (iy < 0 || iy >= ny) return false;
+	if (iz < 0 || iz >= nz) return false;
+	int j = iz + iy*nz + ix*ny*nz;
+
+	val[j] = v;
+	return true;
+}
+
+float RigidBodyGrid::getValue(int j) const {
+	if (j < 0 || j >= size) return 0.0f;
+	return val[j];
+}
+
+float RigidBodyGrid::getValue(int ix, int iy, int iz) const {
+	if (ix < 0 || ix >= nx) return 0.0f;
+	if (iy < 0 || iy >= ny) return 0.0f;
+	if (iz < 0 || iz >= nz) return 0.0f;
+	
+	int j = iz + iy*nz + ix*ny*nz;
+	return val[j];
+}
+
+Vector3 RigidBodyGrid::getPosition(int j) const {
+	int iz = j%nz;
+	int iy = (j/nz)%ny;
+	int ix = j/(nz*ny);
+
+	return Vector3(ix,iy,iz);
+}
+Vector3 RigidBodyGrid::getPosition(int j, Matrix3 basis, Vector3 origin) const {
+	int iz = j%nz;
+	int iy = (j/nz)%ny;
+	int ix = j/(nz*ny);
+
+	return basis.transform(Vector3(ix, iy, iz)) + origin;
+}
+
+// // Does the point r fall in the grid?
+// // Obviously this is without periodic boundary conditions.
+// bool RigidBodyGrid::inGrid(Vector3 r,) const {
+// 	Vector3 l = basisInv.transform(r-origin);
+
+// 	if (l.x < 0.0f || l.x >= nx) return false;
+// 	if (l.y < 0.0f || l.y >= ny) return false;
+// 	if (l.z < 0.0f || l.z >= nz) return false;
+// 	return true;
+// }
+
+// bool RigidBodyGrid::inGridInterp(Vector3 r) const {
+// 	Vector3 l = basisInv.transform(r-origin);
+
+// 	if (l.x < 2.0f || l.x >= nx-3.0f) return false;
+// 	if (l.y < 2.0f || l.y >= ny-3.0f) return false;
+// 	if (l.z < 2.0f || l.z >= nz-3.0f) return false;
+// 	return true;
+// }
+
+// Vector3 RigidBodyGrid::transformTo(Vector3 r) const {
+// 	return basisInv.transform(r-origin);
+// }
+// Vector3 RigidBodyGrid::transformFrom(Vector3 l) const {
+// 	return basis.transform(l) + origin;
+// }
+
+IndexList RigidBodyGrid::index(int j) const {
+	int iz = j%nz;
+	int iy = (j/nz)%ny;
+	int ix = j/(nz*ny);
+	IndexList ret;
+	ret.add(ix);
+	ret.add(iy);
+	ret.add(iz);
+	return ret;
+}
+int RigidBodyGrid::indexX(int j) const { return j/(nz*ny); }
+int RigidBodyGrid::indexY(int j) const { return (j/nz)%ny; }
+int RigidBodyGrid::indexZ(int j) const { return j%nz; }
+int RigidBodyGrid::index(int ix, int iy, int iz) const { return iz + iy*nz + ix*ny*nz; }
+
+// int RigidBodyGrid::index(Vector3 r) const {
+// 	Vector3 l = basisInv.transform(r-origin);
+	
+// 	int ix = int(floor(l.x));
+// 	int iy = int(floor(l.y));
+// 	int iz = int(floor(l.z));
+
+// 	ix = wrap(ix, nx);
+// 	iy = wrap(iy, ny);
+// 	iz = wrap(iz, nz);
+	
+// 	return iz + iy*nz + ix*ny*nz;
+// }
+
+// int RigidBodyGrid::nearestIndex(Vector3 r) const {
+// 	Vector3 l = basisInv.transform(r-origin);
+	
+// 	int ix = int(floorf(l.x + 0.5f));
+// 	int iy = int(floorf(l.y + 0.5f));
+// 	int iz = int(floorf(l.z + 0.5f));
+
+// 	ix = wrap(ix, nx);
+// 	iy = wrap(iy, ny);
+// 	iz = wrap(iz, nz);
+	
+// 	return iz + iy*nz + ix*ny*nz;
+// }
+
+}
+
+// Add a fixed value to the grid.
+void RigidBodyGrid::shift(float s) {
+	for (int i = 0; i < size; i++) val[i] += s;
+}
+
+// Multiply the grid by a fixed value.
+void RigidBodyGrid::scale(float s) {
+	for (int i = 0; i < size; i++) val[i] *= s;
+}
+
+// Get the mean of the entire grid.
+float RigidBodyGrid::mean() const {
+	float sum = 0.0f;
+	for (int i = 0; i < size; i++) sum += val[i];
+	return sum/size;
+}
+
+// Get the potential at the closest node.
+float RigidBodyGrid::getPotential(Vector3 pos) const {
+	// Find the nearest node.
+	int j = nearestIndex(pos);
+
+	return val[j];
+}
+
+
+// Added by Rogan for times when simpler calculations are required.
+float RigidBodyGrid::interpolatePotentialLinearly(Vector3 pos) const {
+	// Find the home node.
+	Vector3 l = pos;
+	int homeX = int(floorf(l.x));
+	int homeY = int(floorf(l.y));
+	int homeZ = int(floorf(l.z));
+	
+	// Get the array jumps.
+	int jump[3];
+	jump[0] = nz*ny;
+	jump[1] = nz;
+	jump[2] = 1;
+
+	// Shift the indices in the home array.
+	int home[3];
+	home[0] = homeX;
+	home[1] = homeY;
+	home[2] = homeZ;
+
+	// Get the grid dimensions.
+	int g[3];
+	g[0] = nx;
+	g[1] = ny;
+	g[2] = nz;
+
+	// Get the interpolation coordinates.
+	float w[3];
+	w[0] = l.x - homeX;
+	w[1] = l.y - homeY;
+	w[2] = l.z - homeZ;
+
+	// Find the values at the neighbors.
+	float g1[2][2][2];
+	for (int ix = 0; ix < 2; ix++) {
+		for (int iy = 0; iy < 2; iy++) {
+			for (int iz = 0; iz < 2; iz++) {
+				// Wrap around the periodic boundaries. 
+				int jx = ix + home[0];
+				jx = wrap(jx, g[0]);
+				int jy = iy + home[1];
+				jy = wrap(jy, g[1]);
+				int jz = iz + home[2];
+				jz = wrap(jz, g[2]);
+				
+				int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
+				g1[ix][iy][iz] = val[ind];
+			}
+		}
+	}
+
+	// Mix along x.
+	float g2[2][2];
+	for (int iy = 0; iy < 2; iy++) {
+		for (int iz = 0; iz < 2; iz++) {
+			// p = w[0] * g[0][iy][iz] + (1-w[0]) * g[1][iy][iz]
+			g2[iy][iz] = (1.0f-w[0])*g1[0][iy][iz] + w[0]*g1[1][iy][iz];
+		}
+	}
+
+	// Mix along y.
+	float g3[2];
+	for (int iz = 0; iz < 2; iz++) {
+		g3[iz] = (1.0f-w[1])*g2[0][iz] + w[1]*g2[1][iz];
+	}
+
+	// DEBUG
+	//printf("(0,0,0)=%.1f (0,0,1)=%.1f (0,1,0)=%.1f (0,1,1)=%.1f (1,0,0)=%.1f (1,0,1)=%.1f (1,1,0)=%.1f (1,1,1)=%.1f ",
+	//   g1[0][0][0], g1[0][0][1], g1[0][1][0], g1[0][1][1], g1[1][0][0], g1[1][0][1], g1[1][1][0], g1[1][1][1] );
+	//printf ("%.2f\n",(1.0-w[2])*g3[0] + w[2]*g3[1]);
+
+	// Mix along z
+	return (1.0f-w[2])*g3[0] + w[2]*g3[1];
+}
+
+
+
+// Vector3 RigidBodyGrid::wrapDiffNearest(Vector3 r) const {
+// 	Vector3 l = r;
+// 	l.x = wrapDiff(l.x, nx);
+// 	l.y = wrapDiff(l.y, ny);
+// 	l.z = wrapDiff(l.z, nz);
+
+// 	float length2 = l.length2();
+
+// 	for (int dx = -1; dx <= 1; dx++) {
+// 		for (int dy = -1; dy <= 1; dy++) {
+// 			for (int dz = -1; dz <= 1; dz++) {
+// 				//if (dx == 0 && dy == 0 && dz == 0) continue;
+// 				Vector3 tmp = Vector3(l.x+dx*nx, l.y+dy*ny, l.z+dz*nz);
+// 				if (tmp.length2() < length2) {
+// 					l = tmp;
+// 					length2 = l.length2();
+// 				}
+// 			}
+// 		}
+// 	}
+
+// 	return l;
+// 	// return basis.transform(l);
+// }
+
+
+// Includes the home node.
+// indexBuffer must have a size of at least 27.
+void RigidBodyGrid::getNeighbors(int j, int* indexBuffer) const {
+	int jx = indexX(j);
+	int jy = indexY(j);
+	int jz = indexZ(j);
+
+	int k = 0;
+	for (int ix = -1; ix <= 1; ix++) {
+		for (int iy = -1; iy <= 1; iy++) {
+			for (int iz = -1; iz <= 1; iz++) {
+				int ind = wrap(jz+iz,nz) + nz*wrap(jy+iy,ny) + nynz*wrap(jx+ix,nx);
+				indexBuffer[k] = ind;
+				k++;
+			}
+		}
+	}
+}
+
+// Includes the home node.
+// indexBuffer must have a size of at least 27.
+void RigidBodyGrid::getNeighbors(int j, int* indexBuffer) const {
+	int jx = indexX(j);
+	int jy = indexY(j);
+	int jz = indexZ(j);
+
+	int k = 0;
+	for (int ix = -1; ix <= 1; ix++) {
+		for (int iy = -1; iy <= 1; iy++) {
+			for (int iz = -1; iz <= 1; iz++) {
+				int ind = wrap(jz+iz,nz) + nz*wrap(jy+iy,ny) + nynz*wrap(jx+ix,nx);
+				indexBuffer[k] = ind;
+				k++;
+			}
+		}
+	}
+}
+
+
+// Get the values at the neighbors of a node.
+// Note that homeX, homeY, and homeZ do not need to be wrapped,
+// since we do it here.
+void RigidBodyGrid::getNeighborValues(NeighborList* neigh, int homeX, int homeY, int homeZ) const {
+	for (int ix = -1; ix <= 1; ix++) {
+		for (int iy = -1; iy <= 1; iy++) {
+			for (int iz = -1; iz <= 1; iz++) {
+				int ind = wrap(homeZ+iz,nz) + nz*wrap(homeY+iy,ny) + nynz*wrap(homeX+ix,nx);
+				neigh->v[ix+1][iy+1][iz+1] = val[ind];
+			}
+		}
+	}
+}  
diff --git a/RigidBodyGrid.h b/RigidBodyGrid.h
new file mode 100644
index 0000000..8453332
--- /dev/null
+++ b/RigidBodyGrid.h
@@ -0,0 +1,519 @@
+//////////////////////////////////////////////////////////////////////
+// Copy of BaseGrid with some modificaitons
+// 
+
+#pragma once
+
+#ifdef __CUDACC__
+    #define HOST __host__
+    #define DEVICE __device__
+#else
+    #define HOST 
+    #define DEVICE 
+#endif
+
+#include "useful.h"
+#include <cmath>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cuda.h>
+
+
+using namespace std;
+
+#define STRLEN 512
+
+class NeighborList {
+public:
+  float v[3][3][3];
+};
+
+class RigidBodyGrid { 
+private:
+  // Initialize the variables that get used a lot.
+  // Also, allocate the main value array.
+  void init();
+
+public:
+
+	/*                               \
+	| CONSTRUCTORS, DESTRUCTORS, I/O |
+	\===============================*/
+	
+	// RBTODO Fix?
+	RigidBodyGrid(); // cmaffeo2 (2015) moved this out of protected, cause I wanted RigidBodyGrid in a struct
+  // The most obvious of constructors.
+		RigidBodyGrid(int nx0, int ny0, int nz0);
+
+  // Make an orthogonal grid given the box dimensions and resolution.
+  RigidBodyGrid(Vector3 box, float dx);
+
+  // The box gives the system geometry.
+  // The grid point numbers define the resolution.
+  RigidBodyGrid(Matrix3 box, int nx0, int ny0, int nz0);
+
+  // The box gives the system geometry.
+  // dx is the approx. resolution.
+  // The grid spacing is always a bit larger than dx.
+  RigidBodyGrid(Matrix3 box, Vector3 origin0, float dx);
+
+  // The box gives the system geometry.
+  // dx is the approx. resolution.
+  // The grid spacing is always a bit smaller than dx.
+  RigidBodyGrid(Matrix3 box, float dx);
+
+  // Make an exact copy of a grid.
+  RigidBodyGrid(const RigidBodyGrid& g);
+
+  RigidBodyGrid mult(const RigidBodyGrid& g);
+
+  RigidBodyGrid& operator=(const RigidBodyGrid& g);
+
+  // Make a copy of a grid, but at a different resolution.
+  RigidBodyGrid(const RigidBodyGrid& g, int nx0, int ny0, int nz0);
+  
+	virtual ~RigidBodyGrid();
+
+	/*             \
+	| DATA METHODS |
+	\=============*/
+		
+	void zero();
+  
+  bool setValue(int j, float v);
+
+  bool setValue(int ix, int iy, int iz, float v);
+
+  virtual float getValue(int j) const;
+
+  virtual float getValue(int ix, int iy, int iz) const;
+
+  // Vector3 getPosition(int ix, int iy, int iz) const;
+  Vector3 getPosition(int j) const;
+	Vector3 getPosition(int j, Matrix3 basis, Vector3 origin) const {
+
+  /* // Does the point r fall in the grid? */
+  /* // Obviously this is without periodic boundary conditions. */
+  /* bool inGrid(Vector3 r) const; */
+
+  /* bool inGridInterp(Vector3 r) const; */
+
+  /* Vector3 transformTo(Vector3 r) const; */
+
+  /* Vector3 transformFrom(Vector3 l) const; */
+
+  IndexList index(int j) const;
+  int indexX(int j) const;
+  int indexY(int j) const;
+  int indexZ(int j) const;
+  int index(int ix, int iy, int iz) const;
+  
+  /* int index(Vector3 r) const; */
+  /* int nearestIndex(Vector3 r) const; */
+
+  HOST DEVICE inline int length() const {
+		return size;
+	}
+
+  HOST DEVICE inline int getNx() const {return nx;}
+  HOST DEVICE inline int getNy() const {return ny;}
+  HOST DEVICE inline int getNz() const {return nz;}
+  HOST DEVICE inline int getSize() const {return size;}
+
+  
+  // Add a fixed value to the grid.
+  void shift(float s);
+
+  // Multiply the grid by a fixed value.
+  void scale(float s);
+
+	/*         \
+	| COMPUTED |
+	\=========*/
+	
+  // Get the mean of the entire grid.
+  float mean() const
+	
+  // Get the potential at the closest node.
+  virtual float getPotential(Vector3 pos) const;
+
+	// Added by Rogan for times when simpler calculations are required.
+  virtual float interpolatePotentialLinearly(Vector3 pos) const;
+
+  HOST DEVICE inline float interpolateDiffX(Vector3 pos, float w[3], float g1[4][4][4]) const {
+    float a0, a1, a2, a3;
+
+		// RBTODO parallelize loops?
+		
+    // Mix along x, taking the derivative.
+    float g2[4][4];
+    for (int iy = 0; iy < 4; iy++) {
+      for (int iz = 0; iz < 4; iz++) {
+				a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
+				a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
+				a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
+				a0 = g1[1][iy][iz];
+
+				//g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+				g2[iy][iz] = 3.0f*a3*w[0]*w[0] + 2.0f*a2*w[0] + a1;
+      }
+    }
+
+
+    // Mix along y.
+    float g3[4];
+    for (int iz = 0; iz < 4; iz++) {
+      a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
+      a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
+      a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
+      a0 = g2[1][iz];
+   
+      g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+    }
+
+    // Mix along z.
+    a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
+    a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
+    a1 = 0.5f*(-g3[0] + g3[2]);
+    a0 = g3[1];
+ 
+    float retval = -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
+    return retval;
+  }
+
+  HOST DEVICE inline float interpolateDiffY(Vector3 pos, float w[3], float g1[4][4][4]) const {
+    float a0, a1, a2, a3;
+  
+    // Mix along x, taking the derivative.
+    float g2[4][4];
+    for (int iy = 0; iy < 4; iy++) {
+      for (int iz = 0; iz < 4; iz++) {
+				a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
+				a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
+				a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
+				a0 = g1[1][iy][iz];
+
+				g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+      }
+    }
+
+    // Mix along y.
+    float g3[4];
+    for (int iz = 0; iz < 4; iz++) {
+      a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
+      a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
+      a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
+      a0 = g2[1][iz];
+   
+      //g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+      g3[iz] = 3.0f*a3*w[1]*w[1] + 2.0f*a2*w[1] + a1;
+    }
+
+    // Mix along z.
+    a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
+    a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
+    a1 = 0.5f*(-g3[0] + g3[2]);
+    a0 = g3[1];
+
+    return -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
+  }
+
+  HOST DEVICE inline float interpolateDiffZ(Vector3 pos, float w[3], float g1[4][4][4]) const {
+    float a0, a1, a2, a3;
+  
+    // Mix along x, taking the derivative.
+    float g2[4][4];
+    for (int iy = 0; iy < 4; iy++) {
+      for (int iz = 0; iz < 4; iz++) {
+				a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
+				a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
+				a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
+				a0 = g1[1][iy][iz];
+
+				g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+      }
+    }
+
+    // Mix along y.
+    float g3[4];
+    for (int iz = 0; iz < 4; iz++) {
+      a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
+      a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
+      a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
+      a0 = g2[1][iz];
+   
+      g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+    }
+
+    // Mix along z.
+    a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
+    a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
+    a1 = 0.5f*(-g3[0] + g3[2]);
+    a0 = g3[1];
+
+    return -(3.0f*a3*w[2]*w[2] + 2.0f*a2*w[2] + a1);
+  }
+
+	// RBTODO overload with optimized algorithm
+	//  skip transforms (assume identity basis)
+  HOST DEVICE inline float interpolatePotential(Vector3 pos) const {
+    // Find the home node.
+    Vector3 l = pos;
+    int homeX = int(floor(l.x));
+    int homeY = int(floor(l.y));
+    int homeZ = int(floor(l.z));
+
+		// out of grid? return 0
+		// RBTODO
+			
+    // Get the array jumps.
+    int jump[3];
+    jump[0] = nz*ny;
+    jump[1] = nz;
+    jump[2] = 1;
+
+    // Shift the indices in the home array.
+    int home[3];
+    home[0] = homeX;
+    home[1] = homeY;
+    home[2] = homeZ;
+
+    // Get the grid dimensions.
+    int g[3];
+    g[0] = nx;
+    g[1] = ny;
+    g[2] = nz;
+  
+    // Get the interpolation coordinates.
+    float w[3];
+    w[0] = l.x - homeX;
+    w[1] = l.y - homeY;
+    w[2] = l.z - homeZ;
+
+    // Find the values at the neighbors.
+    float g1[4][4][4];
+    for (int ix = 0; ix < 4; ix++) {
+	  	int jx = ix-1 + home[0];
+	  	jx = wrap(jx, g[0]);
+      for (int iy = 0; iy < 4; iy++) {
+	  		int jy = iy-1 + home[1];
+	  		jy = wrap(jy, g[1]);
+				for (int iz = 0; iz < 4; iz++) {
+	  			// Wrap around the periodic boundaries. 
+	  			int jz = iz-1 + home[2];
+	  			jz = wrap(jz, g[2]);
+	  
+	  			int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
+	  			g1[ix][iy][iz] = val[ind];
+				}
+      }
+    }
+    float a0, a1, a2, a3;
+  
+    // Mix along x.
+    float g2[4][4];
+    for (int iy = 0; iy < 4; iy++) {
+      for (int iz = 0; iz < 4; iz++) {
+				a3 = 0.5f*(-g1[0][iy][iz] + 3.0f*g1[1][iy][iz] - 3.0f*g1[2][iy][iz] + g1[3][iy][iz]);
+				a2 = 0.5f*(2.0f*g1[0][iy][iz] - 5.0f*g1[1][iy][iz] + 4.0f*g1[2][iy][iz] - g1[3][iy][iz]);
+				a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]);
+				a0 = g1[1][iy][iz];
+
+				g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0;
+      }
+    }
+
+    // Mix along y.
+    float g3[4];
+    for (int iz = 0; iz < 4; iz++) {
+      a3 = 0.5f*(-g2[0][iz] + 3.0f*g2[1][iz] - 3.0f*g2[2][iz] + g2[3][iz]);
+      a2 = 0.5f*(2.0f*g2[0][iz] - 5.0f*g2[1][iz] + 4.0f*g2[2][iz] - g2[3][iz]);
+      a1 = 0.5f*(-g2[0][iz] + g2[2][iz]);
+      a0 = g2[1][iz];
+   
+      g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0;
+    }
+
+    // Mix along z.
+    a3 = 0.5f*(-g3[0] + 3.0f*g3[1] - 3.0f*g3[2] + g3[3]);
+    a2 = 0.5f*(2.0f*g3[0] - 5.0f*g3[1] + 4.0f*g3[2] - g3[3]);
+    a1 = 0.5f*(-g3[0] + g3[2]);
+    a0 = g3[1];
+
+    return a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0;
+  }
+
+  HOST DEVICE inline static int wrap(int i, int n) {
+		if (i < 0) {
+			i %= n;
+			i += n;
+		}
+		// The portion above allows i == n, so no else keyword.
+		if (i >= n) {
+			i %= n;
+		} 
+		return i;
+	}
+
+	/** interpolateForce() to be used on CUDA Device **/
+	DEVICE inline Vector3 interpolateForceD(Vector3 pos) const {
+		Vector3 f;
+ 		Vector3 l = pos;
+		int homeX = int(floor(l.x));
+		int homeY = int(floor(l.y));
+		int homeZ = int(floor(l.z));
+		// Get the array jumps with shifted indices.
+		int jump[3];
+		jump[0] = nz*ny;
+		jump[1] = nz;
+		jump[2] = 1;
+		// Shift the indices in the home array.
+		int home[3];
+		home[0] = homeX;
+		home[1] = homeY;
+		home[2] = homeZ;
+
+		// Shift the indices in the grid dimensions.
+		int g[3];
+		g[0] = nx;
+		g[1] = ny;
+		g[2] = nz;
+
+		// Get the interpolation coordinates.
+		float w[3];
+		w[0] = l.x - homeX;
+		w[1] = l.y - homeY;
+		w[2] = l.z - homeZ;
+		// Find the values at the neighbors.
+		float g1[4][4][4];
+		for (int ix = 0; ix < 4; ix++) {
+			for (int iy = 0; iy < 4; iy++) {
+				for (int iz = 0; iz < 4; iz++) {
+	  			// Wrap around the periodic boundaries. 
+					int jx = ix-1 + home[0];
+					jx = wrap(jx, g[0]);
+					int jy = iy-1 + home[1];
+					jy = wrap(jy, g[1]);
+					int jz = iz-1 + home[2];
+					jz = wrap(jz, g[2]);
+					int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
+					g1[ix][iy][iz] = val[ind];
+				}
+			}
+		}  
+		f.x = interpolateDiffX(pos, w, g1);
+		f.y = interpolateDiffY(pos, w, g1);
+		f.z = interpolateDiffZ(pos, w, g1);
+		// Vector3 f1 = basisInv.transpose().transform(f);
+		// return f1;
+		return f;
+	}
+
+  inline virtual Vector3 interpolateForce(Vector3 pos) const {
+		Vector3 f;
+ 		Vector3 l = pos;
+		int homeX = int(floor(l.x));
+		int homeY = int(floor(l.y));
+		int homeZ = int(floor(l.z));
+		// Get the array jumps with shifted indices.
+		int jump[3];
+		jump[0] = nz*ny;
+		jump[1] = nz;
+		jump[2] = 1;
+		// Shift the indices in the home array.
+		int home[3];
+		home[0] = homeX;
+		home[1] = homeY;
+		home[2] = homeZ;
+
+		// Shift the indices in the grid dimensions.
+		int g[3];
+		g[0] = nx;
+		g[1] = ny;
+		g[2] = nz;
+
+		// Get the interpolation coordinates.
+		float w[3];
+		w[0] = l.x - homeX;
+		w[1] = l.y - homeY;
+		w[2] = l.z - homeZ;
+		// Find the values at the neighbors.
+		float g1[4][4][4];
+		//RBTODO parallelize?
+		for (int ix = 0; ix < 4; ix++) {
+			for (int iy = 0; iy < 4; iy++) {
+				for (int iz = 0; iz < 4; iz++) {
+	  			// Wrap around the periodic boundaries. 
+					int jx = ix-1 + home[0];
+					jx = wrap(jx, g[0]);
+					int jy = iy-1 + home[1];
+					jy = wrap(jy, g[1]);
+					int jz = iz-1 + home[2];
+					jz = wrap(jz, g[2]);
+					int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
+					g1[ix][iy][iz] = val[ind];
+				}
+			}
+		}  
+		f.x = interpolateDiffX(pos, w, g1);
+		f.y = interpolateDiffY(pos, w, g1);
+		f.z = interpolateDiffZ(pos, w, g1);
+		// Vector3 f1 = basisInv.transpose().transform(f);
+		// return f1;
+		return f;
+	}
+
+  // Wrap coordinate: 0 <= x < l
+  HOST DEVICE inline float wrapFloat(float x, float l) const {
+		int image = int(floor(x/l));
+		x -= image*l;
+		return x;
+  }
+  
+  // Wrap distance: -0.5*l <= x < 0.5*l
+  HOST DEVICE static inline float wrapDiff(float x, float l) {
+		int image = int(floor(x/l));
+		x -= image*l;
+		if (x >= 0.5f * l)
+			x -= l;
+		return x;
+  }
+
+  // Wrap vector, 0 <= x < lx  &&  0 <= y < ly  &&  0 <= z < lz
+  HOST DEVICE inline virtual Vector3 wrap(Vector3 l) const {
+    l.x = wrapFloat(l.x, nx);
+    l.y = wrapFloat(l.y, ny);
+    l.z = wrapFloat(l.z, nz);
+    return l;
+  }
+
+  // Wrap vector distance, -0.5*l <= x < 0.5*l  && ...
+  HOST DEVICE inline Vector3 wrapDiff(Vector3 l) const {
+    l.x = wrapDiff(l.x, nx);
+    l.y = wrapDiff(l.y, ny);
+    l.z = wrapDiff(l.z, nz);
+		return l;
+  }
+
+  /* Vector3 wrapDiffNearest(Vector3 r) const; */
+
+  // Includes the home node.
+  // indexBuffer must have a size of at least 27.
+  void getNeighbors(int j, int* indexBuffer) const;
+  
+  // Get the values at the neighbors of a node.
+  // Note that homeX, homeY, and homeZ do not need to be wrapped,
+  // since we do it here.
+  void getNeighborValues(NeighborList* neigh, int homeX, int homeY, int homeZ) const;
+  inline void setVal(float* v) { val = v; }
+
+public:
+  int nx, ny, nz;
+  int nynz;
+  int size;
+public:
+  float* val;
+};
+
diff --git a/RigidBodyType.cu b/RigidBodyType.cu
index b587174..30322be 100644
--- a/RigidBodyType.cu
+++ b/RigidBodyType.cu
@@ -11,6 +11,9 @@ void RigidBodyType::clear() {
 	potentialGrids.clear();
 	densityGrids.clear();
 
+	potentialGridKeys.clear();
+	densityGridKeys.clear();
+
 	if (numPotGrids > 0) delete[] rawPotentialGrids;
 	if (numDenGrids > 0) delete[] rawDensityGrids;
 	rawPotentialGrids = NULL;
@@ -67,8 +70,7 @@ void RigidBodyType::addPotentialGrid(String s) {
 	BaseGrid g(token[1]);
 	
 	potentialGrids.push_back( g );
-	// 	[numPotGrids] = g;
-	// numPotGrids++;
+	potentialGridKeys.push_back( key );
 }
 void RigidBodyType::addDensityGrid(String s) {
 	// tokenize and return
@@ -82,7 +84,9 @@ void RigidBodyType::addDensityGrid(String s) {
 	String key = token[0];
 	BaseGrid g(token[1]);
 	
+
 	densityGrids.push_back( g );
+	densityGridKeys.push_back( key );
 }
 
 void RigidBodyType::updateRaw() {
diff --git a/RigidBodyType.h b/RigidBodyType.h
index d20f1e6..4c334dd 100644
--- a/RigidBodyType.h
+++ b/RigidBodyType.h
@@ -50,6 +50,9 @@ public:
 	Vector3 transDamping;
 	Vector3 rotDamping;
 
+	std::vector<String> potentialGridKeys;
+	std::vector<String> densityGridKeys;
+
 	std::vector<BaseGrid> potentialGrids;
 	std::vector<BaseGrid> densityGrids;
 	
diff --git a/notes.org b/notes.org
index 9cf5d48..c6ee171 100644
--- a/notes.org
+++ b/notes.org
@@ -1 +1,24 @@
-* Q: vector classes in thrust look like they could simplify code; bad idea?
+* grid
+** Q: overhead of dynamic parallelism?
+** Q: overhead of classes? 
+	 Member data can't be shared
+
+** TODO read up on:
+*** dynamic parallelism
+*** registers
+* TODO decide whether to write a monolithic kernel or to break problem into subkernel launches
+
+
+
+* optimize algorithms for cuda (balance floating point calculation with memory transfers)
+** parallize loops
+** subthreads
+** new algorithms may be improved through new data structures; introduce 
+*** RigidBodyGrid: methods pulled from BaseGrid, but transforms passed to functions 
+**** no wrapping
+
+* pairlists for rigid bodies 
+** maybe for grids, depending on parallel structure of code
+
+* other ideas
+** interpolate density grids?
diff --git a/runBrownTown.cpp b/runBrownTown.cpp
index 4cd7714..5de068f 100644
--- a/runBrownTown.cpp
+++ b/runBrownTown.cpp
@@ -103,9 +103,7 @@ int main(int argc, char* argv[]) {
 	GPUManager::init();
 	GPUManager::safe(safe);
 	Configuration config(configFile, replicas, debug);
-	printf("Done with config\n");
 	GPUManager::set(0);
-	printf("Done with gpumanager\n");
 	config.copyToCUDA();
 
 	GrandBrownTown brown(config, outArg, randomSeed,
diff --git a/useful.h b/useful.h
index 78b6efc..72070e8 100644
--- a/useful.h
+++ b/useful.h
@@ -244,7 +244,7 @@ class Matrix3 {
 public:
 	HOST DEVICE inline Matrix3() {}
 	Matrix3(float s);
-	Matrix3(float xx, float xy, float xz, float yx, float yy, float yz, float zx, float zy, float zz);
+	HOST DEVICE Matrix3(float xx, float xy, float xz, float yx, float yy, float yz, float zx, float zy, float zz);
 	Matrix3(float x, float y, float z);
 	Matrix3(const Vector3& ex, const Vector3& ey, const Vector3& ez);
 	Matrix3(const float* d);
-- 
GitLab