From bba585bba7cba1ce2a8f69df98035a72f9569219 Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Tue, 17 Nov 2015 16:09:34 -0600
Subject: [PATCH] worked on rigidbody classes

---
 BaseGrid.h                                    |   2 +-
 ComputeGridGrid.cuh                           |  46 +--
 Configuration.cpp                             |  31 +-
 Configuration.h                               |   8 +-
 Debug.h                                       |   2 +-
 GrandBrownTown.cu                             |  18 +-
 GrandBrownTown.cuh                            |   2 -
 GrandBrownTown.h                              |  10 +-
 RigidBody.c                                   | 265 ------------------
 RigidBody.cu                                  | 226 +++++++++++++++
 RigidBody.h                                   | 252 ++++-------------
 ...BodyController.c => RigidBodyController.cu | 132 +++++++--
 RigidBodyController.h                         |  47 ++++
 RigidBodyGrid.cu                              |  39 +--
 RigidBodyGrid.h                               |  81 +++---
 RigidBodyType.cu                              |  67 ++++-
 RigidBodyType.h                               |  19 +-
 makefile                                      |   2 +-
 notes.org                                     | 191 ++++++++++++-
 useful.h                                      |  22 +-
 20 files changed, 820 insertions(+), 642 deletions(-)
 delete mode 100644 RigidBody.c
 create mode 100644 RigidBody.cu
 rename RigidBodyController.c => RigidBodyController.cu (78%)
 create mode 100644 RigidBodyController.h

diff --git a/BaseGrid.h b/BaseGrid.h
index bcdda0b..e3032d5 100644
--- a/BaseGrid.h
+++ b/BaseGrid.h
@@ -175,7 +175,7 @@ public:
 	\=========*/
 	
   // Get the mean of the entire grid.
-  float mean() const
+  float mean() const;
 	
   // Compute the average profile along an axis.
   // Assumes that the grid axis with index "axis" is aligned with the world axis of index "axis".
diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh
index 11e2f04..50a6ba6 100644
--- a/ComputeGridGrid.cuh
+++ b/ComputeGridGrid.cuh
@@ -1,34 +1,34 @@
 #pragma once
 #include "RigidBodyGrid.h"
+#include "useful.h"
 
 __global__
-void computeGridGridForce(RigidBodyGrid* rho, RigidBodyGrid* u) {
-	unsigned int bidx = blockIdx.x;
-	unsigned int tidx = threadIdx.x;
-
-	// GTX 980 has ~ 4 x 10x10x10 floats available for shared memory
-	//    but code should work without access to this
-	
-	// RBTODO rewrite to use shared memory, break problem into subgrids
-	// will lookups of 
-
-// http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
-
-	const unsigned int r_id = bidx * blockDim.x + threadIdx.x;
+void computeGridGridForce(const RigidBodyGrid& rho, const RigidBodyGrid& u,
+													Matrix3 basis_rho, Vector3 origin_rho,
+													Matrix3 basis_u,   Vector3 origin_u) {
+  // RBTODO http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+	const unsigned int r_id = blockIdx.x * blockDim.x + threadIdx.x;
 	
 	// RBTODO parallelize transform
-	if (r_id > rho->size)					// skip threads with no data 
+	if (r_id > rho.size)					// skip threads with no data 
 		return;
-
-	// Tile grid data into shared memory
+	
+	// Maybe: Tile grid data into shared memory
 	//   RBTODO: think about localizing regions of grid data
-	Vector3 p = rho->getPosition(r_id);
-	float val = rho->val[r_id];
+	Vector3 p = rho.getPosition(r_id, basis_rho, origin_rho);
+	float val = rho.val[r_id];
+
+	// RBTODO potential basis and rho!
+	
+	// RBTODO combine interp methods and reduce repetition! 
+	float energy = val*u.interpolatePotential(p); 
+	Vector3 f = val*u.interpolateForceD(p);
+	Vector3 t = p.cross(f);				// test if sign is correct!
 
-	// RBTODO reduce these
+	// RBTODO reduce forces and torques
 	// http://www.cuvilib.com/Reduction.pdf
-	float energy = ;
-	Vector3 f = u->interpolateForceD(p);
-	Vector3 t = f * 
-		
+
+	// RBTODO 3rd-law forces + torques (maybe outside kernel)
+
+	// must reference rigid bodies in some way
 }
diff --git a/Configuration.cpp b/Configuration.cpp
index 8e7328e..89691e8 100644
--- a/Configuration.cpp
+++ b/Configuration.cpp
@@ -174,18 +174,29 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) :
 	name = new String[num * simNum];
 	currSerial = 0;
 
-	// RigidBodies...
-	if (numRigidTypes > 0) {
-		printf("\nCounting rigid bodies specified in the configuration file.\n");
-		numRB = 0;
-		for (int i = 0; i < numRigidTypes; i++) numRB += rigidBody[i].num;
-
+// RBTODO: clean this mess up
+	/* // RigidBodies... */
+	/* if (numRigidTypes > 0) { */
+	/* 	printf("\nCounting rigid bodies specified in the configuration file.\n"); */
+	/* 	numRB = 0; */
+
+	/* 	// grow list of rbs */
+	/* 	for (int i = 0; i < numRigidTypes; i++) {			 */
+	/* 		numRB += rigidBody[i].num; */
+
+	/* 		std::vector<RigidBody> tmp; */
+	/* 		for (int j = 0; j < rigidBody[i].num; j++) { */
+	/* 			tmp.push_back( new RigidBody( this, rigidBody[i] ) ); */
+	/* 		} */
+
+	/* 		rbs.push_back(tmp); */
+	/* 	} */
 		// // state data
 		// rbPos = new Vector3[numRB * simNum];
 		// type = new int[numRB * simNum];
 		
-	}
-	printf("Initial RigidBodies: %d\n", numRB);
+	/* } */
+	/* printf("Initial RigidBodies: %d\n", numRB); */
 	
 
 	// Create exclusions from the exclude rule, if it was specified in the config file
@@ -449,11 +460,11 @@ void Configuration::copyToCUDA() {
 				cudaMemcpyHostToDevice));
 
 
-	printf("copying RBs\n");
+	printf("Copying RBs\n");
 	// Copy rigidbody types 
 	// http://stackoverflow.com/questions/16024087/copy-an-object-to-device
  	for (int i = 0; i < numRigidTypes; i++) {
-		printf("working on RB %d\n",i);
+		printf("Working on RB %d\n",i);
 		RigidBodyType *rb = &(rigidBody[i]); // temporary for convenience
 		rb->updateRaw();
 
diff --git a/Configuration.h b/Configuration.h
index 0734a33..68df323 100644
--- a/Configuration.h
+++ b/Configuration.h
@@ -10,6 +10,7 @@
 #define CONFIGURATION_H
 
 #include <algorithm> // sort
+#include <vector>
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -27,6 +28,7 @@
 #include "GPUManager.h"
 #include "Dihedral.h"
 #include "RigidBodyType.h"
+#include "RigidBody.h"
 
 #include <cuda.h> 
 #include <cuda_runtime.h>
@@ -109,8 +111,9 @@ public:
 	float minimumSep; // minimum separation allowed with placing new particles
 
 	// RigidBody variables
-	int numRB;
-
+	/* int numRB; */
+	/* std::vector< std::vector<RigidBody> > rbs; */
+	
 	// System parameters
 	String outputName;
 	float timestep;
@@ -194,7 +197,6 @@ public:
 	RigidBodyType* rigidBody;
 	int numRigidTypes;
 
-
 };
 
 #endif
diff --git a/Debug.h b/Debug.h
index c3bee20..a77f2b8 100644
--- a/Debug.h
+++ b/Debug.h
@@ -36,7 +36,7 @@
 
 #ifdef DEBUGM
 
-#include "InfoStream.h"
+/* #include "InfoStream.h" */
 
   #define Debug(x) (x)
   #define DebugM(level,format) \
diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu
index cd1650b..57b271b 100644
--- a/GrandBrownTown.cu
+++ b/GrandBrownTown.cu
@@ -1,5 +1,6 @@
 #include "GrandBrownTown.h"
 #include "GrandBrownTown.cuh"
+/* #include "ComputeGridGrid.cuh" */
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
@@ -9,16 +10,13 @@ inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) {
 	}
 }
 
-#include "RigidBodyType.h" //temporary
-
-
 bool GrandBrownTown::DEBUG;
 
 cudaEvent_t START, STOP;
 
 GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 		const long int randomSeed, bool debug, bool imd_on, unsigned int imd_port, int numReplicas) :
-		imd_on(imd_on), imd_port(imd_port), numReplicas(numReplicas) {
+	imd_on(imd_on), imd_port(imd_port), numReplicas(numReplicas), conf(c) {
 
 	for (int i = 0; i < numReplicas; i++) {
 		std::stringstream curr_file, restart_file, out_prefix;
@@ -56,6 +54,8 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	type   = new     int[num * numReplicas];  // [HOST] array of particles' types.
 	serial = new     int[num * numReplicas];  // [HOST] array of particles' serial numbers.
 
+	// Allocate things for rigid body
+	// RBC = RigidBodyController(c);
 	
 	// Replicate identical initial conditions across all replicas
 	// TODO: add an option to generate random initial conditions for all replicas
@@ -410,13 +410,11 @@ void GrandBrownTown::run() {
 		rt_timer_stop(cputimer);
 		float dt2 = rt_timer_time(cputimer);
 		printf("Position Update Time: %f ms\n", dt2 * 1000);
-		// */
+		*/
 
-		// calculateRigidBodyForce<<< numBlocks, NUM_THREADS >>>(rb_d, rbType_d,
-		// 																						 kT, kTGrid_d,
-		// 																						 tl, timestep,
-		// 																					 sys_d, randoGen_d, numReplicas);
-		computeGridGridForce<<< numBlocks, NUM_THREADS >>>(grid1_d, grid2_d);
+		
+		/* 	for (int j = 0; j < t->num; j++) { */
+		/* computeGridGridForce<<< numBlocks, NUM_THREADS >>>(grid1_d, grid2_d); */
 		
 		// int numBlocks = (numRB ) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
 		
diff --git a/GrandBrownTown.cuh b/GrandBrownTown.cuh
index 7b447e5..5f8c1a7 100644
--- a/GrandBrownTown.cuh
+++ b/GrandBrownTown.cuh
@@ -16,8 +16,6 @@ Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
 __global__
 void updateKernel(Vector3 pos[], Vector3 forceInternal[],
 									int type[], BrownianParticleType* part[],
-									RigidBody* rb[],
-									RigidBodyType* rbType[],
 									float kT, BaseGrid* kTGrid,
 									float electricField, int tGridLength,
 									float timestep, int num, BaseGrid* sys,
diff --git a/GrandBrownTown.h b/GrandBrownTown.h
index 5155fe4..43ba4d7 100644
--- a/GrandBrownTown.h
+++ b/GrandBrownTown.h
@@ -1,4 +1,3 @@
-// // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // // /
 // Author: Jeff Comer <jcomer2@illinois.edu>
 #ifndef GRANDBROWNTOWN_H
 #define GRANDBROWNTOWN_H
@@ -37,6 +36,10 @@
 #include "Angle.h"
 #include "Configuration.h"
 #include "Dihedral.h"
+/* #include "RigidBody.h" */
+/* #include "RigidBodyType.h" */
+/* #include "RigidBodyGrid.h" */
+#include "RigidBodyController.h"
 #include "util.h"
 
 // IMD
@@ -111,6 +114,7 @@ public:
 	Vector3 freePosition(Vector3 r0, Vector3 r1, float minDist);
 
 private:
+	const Configuration& conf;
 	int numReplicas;
 	
 	// IMD variables
@@ -146,6 +150,10 @@ private:
 	float timeLast; 	// used with posLast
 	float minimumSep; 	// minimum separation allowed with placing new particles
 
+	RigidBodyController RBC;
+	Vector3* rbPos; 		// rigid body positions
+	
+	
 	// CUDA device variables
 	Vector3 *pos_d, *forceInternal_d, *force_d;
 	int *type_d;
diff --git a/RigidBody.c b/RigidBody.c
deleted file mode 100644
index 36bd3f8..0000000
--- a/RigidBody.c
+++ /dev/null
@@ -1,265 +0,0 @@
-/**
-***  Copyright (c) 1995, 1996, 1997, 1998, 1999, 2000 by
-***  The Board of Trustees of the University of Illinois.
-***  All rights reserved.
-**/
-
-#ifndef MIN_DEBUG_LEVEL
-#define MIN_DEBUG_LEVEL 5
-#endif
-#define DEBUGM
-#include "Debug.h"
-
-#include <iostream>
-#include <typeinfo>
-
-#include "RigidBody.h"
-#include "Vector.h"
-#include "RigidBodyParams.h"
-
-
-RigidBody::RigidBody(SimParameters *simParams, RigidBodyParams *rbParams) 
-: impulse_to_momentum(0.0004184) {
-
-	int len = strlen(rbParams->rigidBodyKey);
-	key = new char[len+1];
-	strncpy(key,rbParams->rigidBodyKey,len+1);
-
-	DebugM(4, "Initializing Rigid Body " << key << "\n" << endi);
-    
-	timestep = simParams->dt;
-	Temp = simParams->langevinTemp;
-
-	mass = rbParams->mass;
-	inertia = rbParams->inertia;
-
-	position = rbParams->position;
-	orientation = rbParams->orientation; // orientation a matrix that brings a vector from the RB frame to the lab frame
-	
-	momentum = rbParams->velocity * mass; // lab frame
-	DebugM(4, "velocity " << rbParams->velocity << "\n" << endi);
-	DebugM(4, "momentum " << momentum << "\n" << endi);
-
-	angularMomentum = rbParams->orientationalVelocity; // rigid body frame
-	angularMomentum.x *= rbParams->inertia.x;
-	angularMomentum.y *= rbParams->inertia.y;
-	angularMomentum.z *= rbParams->inertia.z;
-
-	isFirstStep = true; // this might not work flawlessly...
-
-	clearForce();
-	clearTorque();
-    
-	DebugM(4, "RigidBody initial Force: " << force << "\n" << endi);
-
-	// setup for langevin
-	langevin = rbParams->langevin;
-	if (langevin) {
-		  /*=======================================================\
-			| DiffCoeff = kT / dampingCoeff mass                     |
-			|                                                        |
-			| rbParams->DampingCoeff has units of (1/ps)             |
-			|                                                        |
-			| f = - dampingCoeff * momentum                          |
-			|                                                        |
-			| units "(1/ps) * (amu AA/fs)" "kcal_mol/AA" * 2.3900574 |
-			\=======================================================*/
-		transDampingCoeff = 2.3900574 * rbParams->transDampingCoeff;
-
-  		/*================================================================\
-			| < f(t) f(t') > = 2 kT dampingCoeff mass delta(t-t')             |
-			|                                                                 |
-			| units "sqrt( k K (1/ps) amu / fs )" "kcal_mol/AA" * 0.068916889 |
-			\================================================================*/
-		transForceCoeff = 0.068916889 * element_sqrt( 2*Temp*mass*rbParams->transDampingCoeff/timestep );
-     
-		// T = - dampingCoeff * angularMomentum
-		rotDampingCoeff = 2.3900574 * rbParams->rotDampingCoeff;
-		// < f(t) f(t') > = 2 kT dampingCoeff inertia delta(t-t')
-		rotTorqueCoeff = 0.068916889 * element_sqrt( 2*Temp*element_mult(inertia, rbParams->rotDampingCoeff) / timestep );
-	}
-}
-
-void RigidBody::addForce(Force f) { 
-	DebugM(1, "RigidBody "<<key<<" adding f ("<<f<<") to Force " << force << "\n" << endi);    
-	force += f; 
-} 
-void RigidBody::addTorque(Force t) {
-	DebugM(1, "RigidBody adding t ("<<t<<") to torque " << torque << "\n" << endi);   
-	torque += t; 
-}
-RigidBody::~RigidBody() {}
-
-/*===========================================================================\
-| Following "Algorithm for rigid-body Brownian dynamics" Dan Gordon, Matthew |
-|   Hoyles, and Shin-Ho Chung                                                |
-|   http://langevin.anu.edu.au/publications/PhysRevE_80_066703.pdf           |
-|                                                                            |
-|                                                                            |
-| BUT: assume diagonal friction tensor and no Wiener process / stochastic    |
-|   calculus then this is just the same as for translation                   |
-|                                                                            |
-|   < T_i(t) T_i(t) > = 2 kT friction inertia                                |
-|                                                                            |
-|   friction / kt = Diff                                                     |
-\===========================================================================*/
-void RigidBody::addLangevin(Vector w1, Vector w2) {
-	// w1 and w2 should be standard normal distributions
-
-	// in RB frame     
-	Force f = element_mult(transForceCoeff,w1) -
-		element_mult(transDampingCoeff, transpose(orientation)*momentum); 
-    
-	Force t = element_mult(rotTorqueCoeff,w2) -
-		element_mult(rotDampingCoeff, angularMomentum);
-
-	f = orientation * f; // return to lab frame
-	t = orientation * t;
-    
-	addForce(f);
-	addTorque(t);
-}
-
-/*==========================================================================\
-| from: Dullweber, Leimkuhler, Maclachlan. Symplectic splitting methods for |
-| rigid body molecular dynamics. JCP 107. (1997)                            |
-| http://jcp.aip.org/resource/1/jcpsa6/v107/i15/p5840_s1                    |
-\==========================================================================*/
-void RigidBody::integrate(Vector *p_trans, Tensor *p_rot, int startFinishAll) {
-	Vector trans; // = *p_trans;
-	Tensor rot = Tensor::identity(); // = *p_rot;
-
-#ifdef DEBUGM
-	switch (startFinishAll) {
-	case 0: // start
-		DebugM(2, "Rigid Body integrating start of cycle" << "\n" << endi);
-	case 1: // finish
-		DebugM(2, "Rigid Body integrating finish of cycle" << "\n" << endi);
-	case 2: // finish and start
-		DebugM(2, "Rigid Body integrating finishing last cycle, starting this one" << "\n" << endi);
-	}    
-#endif
-
-	if ( isnan(force.x) || isnan(torque.x) ) // NaN check
-		NAMD_die("Rigid Body force was NaN!\n");
-
-	// torque = Vector(0,0,10); // debug
-	Force tmpTorque = transpose(orientation)*torque; // bring to rigid body frame
-
-	DebugM(3, "integrate" <<": force "<<force <<": velocity "<<getVelocity() << "\n" << endi);
-	DebugM(3, "integrate" <<": torque "<<tmpTorque <<": orientationalVelocity "<<getAngularVelocity() << "\n" << endi);
-
-	if (startFinishAll == 0 || startFinishAll == 1) {
-		// propogate momenta by half step
-		momentum += 0.5 * timestep * force * impulse_to_momentum;
-		angularMomentum += 0.5 * timestep * tmpTorque * impulse_to_momentum;
-	} else {
-		// propogate momenta by a full timestep
-		momentum += timestep * force * impulse_to_momentum;
-		angularMomentum += timestep * tmpTorque * impulse_to_momentum;
-	}
-
-	DebugM(3, "  position before: " << position << "\n" << endi);
-
-	if (startFinishAll == 0 || startFinishAll == 2) {
-		// update positions
-		// trans = Vector(0); if (false) {
-		trans = timestep * momentum / mass;
-		position += trans; // update CoM a full timestep
-		// }
-
-		// update orientations a full timestep
-		Tensor R; // represents a rotation about a principle axis
-		R = Rx(0.5*timestep * angularMomentum.x / inertia.x ); // R1
-		angularMomentum = R * angularMomentum;
-		rot = transpose(R);
-		DebugM(1, "R: " << R << "\n" << endi);
-		DebugM(1, "Rot 1: " << rot << "\n" << endi);
-
-		R = Ry(0.5*timestep * angularMomentum.y / inertia.y ); // R2
-		angularMomentum = R * angularMomentum;
-		rot = rot * transpose(R);
-		DebugM(1, "R: " << R << "\n" << endi);
-		DebugM(1, "Rot 2: " << rot << "\n" << endi);
-
-		R = Rz(    timestep * angularMomentum.z / inertia.z ); // R3
-		angularMomentum = R * angularMomentum;
-		rot = rot * transpose(R);
-		DebugM(1, "R: " << R << "\n" << endi);
-		DebugM(1, "Rot 3: " << rot << "\n" << endi);
-
-		R = Ry(0.5*timestep * angularMomentum.y / inertia.y ); // R4
-		angularMomentum = R * angularMomentum;
-		rot = rot * transpose(R);
-		DebugM(1, "R: " << R << "\n" << endi);
-		DebugM(1, "Rot 4: " << rot << "\n" << endi);
-
-		R = Rx(0.5*timestep * angularMomentum.x / inertia.x ); // R5
-		angularMomentum = R * angularMomentum;
-		rot = rot * transpose(R);
-		DebugM(1, "R: " << R << "\n" << endi);
-		DebugM(1, "Rot 5: " << rot << "\n" << endi);
-
-		// DebugM(3,"TEST: " << Ry(0.01) <<"\n" << endi); // DEBUG
- 
-		// update actual orientation
-		Tensor newOrientation = orientation*rot; // not 100% sure; rot could be in rb frame
-		orientation = newOrientation;
-		rot = transpose(rot);
-
-		DebugM(2, "trans during: " << trans
-					 << "\n" << endi);
-		DebugM(2, "rot during: " << rot
-					 << "\n" << endi);
-    
-		clearForce();
-		clearTorque();
-	
-		*p_trans = trans;
-		*p_rot = rot;
-	}
-	DebugM(3, "  position after: " << position << "\n" << endi);
-}    
-
-// Rotations about axes
-
-// for very small angles 10^-8, cos^2+sin^2 != 1 
-// concerned about the accumulation of errors in non-unitary transformations!
-Tensor RigidBody::Rx(BigReal t) {
-	BigReal qt = 0.25*t*t;  // for approximate calculations of sin(t) and cos(t)
-	BigReal cos = (1-qt)/(1+qt);
-	BigReal sin = t/(1+qt);
-
-	Tensor tmp;
-	tmp.xx = 1; tmp.xy =   0; tmp.xz =    0;
-	tmp.yx = 0; tmp.yy = cos; tmp.yz = -sin;
-	tmp.zx = 0; tmp.zy = sin; tmp.zz =  cos;
-	return tmp;
-}
-Tensor RigidBody::Ry(BigReal t) {
-	BigReal qt = 0.25*t*t;  // for approximate calculations of sin(t) and cos(t)
-	BigReal cos = (1-qt)/(1+qt);
-	BigReal sin = t/(1+qt);
-
-	Tensor tmp;
-	tmp.xx =  cos; tmp.xy = 0; tmp.xz = sin;
-	tmp.yx =    0; tmp.yy = 1; tmp.yz =   0;
-	tmp.zx = -sin; tmp.zy = 0; tmp.zz = cos;
-	return tmp;
-}
-Tensor RigidBody::Rz(BigReal t) {
-	BigReal qt = 0.25*t*t;  // for approximate calculations of sin(t) and cos(t)
-	BigReal cos = (1-qt)/(1+qt);
-	BigReal sin = t/(1+qt);
-
-	Tensor tmp;
-	tmp.xx = cos; tmp.xy = -sin; tmp.xz = 0;
-	tmp.yx = sin; tmp.yy =  cos; tmp.yz = 0;
-	tmp.zx =   0; tmp.zy =    0; tmp.zz = 1;
-	return tmp;
-}
-Tensor RigidBody::eulerToMatrix(const Vector e) {
-	// convert euler angle input to rotation matrix
-	// http://en.wikipedia.org/wiki/Rotation_formalisms_in_three_dimensions#Conversion_formulae_between_formalisms
-	return Rz(e.z) * Ry(e.y) * Rx(e.x);
-}
diff --git a/RigidBody.cu b/RigidBody.cu
new file mode 100644
index 0000000..bfbbb90
--- /dev/null
+++ b/RigidBody.cu
@@ -0,0 +1,226 @@
+/* #ifndef MIN_DEBUG_LEVEL */
+/* #define MIN_DEBUG_LEVEL 5 */
+/* #endif */
+/* #include "Debug.h" */
+
+#include <iostream>
+#include <typeinfo>
+#include "RigidBody.h"
+#include "Configuration.h"
+
+#include "Debug.h"
+
+RigidBody::RigidBody(const Configuration& cref, RigidBodyType& tref)
+	: c(&cref), t(&tref) {
+	timestep = c->timestep;
+	// RBTODO: fix this
+	Temp = 295;
+	// tempgrid = c->temperatureGrid;
+
+	position = Vector3();
+
+	// Orientation matrix that brings vector from the RB frame to the lab frame
+	orientation = Matrix3();
+	
+	momentum = Vector3() * t->mass; // lab frame
+	/* DebugM(4, "velocity " << rbParams->velocity << "\n" << endi); */
+	DebugM(4, "momentum " << momentum << "\n" << endi);
+
+	angularMomentum = Vector3(); // rigid body frame
+	angularMomentum.x *= t->inertia.x;
+	angularMomentum.y *= t->inertia.y;
+	angularMomentum.z *= t->inertia.z;
+
+	/* isFirstStep = true; // this might not work flawlessly... */
+
+	/* clearForce(); */
+	/* clearTorque(); */
+    
+	/* DebugM(4, "RigidBody initial Force: " << force << "\n" << endi); */
+}
+
+void RigidBody::addForce(Force f) { 
+	// DebugM(1, "RigidBody "<<key<<" adding f ("<<f<<") to Force " << force << "\n" << endi);    
+	force += f; 
+} 
+void RigidBody::addTorque(Force torq) {
+	// DebugM(1, "RigidBody adding t ("<<t<<") to torque " << torque << "\n" << endi);   
+	torque += torq; 
+}
+RigidBody::~RigidBody() {}
+
+/*===========================================================================\
+	| Following "Algorithm for rigid-body Brownian dynamics" Dan Gordon, Matthew |
+	|   Hoyles, and Shin-Ho Chung                                                |
+	|   http://langevin.anu.edu.au/publications/PhysRevE_80_066703.pdf           |
+	|                                                                            |
+	|                                                                            |
+	| BUT: assume diagonal friction tensor and no Wiener process / stochastic    |
+	|   calculus then this is just the same as for translation                   |
+	|                                                                            |
+	|   < T_i(t) T_i(t) > = 2 kT friction inertia                                |
+	|                                                                            |
+	|   friction / kt = Diff                                                     |
+	\===========================================================================*/
+void RigidBody::addLangevin(Vector3 w1, Vector3 w2) {
+	// w1 and w2 should be standard normal distributions
+
+	// in RB frame     
+	Vector3 tmp = orientation.transpose()*momentum;
+	Force f = Vector3::element_mult(t->transForceCoeff,w1) -
+		Vector3::element_mult(t->transDamping, orientation.transpose()*momentum); 
+    
+	Force torq = Vector3::element_mult(t->rotTorqueCoeff,w2) -
+		Vector3::element_mult(t->rotDamping, angularMomentum);
+
+	f = orientation * f; // return to lab frame
+	torq = orientation * torq;
+    
+	addForce(f);
+	addTorque(torq);
+}
+
+/*==========================================================================\
+	| from: Dullweber, Leimkuhler, Maclachlan. Symplectic splitting methods for |
+	| rigid body molecular dynamics. JCP 107. (1997)                            |
+	| http://jcp.aip.org/resource/1/jcpsa6/v107/i15/p5840_s1                    |
+	\==========================================================================*/
+void RigidBody::integrate(Vector3& old_trans, Matrix3& old_rot, int startFinishAll) {
+	Vector3 trans; // = *p_trans;
+	Matrix3 rot = Matrix3(1); // = *p_rot;
+
+#ifdef DEBUGM
+	switch (startFinishAll) {
+	case 0: // start
+		DebugM(2, "Rigid Body integrating start of cycle" << "\n" << endi);
+	case 1: // finish
+		DebugM(2, "Rigid Body integrating finish of cycle" << "\n" << endi);
+	case 2: // finish and start
+		DebugM(2, "Rigid Body integrating finishing last cycle, starting this one" << "\n" << endi);
+	}    
+#endif
+
+	if ( isnan(force.x) || isnan(torque.x) ) { // NaN check
+		printf("Rigid Body force was NaN!\n");
+		exit(-1);
+	}
+
+	// torque = Vector(0,0,10); // debug
+	Force tmpTorque = orientation.transpose()*torque; // bring to rigid body frame
+
+	DebugM(3, "integrate" <<": force "<<force <<": velocity "<<getVelocity() << "\n" << endi);
+	DebugM(3, "integrate" <<": torque "<<tmpTorque <<": orientationalVelocity "<<getAngularVelocity() << "\n" << endi);
+
+	if (startFinishAll == 0 || startFinishAll == 1) {
+		// propogate momenta by half step
+		momentum += 0.5 * timestep * force * impulse_to_momentum;
+		angularMomentum += 0.5 * timestep * tmpTorque * impulse_to_momentum;
+	} else {
+		// propogate momenta by a full timestep
+		momentum += timestep * force * impulse_to_momentum;
+		angularMomentum += timestep * tmpTorque * impulse_to_momentum;
+	}
+
+	DebugM(3, "  position before: " << position << "\n" << endi);
+
+	if (startFinishAll == 0 || startFinishAll == 2) {
+		// update positions
+		// trans = Vector(0); if (false) {
+		trans = timestep * momentum / t->mass;
+		position += trans; // update CoM a full timestep
+		// }
+
+		// update orientations a full timestep
+		Matrix3 R; // represents a rotation about a principle axis
+		R = Rx(0.5*timestep * angularMomentum.x / t->inertia.x ); // R1
+		angularMomentum = R * angularMomentum;
+		rot = R.transpose();
+		DebugM(1, "R: " << R << "\n" << endi);
+		DebugM(1, "Rot 1: " << rot << "\n" << endi);
+
+		R = Ry(0.5*timestep * angularMomentum.y / t->inertia.y ); // R2
+		angularMomentum = R * angularMomentum;
+		rot = rot * R.transpose();
+		DebugM(1, "R: " << R << "\n" << endi);
+		DebugM(1, "Rot 2: " << rot << "\n" << endi);
+
+		R = Rz(    timestep * angularMomentum.z / t->inertia.z ); // R3
+		angularMomentum = R * angularMomentum;
+		rot = rot * R.transpose();
+		DebugM(1, "R: " << R << "\n" << endi);
+		DebugM(1, "Rot 3: " << rot << "\n" << endi);
+
+		R = Ry(0.5*timestep * angularMomentum.y / t->inertia.y ); // R4
+		angularMomentum = R * angularMomentum;
+		rot = rot * R.transpose();
+		DebugM(1, "R: " << R << "\n" << endi);
+		DebugM(1, "Rot 4: " << rot << "\n" << endi);
+
+		R = Rx(0.5*timestep * angularMomentum.x / t->inertia.x ); // R5
+		angularMomentum = R * angularMomentum;
+		rot = rot * R.transpose();
+		DebugM(1, "R: " << R << "\n" << endi);
+		DebugM(1, "Rot 5: " << rot << "\n" << endi);
+
+		// DebugM(3,"TEST: " << Ry(0.01) <<"\n" << endi); // DEBUG
+ 
+		// update actual orientation
+		Matrix3 newOrientation = orientation*rot; // not 100% sure; rot could be in rb frame
+		orientation = newOrientation;
+		rot = rot.transpose();
+
+		DebugM(2, "trans during: " << trans
+					 << "\n" << endi);
+		DebugM(2, "rot during: " << rot
+					 << "\n" << endi);
+    
+		clearForce();
+		clearTorque();
+	
+		old_trans = trans;
+		old_rot = rot;
+	}
+	DebugM(3, "  position after: " << position << "\n" << endi);
+}    
+
+// Rotations about axes
+// for very small angles 10^-8, cos^2+sin^2 != 1 
+// concerned about the accumulation of errors in non-unitary transformations!
+Matrix3 RigidBody::Rx(BigReal t) {
+	BigReal qt = 0.25*t*t;  // for approximate calculations of sin(t) and cos(t)
+	BigReal cos = (1-qt)/(1+qt);
+	BigReal sin = t/(1+qt);
+
+	Matrix3 tmp;
+	tmp.exx = 1; tmp.exy =   0; tmp.exz =    0;
+	tmp.eyx = 0; tmp.eyy = cos; tmp.eyz = -sin;
+	tmp.ezx = 0; tmp.ezy = sin; tmp.ezz =  cos;
+	return tmp;
+}
+Matrix3 RigidBody::Ry(BigReal t) {
+	BigReal qt = 0.25*t*t;  // for approximate calculations of sin(t) and cos(t)
+	BigReal cos = (1-qt)/(1+qt);
+	BigReal sin = t/(1+qt);
+
+	Matrix3 tmp;
+	tmp.exx =  cos; tmp.exy = 0; tmp.exz = sin;
+	tmp.eyx =    0; tmp.eyy = 1; tmp.eyz =   0;
+	tmp.ezx = -sin; tmp.ezy = 0; tmp.ezz = cos;
+	return tmp;
+}
+Matrix3 RigidBody::Rz(BigReal t) {
+	BigReal qt = 0.25*t*t;  // for approximate calculations of sin(t) and cos(t)
+	BigReal cos = (1-qt)/(1+qt);
+	BigReal sin = t/(1+qt);
+
+	Matrix3 tmp;
+	tmp.exx = cos; tmp.exy = -sin; tmp.exz = 0;
+	tmp.eyx = sin; tmp.eyy =  cos; tmp.eyz = 0;
+	tmp.ezx =   0; tmp.ezy =    0; tmp.ezz = 1;
+	return tmp;
+}
+Matrix3 RigidBody::eulerToMatrix(const Vector3 e) {
+	// convert euler angle input to rotation matrix
+	// http://en.wikipedia.org/wiki/Rotation_formalisms_in_three_dimensions#Conversion_formulae_between_formalisms
+	return Rz(e.z) * Ry(e.y) * Rx(e.x);
+}
diff --git a/RigidBody.h b/RigidBody.h
index 0f7c047..0dc6a9c 100644
--- a/RigidBody.h
+++ b/RigidBody.h
@@ -1,235 +1,95 @@
-/**
-***  Copyright (c) 1995, 1996, 1997, 1998, 1999, 2000 by
-***  The Board of Trustees of the University of Illinois.
-***  All rights reserved.
-**/
-
+/*===========================\
+| RigidBody Class for device |
+\===========================*/
 #pragma once
 
-/* #include <set> */
-/* #include <vector> */
-
 #include "useful.h"
-/* #include "Vector.h" */
-/* #include "Tensor.h" */
-
-
-/* #include "strlib.h" */
-/* #include "common.h" */
-/* #include "Vector.h" */
-/* #include "Tensor.h" */
-/* #include "InfoStream.h" */
-/* #include "MStream.h" */
+#include "RigidBodyType.h"
 
 
-/* #include "SimParameters.h" */
-/* #include "NamdTypes.h" */
+#ifdef __CUDACC__
+    #define HOST __host__
+    #define DEVICE __device__
+#else
+    #define HOST 
+    #define DEVICE 
+#endif
 
-typedef BigReal float;					/* strip this out later */
-typedef Force Vector3;
+class Configuration;						/* forward decleration */
 
-#define DEVICE __device__
+typedef float BigReal;					/* strip this out later */
+typedef Vector3 Force;
 
-/* class ComputeMgr; */
-/* class Random; */
 
-class RigidBody {
-	/*=====================================================================\
-		|   See Appendix A of: Dullweber, Leimkuhler and McLaclan. "Symplectic |
-		|   splitting methods for rigid body molecular dynamics". J Chem       |
-		|   Phys. (1997)                                                       |
-		\=====================================================================*/
-public:
-	DEVICE RigidBody(SimParameters *simParams, RigidBodyParams *rbParams);
-	DEVICE ~RigidBody();
+class RigidBody { // host side representation of rigid bodies
+	/*â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“.
+	| See Appendix A of: Dullweber, Leimkuhler and McLaclan. "Symplectic        |
+	| splitting methods for rigid body molecular dynamics". J Chem Phys. (1997) |
+	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
+		public:
+	HOST DEVICE RigidBody(const Configuration& c, RigidBodyType& t);
+	/* HOST DEVICE RigidBody(RigidBodyType t); */
+	HOST DEVICE ~RigidBody();
 
-	DEVICE void addForce(Force f); 
-	DEVICE void addTorque(Force t);
-	DEVICE void addLangevin(Vector3 w1, Vector3 w2);
-
-	DEVICE inline void clearForce() { force = Force(0); }
-	DEVICE inline void clearTorque() { torque = Force(0); }
-
-	DEVICE void integrate(Vector3 *p_trans, Matrix3 *p_rot, int startFinishAll);
+	HOST DEVICE void addForce(Force f); 
+	HOST DEVICE void addTorque(Force t);
+	HOST DEVICE void addLangevin(Vector3 w1, Vector3 w2);
+	
+	HOST DEVICE inline void clearForce() { force = Force(0.0f); }
+	HOST DEVICE inline void clearTorque() { torque = Force(0.0f); }
 
-	DEVICE inline char* getKey() { return key; }
-	DEVICE inline Vector3 getPosition() { return position; }
-	DEVICE inline Matrix3 getOrientation() { return orientation; }
-	DEVICE inline BigReal getMass() { return mass; }
-	DEVICE inline Vector3 getVelocity() { return momentum/mass; }
-	DEVICE inline Vector3 getAngularVelocity() { 
-		return Vector3( angularMomentum.x / inertia.x,
-									 angularMomentum.y / inertia.y,
-									 angularMomentum.z / inertia.z );
+	HOST DEVICE void integrate(Vector3& old_trans, Matrix3& old_rot, int startFinishAll);
+	
+	HOST DEVICE inline String getKey() { return key; }
+	HOST DEVICE inline Vector3 getPosition() const { return position; }
+	HOST DEVICE inline Matrix3 getOrientation() const { return orientation; }
+	HOST DEVICE inline Matrix3 getBasis() const { return orientation; }
+	HOST DEVICE inline BigReal getMass() const { return t->mass; }
+	HOST DEVICE inline Vector3 getVelocity() const { return momentum/t->mass; }
+	HOST DEVICE inline Vector3 getAngularVelocity() const { 
+		return Vector3( angularMomentum.x / t->inertia.x,
+									 angularMomentum.y / t->inertia.y,
+									 angularMomentum.z / t->inertia.z );
 	}
 	bool langevin;
     
 private:
 	String key;
 	/* static const SimParameters * simParams; */
-	BigReal mass;
-
 	Vector3 position;
 	Matrix3 orientation;
 
-	Vector3 inertia; // diagonal elements of inertia tensor
 	Vector3 momentum;
 	Vector3 angularMomentum; // angular momentum along corresponding principal axes
     
 	// Langevin
-	Vector3 langevinTransFriction;
+	Vector3 langevinTransFriction; /* RBTODO: make this work with a grid */
 	Vector3 langevinRotFriction;
 	BigReal Temp;
 
-	Vector3 transDampingCoeff;
-	Vector3 transForceCoeff;
-	Vector3 rotDampingCoeff;
-	Vector3 rotTorqueCoeff;    
+	/* Vector3 transDampingCoeff; */
+	/* Vector3 transForceCoeff; */
+	/* Vector3 rotDampingCoeff; */
+	/* Vector3 rotTorqueCoeff;     */
 
 	// integration
+	const Configuration* c;
+	RigidBodyType* t;					/* RBTODO: const? */
 	int timestep;
 	Vector3 force;  // lab frame
 	Vector3 torque; // lab frame (except in integrate())
 
 	bool isFirstStep; 
 
-	// units "kcal_mol/AA * fs"  "(AA/fs) * amu"
-	const BigReal impulse_to_momentum;
-
-	DEVICE inline Matrix3 Rx(BigReal t);
-	DEVICE inline Matrix3 Ry(BigReal t);
-	DEVICE inline Matrix3 Rz(BigReal t);
-	DEVICE inline Matrix3 eulerToMatrix(const Vector3 e);
-};
-
-class RigidBodyController {
-public:
-	/* DEVICE RigidBodyController(const NamdState *s, int reductionTag, SimParameters *sp); */
-	DEVICE RigidBodyController(const NamdState *s, int reductionTag, SimParameters *sp);
-	
-	DEVICE ~RigidBodyController();
-	DEVICE void integrate(int step);
-	DEVICE void print(int step);
-    
-private:
-	/* void printLegend(std::ofstream &file); */
-	/* void printData(int step, std::ofstream &file); */
-
-	/* SimParameters* simParams; */
-	/* const NamdState* state;		 */    
-	/* std::ofstream trajFile; */
+	/*â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“.
+	| units "kcal_mol/AA * fs" "(AA/fs) * amu" |
+	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
+	const BigReal impulse_to_momentum = 0.0004184; /* should be static, but fails */
 
-	Random* random;
-	/* RequireReduction *gridReduction; */
-	
-	Vector3* trans; // would have made these static, but
-	Matrix3* rot;  	// there are errors on rigidBody->integrate
-	RigidBody* rigidBodyList;
-	
-};
 
-class RigidBodyParams {
-public:
-	RigidBodyParams() {
-		rigidBodyKey = 0;
-		mass = 0;
-		inertia = Vector3();
-		langevin = FALSE;
-		temperature = 0;
-		transDampingCoeff = Vector3();
-		rotDampingCoeff = Vector3();
-		gridList;
-		position = Vector3();
-		velocity = Vector3();
-		orientation = Matrix3();
-		orientationalVelocity = Vector3();	   
-	}    
-	int typeID;
-
-	String *rigidBodyKey;
-	BigReal mass;
-	Vector3 inertia;
-	Bool langevin;
-	BigReal temperature;
-	Vector3 transDampingCoeff;
-	Vector3 rotDampingCoeff;
-	String *gridList;
-	
-    
-	Vector3 position;
-	Vector3 velocity;
-	Matrix3 orientation;
-	Vector3 orientationalVelocity;
-
-	RigidBodyParams *next;
-
-	const void print();
+	HOST DEVICE inline Matrix3 Rx(BigReal t);
+	HOST DEVICE inline Matrix3 Ry(BigReal t);
+	HOST DEVICE inline Matrix3 Rz(BigReal t);
+	HOST DEVICE inline Matrix3 eulerToMatrix(const Vector3 e);
 };
 
-
-/* class RigidBodyParamsList { */
-/* public: */
-/*   RigidBodyParamsList() { */
-/*     clear(); */
-/*   } */
-  
-/*   ~RigidBodyParamsList()  */
-/*   { */
-/*     RBElem* cur; */
-/*     while (head != NULL) { */
-/*       cur = head; */
-/*       head = cur->nxt; */
-/*       delete cur; */
-/*     } */
-/*     clear(); */
-/*   } */
-/*   const void print(char *s); */
-/*   const void print(); */
-
-/*   // The SimParameters bit copy overwrites these values with illegal pointers, */
-/*   // So thise throws away the garbage and lets everything be reinitialized */
-/*   // from scratch */
-/*   void clear() { */
-/*     head = tail = NULL; */
-/*     n_elements = 0; */
-/*   } */
-  
-/*   RigidBodyParams* find_key(const char* key);   */
-/*   int index_for_key(const char* key); */
-/*   RigidBodyParams* add(const char* key); */
-  
-/*   RigidBodyParams *get_first() { */
-/*     if (head == NULL) { */
-/*       return NULL; */
-/*     } else return &(head->elem); */
-/*   } */
-  
-/*   void pack_data(MOStream *msg);   */
-/*   void unpack_data(MIStream *msg); */
-  
-/*   // convert from a string to Bool; returns 1(TRUE) 0(FALSE) or -1(if unknown) */
-/*   static int atoBool(const char *s) */
-/*   { */
-/*     if (!strcasecmp(s, "on")) return 1; */
-/*     if (!strcasecmp(s, "off")) return 0; */
-/*     if (!strcasecmp(s, "true")) return 1; */
-/*     if (!strcasecmp(s, "false")) return 0; */
-/*     if (!strcasecmp(s, "yes")) return 1; */
-/*     if (!strcasecmp(s, "no")) return 0; */
-/*     if (!strcasecmp(s, "1")) return 1; */
-/*     if (!strcasecmp(s, "0")) return 0; */
-/*     return -1; */
-/*   } */
-
-
-/* private: */
-/*   class RBElem { */
-/*   public: */
-/*     RigidBodyParams elem; */
-/*     RBElem* nxt; */
-/*   }; */
-/*   RBElem* head; */
-/*   RBElem* tail; */
-/*   int n_elements; */
-
-/* }; */
diff --git a/RigidBodyController.c b/RigidBodyController.cu
similarity index 78%
rename from RigidBodyController.c
rename to RigidBodyController.cu
index ad6dcc1..feb471e 100644
--- a/RigidBodyController.c
+++ b/RigidBodyController.cu
@@ -1,44 +1,113 @@
-#ifndef MIN_DEBUG_LEVEL
-#define MIN_DEBUG_LEVEL 5
-#endif
-#define DEBUGM
-#include "Debug.h"
+/* #ifndef MIN_DEBUG_LEVEL */
+/* #define MIN_DEBUG_LEVEL 5 */
+/* #endif */
+/* #define DEBUGM */
+/* #include "Debug.h" */
 
-#include <iostream>
-#include <typeinfo>
-
-#include "RigidBody.h"
+/* #include "RigidBody.h" */
 #include "RigidBodyController.h"
-#include "RigidBodyMsgs.h"
-// #include "Vector.h"
-#include "Node.h"
-#include "ReductionMgr.h"
-//#include "Broadcasts.h"
-#include "ComputeMgr.h"
-#include "Random.h"
-#include "Output.h"
+#include "Configuration.h"
 
-#include "SimParameters.h"
-#include "RigidBodyParams.h"
-#include "Molecule.h"
-// #include "InfoStream.h"
-#include "common.h"
-#include <errno.h>
+#include "RigidBodyType.h"
+/* #include "Random.h" */
 
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, String file, int line, bool abort=true) {
+   if (code != cudaSuccess) {
+      fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), __FILE__, line);
+      if (abort) exit(code);
+   }
+}
 
-/*============================\
-| include "RigidBodyParams.h" |
-\============================*/
+/* #include <cuda.h> */
+/* #include <cuda_runtime.h> */
+/* #include <curand_kernel.h> */
 
-#include "MStream.h"
 
-#include "DataStream.h"
-#include "InfoStream.h"
+RigidBodyController::RigidBodyController(const Configuration& c) :
+conf(c) {
 
-#include "Debug.h"
-#include <stdio.h>
+	int numRB = 0;
+	// grow list of rbs
+	for (int i = 0; i < conf.numRigidTypes; i++) {			
+		numRB += conf.rigidBody[i].num;
+		std::vector<RigidBody> tmp;
+		for (int j = 0; j < conf.rigidBody[i].num; j++) {
+			RigidBody r(conf, conf.rigidBody[i]);
+			tmp.push_back( r );
+		}
+		rigidBodyByType.push_back(tmp);
+	}
 
+}
+RigidBodyController::~RigidBodyController() {
+	for (int i = 0; i < rigidBodyByType.size(); i++)
+		rigidBodyByType[i].clear();
+	rigidBodyByType.clear();
+}
+
+void RigidBodyController::updateForces() {
+	/*â€“â€“{ RBTODO }â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“.
+	| probably coalesce kernel calls, or move this to a device kernel caller   |
+	|                                                                          |
+	| - consider removing references (unless they are optimized out!) ---      |
+	| - caclulate numthreads && numblocks                                      |
+	|                                                                          |
+	| all threads in a block should: ----------------------------------------  |
+	|   (1) apply the same transformations to get the data point position in a |
+	|   destination grid ----------------------------------------------------- |
+	|   (2) reduce forces and torques to the same location ------------------- |
+	|   (3) ???  ------------------------------------------------------------- |
+	|                                                                          |
+	| Opportunities for memory bandwidth savings:                              |
+	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
+	// int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
+	int numBlocks = 1;
+	int numThreads = 32;
+
+	
+	// Loop over all pairs of rigid body types
+	//   the references here make the code more readable, but they may incur a performance loss
+	for (int ti = 0; ti < conf.numRigidTypes; ti++) {
+		const RigidBodyType& t1 = conf.rigidBody[ti];
+		for (int tj = ti; tj < conf.numRigidTypes; tj++) {
+			const RigidBodyType& t2 = conf.rigidBody[tj];
+				
+			const std::vector<String>& keys1 = t1.densityGridKeys; 
+			const std::vector<String>& keys2 = t2.potentialGridKeys;
+
+			// Loop over all pairs of grid keys (e.g. "Elec")
+			for(int k1 = 0; k1 < keys1.size(); k1++) {
+				for(int k2 = 0; k2 < keys2.size(); k2++) {
+					if ( keys1[k1] == keys2[k2] ) {
+						// found matching keys => calculate force between all grid pairs
+						//   loop over rigid bodies of this type
+						const std::vector<RigidBody>& rbs1 = rigidBodyByType[ti];
+						const std::vector<RigidBody>& rbs2 = rigidBodyByType[tj];
+
+						for (int i = 0; i < rbs1.size(); i++) {
+							for (int j = (ti==tj ? 0 : i); j < rbs2.size(); j++) {
+								const RigidBody& rb1 = rbs1[i];
+								const RigidBody& rb2 = rbs2[j];
+									
+								computeGridGridForce<<< numBlocks, numThreads >>>
+									(t1.rawDensityGrids[k1], t2.rawPotentialGrids[k2],
+									 rb1.getBasis(), rb1.getPosition(),
+									 rb2.getBasis(), rb2.getPosition());
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+		
+	// RBTODO: see if there is a better way to sync
+	gpuErrchk(cudaDeviceSynchronize());
+
+}
 
+/*
 RigidBodyController::RigidBodyController(const NamdState *s, const int reductionTag, SimParameters *sp) : state(s), simParams(sp)
 {
     DebugM(2, "Rigid Body Controller initializing" 
@@ -515,3 +584,4 @@ void RigidBodyParamsList::unpack_data(MIStream *msg) {
     DebugM(4, "Finished unpacking rigid body parameter list\n");
     print();
 }
+*/
diff --git a/RigidBodyController.h b/RigidBodyController.h
new file mode 100644
index 0000000..1e06609
--- /dev/null
+++ b/RigidBodyController.h
@@ -0,0 +1,47 @@
+#include <vector>
+#include "RigidBody.h"
+/* #include "RandomCUDA.h" */
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "ComputeGridGrid.cuh"
+
+class Configuration;
+
+struct gridInteractionList {
+	
+};
+
+class RigidBodyController {
+public:
+	/* DEVICE RigidBodyController(const NamdState *s, int reductionTag, SimParameters *sp); */
+	RigidBodyController();
+  ~RigidBodyController();
+	RigidBodyController(const Configuration& c);
+
+	DEVICE void integrate(int step);
+	DEVICE void print(int step);
+    
+private:
+	void updateForces();
+
+	/* void printLegend(std::ofstream &file); */
+	/* void printData(int step, std::ofstream &file); */
+
+	/* std::ofstream trajFile; */
+
+	const Configuration& conf;
+	
+	/* Random* random; */
+	/* RequireReduction *gridReduction; */
+	
+	Vector3* trans; // would have made these static, but
+	Matrix3* rot;  	// there are errors on rigidBody->integrate
+
+	std::vector< std::vector<RigidBody> > rigidBodyByType;
+	/* RigidBody* rigidBodyList; */
+
+	
+	
+};
+
diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu
index f893867..28bb6fe 100644
--- a/RigidBodyGrid.cu
+++ b/RigidBodyGrid.cu
@@ -1,4 +1,3 @@
-
 //////////////////////////////////////////////////////////////////////
 // Grid base class that does just the basics.
 // Author: Jeff Comer <jcomer2@illinois.edu>
@@ -6,7 +5,6 @@
 #include "RigidBodyGrid.h"
 #include <cuda.h>
 
-
 #define STRLEN 512
 
 // Initialize the variables that get used a lot.
@@ -17,7 +15,7 @@ void RigidBodyGrid::init() {
 	val = new float[size];
 }
 RigidBodyGrid::RigidBodyGrid() {
-	RigidBodyGrid tmp(Matrix3(),Vector3(),1,1,1);
+	RigidBodyGrid tmp(1,1,1);
 	val = new float[1];
 	*this = tmp;									// TODO: verify that this is OK
 	
@@ -149,8 +147,8 @@ RigidBodyGrid::RigidBodyGrid(const RigidBodyGrid& g, int nx0, int ny0, int nz0)
 	if (ny <= 0) ny = 1;
 	if (nz <= 0) nz = 1;
 
-	// Tile the grid into the box of the template grid.
-	Matrix3 box = g.getBox();
+	// // Tile the grid into the box of the template grid.
+	// Matrix3 box = g.getBox();
 
 	init();
 
@@ -207,6 +205,7 @@ Vector3 RigidBodyGrid::getPosition(int j) const {
 
 	return Vector3(ix,iy,iz);
 }
+
 Vector3 RigidBodyGrid::getPosition(int j, Matrix3 basis, Vector3 origin) const {
 	int iz = j%nz;
 	int iy = (j/nz)%ny;
@@ -285,7 +284,6 @@ int RigidBodyGrid::index(int ix, int iy, int iz) const { return iz + iy*nz + ix*
 // 	return iz + iy*nz + ix*ny*nz;
 // }
 
-}
 
 // Add a fixed value to the grid.
 void RigidBodyGrid::shift(float s) {
@@ -305,12 +303,12 @@ float RigidBodyGrid::mean() const {
 }
 
 // Get the potential at the closest node.
-float RigidBodyGrid::getPotential(Vector3 pos) const {
-	// Find the nearest node.
-	int j = nearestIndex(pos);
+// float RigidBodyGrid::getPotential(Vector3 pos) const {
+// 	// Find the nearest node.
+// 	int j = nearestIndex(pos);
 
-	return val[j];
-}
+// 	return val[j];
+// }
 
 
 // Added by Rogan for times when simpler calculations are required.
@@ -416,25 +414,6 @@ float RigidBodyGrid::interpolatePotentialLinearly(Vector3 pos) const {
 // }
 
 
-// Includes the home node.
-// indexBuffer must have a size of at least 27.
-void RigidBodyGrid::getNeighbors(int j, int* indexBuffer) const {
-	int jx = indexX(j);
-	int jy = indexY(j);
-	int jz = indexZ(j);
-
-	int k = 0;
-	for (int ix = -1; ix <= 1; ix++) {
-		for (int iy = -1; iy <= 1; iy++) {
-			for (int iz = -1; iz <= 1; iz++) {
-				int ind = wrap(jz+iz,nz) + nz*wrap(jy+iy,ny) + nynz*wrap(jx+ix,nx);
-				indexBuffer[k] = ind;
-				k++;
-			}
-		}
-	}
-}
-
 // Includes the home node.
 // indexBuffer must have a size of at least 27.
 void RigidBodyGrid::getNeighbors(int j, int* indexBuffer) const {
diff --git a/RigidBodyGrid.h b/RigidBodyGrid.h
index 8453332..1aa49c1 100644
--- a/RigidBodyGrid.h
+++ b/RigidBodyGrid.h
@@ -1,8 +1,9 @@
 //////////////////////////////////////////////////////////////////////
 // Copy of BaseGrid with some modificaitons
 // 
-
-#pragma once
+#ifndef RBBASEGRID_H
+#define RBBASEGRID_H
+// #pragma once
 
 #ifdef __CUDACC__
     #define HOST __host__
@@ -12,6 +13,7 @@
     #define DEVICE 
 #endif
 
+#include "BaseGrid.h"
 #include "useful.h"
 #include <cmath>
 #include <cstring>
@@ -20,17 +22,14 @@
 #include <ctime>
 #include <cuda.h>
 
-
 using namespace std;
 
 #define STRLEN 512
 
-class NeighborList {
-public:
-  float v[3][3][3];
-};
+// RBTODO: integrate with existing grid code?
 
 class RigidBodyGrid { 
+	friend class SparseGrid;
 private:
   // Initialize the variables that get used a lot.
   // Also, allocate the main value array.
@@ -45,7 +44,7 @@ public:
 	// RBTODO Fix?
 	RigidBodyGrid(); // cmaffeo2 (2015) moved this out of protected, cause I wanted RigidBodyGrid in a struct
   // The most obvious of constructors.
-		RigidBodyGrid(int nx0, int ny0, int nz0);
+	RigidBodyGrid(int nx0, int ny0, int nz0);
 
   // Make an orthogonal grid given the box dimensions and resolution.
   RigidBodyGrid(Vector3 box, float dx);
@@ -64,6 +63,9 @@ public:
   // The grid spacing is always a bit smaller than dx.
   RigidBodyGrid(Matrix3 box, float dx);
 
+  // Make a copy of a BaseGrid grid.
+  RigidBodyGrid(const BaseGrid& g);
+
   // Make an exact copy of a grid.
   RigidBodyGrid(const RigidBodyGrid& g);
 
@@ -91,9 +93,9 @@ public:
   virtual float getValue(int ix, int iy, int iz) const;
 
   // Vector3 getPosition(int ix, int iy, int iz) const;
-  Vector3 getPosition(int j) const;
-	Vector3 getPosition(int j, Matrix3 basis, Vector3 origin) const {
-
+  HOST DEVICE Vector3 getPosition(int j) const;
+	HOST DEVICE Vector3 getPosition(int j, Matrix3 basis, Vector3 origin) const;
+		
   /* // Does the point r fall in the grid? */
   /* // Obviously this is without periodic boundary conditions. */
   /* bool inGrid(Vector3 r) const; */
@@ -113,9 +115,7 @@ public:
   /* int index(Vector3 r) const; */
   /* int nearestIndex(Vector3 r) const; */
 
-  HOST DEVICE inline int length() const {
-		return size;
-	}
+  HOST DEVICE inline int length() const { return size; }
 
   HOST DEVICE inline int getNx() const {return nx;}
   HOST DEVICE inline int getNy() const {return ny;}
@@ -134,18 +134,18 @@ public:
 	\=========*/
 	
   // Get the mean of the entire grid.
-  float mean() const
+  float mean() const;
 	
   // Get the potential at the closest node.
-  virtual float getPotential(Vector3 pos) const;
+  /* virtual float getPotential(Vector3 pos) const; */
 
 	// Added by Rogan for times when simpler calculations are required.
   virtual float interpolatePotentialLinearly(Vector3 pos) const;
 
-  HOST DEVICE inline float interpolateDiffX(Vector3 pos, float w[3], float g1[4][4][4]) const {
+	HOST DEVICE inline float interpolateDiffX(Vector3 pos, float w[3], float g1[4][4][4]) const {
     float a0, a1, a2, a3;
-
-		// RBTODO parallelize loops?
+		
+		// RBTODO further parallelize loops? unlikely?
 		
     // Mix along x, taking the derivative.
     float g2[4][4];
@@ -179,8 +179,7 @@ public:
     a1 = 0.5f*(-g3[0] + g3[2]);
     a0 = g3[1];
  
-    float retval = -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
-    return retval;
+    return -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0);
   }
 
   HOST DEVICE inline float interpolateDiffY(Vector3 pos, float w[3], float g1[4][4][4]) const {
@@ -256,8 +255,6 @@ public:
     return -(3.0f*a3*w[2]*w[2] + 2.0f*a2*w[2] + a1);
   }
 
-	// RBTODO overload with optimized algorithm
-	//  skip transforms (assume identity basis)
   HOST DEVICE inline float interpolatePotential(Vector3 pos) const {
     // Find the home node.
     Vector3 l = pos;
@@ -296,17 +293,17 @@ public:
     float g1[4][4][4];
     for (int ix = 0; ix < 4; ix++) {
 	  	int jx = ix-1 + home[0];
-	  	jx = wrap(jx, g[0]);
       for (int iy = 0; iy < 4; iy++) {
 	  		int jy = iy-1 + home[1];
-	  		jy = wrap(jy, g[1]);
 				for (int iz = 0; iz < 4; iz++) {
-	  			// Wrap around the periodic boundaries. 
 	  			int jz = iz-1 + home[2];
-	  			jz = wrap(jz, g[2]);
-	  
-	  			int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
-	  			g1[ix][iy][iz] = val[ind];
+					if (jx <  0  ||  jy < 0  ||  jz < 0  ||
+							jx >= nx || jz >= nz || jz >= nz) {
+						g1[ix][iy][iz] = 0;
+					} else {
+						int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
+						g1[ix][iy][iz] = val[ind];
+					}
 				}
       }
     }
@@ -387,19 +384,22 @@ public:
 		w[1] = l.y - homeY;
 		w[2] = l.z - homeZ;
 		// Find the values at the neighbors.
-		float g1[4][4][4];
+		float g1[4][4][4];					/* RBTODO: inefficient for my algorithm? */
 		for (int ix = 0; ix < 4; ix++) {
+			int jx = ix-1 + home[0];
 			for (int iy = 0; iy < 4; iy++) {
+				int jy = iy-1 + home[1];
 				for (int iz = 0; iz < 4; iz++) {
-	  			// Wrap around the periodic boundaries. 
-					int jx = ix-1 + home[0];
-					jx = wrap(jx, g[0]);
-					int jy = iy-1 + home[1];
-					jy = wrap(jy, g[1]);
+	  			// Assume zero value at edges
 					int jz = iz-1 + home[2];
-					jz = wrap(jz, g[2]);
-					int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
-					g1[ix][iy][iz] = val[ind];
+					// RBTODO: possible branch divergence in warp?
+					if (jx <  0  ||  jy < 0  ||  jz < 0  ||
+							jx >= nx || jz >= nz || jz >= nz) {
+						g1[ix][iy][iz] = 0;
+					} else {
+						int ind = jz*jump[2] + jy*jump[1] + jx*jump[0];
+						g1[ix][iy][iz] = val[ind];
+					}
 				}
 			}
 		}  
@@ -508,7 +508,7 @@ public:
   // since we do it here.
   void getNeighborValues(NeighborList* neigh, int homeX, int homeY, int homeZ) const;
   inline void setVal(float* v) { val = v; }
-
+	
 public:
   int nx, ny, nz;
   int nynz;
@@ -517,3 +517,4 @@ public:
   float* val;
 };
 
+#endif
diff --git a/RigidBodyType.cu b/RigidBodyType.cu
index 30322be..eca2b46 100644
--- a/RigidBodyType.cu
+++ b/RigidBodyType.cu
@@ -57,6 +57,43 @@ void RigidBodyType::clear() {
 // 	g.grid = * new BaseGrid(token[1]);
 // 	return g;
 // }
+
+
+void RigidBodyType::setDampingCoeffs(float timestep, float tmp_mass, Vector3 tmp_inertia, float tmp_transDamping, float tmp_rotDamping) {
+	mass = tmp_mass;
+	inertia = tmp_inertia;
+	/*â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“.
+	| DiffCoeff = kT / dampingCoeff mass                     |
+	|                                                          |
+	| type->DampingCoeff has units of (1/ps)                 |
+	|                                                          |
+	| f[kcal/mol AA] = - dampingCoeff * momentum[amu AA/fs]  |
+	|                                                          |
+	| units "(1/ps) * (amu AA/fs)" "kcal_mol/AA" * 2.3900574 |
+	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
+	transDamping = 2.3900574 * tmp_transDamping;
+
+	/*â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“.
+	| < f(t) f(t') > = 2 kT dampingCoeff mass delta(t-t') |
+	|                                                     |
+	|  units "sqrt( k K (1/ps) amu / fs )" "kcal_mol/AA"  |
+	|    * 0.068916889                                    |
+	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
+	float Temp = 295;								/* RBTODO: Fix!!!! */
+	transForceCoeff = 0.068916889 * Vector3::element_sqrt( 2*Temp*mass*tmp_transDamping/timestep );
+
+	// setup for langevin
+	// langevin = rbParams->langevin;
+	// if (langevin) {
+	// T = - dampingCoeff * angularMomentum
+	rotDamping = 2.3900574 * tmp_rotDamping;
+
+	// < f(t) f(t') > = 2 kT dampingCoeff inertia delta(t-t')
+	rotTorqueCoeff = 0.068916889 *
+		Vector3::element_sqrt( 2*Temp* Vector3::element_mult(inertia,tmp_rotDamping) / timestep );
+	//  }
+}
+
 void RigidBodyType::addPotentialGrid(String s) {
 	// tokenize and return
 	int numTokens = s.tokenCount();
@@ -94,15 +131,27 @@ void RigidBodyType::updateRaw() {
 	if (numDenGrids > 0) delete[] rawDensityGrids;
 	numPotGrids = potentialGrids.size();
 	numDenGrids = densityGrids.size();
-	if (numPotGrids > 0)
-		rawPotentialGrids = new BaseGrid[numPotGrids];
-	if (numDenGrids > 0)
-		rawDensityGrids = new BaseGrid[numDenGrids];
-
-	for (int i=0; i < numPotGrids; i++)
-		rawPotentialGrids[i] = potentialGrids[i];
-	for (int i=0; i < numDenGrids; i++)
-		rawDensityGrids[i] = densityGrids[i];
+	if (numPotGrids > 0) {
+		rawPotentialGrids		= new RigidBodyGrid[numPotGrids];
+		rawPotentialBases		= new Matrix3[numPotGrids];
+		rawPotentialOrigins = new Vector3[numPotGrids];
+	}
+	if (numDenGrids > 0) {
+		rawDensityGrids			= new RigidBodyGrid[numDenGrids];
+		rawDensityBases			= new Matrix3[numDenGrids];
+		rawDensityOrigins		= new Vector3[numDenGrids];
+	}
+
+	for (int i=0; i < numPotGrids; i++) {
+		rawPotentialGrids[i]	 = potentialGrids[i];
+		rawPotentialBases[i]	 = potentialGrids[i].getBasis();
+		rawPotentialOrigins[i] = potentialGrids[i].getOrigin();
+	}
+	for (int i=0; i < numDenGrids; i++) {
+		rawDensityGrids[i]		 = densityGrids[i];
+		rawDensityBases[i]		 = densityGrids[i].getBasis();
+		rawDensityOrigins[i]	 = densityGrids[i].getOrigin();
+	}
 	
 }
 
diff --git a/RigidBodyType.h b/RigidBodyType.h
index 4c334dd..be452c3 100644
--- a/RigidBodyType.h
+++ b/RigidBodyType.h
@@ -6,8 +6,9 @@
 /* #include <thrust/host_vector.h> */
 /* #include <thrust/device_vector.h> */
 #include "Reservoir.h"
-#include "useful.h"
 #include "BaseGrid.h"
+#include "RigidBodyGrid.h"
+#include "useful.h"
 
 #include <cstdio>
 
@@ -37,9 +38,12 @@ RigidBodyType(const String& name = "") :
   void addPotentialGrid(String s);
 	void addDensityGrid(String s);
 	void updateRaw();
-
+	void setDampingCoeffs(float timestep, float tmp_mass, Vector3 tmp_inertia,
+												 float tmp_transDamping, float tmp_rotDamping);
+	
 public:
 
+	
 	String name;
 	int num; // number of particles of this type
 
@@ -49,6 +53,8 @@ public:
 	Vector3 inertia;
 	Vector3 transDamping;
 	Vector3 rotDamping;
+	Vector3 transForceCoeff;
+	Vector3 rotTorqueCoeff;
 
 	std::vector<String> potentialGridKeys;
 	std::vector<String> densityGridKeys;
@@ -59,8 +65,11 @@ public:
 	// for device
 	int numPotGrids;
 	int numDenGrids;
-	BaseGrid* rawPotentialGrids;
-	BaseGrid* rawDensityGrids;
-	
+	RigidBodyGrid* rawPotentialGrids;
+	RigidBodyGrid* rawDensityGrids;
+	Matrix3* rawPotentialBases;
+	Matrix3* rawDensityBases;
+	Vector3* rawPotentialOrigins;
+	Vector3* rawDensityOrigins;		
 };
 
diff --git a/makefile b/makefile
index c119d1b..b701a44 100644
--- a/makefile
+++ b/makefile
@@ -12,7 +12,7 @@ INCLUDE = $(CUDA_PATH)/include
 
 DEBUG = -g
 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic
-NV_FLAGS = $(DEBUG)
+NV_FLAGS = -rdc=true $(DEBUG)
 EX_FLAGS = -O3 -m$(OS_SIZE)
 
 ifneq ($(MAVERICKS),)
diff --git a/notes.org b/notes.org
index c6ee171..5a3e805 100644
--- a/notes.org
+++ b/notes.org
@@ -1,24 +1,189 @@
-* grid
-** Q: overhead of dynamic parallelism?
-** Q: overhead of classes? 
-	 Member data can't be shared
+* Opportunities for memory bandwidth savings
+** each block should contain a compact set of density grid points
+ cache (automatically!?) potential grid lookups and coefficients! 
+** each block should have same transformation matrices applied to each grid point?!
+** each block could have same inverse transformation matrix applied to each grid point
+*** how well this peforms will depend on the number 
+*** but also simplifies reductions
+
+** new data structure for grids?
+ each grid contains blocks of data of a size optmized for the device 
+** Each thread can operate on multiple data points if needed (any advantage?)
+
 
-** TODO read up on:
-*** dynamic parallelism
-*** registers
-* TODO decide whether to write a monolithic kernel or to break problem into subkernel launches
 
+* questions
+** Q: overhead of dynamic parallelism?
+Where does it make sense to have a kernel call subkernels?
+			A: At least: to sychronize blocks
+
+** Q: overhead for classes?
+** Q: could algorithm use shared memory through persistent threads?
+	 
+	 
 
+* bring in rigid body integrator
 
 * optimize algorithms for cuda (balance floating point calculation with memory transfers)
-** parallize loops
-** subthreads
-** new algorithms may be improved through new data structures; introduce 
-*** RigidBodyGrid: methods pulled from BaseGrid, but transforms passed to functions 
+** share pieces of memory that are used repeatedly
+
+** RigidBodyGrid: methods pulled from BaseGrid, but transforms passed to functions 
 **** no wrapping
 
+** where exactly to parallelize:
+	 Easy to break calculation for density grid, but each thread needs to perform
+	 it's own lookup of potential grid; only O(10) values per thread
+	 
+	 neighboring density grid points will need to lookup similar region of potential grid
+	 						 BUT the regions could be difficult to determine a priori
+							 
+	
+		
+	 possible to move potential grid data into shared memory tiles
+	 	 e.g. blocks of 10x10x10
+     this will be more important for global memory 
+	Better to use cudaCache?
+	 
+	 
+
+
 * pairlists for rigid bodies 
 ** maybe for grids, depending on parallel structure of code
 
 * other ideas
-** interpolate density grids?
+** interpolate density grid?
+
+BaseGrid.h:	// RBTODO Fix?
+BaseGrid.h-	BaseGrid(); // cmaffeo2 (2015) moved this out of protected, cause I wanted BaseGrid in a struct
+BaseGrid.h-  // The most obvious of constructors.
+BaseGrid.h-		BaseGrid(Matrix3 basis0, Vector3 origin0, int nx0, int ny0, int nz0);
+BaseGrid.h-
+--
+BaseGrid.h-    float a0, a1, a2, a3;
+BaseGrid.h-
+BaseGrid.h:		// RBTODO parallelize loops?
+BaseGrid.h-		
+BaseGrid.h-    // Mix along x, taking the derivative.
+BaseGrid.h-    float g2[4][4];
+BaseGrid.h-    for (int iy = 0; iy < 4; iy++) {
+--
+BaseGrid.h-  }
+BaseGrid.h-
+BaseGrid.h:	// RBTODO overload with optimized algorithm
+BaseGrid.h-	//  skip transforms (assume identity basis)
+BaseGrid.h-  HOST DEVICE inline float interpolatePotential(Vector3 pos) const {
+BaseGrid.h-    // Find the home node.
+BaseGrid.h-    Vector3 l = basisInv.transform(pos - origin);
+--
+BaseGrid.h-
+BaseGrid.h-		// out of grid? return 0
+BaseGrid.h:		// RBTODO
+BaseGrid.h-			
+BaseGrid.h-    // Get the array jumps.
+BaseGrid.h-    int jump[3];
+BaseGrid.h-    jump[0] = nz*ny;
+--
+BaseGrid.h-		// Find the values at the neighbors.
+BaseGrid.h-		float g1[4][4][4];
+BaseGrid.h:		//RBTODO parallelize?
+BaseGrid.h-		for (int ix = 0; ix < 4; ix++) {
+BaseGrid.h-			for (int iy = 0; iy < 4; iy++) {
+BaseGrid.h-				for (int iz = 0; iz < 4; iz++) {
+BaseGrid.h-	  			// Wrap around the periodic boundaries. 
+--
+ComputeGridGrid.cuh-													Matrix3 basis_rho, Vector3 origin_rho,
+ComputeGridGrid.cuh-													Matrix3 basis_u,   Vector3 origin_u) {
+ComputeGridGrid.cuh:  // RBTODO http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ComputeGridGrid.cuh-	const unsigned int r_id = blockIdx.x * blockDim.x + threadIdx.x;
+ComputeGridGrid.cuh-	
+ComputeGridGrid.cuh:	// RBTODO parallelize transform
+ComputeGridGrid.cuh-	if (r_id > rho->size)					// skip threads with no data 
+ComputeGridGrid.cuh-		return;
+ComputeGridGrid.cuh-	
+ComputeGridGrid.cuh-	// Maybe: Tile grid data into shared memory
+ComputeGridGrid.cuh:	//   RBTODO: think about localizing regions of grid data
+ComputeGridGrid.cuh-	Vector3 p = rho->getPosition(r_id, basis, origin);
+ComputeGridGrid.cuh-	float val = rho->val[r_id];
+ComputeGridGrid.cuh-
+ComputeGridGrid.cuh:	// RBTODO reduce forces and torques
+ComputeGridGrid.cuh-	// http://www.cuvilib.com/Reduction.pdf
+ComputeGridGrid.cuh-
+ComputeGridGrid.cuh:	// RBTODO combine interp methods and reduce repetition! 
+ComputeGridGrid.cuh-	float energy = u->interpolatePotential(p); 
+ComputeGridGrid.cuh-	Vector3 f = u->interpolateForceD(p);
+ComputeGridGrid.cuh-	Vector3 t = cross(p,f);				// test if sign is correct!
+ComputeGridGrid.cuh-
+ComputeGridGrid.cuh:	// RBTODO 3rd-law forces + torques
+ComputeGridGrid.cuh-}
+--
+Configuration.cpp-				cudaMemcpyHostToDevice));
+Configuration.cpp-	}
+Configuration.cpp:	// RBTODO: moved this out of preceding loop; was that correct?
+Configuration.cpp-	gpuErrchk(cudaMemcpyAsync(part_d, part_addr, sizeof(BrownianParticleType*) * numParts,
+Configuration.cpp-				cudaMemcpyHostToDevice));
+Configuration.cpp-
+Configuration.cpp-
+--
+Configuration.cpp-			sz = sizeof(float) * len;
+Configuration.cpp-			gpuErrchk(cudaMemcpy( tmpData, g->val, sz, cudaMemcpyHostToDevice));
+Configuration.cpp:			// RBTODO: why can't this be deleted? 
+Configuration.cpp-			// delete[] tmpData;
+Configuration.cpp-		}
+Configuration.cpp-	}
+Configuration.cpp-
+--
+Configuration.cpp-			sz = sizeof(float) * len;
+Configuration.cpp-			gpuErrchk(cudaMemcpy( tmpData, g->val, sz, cudaMemcpyHostToDevice));
+Configuration.cpp:			// RBTODO: why can't this be deleted? 
+Configuration.cpp-			// delete[] tmpData;
+Configuration.cpp-		}
+Configuration.cpp-		
+Configuration.cpp-  }
+--
+RigidBodyGrid.h-	\===============================*/
+RigidBodyGrid.h-	
+RigidBodyGrid.h:	// RBTODO Fix?
+RigidBodyGrid.h-	RigidBodyGrid(); // cmaffeo2 (2015) moved this out of protected, cause I wanted RigidBodyGrid in a struct
+RigidBodyGrid.h-  // The most obvious of constructors.
+RigidBodyGrid.h-		RigidBodyGrid(int nx0, int ny0, int nz0);
+RigidBodyGrid.h-
+--
+RigidBodyGrid.h-    float a0, a1, a2, a3;
+RigidBodyGrid.h-
+RigidBodyGrid.h:		// RBTODO further parallelize loops? unlikely?
+RigidBodyGrid.h-		
+RigidBodyGrid.h-    // Mix along x, taking the derivative.
+RigidBodyGrid.h-    float g2[4][4];
+RigidBodyGrid.h-    for (int iy = 0; iy < 4; iy++) {
+--
+RigidBodyGrid.h-
+RigidBodyGrid.h-		// out of grid? return 0
+RigidBodyGrid.h:		// RBTODO
+RigidBodyGrid.h-			
+RigidBodyGrid.h-    // Get the array jumps.
+RigidBodyGrid.h-    int jump[3];
+RigidBodyGrid.h-    jump[0] = nz*ny;
+--
+RigidBodyGrid.h-		w[2] = l.z - homeZ;
+RigidBodyGrid.h-		// Find the values at the neighbors.
+RigidBodyGrid.h:		float g1[4][4][4];					/* RBTODO: inefficient for my algorithm? */
+RigidBodyGrid.h-		for (int ix = 0; ix < 4; ix++) {
+RigidBodyGrid.h-			int jx = ix-1 + home[0];
+RigidBodyGrid.h-			for (int iy = 0; iy < 4; iy++) {
+RigidBodyGrid.h-				int jy = iy-1 + home[1];
+--
+RigidBodyGrid.h-	  			// Assume zero value at edges
+RigidBodyGrid.h-					int jz = iz-1 + home[2];
+RigidBodyGrid.h:					// RBTODO: possible branch divergence in warp?
+RigidBodyGrid.h-					if (jx <  0  ||  jy < 0  ||  jz < 0  ||
+RigidBodyGrid.h-							jx >= nx || jz >= nz || jz >= nz) {
+RigidBodyGrid.h-						g1[ix][iy][iz] = 0;
+RigidBodyGrid.h-					} else {
+--
+RigidBodyGrid.h-		// Find the values at the neighbors.
+RigidBodyGrid.h-		float g1[4][4][4];
+RigidBodyGrid.h:		//RBTODO parallelize?
+RigidBodyGrid.h-		for (int ix = 0; ix < 4; ix++) {
+RigidBodyGrid.h-			for (int iy = 0; iy < 4; iy++) {
+RigidBodyGrid.h-				for (int iz = 0; iz < 4; iz++) {
+RigidBodyGrid.h-	  			// Wrap around the periodic boundaries. 
diff --git a/useful.h b/useful.h
index 72070e8..327cd7a 100644
--- a/useful.h
+++ b/useful.h
@@ -194,7 +194,6 @@ public:
 		return v;
 	}
 
-
 	HOST DEVICE inline Vector3 operator*(float s) const {
 		Vector3 v;
 		v.x = s*x;
@@ -220,6 +219,22 @@ public:
 		return 0.0f;
 	}
 
+	HOST DEVICE static inline Vector3 element_mult(const Vector3 v, const Vector3 w) {
+		Vector3 ret(
+			v.x*w.x,
+			v.y*w.y,
+			v.z*w.z);
+		return ret;
+	}
+	HOST DEVICE static inline Vector3 element_sqrt(const Vector3 w) {
+		Vector3 ret(
+			sqrt(w.x),
+			sqrt(w.y),
+			sqrt(w.z));
+		return ret;
+	}
+
+	
 	String toString() const;
 	float x, y, z;
 };
@@ -251,6 +266,8 @@ public:
 
 	const Matrix3 operator*(float s) const;
 
+	HOST DEVICE	const Vector3 operator*(const Vector3& v) const	{ return this->transform(v); }
+
 	const Matrix3 operator*(const Matrix3& m) const;
 
 	const Matrix3 operator-() const;
@@ -310,8 +327,11 @@ public:
 	float exx, exy, exz;
 	float eyx, eyy, eyz;
 	float ezx, ezy, ezz;
+
 };
 
+
+
 Matrix3 operator*(float s, Matrix3 m);
 Matrix3 operator/(Matrix3 m, float s);
 
-- 
GitLab