From a7b708511ea945bdf20e51bf6ca8bfa29be9daa0 Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Tue, 17 Nov 2015 16:12:21 -0600
Subject: [PATCH] moved cuda copy for rigidbodies to controller

---
 ComputeGridGrid.cuh    |   4 +-
 Configuration.cpp      | 119 -----------------
 Configuration.h        |   1 -
 GrandBrownTown.cu      |  17 +--
 RigidBody.cu           |   4 +-
 RigidBody.h            |   2 +-
 RigidBodyController.cu | 293 +++++++++++++++++++----------------------
 RigidBodyController.h  |  12 +-
 RigidBodyGrid.cu       |   9 ++
 RigidBodyType.h        |  10 +-
 makefile               |  14 +-
 notes.org              |  12 +-
 12 files changed, 192 insertions(+), 305 deletions(-)

diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh
index 50a6ba6..173e7bb 100644
--- a/ComputeGridGrid.cuh
+++ b/ComputeGridGrid.cuh
@@ -1,6 +1,6 @@
 #pragma once
-#include "RigidBodyGrid.h"
-#include "useful.h"
+/* #include "RigidBodyGrid.h" */
+/* #include "useful.h" */
 
 __global__
 void computeGridGridForce(const RigidBodyGrid& rho, const RigidBodyGrid& u,
diff --git a/Configuration.cpp b/Configuration.cpp
index 89691e8..d233625 100644
--- a/Configuration.cpp
+++ b/Configuration.cpp
@@ -412,11 +412,9 @@ void Configuration::copyToCUDA() {
 	printf("Copying to GPU %d\n", GPUManager::current());
 
 	BrownianParticleType **part_addr = new BrownianParticleType*[numParts];
-	RigidBodyType **rb_addr = new RigidBodyType*[numRigidTypes];	
 
 	// Copy the BaseGrid objects and their member variables/objects
 	gpuErrchk(cudaMalloc(&part_d, sizeof(BrownianParticleType*) * numParts));
-	gpuErrchk(cudaMalloc(&rbType_d, sizeof(RigidBodyType*) * numRigidTypes));
 	// TODO: The above line fails when there is not enough memory. If it fails, stop.
 	
 	for (int i = 0; i < numParts; i++) {
@@ -459,123 +457,6 @@ void Configuration::copyToCUDA() {
 	gpuErrchk(cudaMemcpyAsync(part_d, part_addr, sizeof(BrownianParticleType*) * numParts,
 				cudaMemcpyHostToDevice));
 
-
-	printf("Copying RBs\n");
-	// Copy rigidbody types 
-	// http://stackoverflow.com/questions/16024087/copy-an-object-to-device
- 	for (int i = 0; i < numRigidTypes; i++) {
-		printf("Working on RB %d\n",i);
-		RigidBodyType *rb = &(rigidBody[i]); // temporary for convenience
-		rb->updateRaw();
-
-		int ng = rb->numPotGrids;
-
-		// copy rigidbody to device
-		// RigidBodyType *rb_d;
-    gpuErrchk(cudaMalloc(&rb_addr[i], sizeof(RigidBodyType)));
-		gpuErrchk(cudaMemcpy(rb_addr[i], rb, sizeof(RigidBodyType),
-												 cudaMemcpyHostToDevice));
-		
-		// copy rb->grid to device
-		BaseGrid * gtmp;
-		// gtmp = new BaseGrid[ng];
-		size_t sz = sizeof(BaseGrid)*ng;
-		
-		// allocate grids on device
-		// copy temporary host pointer to device pointer
-		// copy grids to device through temporary host poin
-		gpuErrchk(cudaMalloc((void **) &gtmp, sz));
-		gpuErrchk(cudaMemcpy(&(rb_addr[i]->rawPotentialGrids), &gtmp, 
-												 sizeof(BaseGrid*) * ng, cudaMemcpyHostToDevice ));
-		gpuErrchk(cudaMemcpy(gtmp, &(rb->rawPotentialGrids),
-												 sizeof(BaseGrid)  * ng, cudaMemcpyHostToDevice ));
-		for (int gid = 0; gid < ng; gid++) {
-			gpuErrchk(cudaMemcpy(&(gtmp[gid]), &(rb->rawPotentialGrids[gid]),
-													 sizeof(BaseGrid), cudaMemcpyHostToDevice ));
-		}
-
-		printf("  RigidBodyType %d: numGrids = %d\n", i, ng);		
-		// copy grid data to device
-		for (int gid = 0; gid < ng; gid++) { 
-			BaseGrid *g = &(rb->rawPotentialGrids[gid]); // convenience
-			int len = g->getSize();
-			float **tmpData;
-			tmpData = new float*[len];
-
-			printf("  RigidBodyType %d: potGrid[%d] size: %d\n", i, gid, len);
-			for (int k = 0; k < len; k++)
-				printf("    rbType_d[%d]->potGrid[%d].val[%d]: %g\n",
-							 i, gid, k, g->val[k]);
-			
-      // allocate grid data on device
-			// copy temporary host pointer to device pointer
-			// copy data to device through temporary host pointer
-			sz = sizeof(float*) * len;
-			gpuErrchk(cudaMalloc((void **) &tmpData, sz)); 
-			// gpuErrchk(cudaMemcpy( &(rb_addr[i]->rawPotentialGrids[gid].val), &tmpData,
-			// 											sizeof(float*), cudaMemcpyHostToDevice));
-			gpuErrchk(cudaMemcpy( &(gtmp[gid].val), &tmpData,
-														sizeof(float*), cudaMemcpyHostToDevice));
-			sz = sizeof(float) * len;
-			gpuErrchk(cudaMemcpy( tmpData, g->val, sz, cudaMemcpyHostToDevice));
-			// RBTODO: why can't this be deleted? 
-			// delete[] tmpData;
-		}
-	}
-
-	// density grids
- 	for (int i = 0; i < numRigidTypes; i++) {
-		printf("working on RB %d\n",i);
-		RigidBodyType *rb = &(rigidBody[i]); // temporary for convenience
-
-		int ng = rb->numDenGrids;
-		BaseGrid * gtmp;
-		size_t sz = sizeof(BaseGrid)*ng;
-		
-		// allocate grids on device
-		// copy temporary host pointer to device pointer
-		// copy grids to device through temporary host poin
-		gpuErrchk(cudaMalloc((void **) &gtmp, sz));
-		gpuErrchk(cudaMemcpy(&(rb_addr[i]->rawDensityGrids), &gtmp, 
-												 sizeof(BaseGrid*) * ng, cudaMemcpyHostToDevice ));
-		gpuErrchk(cudaMemcpy(gtmp, &(rb->rawDensityGrids),
-												 sizeof(BaseGrid)  * ng, cudaMemcpyHostToDevice ));
-		for (int gid = 0; gid < ng; gid++) {
-			gpuErrchk(cudaMemcpy(&(gtmp[gid]), &(rb->rawDensityGrids[gid]),
-													 sizeof(BaseGrid), cudaMemcpyHostToDevice ));
-		}
-
-		printf("  RigidBodyType %d: numGrids = %d\n", i, ng);		
-		// copy grid data to device
-		for (int gid = 0; gid < ng; gid++) { 
-			BaseGrid *g = &(rb->rawDensityGrids[gid]); // convenience
-			int len = g->getSize();
-			float **tmpData;
-			tmpData = new float*[len];
-
-			printf("  RigidBodyType %d: potGrid[%d] size: %d\n", i, gid, len);
-			for (int k = 0; k < len; k++)
-				printf("    rbType_d[%d]->potGrid[%d].val[%d]: %g\n",
-							 i, gid, k, g->val[k]);
-			
-      // allocate grid data on device
-			// copy temporary host pointer to device pointer
-			// copy data to device through temporary host pointer
-			sz = sizeof(float*) * len;
-			gpuErrchk(cudaMalloc((void **) &tmpData, sz)); 
-			gpuErrchk(cudaMemcpy( &(gtmp[gid].val), &tmpData,
-														sizeof(float*), cudaMemcpyHostToDevice));
-			sz = sizeof(float) * len;
-			gpuErrchk(cudaMemcpy( tmpData, g->val, sz, cudaMemcpyHostToDevice));
-			// RBTODO: why can't this be deleted? 
-			// delete[] tmpData;
-		}
-		
-  }
-	gpuErrchk(cudaMemcpy(rbType_d, rb_addr, sizeof(RigidBodyType*) * numRigidTypes,
-				cudaMemcpyHostToDevice));
-	printf("Done copying RBs\n");
-
 	// kTGrid_d
 	kTGrid_d = NULL;
 	if (temperatureGrid.length() > 0) {
diff --git a/Configuration.h b/Configuration.h
index 68df323..cb255b1 100644
--- a/Configuration.h
+++ b/Configuration.h
@@ -82,7 +82,6 @@ public:
 	// Device Variables
 	int *type_d;
 	BrownianParticleType **part_d;
-	RigidBodyType **rbType_d;
 	BaseGrid *sys_d, *kTGrid_d;
 	Bond* bonds_d;
 	int2* bondMap_d;
diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu
index 57b271b..fa54f7e 100644
--- a/GrandBrownTown.cu
+++ b/GrandBrownTown.cu
@@ -16,7 +16,8 @@ cudaEvent_t START, STOP;
 
 GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 		const long int randomSeed, bool debug, bool imd_on, unsigned int imd_port, int numReplicas) :
-	imd_on(imd_on), imd_port(imd_port), numReplicas(numReplicas), conf(c) {
+	imd_on(imd_on), imd_port(imd_port), numReplicas(numReplicas),
+	conf(c), RBC(RigidBodyController(c)) {
 
 	for (int i = 0; i < numReplicas; i++) {
 		std::stringstream curr_file, restart_file, out_prefix;
@@ -28,13 +29,6 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 		outCurrFiles.push_back(curr_file.str());
 		restartFiles.push_back(restart_file.str());
 		outFilePrefixes.push_back(out_prefix.str());
-		
-		printf("About to devicePrint\n");
-		// devicePrint<<<1,1>>>(&(c.rigidBody[0]));
-		devicePrint<<<1,1>>>(c.rbType_d);
-		cudaDeviceSynchronize();
-		printf("Done with devicePrint\n");
-
 	}
 
 	GrandBrownTown::DEBUG = debug;
@@ -56,6 +50,13 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 
 	// Allocate things for rigid body
 	// RBC = RigidBodyController(c);
+	printf("About to devicePrint\n");
+	// devicePrint<<<1,1>>>(&(c.rigidBody[0]));
+	devicePrint<<<1,1>>>(RBC.rbType_d);
+	cudaDeviceSynchronize();
+	printf("Done with devicePrint\n");
+
+
 	
 	// Replicate identical initial conditions across all replicas
 	// TODO: add an option to generate random initial conditions for all replicas
diff --git a/RigidBody.cu b/RigidBody.cu
index bfbbb90..d35d610 100644
--- a/RigidBody.cu
+++ b/RigidBody.cu
@@ -10,8 +10,10 @@
 
 #include "Debug.h"
 
+
 RigidBody::RigidBody(const Configuration& cref, RigidBodyType& tref)
-	: c(&cref), t(&tref) {
+	: c(&cref), t(&tref), impulse_to_momentum(0.0004184) {
+
 	timestep = c->timestep;
 	// RBTODO: fix this
 	Temp = 295;
diff --git a/RigidBody.h b/RigidBody.h
index 0dc6a9c..e0fe66e 100644
--- a/RigidBody.h
+++ b/RigidBody.h
@@ -84,7 +84,7 @@ private:
 	/*â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“.
 	| units "kcal_mol/AA * fs" "(AA/fs) * amu" |
 	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
-	const BigReal impulse_to_momentum = 0.0004184; /* should be static, but fails */
+	BigReal impulse_to_momentum; /* should be const, but copy constructor failed */
 
 
 	HOST DEVICE inline Matrix3 Rx(BigReal t);
diff --git a/RigidBodyController.cu b/RigidBodyController.cu
index feb471e..fea8194 100644
--- a/RigidBodyController.cu
+++ b/RigidBodyController.cu
@@ -7,8 +7,9 @@
 /* #include "RigidBody.h" */
 #include "RigidBodyController.h"
 #include "Configuration.h"
-
 #include "RigidBodyType.h"
+#include "ComputeGridGrid.cuh"
+
 /* #include "Random.h" */
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
@@ -27,6 +28,10 @@ inline void gpuAssert(cudaError_t code, String file, int line, bool abort=true)
 RigidBodyController::RigidBodyController(const Configuration& c) :
 conf(c) {
 
+	if (conf.numRigidTypes > 0) {
+		copyGridsToDevice();
+	}
+
 	int numRB = 0;
 	// grow list of rbs
 	for (int i = 0; i < conf.numRigidTypes; i++) {			
@@ -37,8 +42,7 @@ conf(c) {
 			tmp.push_back( r );
 		}
 		rigidBodyByType.push_back(tmp);
-	}
-
+	}	
 }
 RigidBodyController::~RigidBodyController() {
 	for (int i = 0; i < rigidBodyByType.size(); i++)
@@ -62,8 +66,8 @@ void RigidBodyController::updateForces() {
 	| Opportunities for memory bandwidth savings:                              |
 	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
 	// int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
-	int numBlocks = 1;
-	int numThreads = 32;
+	// int numBlocks = 1;
+	int numThreads = 256;
 
 	
 	// Loop over all pairs of rigid body types
@@ -89,7 +93,9 @@ void RigidBodyController::updateForces() {
 							for (int j = (ti==tj ? 0 : i); j < rbs2.size(); j++) {
 								const RigidBody& rb1 = rbs1[i];
 								const RigidBody& rb2 = rbs2[j];
-									
+
+								const int sz = t1.rawDensityGrids[k1].getSize();
+								const int numBlocks = sz / numThreads + ((sz % numThreads == 0) ? 0:1 );
 								computeGridGridForce<<< numBlocks, numThreads >>>
 									(t1.rawDensityGrids[k1], t2.rawPotentialGrids[k2],
 									 rb1.getBasis(), rb1.getPosition(),
@@ -104,56 +110,135 @@ void RigidBodyController::updateForces() {
 		
 	// RBTODO: see if there is a better way to sync
 	gpuErrchk(cudaDeviceSynchronize());
-
 }
 
-/*
-RigidBodyController::RigidBodyController(const NamdState *s, const int reductionTag, SimParameters *sp) : state(s), simParams(sp)
-{
-    DebugM(2, "Rigid Body Controller initializing" 
-    	   << "\n" << endi);
-
-    // initialize each RigidBody
-    RigidBodyParams *params =  simParams->rigidBodyList.get_first();
-    while (params != NULL) {
-    	// check validity of params?
-    	RigidBody *rb = new RigidBody(simParams, params);
-    	rigidBodyList.push_back( rb );
-    	params = params->next;
-    }
+void RigidBodyController::copyGridsToDevice() {
+	RigidBodyType **rb_addr = new RigidBodyType*[conf.numRigidTypes];	/* temporary pointer to device pointer */
 
-    // initialize translation and rotation data
-    trans.resize( rigidBodyList.size() );
-    rot.resize( rigidBodyList.size() );
-    for (int i=0; i<rigidBodyList.size(); i++) {
-    	trans[i] = rigidBodyList[i]->getPosition();
-    	rot[i] = rigidBodyList[i]->getOrientation();
-    	// trans.insert( rigidBodyList[i]->getPosition(), i );
-    	// rot.insert( rigidBodyList[i]->getOrientation(), i ); 
-   }
+	gpuErrchk(cudaMalloc(&rbType_d, sizeof(RigidBodyType*) * conf.numRigidTypes));
+	// TODO: The above line fails when there is not enough memory. If it fails, stop.
 
-    random = new Random(simParams->randomSeed);
-    // random->split(0,PatchMap::Object()->numPatches()+1);
-        
-    // inbound communication
-    DebugM(3, "RBC::init: requiring reduction "<<reductionTag<<" with "<<6*rigidBodyList.size()<<" elements\n" << endi);
-    gridReduction = ReductionMgr::Object()->willRequire(reductionTag ,6*rigidBodyList.size() );
+	printf("Copying RBs\n");
+	// Copy rigidbody types 
+	// http://stackoverflow.com/questions/16024087/copy-an-object-to-device
+ 	for (int i = 0; i < conf.numRigidTypes; i++) {
+		printf("Working on RB %d\n",i);
+		RigidBodyType& rb = conf.rigidBody[i]; // temporary for convenience
+		rb.updateRaw();
 
-    // outbound communication
-    CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
-    computeMgr = cm.ckLocalBranch();
+		int ng = rb.numPotGrids;
 
-    if (trans.size() != rot.size())
-	NAMD_die("failed sanity check\n");    
-    RigidBodyMsg *msg = new RigidBodyMsg;
-    msg->trans.copy(trans);	// perhaps .swap() would cause problems
-    msg->rot.copy(rot);
-    computeMgr->sendRigidBodyUpdate(msg);
-}
-RigidBodyController::~RigidBodyController() {
-    delete gridReduction;
+		// copy rigidbody to device
+		// RigidBodyType *rb_d;
+    gpuErrchk(cudaMalloc(&rb_addr[i], sizeof(RigidBodyType)));
+		gpuErrchk(cudaMemcpy(rb_addr[i], &rb, sizeof(RigidBodyType),
+												 cudaMemcpyHostToDevice));
+		
+		// copy rb->grid to device
+		RigidBodyGrid * gtmp;
+		// gtmp = new RigidBodyGrid[ng];
+		size_t sz = sizeof(RigidBodyGrid)*ng;
+		
+		// allocate grids on device
+		// copy temporary host pointer to device pointer
+		// copy grids to device through temporary host poin
+		gpuErrchk(cudaMalloc((void **) &gtmp, sz));
+		gpuErrchk(cudaMemcpy(&(rb_addr[i]->rawPotentialGrids), &gtmp, 
+												 sizeof(RigidBodyGrid*) * ng, cudaMemcpyHostToDevice ));
+		gpuErrchk(cudaMemcpy(gtmp, &(rb.rawPotentialGrids),
+												 sizeof(RigidBodyGrid)  * ng, cudaMemcpyHostToDevice ));
+		for (int gid = 0; gid < ng; gid++) {
+			gpuErrchk(cudaMemcpy(&(gtmp[gid]), &(rb.rawPotentialGrids[gid]),
+													 sizeof(RigidBodyGrid), cudaMemcpyHostToDevice ));
+		}
+
+		printf("  RigidBodyType %d: numGrids = %d\n", i, ng);		
+		// copy potential grid data to device
+		for (int gid = 0; gid < ng; gid++) { 
+			RigidBodyGrid *g = &(rb.rawPotentialGrids[gid]); // convenience
+			int len = g->getSize();
+			float **tmpData;
+			tmpData = new float*[len];
+
+			printf("  RigidBodyType %d: potGrid[%d] size: %d\n", i, gid, len);
+			for (int k = 0; k < len; k++)
+				printf("    rbType_d[%d]->potGrid[%d].val[%d]: %g\n",
+							 i, gid, k, g->val[k]);
+			
+      // allocate grid data on device
+			// copy temporary host pointer to device pointer
+			// copy data to device through temporary host pointer
+			sz = sizeof(float*) * len;
+			gpuErrchk(cudaMalloc((void **) &tmpData, sz)); 
+			// gpuErrchk(cudaMemcpy( &(rb_addr[i]->rawPotentialGrids[gid].val), &tmpData,
+			// 											sizeof(float*), cudaMemcpyHostToDevice));
+			gpuErrchk(cudaMemcpy( &(gtmp[gid].val), &tmpData,
+														sizeof(float*), cudaMemcpyHostToDevice));
+			sz = sizeof(float) * len;
+			gpuErrchk(cudaMemcpy( tmpData, g->val, sz, cudaMemcpyHostToDevice));
+			// RBTODO: why can't this be deleted? 
+			// delete[] tmpData;
+		}
+	}
+
+	// density grids
+ 	for (int i = 0; i < conf.numRigidTypes; i++) {
+		printf("working on RB %d\n",i);
+		RigidBodyType& rb = conf.rigidBody[i];
+
+		int ng = rb.numDenGrids;
+		RigidBodyGrid * gtmp;
+		size_t sz = sizeof(RigidBodyGrid)*ng;
+		
+		// allocate grids on device
+		// copy temporary host pointer to device pointer
+		// copy grids to device through temporary host poin
+		gpuErrchk(cudaMalloc((void **) &gtmp, sz));
+		gpuErrchk(cudaMemcpy(&(rb_addr[i]->rawDensityGrids), &gtmp, 
+												 sizeof(RigidBodyGrid*) * ng, cudaMemcpyHostToDevice ));
+		gpuErrchk(cudaMemcpy(gtmp, &(rb.rawDensityGrids),
+												 sizeof(RigidBodyGrid)  * ng, cudaMemcpyHostToDevice ));
+		for (int gid = 0; gid < ng; gid++) {
+			gpuErrchk(cudaMemcpy(&(gtmp[gid]), &(rb.rawDensityGrids[gid]),
+													 sizeof(RigidBodyGrid), cudaMemcpyHostToDevice ));
+		}
+
+		printf("  RigidBodyType %d: numGrids = %d\n", i, ng);		
+		// copy grid data to device
+		for (int gid = 0; gid < ng; gid++) { 
+			RigidBodyGrid& g = rb.rawDensityGrids[gid]; // convenience
+			int len = g.getSize();
+			float **tmpData;
+			tmpData = new float*[len];
+
+			printf("  RigidBodyType %d: potGrid[%d] size: %d\n", i, gid, len);
+			for (int k = 0; k < len; k++)
+				printf("    rbType_d[%d]->potGrid[%d].val[%d]: %g\n",
+							 i, gid, k, g.val[k]);
+			
+      // allocate grid data on device
+			// copy temporary host pointer to device pointer
+			// copy data to device through temporary host pointer
+			sz = sizeof(float*) * len;
+			gpuErrchk(cudaMalloc((void **) &tmpData, sz)); 
+			gpuErrchk(cudaMemcpy( &(gtmp[gid].val), &tmpData,
+														sizeof(float*), cudaMemcpyHostToDevice));
+			sz = sizeof(float) * len;
+			gpuErrchk(cudaMemcpy( tmpData, g.val, sz, cudaMemcpyHostToDevice));
+			
+			// RBTODO: why can't this be deleted? 
+			// delete[] tmpData;
+		}
+		
+  }
+	gpuErrchk(cudaMemcpy(rbType_d, rb_addr, sizeof(RigidBodyType*) * conf.numRigidTypes,
+				cudaMemcpyHostToDevice));
+	printf("Done copying RBs\n");
 }
 
+
+
+/*
 void RigidBodyController::print(int step) {
     // modeled after outputExtendedData() in Controller.C
     if ( step >= 0 ) {
@@ -472,116 +557,4 @@ RigidBodyParams* RigidBodyParamsList::add(const char* key)
     return elem;
 }  
 
-const void RigidBodyParams::print() {
-    iout << iINFO
-	 << "printing RigidBodyParams("<<rigidBodyKey<<"):"
-	 <<"\n\t" << "mass: " << mass
-	 <<"\n\t" << "inertia: " << inertia
-	 <<"\n\t" << "langevin: " << langevin
-	 <<"\n\t" << "temperature: " << temperature
-	 <<"\n\t" << "transDampingCoeff: " << transDampingCoeff
-	 <<"\n\t" << "position: " << position
-	 <<"\n\t" << "orientation: " << orientation
-	 <<"\n\t" << "orientationalVelocity: " << orientationalVelocity
-	 << "\n"  << endi;
-
-}
-const void RigidBodyParamsList::print() {
-    iout << iINFO << "Printing " << n_elements << " RigidBodyParams\n" << endi;
-	
-    RigidBodyParams *elem = get_first();
-    while (elem != NULL) {
-	elem->print();
-	elem = elem->next;
-    }
-}
-const void RigidBodyParamsList::print(char *s) {
-    iout << iINFO << "("<<s<<") Printing " << n_elements << " RigidBodyParams\n" << endi;
-	
-    RigidBodyParams *elem = get_first();
-    while (elem != NULL) {
-	elem->print();
-	elem = elem->next;
-    }
-}
-
-void RigidBodyParamsList::pack_data(MOStream *msg) {
-    DebugM(4, "Packing rigid body parameter list\n");
-    print();
-
-    int i = n_elements;
-    msg->put(n_elements);
-    
-    RigidBodyParams *elem = get_first();
-    while (elem != NULL) {
-    	DebugM(4, "Packing a new element\n");
-
-    	int len;
-	Vector v;
-	
-	len = strlen(elem->rigidBodyKey) + 1;
-    	msg->put(len);
-    	msg->put(len,elem->rigidBodyKey);
-	msg->put(elem->mass);
-	
-	// v = elem->
-	msg->put(&(elem->inertia));
-	msg->put( (elem->langevin?1:0) ); 
-	msg->put(elem->temperature);
-	msg->put(&(elem->transDampingCoeff));
-	msg->put(&(elem->rotDampingCoeff));
-    	
-	// elem->gridList.clear();
-	
-	msg->put(&(elem->position));
-	msg->put(&(elem->velocity));
-	// Tensor data = elem->orientation;
-	msg->put( & elem->orientation );
-	msg->put(&(elem->orientationalVelocity)) ;
-	
-	i--;
-	elem = elem->next;
-    }
-    if (i != 0)
-      NAMD_die("MGridforceParams message packing error\n");
-}
-void RigidBodyParamsList::unpack_data(MIStream *msg) {
-    DebugM(4, "Could be unpacking rigid body parameterlist (not used & not implemented)\n");
-
-    int elements;
-    msg->get(elements);
-
-    for(int i=0; i < elements; i++) {
-    	DebugM(4, "Unpacking a new element\n");
-
-	int len;
-	msg->get(len);
-	char *key = new char[len];
-	msg->get(len,key);
-	RigidBodyParams *elem = add(key);
-	delete [] key;
-	
-	msg->get(&(elem->inertia));
-
-	int j;
-	msg->get(j);
-	elem->langevin = (j != 0); 
-	
-	msg->get(elem->temperature);
-	msg->get(&(elem->transDampingCoeff));
-	msg->get(&(elem->rotDampingCoeff));
-    	
-	// elem->gridList.clear();
-	
-	msg->get(&(elem->position));
-	msg->get(&(elem->velocity));
-	msg->get( & elem->orientation );
-	msg->get(&(elem->orientationalVelocity)) ;
-	
-	elem = elem->next;
-    }
-
-    DebugM(4, "Finished unpacking rigid body parameter list\n");
-    print();
-}
 */
diff --git a/RigidBodyController.h b/RigidBodyController.h
index 1e06609..5287410 100644
--- a/RigidBodyController.h
+++ b/RigidBodyController.h
@@ -4,8 +4,6 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-#include "ComputeGridGrid.cuh"
-
 class Configuration;
 
 struct gridInteractionList {
@@ -23,11 +21,15 @@ public:
 	DEVICE void print(int step);
     
 private:
+	void copyGridsToDevice();
 	void updateForces();
 
 	/* void printLegend(std::ofstream &file); */
 	/* void printData(int step, std::ofstream &file); */
-
+public:
+		RigidBodyType** rbType_d;
+	
+private:
 	/* std::ofstream trajFile; */
 
 	const Configuration& conf;
@@ -37,11 +39,7 @@ private:
 	
 	Vector3* trans; // would have made these static, but
 	Matrix3* rot;  	// there are errors on rigidBody->integrate
-
 	std::vector< std::vector<RigidBody> > rigidBodyByType;
-	/* RigidBody* rigidBodyList; */
-
 	
 	
 };
-
diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu
index 28bb6fe..7075071 100644
--- a/RigidBodyGrid.cu
+++ b/RigidBodyGrid.cu
@@ -112,6 +112,15 @@ RigidBodyGrid::RigidBodyGrid(Matrix3 box, float dx) {
 	zero();
 }
 
+RigidBodyGrid::RigidBodyGrid(const BaseGrid& g) {
+	nx = g.nx;
+	ny = g.ny;
+	nz = g.nz;
+	
+	init();
+	for (int i = 0; i < size; i++) val[i] = g.val[i];
+}
+
 // Make an exact copy of a grid.
 RigidBodyGrid::RigidBodyGrid(const RigidBodyGrid& g) {
 	nx = g.nx;
diff --git a/RigidBodyType.h b/RigidBodyType.h
index be452c3..04eb55a 100644
--- a/RigidBodyType.h
+++ b/RigidBodyType.h
@@ -61,8 +61,9 @@ public:
 
 	std::vector<BaseGrid> potentialGrids;
 	std::vector<BaseGrid> densityGrids;
-	
-	// for device
+
+	// RBTODO: clear std::vectors after initialization 
+	// duplicates of std::vector grids for device
 	int numPotGrids;
 	int numDenGrids;
 	RigidBodyGrid* rawPotentialGrids;
@@ -71,5 +72,10 @@ public:
 	Matrix3* rawDensityBases;
 	Vector3* rawPotentialOrigins;
 	Vector3* rawDensityOrigins;		
+
+	// device pointers
+	RigidBodyGrid** rawPotentialGrids_d;
+	RigidBodyGrid** rawDensityGrids_d;
+	
 };
 
diff --git a/makefile b/makefile
index b701a44..6f9a875 100644
--- a/makefile
+++ b/makefile
@@ -12,7 +12,7 @@ INCLUDE = $(CUDA_PATH)/include
 
 DEBUG = -g
 CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic
-NV_FLAGS = -rdc=true $(DEBUG)
+NV_FLAGS = $(DEBUG)
 EX_FLAGS = -O3 -m$(OS_SIZE)
 
 ifneq ($(MAVERICKS),)
@@ -29,7 +29,6 @@ else
     LIBRARY = $(CUDA_PATH)/lib64
 endif
 
-LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -Wl,-rpath,$(LIBRARY)
 
 #CODE_10 := -gencode arch=compute_10,code=sm_10
 #CODE_12 := -gencode arch=compute_12,code=sm_12
@@ -40,6 +39,11 @@ CODE_20 := -arch=sm_20
 
 NV_FLAGS += $(CODE_10) $(CODE_12) $(CODE_20) $(CODE_30) $(CODE_35)
 
+NVLD_FLAGS := $(NV_FLAGS) --device-link
+NV_FLAGS += -rdc=true
+
+LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -Wl,-rpath,$(LIBRARY)
+
 
 ### Sources
 CC_SRC := $(wildcard *.cpp)
@@ -59,7 +63,11 @@ all: $(TARGET)
 	@echo "Done ->" $(TARGET)
 
 $(TARGET): $(CU_OBJ) $(CC_OBJ) runBrownTown.cpp vmdsock.c imd.c imd.h
-	$(EXEC) $(CC) $(CC_FLAGS) $(LD_FLAGS) $(EX_FLAGS) runBrownTown.cpp vmdsock.c imd.c $(CU_OBJ) $(CC_OBJ) -o $(TARGET)
+	$(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o
+	$(EXEC) $(CC) $(CC_FLAGS) $(EX_FLAGS) runBrownTown.cpp vmdsock.c imd.c $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) $(LD_FLAGS)  -o $(TARGET)
+
+# $(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o
+# $(EXEC) $(CC) $(CC_FLAGS) $(LD_FLAGS) $(EX_FLAGS) runBrownTown.cpp vmdsock.c imd.c $(CU_OBJ) $(CC_OBJ) -o $(TARGET)
 
 .SECONDEXPANSION:
 $(CU_OBJ): %.o: %.cu $$(wildcard %.h) $$(wildcard %.cuh)
diff --git a/notes.org b/notes.org
index 5a3e805..25d5831 100644
--- a/notes.org
+++ b/notes.org
@@ -1,5 +1,15 @@
+* TODO active
+** make rigid body object device pointer
+
+
+
+* organization
+** RgidBodyController (RBC) holds device pointers, manages force evaluation and integration
+
+
+
 * Opportunities for memory bandwidth savings
-** each block should contain a compact set of density grid points
+** each block should (ideally) contain a compact set of density grid points
  cache (automatically!?) potential grid lookups and coefficients! 
 ** each block should have same transformation matrices applied to each grid point?!
 ** each block could have same inverse transformation matrix applied to each grid point
-- 
GitLab