From 8f5693f6d15d57c9e3ce63cfb8909b4fe1982d63 Mon Sep 17 00:00:00 2001
From: Han-Yi Chou <hchou10@illinois.edu>
Date: Sat, 27 Jan 2018 18:18:37 -0600
Subject: [PATCH] modify feature of computing energy and make interpolation
 scheme as an option

Conflicts:
	src/Configuration.cpp
	src/GrandBrownTown.cu
---
 src/BaseGrid.h             |  45 ++++++++--
 src/ComputeForce.cu        |   5 +-
 src/ComputeGridGrid.cu     |  76 ++++++++++------
 src/ComputeGridGrid.cuh    |   5 +-
 src/Configuration.cpp      |  11 ++-
 src/Configuration.h        |   3 +-
 src/GrandBrownTown.cu      | 107 ++++++++++++++--------
 src/GrandBrownTown.cuh     |  76 ++++++++++------
 src/GrandBrownTown.h       |   5 +-
 src/RigidBody.cu           |  72 ++++++++++-----
 src/RigidBody.h            |  19 ++--
 src/RigidBodyController.cu | 179 ++++++++++++++++++++++++-------------
 src/RigidBodyController.h  |  16 ++--
 src/RigidBodyGrid.cu       |  14 +--
 src/useful.h               |  12 +--
 15 files changed, 429 insertions(+), 216 deletions(-)

diff --git a/src/BaseGrid.h b/src/BaseGrid.h
index 5523284..29c4f7a 100644
--- a/src/BaseGrid.h
+++ b/src/BaseGrid.h
@@ -35,10 +35,39 @@ public:
 
 class ForceEnergy {
 public:
-	DEVICE ForceEnergy(Vector3 &f, float &e) :
+        HOST DEVICE ForceEnergy() : f(0.f), e(0.f) {};
+	HOST DEVICE ForceEnergy(Vector3 &f, float &e) :
 		f(f), e(e) {};
-        DEVICE ForceEnergy(float f, float e) :
+        HOST DEVICE explicit ForceEnergy(float e) : f(e), e(e) {};
+        HOST DEVICE ForceEnergy(float f, float e) :
         f(f), e(e) {};
+        HOST DEVICE ForceEnergy(const ForceEnergy& src)
+        {
+            f = src.f;
+            e = src.e;
+        }
+        HOST DEVICE ForceEnergy& operator=(const ForceEnergy& src)
+        {
+            if(&src != this)
+            {
+                this->f = src.f;
+                this->e = src.e;
+            }
+            return *this;
+        }
+        HOST DEVICE ForceEnergy operator+(const ForceEnergy& src)
+        {
+            ForceEnergy fe;
+            fe.f = this->f + src.f;
+            fe.e = this->e + src.e;
+            return fe;
+        }
+        HOST DEVICE ForceEnergy& operator+=(ForceEnergy& src)
+        {
+            this->f += src.f;
+            this->e += src.e;
+            return *this; 
+        }
 	Vector3 f;
 	float e;
 };
@@ -877,13 +906,13 @@ public:
 
         //#define cubic
 	DEVICE inline ForceEnergy interpolateForceDLinearlyPeriodic(const Vector3& pos) const {
-                #ifdef cubic
-                return interpolateForceD(pos);
-                #elif defined(cubic_namd)
-                return interpolateForceDnamd(pos);
-                #else
+                //#ifdef cubic
+                //return interpolateForceD(pos);
+                //#elif defined(cubic_namd)
+                //return interpolateForceDnamd(pos);
+                //#else
                 return interpolateForceDLinearly(pos); 
-                #endif
+                //#endif
                 #if 0
  		const Vector3 l = basisInv.transform(pos - origin);
 
diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index e5d212c..3610f8f 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -6,7 +6,6 @@
 #include "ComputeForce.cuh"
 #include "Configuration.h"
 #include <cuda_profiler_api.h>
-#include <thrust/device_ptr.h>
 #include <fstream>
 #include <iostream>
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
@@ -754,7 +753,7 @@ float ComputeForce::computeTabulated(bool get_energy) {
 
 	// Calculate the energy based on the array created by the kernel
 	// TODO: return energy
-	if (get_energy) 
+	/*if (get_energy) 
         {
             float e = 0.f;
 	    gpuErrchk(cudaDeviceSynchronize());
@@ -772,7 +771,7 @@ float ComputeForce::computeTabulated(bool get_energy) {
                 std::cout << "Error in opening energ files\n";
             }
             energy = e;
-        }
+        }*/
 	return energy;
 }
 
diff --git a/src/ComputeGridGrid.cu b/src/ComputeGridGrid.cu
index 6ba5791..7fae6ae 100644
--- a/src/ComputeGridGrid.cu
+++ b/src/ComputeGridGrid.cu
@@ -5,21 +5,21 @@
 //RBTODO handle periodic boundaries
 //RBTODO: add __restrict__, benchmark (Q: how to restrict member data?)
 __global__
-void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
-													const Matrix3 basis_rho, const Matrix3 basis_u_inv,
-													const Vector3 origin_rho_minus_origin_u,
-													Vector3 * retForce, Vector3 * retTorque) {
+void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, const Matrix3 basis_rho, const Matrix3 basis_u_inv, const Vector3 origin_rho_minus_origin_u,
+			ForceEnergy* retForce, Vector3 * retTorque, int scheme) 
+{
 
-	extern __shared__ Vector3 s[];
-	Vector3 *force = s;
-	Vector3 *torque = &s[NUMTHREADS];
+	extern __shared__ ForceEnergy s[];
+	ForceEnergy *force = s;
+	//Vector3 *torque = &s[NUMTHREADS];
+        ForceEnergy *torque = &s[NUMTHREADS];
 
   // RBTODO: http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops
 	const int tid = threadIdx.x;
 	const int r_id = blockIdx.x * blockDim.x + threadIdx.x;
 
-	force[tid] = Vector3(0.0f);
-	torque[tid] = Vector3(0.0f);
+	force[tid] = ForceEnergy(0.f,0.f);
+	torque[tid] = ForceEnergy(0.f,0.f);
 	if (r_id < rho->getSize()) { // skip threads with nothing to do
 		// RBTODO: reduce registers used;
 		//   commenting out interpolateForceD still uses ~40 registers
@@ -32,12 +32,20 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 		/* Vector3 tmpf  = Vector3(0.0f); */
 		/* float tmpe = 0.0f; */
 		/* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */
-		const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
-		force[tid] = fe.f;
+
+                ForceEnergy fe;
+                if(!scheme)
+		    fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
+                else
+                    fe = u->interpolateForceD( u_ijk_float );
+
+		force[tid] = fe;
+                //force[tid].e = fe.e;
+
 		const float r_val = rho->val[r_id]; /* maybe move to beginning of function?  */
-		force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */
+		force[tid].f = basis_u_inv.transpose().transform( r_val*(force[tid].f) ); /* transform to lab frame, with correct scaling factor */
 		// Calculate torque about origin_u in the lab frame
-		torque[tid] = r_pos.cross(force[tid]);
+		torque[tid].f = r_pos.cross(force[tid].f);
 	}
 
 	// Reduce force and torques
@@ -48,6 +56,8 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 	for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
 		if (tid < offset) {
 			int oid = tid + offset;
+                        //if(get_energy)
+                            //force[tid].e = force[tid].e + force[oid].e;
 			force[tid] = force[tid] + force[oid];
 			torque[tid] = torque[tid] + torque[oid];
 		}
@@ -55,8 +65,9 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 	}
 
 	if (tid == 0) {
+                //retForce[blockIdx.x].e = force[0].e;
 		retForce[blockIdx.x] = force[0];
-		retTorque[blockIdx.x] = torque[0];
+		retTorque[blockIdx.x] = torque[0].f;
 	}
 }
 
@@ -65,29 +76,38 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 				const int num, const int* __restrict__ particleIds, 
 				const RigidBodyGrid* __restrict__ u,
 				const Matrix3 basis_u_inv, const Vector3 origin_u,
-				Vector3* __restrict__ retForce, Vector3* __restrict__ retTorque) {
+				ForceEnergy* __restrict__ retForce, Vector3* __restrict__ retTorque, float* __restrict__ energy, bool get_energy, int scheme) {
 
-	extern __shared__ Vector3 s[];
-	Vector3 *force = s;
-	Vector3 *torque = &s[NUMTHREADS];
+	extern __shared__ ForceEnergy s[];
+	ForceEnergy *force  = s;
+	ForceEnergy *torque = &s[NUMTHREADS];
   	
 	const int tid = threadIdx.x;
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 
-	force[tid] = Vector3(0.0f);
-	torque[tid] = Vector3(0.0f);
+	force[tid]  = ForceEnergy(0.f, 0.f);
+	torque[tid] = ForceEnergy(0.f,0.f);
 	if (i < num) {
-		const int& id = particleIds[i];
+		const int id = particleIds[i];
 		Vector3 p = pos[id] - origin_u;
 		// TODO: wrap to center of u
 		const Vector3 u_ijk_float = basis_u_inv.transform( p );
-		const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
-		force[tid] = fe.f;
-		force[tid] = basis_u_inv.transpose().transform( force[tid] ); /* transform to lab frame */
-		atomicAdd( &particleForce[id], force[tid] ); // apply force to particle
+
+                ForceEnergy fe;
+                if(!scheme)                       
+		    fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
+                else
+                    fe = u->interpolateForceD( u_ijk_float );
+                
+		force[tid] = fe;
+                //force[tid].e = fe.e;
+                if(get_energy)
+                    atomicAdd(&energy[id], fe.e);
+		force[tid].f = basis_u_inv.transpose().transform( force[tid].f ); /* transform to lab frame */
+		atomicAdd( &particleForce[id], force[tid].f ); // apply force to particle
 		
 		// Calculate torque about origin_u in the lab frame
-		torque[tid] = p.cross(force[tid]);				// RBTODO: test sign
+		torque[tid].f = p.cross(force[tid].f);				// RBTODO: test sign
 	}
 
 	// Reduce force and torques
@@ -96,6 +116,8 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 	for (int offset = blockDim.x/2; offset > 0; offset >>= 1) {
 		if (tid < offset) {
 			int oid = tid + offset;
+                        //if(get_energy)
+                            //force[tid].e = force[tid].e + force[oid].e;
 			force[tid] = force[tid] + force[oid];
 			torque[tid] = torque[tid] + torque[oid];
 		}
@@ -104,7 +126,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc
 	
 	if (tid == 0) {
 		retForce[blockIdx.x] = force[0];
-		retTorque[blockIdx.x] = torque[0];
+		retTorque[blockIdx.x] = torque[0].f;
 	}
 }
 
diff --git a/src/ComputeGridGrid.cuh b/src/ComputeGridGrid.cuh
index 0140616..98275a1 100644
--- a/src/ComputeGridGrid.cuh
+++ b/src/ComputeGridGrid.cuh
@@ -5,19 +5,20 @@
 #define WARPSIZE 32
 
 class RigidBodyGrid;
+class ForceEnergy;
 
 extern __global__
 void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 				const Matrix3 basis_rho, const Matrix3 basis_u_inv,
 				const Vector3 origin_rho_minus_origin_u,
-				Vector3 * retForce, Vector3 * retTorque);
+				ForceEnergy* retForce, Vector3 * retTorque, int scheme);
 
 extern __global__
 void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForce,
 				const int num, const int* __restrict__ particleIds,
 				const RigidBodyGrid* __restrict__ u,
 				const Matrix3 basis_u_inv, const Vector3 origin_u,
-				Vector3* __restrict__ retForce, Vector3* __restrict__ retTorque);
+				ForceEnergy* __restrict__ retForce, Vector3* __restrict__ retTorque, float* energy, bool get_energy, int scheme);
 
 extern __global__
 void createPartlist(const Vector3* __restrict__ pos,
diff --git a/src/Configuration.cpp b/src/Configuration.cpp
index bbcdebd..61728da 100644
--- a/src/Configuration.cpp
+++ b/src/Configuration.cpp
@@ -6,6 +6,9 @@
 #include <cassert>
 #include <stdlib.h>     /* srand, rand */
 #include <time.h>       /* time */
+#include <iostream>
+using namespace std;
+
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
@@ -776,6 +779,9 @@ void Configuration::setDefaults() {
 	// Hidden parameters
 	// Might be parameters later
 	numCapFactor = 5;
+
+        ParticleInterpolationType = 0;
+        RigidBodyInterpolationType = 0;
 }
 
 int Configuration::readParameters(const char * config_file) {
@@ -925,7 +931,10 @@ int Configuration::readParameters(const char * config_file) {
                     RigidBodyDynamicType = value;
                 else if (param == String("ParticleLangevinIntegrator"))
                     ParticleLangevinIntegrator = value;
-
+                else if (param == String("ParticleInterpolationType"))
+                    ParticleInterpolationType = atoi(value.val());
+                else if (param == String("RigidBodyInterpolationType"))
+                    RigidBodyInterpolationType = atoi(value.val());
 		// PARTICLES
 		else if (param == String("particle")) {
 			part[++currPart] = BrownianParticleType(value);
diff --git a/src/Configuration.h b/src/Configuration.h
index 6bd6996..0895235 100644
--- a/src/Configuration.h
+++ b/src/Configuration.h
@@ -242,7 +242,8 @@ public:
 	// RigidBody parameters.
 	RigidBodyType* rigidBody;
 	int numRigidTypes;
-
+        int ParticleInterpolationType;
+        int RigidBodyInterpolationType;
 };
 
 #endif
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 6e67408..6c3a766 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -5,6 +5,8 @@
 #include "BrownParticlesKernel.h"
 #include <stdlib.h>     /* srand, rand */
 #include <time.h>       /* time */
+#include <thrust/device_ptr.h>
+#include <fstream>
 
 #ifndef gpuErrchk
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
@@ -16,6 +18,16 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
 }
 #endif
 
+static void checkEnergyFiles()
+{
+    std::ifstream part_ifile("energy_config.txt");
+    std::ifstream rb_ifile("rb_energy_config.txt");
+    if(part_ifile.good())
+        rename("energy_config.txt","energy_config.txt.BAK");
+    if(rb_ifile.good())
+        rename("rb_energy_config.txt","rb_energy_config.txt.BAK");
+}
+
 bool GrandBrownTown::DEBUG;
 
 cudaEvent_t START, STOP;
@@ -28,6 +40,8 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
         //Determine which dynamic. Han-Yi Chou
         particle_dynamic  = c.ParticleDynamicType;
         rigidbody_dynamic = c.RigidBodyDynamicType;
+        ParticleInterpolationType = c.ParticleInterpolationType;
+        RigidBodyInterpolationType = c.RigidBodyInterpolationType;
         //particle_langevin_integrator = c.ParticleLangevinIntegrator;
         printf("%d\n",__LINE__);
 	for (int i = 0; i < numReplicas; ++i) 
@@ -530,6 +544,8 @@ void GrandBrownTown::RunNoseHooverLangevin()
     Vector3 *force_d;
     gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas));
 
+    checkEnergyFiles();
+
     printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
     //float total_energy = 0.f;
     // Main loop over Brownian dynamics steps
@@ -541,8 +557,9 @@ void GrandBrownTown::RunNoseHooverLangevin()
         {
             // 'interparticleForce' - determines whether particles interact with each other
             gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
+            gpuErrchk(cudaMemset((void*)(internal->getEnergy()), 0, sizeof(float)*num*numReplicas));
             RBC.clearForceAndTorque(); //Han-Yi Chou
-
+            
             if (interparticleForce)
             {
                 if (tabulatedPotential)
@@ -593,7 +610,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                 }
             }//if inter-particle force
             gpuErrchk(cudaDeviceSynchronize());
-            RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s);
+            RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, internal->getEnergy(), get_energy, RigidBodyInterpolationType);
             if(rigidbody_dynamic == String("Langevin"))
             {
                 RBC.SetRandomTorques();
@@ -601,19 +618,20 @@ void GrandBrownTown::RunNoseHooverLangevin()
             }
         }//if step == 1
 
-        gpuErrchk(cudaDeviceSynchronize());
-
+        gpuErrchk(cudaMemset((void*)(internal->getEnergy()), 0, sizeof(float)*num*numReplicas));
+        
         if(particle_dynamic == String("Langevin"))
-            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
         else if(particle_dynamic == String("NoseHooverLangevin"))
             //kernel for Nose-Hoover Langevin dynamic
             updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), 
             internal -> getRan_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, 
-            randoGen_d, numReplicas);
-        //For Brownian motion
+            randoGen_d, numReplicas, ParticleInterpolationType);
+        ////For Brownian motion
         else
             updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(),
-                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy);
+                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, 
+                                                       internal->getEnergy(), get_energy, ParticleInterpolationType);
 
         if(rigidbody_dynamic == String("Langevin"))
         {
@@ -621,7 +639,10 @@ void GrandBrownTown::RunNoseHooverLangevin()
             RBC.integrateDLM(1);
         }
         else
+        {
             RBC.integrate(s);
+            //RBC.print(s);
+        }
         gpuErrchk(cudaDeviceSynchronize());
 
         if (s % outputPeriod == 0)
@@ -696,7 +717,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
         }
         if (imd_on && clientsock)
             internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD
-
+         
         RBC.clearForceAndTorque();
         gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
         if (interparticleForce)
@@ -748,11 +769,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
         }
         gpuErrchk(cudaDeviceSynchronize());
         //compute the force for rigid bodies
-        RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s);
+        RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, internal->getEnergy(), get_energy, RigidBodyInterpolationType);
         gpuErrchk(cudaDeviceSynchronize());
 
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
-            LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy);
+            LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), 
+            internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, 
+            ParticleInterpolationType);
             //gpuErrchk(cudaDeviceSynchronize());
 
         if(rigidbody_dynamic == String("Langevin"))
@@ -763,7 +786,6 @@ void GrandBrownTown::RunNoseHooverLangevin()
             RBC.print(s);
         }
 
-        gpuErrchk(cudaDeviceSynchronize());
         if (s % outputPeriod == 0)
         {
             if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
@@ -812,33 +834,46 @@ void GrandBrownTown::RunNoseHooverLangevin()
 
                 // Do the output
                 printf("\rStep %ld [%.2f%% complete | %.3f ms/step | %.3f ns/day]",s, percent, msPerStep, nsPerDay);
+        //}
+        //if (get_energy)
+        //{
 
                 // Copy positions from GPU to CPU.
-                gpuErrchk(cudaMemcpy(pos, internal->getPos_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                //gpuErrchk(cudaMemcpy(pos, internal->getPos_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                float e = 0.f;
+                float V = 0.f;
+                thrust::device_ptr<float> en_d(internal->getEnergy());
+                V = (thrust::reduce(en_d, en_d+num*numReplicas)) / numReplicas;
                 if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
                 {
-                    //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
                     gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
-                    float e = KineticEnergy();
-                    printf(" The kinetic energy is %f \n",e*(2.388458509e-1));
-                    std::fstream energy_file;
-                    energy_file.open("energy_config.txt", std::fstream::out | std::fstream::app);
-                    if(energy_file.is_open())
-                    {
-                        energy_file << "Kinetic Energy: " << e*num*0.5f*(2.388458509e-1) << " (kT) "<< std::endl;
-                        energy_file.close();
-                    }
-                    else
-                    {
-                        std::cout << "Error in opening energ files\n";
-                    }
+                    e = KineticEnergy();
+                }   
+                std::fstream energy_file;
+                energy_file.open("energy_config.txt", std::fstream::out | std::fstream::app);
+                if(energy_file.is_open())
+                {
+                    energy_file << "Kinetic Energy: " << e*num*0.5f*(2.388458509e-1) << " (kT) "<< std::endl;
+                    energy_file << "Potential Energy: " << V << " (kcal/mol) " << std::endl;
+                    energy_file.close();
+                }
+                else
+                {
+                    std::cout << "Error in opening energ files\n";
                 }
                 
                 if(rigidbody_dynamic == String("Langevin"))
+                    RBC.KineticEnergy();
+                std::fstream rb_energy_file;
+                rb_energy_file.open("rb_energy_config.txt", std::fstream::out | std::fstream::app);
+                if(rb_energy_file.is_open())
                 {
-                    //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
-                    float e = RotKineticEnergy();
-                    printf(" The Rotational kinetic energy is %f \n",e*(2.388458509e-1));
+                    RBC.printEnergyData(rb_energy_file);
+                    energy_file.close();
+                }
+                else
+                {
+                    std::cout << "Error in opening rb energy files\n"; 
                 }
 
                 // Write restart files for each replica.
@@ -1067,7 +1102,7 @@ void GrandBrownTown::run() {
 		}//if inter-particle force
 
 	        gpuErrchk(cudaDeviceSynchronize());
-		RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s);
+		RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, RigidBodyInterpolationType);
                 if(rigidbody_dynamic == String("Langevin"))
                 {
                     RBC.SetRandomTorques();
@@ -1241,7 +1276,7 @@ void GrandBrownTown::run() {
             gpuErrchk(cudaDeviceSynchronize());
 
             //compute the force for rigid bodies
-            RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s);
+            RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, RigidBodyInterpolationType);
             //Han-Yi Chou
             //For BAOAB, the last update is only to update the momentum
             gpuErrchk(cudaDeviceSynchronize());
@@ -1515,14 +1550,14 @@ float GrandBrownTown::KineticEnergy()
 
     return 2. * particle_energy / kT / num / numReplicas; //In the unit of 0.5kT
 }
-
-float GrandBrownTown::RotKineticEnergy()
+/*
+void GrandBrownTown::RotKineticEnergy()
 {
-    float e = RBC.KineticEnergy();
+    RBC.KineticEnergy();
 
     return 2. * e / numReplicas / kT; //In the unit of 0.5kT
 }
-
+*/
 void GrandBrownTown::InitNoseHooverBath(int N)
 {
     printf("Entering Nose-Hoover Langevin\n");
diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh
index 9e8ef8b..f42764d 100644
--- a/src/GrandBrownTown.cuh
+++ b/src/GrandBrownTown.cuh
@@ -101,9 +101,9 @@ __global__ void
 updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__ momentum, float* random, 
                                Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[], 
                                float kT, BaseGrid* kTGrid, float electricField, int tGridLength, float timestep, 
-                               int num, BaseGrid* sys, Random* randoGen, int numReplicas)
+                               int num, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme)
 {
-    const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < num * numReplicas)
     {
         int t = type[idx];
@@ -117,7 +117,11 @@ updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__
         ForceEnergy fe(0.f, 0.f);
         for(int i = 0; i < pt.numPartGridFiles; ++i)
         {
-            ForceEnergy tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(r0);
+            ForceEnergy tmp(0.f,0.f);
+            if(!scheme)
+                tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(r0);
+            else 
+                tmp = pt.pmf[i].interpolateForceD(r0);
             fe.f += tmp.f;
         }
         //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
@@ -193,9 +197,9 @@ updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__
 __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
                                   int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
                                   float electricField,int tGridLength, float timestep, int num, BaseGrid* sys, 
-                                  Random* randoGen, int numReplicas)
+                                  Random* randoGen, int numReplicas, int scheme)
 {
-    const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx < num * numReplicas)
     {
@@ -211,7 +215,11 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re
         ForceEnergy fe(0.f, 0.f);
         for(int i = 0; i < pt.numPartGridFiles; ++i)
         {
-            ForceEnergy tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(r0);
+            ForceEnergy tmp(0.f, 0.f);
+            if(!scheme)
+                tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(r0);
+            else
+                tmp = pt.pmf[i].interpolateForceD(r0);
             fe.f += tmp.f;
         }
 
@@ -224,7 +232,7 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re
         Vector3 force = forceInternal[idx] + forceExternal + fe.f;
 #ifdef Debug
         forceInternal[idx] = -force;
-        #endif
+#endif
 
         // Get local kT value
         float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(r0); /* periodic */
@@ -272,7 +280,7 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re
 __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* __restrict__ forceInternal,
                                       int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
                                       float electricField, int tGridLength, float timestep, int num, 
-                                      BaseGrid* sys, Random* randoGen, int numReplicas, float* __restrict__ energy, bool get_energy)
+                                      BaseGrid* sys, Random* randoGen, int numReplicas, float* __restrict__ energy, bool get_energy,int scheme)
 {
     const int idx  = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
 
@@ -289,9 +297,14 @@ __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* _
         ForceEnergy fe(0.f, 0.f);
         for(int i = 0; i < pt.numPartGridFiles; ++i)
         {
-            ForceEnergy tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(r0);
-            fe.f += tmp.f;
-            fe.e += tmp.e;
+            ForceEnergy tmp(0.f, 0.f);
+            if(!scheme)
+                tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(r0);
+            else
+                tmp = pt.pmf[i].interpolateForceD(r0);
+
+            fe += tmp;
+            //fe.e += tmp.e;
         }
         if(get_energy)
             energy[idx] += fe.e;
@@ -304,7 +317,7 @@ __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* _
         Vector3 force = forceInternal[idx] + forceExternal + fe.f;
 #ifdef Debug
         forceInternal[idx] = -force;
-        #endif
+#endif
 
         #ifdef MDSTEP
         force = Vector3(-r0.x, -r0.y, -r0.z);
@@ -381,14 +394,15 @@ __global__
 void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[], 
                   BrownianParticleType* part[],float kT, BaseGrid* kTGrid, float electricField, 
                   int tGridLength, float timestep, int num, BaseGrid* sys,
-		  Random* randoGen, int numReplicas, float* __restrict__ energy, bool get_energy) 
+		  Random* randoGen, int numReplicas, float* energy, bool get_energy, int scheme) 
 {
 	// Calculate this thread's ID
 	const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
         
 	// TODO: Make this a grid-stride loop to make efficient reuse of RNG states 
 	// Loop over ALL particles in ALL replicas
-	if (idx < num * numReplicas) {
+	if (idx < num * numReplicas) 
+        {
 		const int t = type[idx];
 		Vector3   p = pos[idx];
 
@@ -405,12 +419,13 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[],
 		ForceEnergy fe(0.f, 0.f);
                 for(int i = 0; i < pt.numPartGridFiles; ++i)
                 {
-                    ForceEnergy tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(p);
-                    fe.f += tmp.f;
-                    fe.e += tmp.e;
+                    ForceEnergy tmp(0.f,0.f);
+                    if(!scheme)
+                        tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(p);
+                    else
+                        tmp = pt.pmf[i].interpolateForceD(p);
+                    fe += tmp;
                 }
-                if(get_energy)
-                    energy[idx]+=fe.e;
 #ifndef FORCEGRIDOFF
 		// Add a force defined via 3D FORCE maps (not 3D potential maps)
 		if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(p);
@@ -423,12 +438,10 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[],
 		//	  External:  electric field (now this is basically a constant vector)
 		//	  forceGrid: ADD force due to PMF or other potentials defined in 3D space
 		Vector3 force = forceInternal[idx] + forceExternal + fe.f;
-#ifdef Debug
-        forceInternal[idx] = -force;
-        #endif
+                #ifdef Debug
+                forceInternal[idx] = -force;
+                #endif
 
-		//if (idx == 0)
-			//forceInternal[idx] = force; // write it back out for force0 in run()
 
 		// Get local kT value
 		float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(p); /* periodic */
@@ -466,8 +479,19 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[],
                                    num * numReplicas);
 		// assert( tmp.length() < 10000.0f );
 		pos[idx] = tmp;
-		
-			
+
+                if(get_energy)
+                {
+                    float en_local = 0.f;
+                    for(int i = 0; i < pt.numPartGridFiles; ++i)
+                    {
+                        if(!scheme)
+                            en_local += pt.pmf[i].interpolatePotentialLinearly(tmp);
+                        else
+                            en_local += pt.pmf[i].interpolatePotential(tmp);
+                    }
+                    energy[idx] += en_local;
+                }		
 	}
 }
 /*
diff --git a/src/GrandBrownTown.h b/src/GrandBrownTown.h
index 6965ca5..afbd008 100644
--- a/src/GrandBrownTown.h
+++ b/src/GrandBrownTown.h
@@ -96,7 +96,7 @@ private:
 
         //Compute the kinetic energy in general. Han-Yi Chou
         float KineticEnergy();
-        float RotKineticEnergy();
+        //float RotKineticEnergy();
 
         //Initialize the Nose-Hoover auxilliary variables
         void InitNoseHooverBath(int N);
@@ -269,7 +269,8 @@ private:
         String particle_dynamic;
         String rigidbody_dynamic;
         String particle_langevin_integrator;
-
+        int ParticleInterpolationType;
+        int RigidBodyInterpolationType;
 	void updateNameList();
 
 	void remember(float t);
diff --git a/src/RigidBody.cu b/src/RigidBody.cu
index bc22c29..c0db5e7 100644
--- a/src/RigidBody.cu
+++ b/src/RigidBody.cu
@@ -6,6 +6,10 @@
 #include "Configuration.h"
 #include "ComputeGridGrid.cuh"
 
+#include <gsl/gsl_rng.h>
+#include <gsl/gsl_randist.h>
+#include <gsl/gsl_math.h>
+
 #include "Debug.h"
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
@@ -39,9 +43,11 @@ void RigidBody::init() {
 	const int& numGrids = t->numPotGrids;
 	numParticles = new int[numGrids];
 	particles_d = new int*[numGrids];
-	particleForces = new Vector3*[numGrids];
+	//particleForces = new Vector3*[numGrids];
+	particleForces = new ForceEnergy*[numGrids];
 	particleTorques = new Vector3*[numGrids];
-	particleForces_d = new Vector3*[numGrids];
+	//particleForces_d = new Vector3*[numGrids];
+	particleForces_d = new ForceEnergy*[numGrids];
 	particleTorques_d = new Vector3*[numGrids];
 	for (int i = 0; i < numGrids; ++i) {
 	    numParticles[i] = -1;
@@ -50,9 +56,11 @@ void RigidBody::init() {
 		if (n > 0) {
 		    // gpuErrchk(cudaMalloc( &particles_d[i], 0.5*sizeof(int)*n )); // not sure why 0.5 was here; prolly bug
 		        gpuErrchk(cudaMalloc( &particles_d[i], sizeof(int)*n )); // TODO: dynamically allocate memory as needed
-			particleForces[i] = new Vector3[nb];
+			//particleForces[i] = new Vector3[nb];
+                        particleForces[i] = new ForceEnergy[nb];
 			particleTorques[i] = new Vector3[nb];
-			gpuErrchk(cudaMalloc( &particleForces_d[i], sizeof(Vector3)*nb ));
+			//gpuErrchk(cudaMalloc( &particleForces_d[i], sizeof(Vector3)*nb ));
+			gpuErrchk(cudaMalloc( &particleForces_d[i], sizeof(ForceEnergy)*nb ));
 			gpuErrchk(cudaMalloc( &particleTorques_d[i], sizeof(Vector3)*nb ));
 		}
 	}
@@ -62,13 +70,19 @@ void RigidBody::init() {
 void RigidBody::Boltzmann()
 {
 
-    Vector3 rando = getRandomGaussVector();
-    momentum = sqrt(t->mass*Temp) * 2.046167135 * rando;
-    rando = getRandomGaussVector();
-    angularMomentum.x = sqrt(t->inertia.x*Temp) * 2.046167135 * rando.x;
-    angularMomentum.y = sqrt(t->inertia.y*Temp) * 2.046167135 * rando.y;
-    angularMomentum.z = sqrt(t->inertia.z*Temp) * 2.046167135 * rando.z;
+    gsl_rng *gslcpp_rng = gsl_rng_alloc(gsl_rng_default);
+    std::srand(time(NULL));
+    gsl_rng_set (gslcpp_rng, rand() % 100000);
+
+    double sigma[4] = { sqrt(t->mass*Temp) * 2.046167135,sqrt(t->inertia.x*Temp) * 2.046167135, sqrt(t->inertia.y*Temp) * 2.046167135, sqrt(t->inertia.z*Temp) * 2.046167135 };
+
+    //Vector3 rando = getRandomGaussVector();
+    momentum = Vector3(gsl_ran_gaussian(gslcpp_rng,sigma[0]),gsl_ran_gaussian(gslcpp_rng,sigma[0]), gsl_ran_gaussian(gslcpp_rng,sigma[0]));
 
+    angularMomentum.x = gsl_ran_gaussian(gslcpp_rng,sigma[1]);
+    angularMomentum.y = gsl_ran_gaussian(gslcpp_rng,sigma[2]);
+    angularMomentum.z = gsl_ran_gaussian(gslcpp_rng,sigma[3]);
+    printf("%f\n", Temp);
     printf("%f\n", Temperature());
 }
 
@@ -100,7 +114,10 @@ void RigidBody::addForce(Force f) {
 void RigidBody::addTorque(Force torq) {
 	torque += torq; 
 }
-
+void RigidBody::addEnergy(float e)
+{
+    energy += e;
+}
 void RigidBody::updateParticleList(Vector3* pos_d) {
 	for (int i = 0; i < t->numPotGrids; ++i) {
 		numParticles[i] = 0;
@@ -131,7 +148,7 @@ void RigidBody::updateParticleList(Vector3* pos_d) {
 	}
 }
 
-void RigidBody::callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, int s) {
+void RigidBody::callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme) {
 	// Apply the force and torque on the rigid body, and forces on particles
 	
 	// RBTODO: performance: consolidate CUDA stream management
@@ -157,37 +174,44 @@ void RigidBody::callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, in
 		// RBTODO: performance: Improve parellism here (use streams/events; overlap with other computations)
 		const int nb = (numParticles[i]/NUMTHREADS)+1;
 		
-		Vector3* forces = particleForces[i];
+		//Vector3* forces = particleForces[i];
+		ForceEnergy* forces = particleForces[i];
 		Vector3* torques = particleTorques[i];
-		Vector3* forces_d = particleForces_d[i];
+		//Vector3* forces_d = particleForces_d[i];
+		ForceEnergy* forces_d = particleForces_d[i];
 		Vector3* torques_d = particleTorques_d[i];
 
 		for (int k=0; k < nb; ++k) {
-			forces[k] = Vector3(0.0f);
+			//forces[k] = Vector3(0.0f);
+			forces[k] = ForceEnergy(0.f);
 			torques[k] = Vector3(0.0f);
 		}
-		gpuErrchk(cudaMemcpy(forces_d, forces, sizeof(Vector3)*nb, cudaMemcpyHostToDevice));
+		//gpuErrchk(cudaMemcpy(forces_d, forces, sizeof(Vector3)*nb, cudaMemcpyHostToDevice));
+		gpuErrchk(cudaMemcpy(forces_d, forces, sizeof(ForceEnergy)*nb, cudaMemcpyHostToDevice));
 		gpuErrchk(cudaMemcpy(torques_d, torques, sizeof(Vector3)*nb, cudaMemcpyHostToDevice));
 		
-		computePartGridForce<<< nb, NUMTHREADS, NUMTHREADS*2*sizeof(Vector3) >>>(
+		computePartGridForce<<< nb, NUMTHREADS,2*NUMTHREADS*sizeof(ForceEnergy)*nb >>>(
 			pos_d, force_d, numParticles[i], particles_d[i],
 			t->rawPotentialGrids_d[i],
-			B, c, forces_d, torques_d);
+			B, c, forces_d, torques_d, energy, get_energy, scheme);
 		
-		gpuErrchk(cudaMemcpy(forces, forces_d, sizeof(Vector3)*nb, cudaMemcpyDeviceToHost));
+		//gpuErrchk(cudaMemcpy(forces, forces_d, sizeof(Vector3)*nb, cudaMemcpyDeviceToHost));
+		gpuErrchk(cudaMemcpy(forces, forces_d, sizeof(ForceEnergy)*nb, cudaMemcpyDeviceToHost));
 		gpuErrchk(cudaMemcpy(torques, torques_d, sizeof(Vector3)*nb, cudaMemcpyDeviceToHost));
 
 		// Sum and apply forces and torques
-		Vector3 f = Vector3(0.0f);
+		//Vector3 f = Vector3(0.0f);
+		ForceEnergy f = ForceEnergy(0.f,0.f);
 		Vector3 torq = Vector3(0.0f);
 		for (int k = 0; k < nb; ++k) {
 			f = f + forces[k];
 			torq = torq + torques[k];
 		}
 		
-		torq = -torq + (getPosition()-c).cross( f ); 
-		addForce( -f );
+		torq = -torq + (getPosition()-c).cross( f.f ); 
+		addForce( -f.f );
 		addTorque( torq );
+                addEnergy( f.e );
 	}
 }
 
@@ -282,7 +306,7 @@ void RigidBody::integrate(int startFinishAll)
 {
     //if (startFinishAll == 1) return;
 
-    Matrix3 rot = Matrix3(1); // = *p_rot;
+    //Matrix3 rot = Matrix3(1); // = *p_rot;
 
     if ( isnan(force.x) || isnan(torque.x) ) 
     {
@@ -316,7 +340,7 @@ float RigidBody::Temperature()
     return (momentum.length2() / t->mass + 
             angularMomentum.x * angularMomentum.x / t->inertia.x + 
             angularMomentum.y * angularMomentum.y / t->inertia.y + 
-            angularMomentum.z * angularMomentum.z / t->inertia.z) * 0.50;
+            angularMomentum.z * angularMomentum.z / t->inertia.z) * 0.50 / Temp * (2.388458509e-1);
 }
 
 void RigidBody::applyRotation(const Matrix3& R) {
diff --git a/src/RigidBody.h b/src/RigidBody.h
index 9eaf2b1..9974511 100644
--- a/src/RigidBody.h
+++ b/src/RigidBody.h
@@ -18,7 +18,7 @@
 #include "RigidBodyController.h"
 
 class Configuration;
-
+class ForceEnergy;
 
 typedef float BigReal;					/* strip this out later */
 typedef Vector3 Force;
@@ -40,9 +40,11 @@ class RigidBody { // host side representation of rigid bodies
 
 	HOST DEVICE void addForce(Force f); 
 	HOST DEVICE void addTorque(Force t);
+        HOST DEVICE void addEnergy(float e);
 	HOST DEVICE void addLangevin(Vector3 w1, Vector3 w2);
-	
-	HOST DEVICE inline void clearForce() { force = Force(0.0f); }
+        HOST inline void setKinetic(float e) { kinetic = e; };	
+	HOST DEVICE inline void clearForce() { force = Force(0.0f); energy = 0.f;}
+	//HOST DEVICE inline void clearForce() { force = ForceEnergy(0.f, 0.f); }
 	HOST DEVICE inline void clearTorque() { torque = Force(0.0f); }
 
 	// HOST DEVICE void integrate(Vector3& old_trans, Matrix3& old_rot, int startFinishAll);
@@ -61,6 +63,8 @@ class RigidBody { // host side representation of rigid bodies
 	HOST DEVICE inline BigReal getMass() const { return t->mass; }
 	//HOST DEVICE inline Vector3 getVelocity() const { return momentum/t->mass; }
 	HOST DEVICE inline Vector3 getVelocity() const { return momentum; }
+        HOST DEVICE inline float getEnergy() const { return energy; }
+        HOST DEVICE inline float getKinetic() const { return kinetic; }
 	//HOST DEVICE inline Vector3 getAngularVelocity() const { 
 	//	return Vector3( angularMomentum.x / t->inertia.x,
 	//								 angularMomentum.y / t->inertia.y,
@@ -71,7 +75,7 @@ class RigidBody { // host side representation of rigid bodies
         }
 
 	void updateParticleList(Vector3* pos_d);
-	void callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, int s);
+	void callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme);
 	
 	
 	bool langevin;
@@ -110,14 +114,15 @@ private:
 	const RigidBodyType* t;
 	float timestep;					
 	Vector3 force;  // lab frame
-
+        float energy; //potential energy
+        float kinetic; 
 	bool isFirstStep; 
 	
 	int* numParticles;		  /* particles affected by potential grids */
 	int** particles_d;		 	
-	Vector3** particleForces;
+	ForceEnergy** particleForces;
 	Vector3** particleTorques;
-	Vector3** particleForces_d;
+	ForceEnergy** particleForces_d;
 	Vector3** particleTorques_d;
 	
 	
diff --git a/src/RigidBodyController.cu b/src/RigidBodyController.cu
index 16d3f13..bb49a5e 100644
--- a/src/RigidBodyController.cu
+++ b/src/RigidBodyController.cu
@@ -1,4 +1,5 @@
 /* #include "RigidBody.h" */
+#include <iomanip>
 #include "RigidBodyController.h"
 #include "Configuration.h"
 #include "RigidBodyType.h"
@@ -51,14 +52,6 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* out
 		rigidBodyByType.push_back(tmp);
 	}
 
-	if (conf.inputRBCoordinates.length() > 0)
-		loadRBCoordinates(conf.inputRBCoordinates.val());
-	
-	random = new RandomCPU(conf.seed + 1); /* +1 to avoid using same seed as RandomCUDA */
-	
-	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */
-	initializeForcePairs();
-	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */
         //Boltzmann distribution
         for (int i = 0; i < rigidBodyByType.size(); i++)
         {
@@ -69,6 +62,15 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* out
                 rb.Boltzmann();
             }
         }
+
+	if (conf.inputRBCoordinates.length() > 0)
+		loadRBCoordinates(conf.inputRBCoordinates.val());
+	
+	random = new RandomCPU(conf.seed + 1); /* +1 to avoid using same seed as RandomCUDA */
+	
+	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */
+	initializeForcePairs();
+	gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */
 }
 
 RigidBodyController::~RigidBodyController() {
@@ -100,11 +102,15 @@ bool RigidBodyController::loadRBCoordinates(const char* fileName) {
 
 		String s(line);
 		int numTokens = s.tokenCount();
-		if (numTokens != 3+9) {
+		if (numTokens < 3+9) {
 			printf("GrandBrownTown: load RB coordinates: Invalid coordinate file line: %s\n", line);
 			fclose(inp);	
 			exit(-1);
 		}
+                if(conf.RigidBodyDynamicType == String("Langevin") && numTokens < 18)
+                {
+                    std::cout << "Warning the initial momentum set by random number" << std::endl;
+                }
 
 		String* tokenList = new String[numTokens];
 		s.tokenize(tokenList);
@@ -122,7 +128,12 @@ bool RigidBodyController::loadRBCoordinates(const char* fileName) {
 			(float) strtod(tokenList[6],NULL), (float) strtod(tokenList[7],NULL), (float) strtod(tokenList[8],NULL),
 			(float) strtod(tokenList[9],NULL), (float) strtod(tokenList[10],NULL), (float) strtod(tokenList[11],NULL));
 
-		
+	        if(conf.RigidBodyDynamicType == String("Langevin") && numTokens >= 18)
+                {
+                    rb.momentum = Vector3((float)strtod(tokenList[12],NULL), (float) strtod(tokenList[13],NULL), (float) strtod(tokenList[14],NULL));
+                    rb.angularMomentum = Vector3((float)strtod(tokenList[15],NULL), (float) strtod(tokenList[16],NULL), (float) strtod(tokenList[17],NULL));
+                }
+               
 		delete[] tokenList;
 
 		//i++;
@@ -268,7 +279,7 @@ void RigidBodyController::clearForceAndTorque()
     }
 }
 
-void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s) {
+void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme) {
 	if (s <= 1)
 		gpuErrchk( cudaProfilerStart() );
 	
@@ -276,7 +287,7 @@ void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s)
 	for (int i = 0; i < rigidBodyByType.size(); i++) {
 		for (int j = 0; j < rigidBodyByType[i].size(); j++) {
 			RigidBody& rb = rigidBodyByType[i][j];
-			rb.callGridParticleForceKernel( pos_d, force_d, s );
+			rb.callGridParticleForceKernel( pos_d, force_d, s, energy, get_energy, scheme );
 		}
 	}
 	
@@ -285,7 +296,7 @@ void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s)
 		for (int i=0; i < forcePairs.size(); i++) {
 			// TODO: performance: make this check occur less frequently
 			if (forcePairs[i].isWithinPairlistDist())
-				forcePairs[i].callGridForceKernel(i,s);
+				forcePairs[i].callGridForceKernel(i, s, scheme);
 		}
 		
 		// each kernel call is followed by async memcpy for previous; now get last
@@ -353,18 +364,26 @@ void RigidBodyController::integrateDLM(int step)
 void RigidBodyController::integrate(int step) 
 {
  	// tell RBs to integrate
-	if ( step % conf.outputPeriod == 0 ) { /* PRINT & INTEGRATE */
-		if (step == 0) {						// first step so only start this cycle
+	if ( step % conf.outputPeriod == 0 ) 
+        {       /* PRINT & INTEGRATE */
+		if (step == 0) 
+                {	// first step so only start this cycle
 			print(step);
-			for (int i = 0; i < rigidBodyByType.size(); i++) {
-				for (int j = 0; j < rigidBodyByType[i].size(); j++) {
+			for (int i = 0; i < rigidBodyByType.size(); i++)
+                        {
+				for (int j = 0; j < rigidBodyByType[i].size(); j++)
+                                {
 					RigidBody& rb = rigidBodyByType[i][j];
 					rb.integrate(0);	
 				}
 			}
-		} else {										// finish last cycle
-			for (int i = 0; i < rigidBodyByType.size(); i++) {
-				for (int j = 0; j < rigidBodyByType[i].size(); j++) {
+		} 
+                else 
+                {       // finish last cycle
+			for (int i = 0; i < rigidBodyByType.size(); i++)
+                        {
+				for (int j = 0; j < rigidBodyByType[i].size(); j++)
+                                {
 					RigidBody& rb = rigidBodyByType[i][j];
 					rb.integrate(1);	
 				}
@@ -372,25 +391,34 @@ void RigidBodyController::integrate(int step)
 			print(step);
 
 			// start this cycle
-			for (int i = 0; i < rigidBodyByType.size(); i++) {
+			/*for (int i = 0; i < rigidBodyByType.size(); i++) {
 				for (int j = 0; j < rigidBodyByType[i].size(); j++) {
 					RigidBody& rb = rigidBodyByType[i][j];
 					rb.integrate(0);	
 				}
-			}
+			}*/
 		}
-	} else {											/* INTEGRATE ONLY */
-		if (step == 0) {						// first step so only start this cycle
+	} 
+        else 
+        {	/* INTEGRATE ONLY */
+		if (step == 0) 
+                {		// first step so only start this cycle
 			print(step);
-			for (int i = 0; i < rigidBodyByType.size(); i++) {
-				for (int j = 0; j < rigidBodyByType[i].size(); j++) {
+			for (int i = 0; i < rigidBodyByType.size(); i++)
+                        {
+				for (int j = 0; j < rigidBodyByType[i].size(); j++)
+                                {
 					RigidBody& rb = rigidBodyByType[i][j];
 					rb.integrate(0);	
 				}
 			}
-		} else {										// integrate end of last step and start of this one
-			for (int i = 0; i < rigidBodyByType.size(); i++) {
-				for (int j = 0; j < rigidBodyByType[i].size(); j++) {
+		} 
+                else 
+                {       // integrate end of last step and start of this one
+			for (int i = 0; i < rigidBodyByType.size(); i++) 
+                        {
+				for (int j = 0; j < rigidBodyByType[i].size(); j++)
+                                {
 					RigidBody& rb = rigidBodyByType[i][j];
 					rb.integrate(2);	
 				}
@@ -399,23 +427,26 @@ void RigidBodyController::integrate(int step)
 	}
 }
 
-float RigidBodyController::KineticEnergy()
+void RigidBodyController::KineticEnergy()
 {
-    float e = 0.;
-    int num = 0;
+    //float e = 0.;
+    //int num = 0;
     for (int i = 0; i < rigidBodyByType.size(); i++) 
     {
         for (int j = 0; j < rigidBodyByType[i].size(); j++) 
         {
             RigidBody& rb = rigidBodyByType[i][j];
-            e += rb.Temperature();
-            num += 1;
+            rb.setKinetic(rb.Temperature());
+            //rb.kinetic=tmp;
+            //e += tmp;
+            //num += 1;
         }
     }
-    if(num > 0)
+    //return e;
+    /*if(num > 0)
         return e / num;
     else
-        return 0.;
+        return 0.;*/
 }
 
 // allocate and initialize an array of stream handles
@@ -468,7 +499,7 @@ Matrix3 RigidBodyForcePair::getBasis2(const int i) {
 }
 
 // RBTODO: bundle several rigidbodypair evaluations in single kernel call
-void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
+void RigidBodyForcePair::callGridForceKernel(int pairId, int s, int scheme) {
 	// get the force/torque between a pair of rigid bodies
 	/* printf("  Updating rbPair forces\n"); */
 	const int numGrids = gridKeyId1.size();
@@ -505,15 +536,15 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
                 
 		// RBTODO: get energy
 		if (!isPmf) {								/* pair of RBs */
-			computeGridGridForce<<< nb, NUMTHREADS, NUMTHREADS*2*sizeof(Vector3), s >>>
+			computeGridGridForce<<< nb, NUMTHREADS, 2*sizeof(ForceEnergy)*NUMTHREADS, s>>>
 				(type1->rawDensityGrids_d[k1], type2->rawPotentialGrids_d[k2],
 				 B1, B2, c,
-				 forces_d[i], torques_d[i]);
+				 forces_d[i], torques_d[i], scheme);
 		} else {										/* RB with a PMF */
-			computeGridGridForce<<< nb, NUMTHREADS, NUMTHREADS*2*sizeof(Vector3), s >>>
+			computeGridGridForce<<< nb, NUMTHREADS, 2*sizeof(ForceEnergy)*NUMTHREADS, s>>>
 				(type1->rawDensityGrids_d[k1], type2->rawPmfs_d[k2],
 				 B1, B2, c,
-				 forces_d[i], torques_d[i]);
+				 forces_d[i], torques_d[i], scheme);
 		}
 		// retrieveForcesForGrid(i); // this is slower than approach below, unsure why
 		
@@ -529,22 +560,21 @@ void RigidBodyForcePair::retrieveForcesForGrid(const int i) {
 	const cudaStream_t &s = stream[streamID[i]];
 	const int nb = numBlocks[i];
 
-	gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(Vector3)*nb,
-														cudaMemcpyDeviceToHost, s));
-	gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb,
-														cudaMemcpyDeviceToHost, s));
+	gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(ForceEnergy)*nb, cudaMemcpyDeviceToHost, s));
+	gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb, cudaMemcpyDeviceToHost, s));
 }
 void RigidBodyForcePair::processGPUForces() {
 	
 	const int numGrids = gridKeyId1.size();
-	Vector3 f = Vector3(0.0f);
-	Vector3 t = Vector3(0.0f);
-
+	Vector3 f = Vector3(0.f);
+	Vector3 t = Vector3(0.f);
+        float energy = 0.f;
 	for (int i = 0; i < numGrids; i++) {
 		const int nb = numBlocks[i];
 
-		Vector3 tmpF = Vector3(0.0f);
-		Vector3 tmpT = Vector3(0.0f);
+		//Vector3 tmpF = Vector3(0.0f);
+		ForceEnergy tmpF = ForceEnergy(0.f, 0.f);
+		Vector3 tmpT = Vector3(0.f);
 			
 		for (int j = 0; j < nb; j++) {
 			tmpF = tmpF + forces[i][j];
@@ -554,10 +584,11 @@ void RigidBodyForcePair::processGPUForces() {
 		// tmpT is the torque calculated about the origin of grid k2 (e.g. c2)
 		//   so here we transform torque to be about rb1
 		Vector3 o2 = getOrigin2(i);
-		tmpT = tmpT - (rb1->getPosition() - o2).cross( tmpF ); 
+		tmpT = tmpT - (rb1->getPosition() - o2).cross( tmpF.f ); 
 
-		// sum forces and torques
-		f = f + tmpF;
+		// sum energies,forces and torques
+                energy += tmpF.e;
+		f = f + tmpF.f;
 		t = t + tmpT;
 	}
 
@@ -566,11 +597,13 @@ void RigidBodyForcePair::processGPUForces() {
 	
 	rb1->addForce( f );
 	rb1->addTorque( t );
-
+        rb1->addEnergy( energy );
+ 
 	if (!isPmf) {
 		const Vector3 t2 = -t + (rb2->getPosition()-rb1->getPosition()).cross( f );
 		rb2->addForce( -f );
 		rb2->addTorque( t2 );
+                rb2->addEnergy( energy );
 	}
 
 	// printf("force: %s\n", f.toString().val());
@@ -611,6 +644,10 @@ void RigidBodyController::print(int step) {
 			printData(step,trajFile);
 			trajFile.flush();    
 		}
+                if(step % conf.outputEnergyPeriod == 0)
+                {
+                
+                }
     
 		// Write restart File
 		/* if ( simParams->restartFrequency && */
@@ -665,18 +702,38 @@ void RigidBodyController::printData(int step,std::ofstream &file) {
 			Matrix3 t =  rb.getOrientation();
 			file << step <<" "<< rb.getKey()
 					 <<" "<< v.x <<" "<< v.y <<" "<< v.z;
-			file <<" "<< t.exx <<" "<< t.exy <<" "<< t.exz
+			file << std::setprecision(10) <<" "<< t.exx <<" "<< t.exy <<" "<< t.exz
 					 <<" "<< t.eyx <<" "<< t.eyy <<" "<< t.eyz
 					 <<" "<< t.ezx <<" "<< t.ezy <<" "<< t.ezz;
 			v = rb.getVelocity();
-			file <<" "<< v.x <<" "<< v.y <<" "<< v.z;
+			file << std::setprecision(10) <<" "<< v.x <<" "<< v.y <<" "<< v.z;
 			v = rb.getAngularVelocity();
-			file <<" "<< v.x <<" "<< v.y <<" "<< v.z
+			file << std::setprecision(10) <<" "<< v.x <<" "<< v.y <<" "<< v.z
 					 << std::endl;
 		}
 	}
 }
 
+void RigidBodyController::printEnergyData(std::fstream &file)
+{
+    if(file.is_open())
+    {
+
+        for (int i = 0; i < rigidBodyByType.size(); i++) 
+        {
+            for(int j = 0; j < rigidBodyByType[i].size(); j++)
+            {
+                const RigidBody& rb = rigidBodyByType[i][j];
+                file << "Kinetic Energy " << rb.getKey() << ": " << rb.getKinetic() << " (kT)" << std::endl;
+                file << " Potential Energy " << rb.getKey() << ": " << rb.getEnergy() << " (kcal/mol)" << std::endl;
+            }
+       }
+    }
+    else
+    {
+        std::cout << " Error in opening files\n"; 
+    }      
+}
 int RigidBodyForcePair::initialize() {
     // printf("    Initializing (streams for) RB force pair...\n");
 
@@ -692,15 +749,17 @@ int RigidBodyForcePair::initialize() {
 		nextStreamID++;
 
 		numBlocks.push_back(nb);
-		forces.push_back( new Vector3[nb] );
+		//forces.push_back( new Vector3[nb] );
+		forces.push_back( new ForceEnergy[nb]);
 		torques.push_back( new Vector3[nb] );
 
-		forces_d.push_back( new Vector3[nb] ); // RBTODO: correct?
+		//forces_d.push_back( new Vector3[nb] ); // RBTODO: correct?
+		forces_d.push_back( new ForceEnergy[nb]);
 		torques_d.push_back( new Vector3[nb] );
 
 		// allocate device memory for numBlocks of torque, etc.
     // printf("      Allocating device memory for forces/torques\n");
-		gpuErrchk(cudaMalloc(&(forces_d[i]), sizeof(Vector3) * nb));
+		gpuErrchk(cudaMalloc(&(forces_d[i]), sizeof(ForceEnergy) * nb));
 		gpuErrchk(cudaMalloc(&(torques_d[i]), sizeof(Vector3) * nb));
 	}
 	gpuErrchk(cudaDeviceSynchronize());
diff --git a/src/RigidBodyController.h b/src/RigidBodyController.h
index f30df42..1aee363 100644
--- a/src/RigidBodyController.h
+++ b/src/RigidBodyController.h
@@ -14,6 +14,7 @@
 class RigidBodyType;
 class RigidBody;
 class Configuration;
+class ForceEnergy;
 // class RandomCPU;
 #include "RandomCPU.h"
 
@@ -64,8 +65,10 @@ private:
 
 	bool isPmf;
 	
-	std::vector<Vector3*> forces;
-	std::vector<Vector3*> forces_d;
+	//std::vector<Vector3*> forces;
+	//std::vector<Vector3*> forces_d;
+	std::vector<ForceEnergy*> forces;
+        std::vector<ForceEnergy*> forces_d;
 	std::vector<Vector3*> torques;
 	std::vector<Vector3*> torques_d;
 
@@ -78,7 +81,7 @@ private:
 	static RigidBodyForcePair* lastRbForcePair;
 	static int lastRbGridID;
 	
-	void callGridForceKernel(int pairId, int s);
+	void callGridForceKernel(int pairId, int s,int scheme);
 	void retrieveForcesForGrid(const int i);
 	void processGPUForces();
 	Matrix3 getBasis1(const int i);
@@ -98,11 +101,12 @@ public:
         void SetRandomTorques();
 	void integrate(int step);
         void integrateDLM(int step);
-	void updateForces(Vector3* pos_d, Vector3* force_d, int s);
+	void updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme);
 	void updateParticleLists(Vector3* pos_d);
         void clearForceAndTorque(); 
-        float KineticEnergy();
+        void KineticEnergy();
         void print(int step);
+        void printEnergyData(std::fstream &file);
 private:
 	bool loadRBCoordinates(const char* fileName);
 	void initializeForcePairs();
@@ -133,6 +137,6 @@ private:
 	
 	std::vector< RigidBodyForcePair > forcePairs;
 
-	
+        float* rb_energy;	
 	
 };
diff --git a/src/RigidBodyGrid.cu b/src/RigidBodyGrid.cu
index d9b32da..06d68ba 100644
--- a/src/RigidBodyGrid.cu
+++ b/src/RigidBodyGrid.cu
@@ -243,13 +243,13 @@ DEVICE ForceEnergy RigidBodyGrid::interpolateForceD(const Vector3 l) const {
 	
 	return ForceEnergy(f,e);
 }
-#define cubic
+//#define cubic
 DEVICE ForceEnergy RigidBodyGrid::interpolateForceDLinearly(const Vector3& l) const {
-#ifdef cubic
-return interpolateForceD(l);
-#elif defined(cubic_namd)
-return interpolateForceDnamd(l);
-#else
+//#ifdef cubic
+//return interpolateForceD(l);
+//#elif defined(cubic_namd)
+//return interpolateForceDnamd(l);
+//#else
 	// Find the home node.
 	const int homeX = int(floor(l.x));
 	const int homeY = int(floor(l.y));
@@ -293,7 +293,7 @@ return interpolateForceDnamd(l);
 	f.z = -      (g3[2][1] - g3[2][0]);
 	float e = wz * (g3[2][1] - g3[2][0]) + g3[2][0];
 	return ForceEnergy(f,e);
-#endif
+//#endif
 }
 DEVICE ForceEnergy RigidBodyGrid::interpolateForceDnamd(const Vector3& l) const
 {
diff --git a/src/useful.h b/src/useful.h
index 28417b7..cc6bbe0 100644
--- a/src/useful.h
+++ b/src/useful.h
@@ -131,12 +131,12 @@ String operator+(String s1, String s2);
 //
 class MY_ALIGN(16) Vector3 {
 public:
-	HOST DEVICE inline Vector3() : x(0), y(0), z(0) {}
-	HOST DEVICE inline Vector3(float s):x(s), y(s), z(s) {}
-	HOST DEVICE inline Vector3(const Vector3& v):x(v.x), y(v.y), z(v.z) {}
-	HOST DEVICE inline Vector3(float x0, float y0, float z0) : x(x0), y(y0), z(z0) {}
-	HOST DEVICE inline Vector3(const float* d) : x(d[0]), y(d[1]), z(d[2]) {}
-             DEVICE inline Vector3(const float4 a) : x(a.x ), y(a.y ), z(a.z ) {}
+	HOST DEVICE inline Vector3() : x(0), y(0), z(0), w(0) {}
+	HOST DEVICE inline Vector3(float s):x(s), y(s), z(s), w(s) {}
+	HOST DEVICE inline Vector3(const Vector3& v):x(v.x), y(v.y), z(v.z), w(v.w)  {}
+	HOST DEVICE inline Vector3(float x0, float y0, float z0) : x(x0), y(y0), z(z0), w(0) {}
+	HOST DEVICE inline Vector3(const float* d) : x(d[0]), y(d[1]), z(d[2]), w(0) {}
+             DEVICE inline Vector3(const float4 a) : x(a.x ), y(a.y ), z(a.z ), w(a.w) {}
 
 	static Vector3 random(float s);
 
-- 
GitLab