diff --git a/src/BrownianParticleType.cpp b/src/BrownianParticleType.cpp
index 0b56413236549ef34e2a0c381b54437f317f5aa4..50abb69740c2b6fdc4bad1a9083c7ae0425a943d 100644
--- a/src/BrownianParticleType.cpp
+++ b/src/BrownianParticleType.cpp
@@ -13,7 +13,7 @@ void BrownianParticleType::clear() {
         if (meanPmf != NULL) delete []  meanPmf;
 	pmf = NULL, diffusionGrid = NULL;
 	forceXGrid = NULL, forceYGrid = NULL, forceZGrid = NULL;
-	reservoir = NULL; meanPmf = NULL;
+	reservoir = NULL, meanPmf = NULL;
 }
 
 void BrownianParticleType::copy(const BrownianParticleType& src) {
diff --git a/src/BrownianParticleType.h b/src/BrownianParticleType.h
index 67e634652c90695bc58ca5736085da3f9ea9c864..68f99a9bb03d7cec972931bbebcb9347fef3f852 100644
--- a/src/BrownianParticleType.h
+++ b/src/BrownianParticleType.h
@@ -30,8 +30,8 @@ class BrownianParticleType {
 		BrownianParticleType(const String& name = "") :
 				name(name), num(0),
 				diffusion(0.0f), radius(1.0f), charge(0.0f), eps(0.0f), meanPmf(NULL),
-				reservoir(NULL), pmf(NULL), diffusionGrid(NULL),
-				forceXGrid(NULL), forceYGrid(NULL), forceZGrid(NULL), numPartGridFiles(-1) { }
+				numPartGridFiles(-1), reservoir(NULL), pmf(NULL), diffusionGrid(NULL),
+				forceXGrid(NULL), forceYGrid(NULL), forceZGrid(NULL){ }
 
 		BrownianParticleType(const BrownianParticleType& src) { copy(src); }
 
diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index 903b6c22a1ade77d2e2018b86364ddfd8894d9aa..e5d212ce7b6f4dbf598b8de43ea4dbc3246f4ef9 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -6,8 +6,9 @@
 #include "ComputeForce.cuh"
 #include "Configuration.h"
 #include <cuda_profiler_api.h>
-
-
+#include <thrust/device_ptr.h>
+#include <fstream>
+#include <iostream>
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
    if (code != cudaSuccess) {
@@ -121,7 +122,7 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
                 gpuErrchk(cudaBindTexture(0, pairTabPotTypeTex, pairTabPotType_d, sizeof(int)*maxPairs)); //Han-Yi
 
                 //Han-Yi Chou
-                size_t nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
+                int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
                 //int* nCells_dev;
                 int3 *Cells_dev;
 
@@ -130,7 +131,7 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
                 gpuErrchk(cudaMalloc(&Cells_dev,sizeof(int3)));
                 //gpuErrchk(cudaMemcpy(nCells_dev,&nCells,1,cudaMemcpyHostToDevice);
                 gpuErrchk(cudaMemcpy(Cells_dev,&(decomp.nCells),sizeof(int3),cudaMemcpyHostToDevice));
-                createNeighborsList<<<1,64>>>(Cells_dev,CellNeighborsList);
+                createNeighborsList<<<256,256>>>(Cells_dev,CellNeighborsList);
                 gpuErrchk(cudaFree(Cells_dev));
                 cudaBindTexture(0, NeighborsTex, CellNeighborsList, 27*nCells*sizeof(int));
 	}
@@ -139,8 +140,7 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
 	gridSize =  num / NUM_THREADS + 1;
 
 	// Create and allocate the energy arrays
-	gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * num));
-	
+	gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * num * numReplicas));
 	cudaEventCreate(&start);
 	cudaEventCreate(&stop);
 }
@@ -702,11 +702,12 @@ float ComputeForce::computeTabulated(bool get_energy) {
 	gpuErrchk(cudaDeviceSynchronize());
 
 	// RBTODO: get_energy
-	//if (get_energy)
-	if (false) 
+	if (get_energy)
+	//if (false) 
 	{
-		clearEnergies<<< nb, numThreads >>>(energies_d,num);
-		gpuErrchk(cudaDeviceSynchronize());
+		//clearEnergies<<< nb, numThreads >>>(energies_d,num);
+		//gpuErrchk(cudaDeviceSynchronize());
+		cudaMemset((void*)energies_d, 0, sizeof(float)*num*numReplicas);
 		computeTabulatedEnergyKernel<<< nb, numThreads >>>(forceInternal_d, pos_d, sys_d,
 						cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d, energies_d);
 	}
@@ -728,27 +729,50 @@ float ComputeForce::computeTabulated(bool get_energy) {
 
 	{
 	    //computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d, bonds, bondMap_d, numBonds, numReplicas, energies_d, get_energy, tableBond_d);
-	computeTabulatedBonds <<<nb, numThreads>>> ( forceInternal_d, pos_d, sys_d, numReplicas*numBonds/2, bondList_d, tableBond_d);
+	//computeTabulatedBonds <<<nb, numThreads>>> ( forceInternal_d, pos_d, sys_d, numReplicas*numBonds/2, bondList_d, tableBond_d);
+	  //if(get_energy)
+              //cudaMemset(bond_energy_d, 0, sizeof(float)*num);
+	    computeTabulatedBonds <<<nb, numThreads>>> ( forceInternal_d, pos_d, sys_d, numReplicas*numBonds/2, bondList_d, tableBond_d, energies_d, get_energy);
 	}
 
 	if (angleList_d != NULL && tableAngle_d != NULL)
-		computeTabulatedAngles<<<nb, numThreads>>>(forceInternal_d, pos_d, sys_d, numAngles*numReplicas, angleList_d, tableAngle_d);
-
+        {
+            //if(get_energy)
+		//computeTabulatedAngles<<<nb, numThreads>>>(forceInternal_d, pos_d, sys_d, numAngles*numReplicas, angleList_d, tableAngle_d);
+		computeTabulatedAngles<<<nb, numThreads>>>(forceInternal_d, pos_d, sys_d, numAngles*numReplicas, angleList_d, tableAngle_d, energies_d, get_energy);
+        }
 	if (dihedralList_d != NULL && tableDihedral_d != NULL)
-		computeTabulatedDihedrals<<<nb, numThreads>>>(forceInternal_d, pos_d, sys_d, numDihedrals*numReplicas, dihedralList_d, dihedralPotList_d, tableDihedral_d);
-
+        {
+            //if(get_energy)
+		//computeTabulatedDihedrals<<<nb, numThreads>>>(forceInternal_d, pos_d, sys_d, numDihedrals*numReplicas, dihedralList_d, dihedralPotList_d, tableDihedral_d);
+		computeTabulatedDihedrals<<<nb, numThreads>>>(forceInternal_d, pos_d, sys_d, numDihedrals*numReplicas, 
+                dihedralList_d, dihedralPotList_d, tableDihedral_d, energies_d, get_energy);
+        }
 	if (restraintIds_d != NULL )
 	    computeHarmonicRestraints<<<1, numThreads>>>(forceInternal_d, pos_d, sys_d, numRestraints*numReplicas, restraintIds_d, restraintLocs_d, restraintSprings_d);
 	
 
 	// Calculate the energy based on the array created by the kernel
 	// TODO: return energy
-	// if (get_energy) {
-	// 	gpuErrchk(cudaDeviceSynchronize());
-	// 	thrust::device_ptr<float> en_d(energies_d);
-	// 	energy = thrust::reduce(en_d, en_d + num);
-	// }
-
+	if (get_energy) 
+        {
+            float e = 0.f;
+	    gpuErrchk(cudaDeviceSynchronize());
+	    thrust::device_ptr<float> en_d(energies_d);
+	    e = (thrust::reduce(en_d, en_d+num*numReplicas)) / numReplicas;
+            std::fstream energy_file;
+            energy_file.open("energy_config.txt", std::fstream::out | std::fstream::app);
+            if(energy_file.is_open())
+            {
+                energy_file << "Configuation Energy: "  << e << " kcal/mol " << std::endl;
+                energy_file.close();
+            }
+            else
+            {
+                std::cout << "Error in opening energ files\n";
+            }
+            energy = e;
+        }
 	return energy;
 }
 
diff --git a/src/ComputeForce.cuh b/src/ComputeForce.cuh
index 8547e602857358731280f735bd97257906a2b2df..d325f9fd221766acea6b766bcd4e7ba7b09f37e3 100644
--- a/src/ComputeForce.cuh
+++ b/src/ComputeForce.cuh
@@ -987,10 +987,14 @@ void computeAngles(Vector3 force[], Vector3 pos[],
 }
 
 // TODO: add kernels for energy calculations
-__global__ void computeTabulatedBonds(Vector3* force,
-				Vector3* __restrict__ pos,
-				BaseGrid* __restrict__ sys,
-				int numBonds, int3* __restrict__ bondList_d, TabulatedPotential** tableBond) {
+//__global__ void computeTabulatedBonds(Vector3* force,
+//				Vector3* __restrict__ pos,
+//				BaseGrid* __restrict__ sys,
+//				int numBonds, int3* __restrict__ bondList_d, TabulatedPotential** tableBond) {
+__global__
+void computeTabulatedBonds(Vector3* force, Vector3* __restrict__ pos, BaseGrid* __restrict__ sys, 
+int numBonds, int3* __restrict__ bondList_d, TabulatedPotential** tableBond, float* energy, bool get_energy)
+{
 	// Loop over ALL bonds in ALL replicas
 	for (int bid = threadIdx.x+blockIdx.x*blockDim.x; bid<numBonds; bid+=blockDim.x*gridDim.x) {
 		// Initialize interaction energy (per particle)
@@ -1003,17 +1007,19 @@ __global__ void computeTabulatedBonds(Vector3* force,
 		// wrapping this value if necessary
 		const Vector3 dr = sys->wrapDiff(pos[j] - pos[i]);
 
-		Vector3 force_local = tableBond[ bondList_d[bid].z ]->computef(dr,dr.length2());
-			
-		atomicAdd( &force[i], force_local );
-		atomicAdd( &force[j], -force_local );
-		
-		// if (get_energy)
-		// {
-		// 	//TODO: clarification on energy computation needed, consider changing.
-		// 	atomicAdd( &g_energies[i], energy_local);
-		// 	//atomicAdd( &g_energies[j], energy_local);
-		// }
+		//Vector3 force_local = tableBond[ bondList_d[bid].z ]->computef(dr,dr.length2());
+	        EnergyForce fe_local = tableBond[ bondList_d[bid].z ]->compute(dr,dr.length2());	
+		//atomicAdd( &force[i], force_local );
+		//atomicAdd( &force[j], -force_local );
+		atomicAdd( &force[i], fe_local.f );
+                atomicAdd( &force[j], -fe_local.f );
+
+		if (get_energy)
+		{
+		 	//TODO: clarification on energy computation needed, consider changing.
+		 	atomicAdd( &energy[i], fe_local.e*0.5f);
+		        atomicAdd( &energy[j], fe_local.e*0.5f);
+		}
 	}
 }
 
@@ -1021,11 +1027,11 @@ __global__
 void computeTabulatedAngles(Vector3* force,
 				Vector3* __restrict__ pos,
 				BaseGrid* __restrict__ sys,
-				int numAngles, int4* __restrict__ angleList_d, TabulatedAnglePotential** tableAngle) {
+				int numAngles, int4* __restrict__ angleList_d, TabulatedAnglePotential** tableAngle, float* energy, bool get_energy) {
 	// Loop over ALL angles in ALL replicas
 	for (int i = threadIdx.x+blockIdx.x*blockDim.x; i<numAngles; i+=blockDim.x*gridDim.x) {
 		int4& ids = angleList_d[i];
-		computeAngle(tableAngle[ ids.w ], sys, force, pos, ids.x, ids.y, ids.z);
+		computeAngle(tableAngle[ ids.w ], sys, force, pos, ids.x, ids.y, ids.z, energy, get_energy);
 	// if (get_energy)
 	// {
 	//     //TODO: clarification on energy computation needed, consider changing.
@@ -1128,7 +1134,7 @@ __global__
 void computeTabulatedDihedrals(Vector3* force, const Vector3* __restrict__ pos,
 			       const BaseGrid* __restrict__ sys,
 			       int numDihedrals, const int4* const __restrict__ dihedralList_d,
-			       const int* __restrict__ dihedralPotList_d, TabulatedDihedralPotential** tableDihedral) {
+			       const int* __restrict__ dihedralPotList_d, TabulatedDihedralPotential** tableDihedral, float* energy, bool get_energy) {
 
 	// int currDihedral = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID
 
@@ -1136,7 +1142,7 @@ void computeTabulatedDihedrals(Vector3* force, const Vector3* __restrict__ pos,
 	for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < numDihedrals; i+=blockDim.x*gridDim.x) {
 		const int4& ids = dihedralList_d[i];
 		const int& id = dihedralPotList_d[i];
-		computeDihedral(tableDihedral[ id ], sys, force, pos, ids.x, ids.y, ids.z, ids.w);
+		computeDihedral(tableDihedral[ id ], sys, force, pos, ids.x, ids.y, ids.z, ids.w, energy, get_energy);
 
 	// if (get_energy)
 	// {
diff --git a/src/ComputeForce.h b/src/ComputeForce.h
index e3101bf63503ef10313586f1e60535e0cd2df482..2b4f7c7d6471c95208f6125c56540160750c3666 100644
--- a/src/ComputeForce.h
+++ b/src/ComputeForce.h
@@ -141,7 +141,10 @@ public:
 		return bondList_d;
 	}
 	
-
+        float* getEnergy()
+        {
+            return energies_d;
+        }
 	HOST DEVICE
 	static EnergyForce coulombForce(Vector3 r, float alpha,float start, float len);
 
diff --git a/src/ComputeGridGrid.cu b/src/ComputeGridGrid.cu
index 1cbba924bcfd0f402e985906816dbd52692795e0..6ba57915b8172e52340dabb967227fa65427134f 100644
--- a/src/ComputeGridGrid.cu
+++ b/src/ComputeGridGrid.cu
@@ -2,7 +2,6 @@
 #include "ComputeGridGrid.cuh"
 #include "RigidBodyGrid.h"
 #include "CudaUtil.cuh"
-
 //RBTODO handle periodic boundaries
 //RBTODO: add __restrict__, benchmark (Q: how to restrict member data?)
 __global__
@@ -29,17 +28,14 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u,
 
 		r_pos = basis_rho.transform( r_pos ) + origin_rho_minus_origin_u; /* real space */
 		const Vector3 u_ijk_float = basis_u_inv.transform( r_pos );
-
 		// RBTODO: Test for non-unit delta
 		/* Vector3 tmpf  = Vector3(0.0f); */
 		/* float tmpe = 0.0f; */
 		/* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */
 		const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */
 		force[tid] = fe.f;
-		
 		const float r_val = rho->val[r_id]; /* maybe move to beginning of function?  */
 		force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */
-
 		// Calculate torque about origin_u in the lab frame
 		torque[tid] = r_pos.cross(force[tid]);
 	}
diff --git a/src/Configuration.cpp b/src/Configuration.cpp
index 8dfbb22f1c43a600facaf55f3c17b3bfac68267e..bbcdebd15c1d06ade7fb3e4953787492442da721 100644
--- a/src/Configuration.cpp
+++ b/src/Configuration.cpp
@@ -1105,7 +1105,6 @@ int Configuration::readParameters(const char * config_file) {
                         {
                                 printf("The grid file name %s\n", value.val());
 				//partGridFile[currPart] = value;
-				cout << part[currPart].numPartGridFiles << endl;
 				stringToArray<String>(&value, part[currPart].numPartGridFiles, 
                                                              &partGridFile[currPart]);
                                 for(int i = 0; i < part[currPart].numPartGridFiles; ++i)
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 26df336db6a7604757d3ed6b3fd20a3887bfcb96..6e67408324cbe433f90fcc5f369aa03fe996d109 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -77,7 +77,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
             if(particle_dynamic == String("NoseHooverLangevin"))
                 random = new float[num * numReplicas];
         }
-        printf("%d\n",__LINE__);
+        //printf("%d\n",__LINE__);
         //for debug
         //force = new Vector3[num * numReplicas];
 
@@ -531,7 +531,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
     gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas));
 
     printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
-
+    //float total_energy = 0.f;
     // Main loop over Brownian dynamics steps
     for (long int s = 1; s < steps; s++)
     {
@@ -608,11 +608,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
         else if(particle_dynamic == String("NoseHooverLangevin"))
             //kernel for Nose-Hoover Langevin dynamic
             updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), 
-            internal -> getRan_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+            internal -> getRan_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, 
+            randoGen_d, numReplicas);
         //For Brownian motion
         else
             updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(),
-                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy);
 
         if(rigidbody_dynamic == String("Langevin"))
         {
@@ -751,7 +752,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
         gpuErrchk(cudaDeviceSynchronize());
 
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
-            LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+            LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy);
             //gpuErrchk(cudaDeviceSynchronize());
 
         if(rigidbody_dynamic == String("Langevin"))
@@ -802,7 +803,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
                 wkf_timer_stop(timerS);
                 t = s * timestep;
                 // Simulation progress and statistics.
-                                   float percent = (100.0f * s) / steps;
+                float percent = (100.0f * s) / steps;
                 float msPerStep = wkf_timer_time(timerS) * 1000.0f / outputEnergyPeriod;
                 float nsPerDay = numReplicas * timestep / msPerStep * 864E5f;
 
@@ -820,7 +821,19 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
                     float e = KineticEnergy();
                     printf(" The kinetic energy is %f \n",e*(2.388458509e-1));
+                    std::fstream energy_file;
+                    energy_file.open("energy_config.txt", std::fstream::out | std::fstream::app);
+                    if(energy_file.is_open())
+                    {
+                        energy_file << "Kinetic Energy: " << e*num*0.5f*(2.388458509e-1) << " (kT) "<< std::endl;
+                        energy_file.close();
+                    }
+                    else
+                    {
+                        std::cout << "Error in opening energ files\n";
+                    }
                 }
+                
                 if(rigidbody_dynamic == String("Langevin"))
                 {
                     //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh
index 2ff30850f460bd5849fd6a3c6b51985824843b20..9e8ef8bdf904c4a0563f91ee67e6f8e2fb17d6a2 100644
--- a/src/GrandBrownTown.cuh
+++ b/src/GrandBrownTown.cuh
@@ -272,7 +272,7 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re
 __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* __restrict__ forceInternal,
                                       int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
                                       float electricField, int tGridLength, float timestep, int num, 
-                                      BaseGrid* sys, Random* randoGen, int numReplicas)
+                                      BaseGrid* sys, Random* randoGen, int numReplicas, float* __restrict__ energy, bool get_energy)
 {
     const int idx  = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
 
@@ -291,8 +291,10 @@ __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* _
         {
             ForceEnergy tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(r0);
             fe.f += tmp.f;
+            fe.e += tmp.e;
         }
-
+        if(get_energy)
+            energy[idx] += fe.e;
 #ifndef FORCEGRIDOFF
         // Add a force defined via 3D FORCE maps (not 3D potential maps)
         if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
@@ -379,7 +381,7 @@ __global__
 void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[], 
                   BrownianParticleType* part[],float kT, BaseGrid* kTGrid, float electricField, 
                   int tGridLength, float timestep, int num, BaseGrid* sys,
-		  Random* randoGen, int numReplicas) 
+		  Random* randoGen, int numReplicas, float* __restrict__ energy, bool get_energy) 
 {
 	// Calculate this thread's ID
 	const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
@@ -405,8 +407,10 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[],
                 {
                     ForceEnergy tmp = pt.pmf[i].interpolateForceDLinearlyPeriodic(p);
                     fe.f += tmp.f;
+                    fe.e += tmp.e;
                 }
-
+                if(get_energy)
+                    energy[idx]+=fe.e;
 #ifndef FORCEGRIDOFF
 		// Add a force defined via 3D FORCE maps (not 3D potential maps)
 		if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(p);
diff --git a/src/RigidBody.cu b/src/RigidBody.cu
index 7c9fdce55165e98811e9b4a42c68a73d5c24c309..bc22c296cf9ed682b25cc092f510762261391f44 100644
--- a/src/RigidBody.cu
+++ b/src/RigidBody.cu
@@ -231,6 +231,7 @@ void RigidBody::integrateDLM(int startFinishAll)
 {
     Vector3 trans; // = *p_trans;
     //Matrix3 rot = Matrix3(1); // = *p_rot;
+
     if ( isnan(force.x) || isnan(torque.x) ) 
     {   
         // NaN check
diff --git a/src/RigidBodyController.cu b/src/RigidBodyController.cu
index de54866acc4ef9ceec59b2571fe1f97de0403834..16d3f13c8a0fe48af659f24628d078bd0624d83b 100644
--- a/src/RigidBodyController.cu
+++ b/src/RigidBodyController.cu
@@ -125,13 +125,21 @@ bool RigidBodyController::loadRBCoordinates(const char* fileName) {
 		
 		delete[] tokenList;
 
-		i++;
-		if (i == imax) {
-			j++;
-			i=0;
-			if (j == jmax)
-				break;
-		}
+		//i++;
+		//if (i == imax) {
+		//	j++;
+		//	i=0;
+		//	if (j == jmax)
+		//		break;
+		//}
+                j++;
+		if (j == jmax) {
+			i++;
+			if (i == imax)
+ 				break;
+			j=0;
+			jmax = rigidBodyByType[i].size();
+                }
 	}
 	fclose(inp);
 	return true;
@@ -493,10 +501,8 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
 		*/
 		Matrix3 B1 = getBasis1(i);
 		Vector3 c = getOrigin1(i) - getOrigin2(i);
-		
 		Matrix3 B2 = getBasis2(i).inverse();
-
-		
+                
 		// RBTODO: get energy
 		if (!isPmf) {								/* pair of RBs */
 			computeGridGridForce<<< nb, NUMTHREADS, NUMTHREADS*2*sizeof(Vector3), s >>>
@@ -527,7 +533,6 @@ void RigidBodyForcePair::retrieveForcesForGrid(const int i) {
 														cudaMemcpyDeviceToHost, s));
 	gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb,
 														cudaMemcpyDeviceToHost, s));
-	
 }
 void RigidBodyForcePair::processGPUForces() {
 	
diff --git a/src/TabulatedMethods.cuh b/src/TabulatedMethods.cuh
index 90a78391914cd22a53e290c4704368f46535f0e4..2f460faca6328f2742d65bc3b2ef6a1df445bbf8 100644
--- a/src/TabulatedMethods.cuh
+++ b/src/TabulatedMethods.cuh
@@ -3,7 +3,7 @@
 #define BD_PI 3.1415927f
 
 __device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ a, const BaseGrid* __restrict__ sys, Vector3* force, const Vector3* __restrict__ pos,
-				const int& i, const int& j, const int& k) {
+				const int& i, const int& j, const int& k, float* energy, bool get_energy) {
 	    
 	    
 	// Particle's type and position
@@ -54,7 +54,14 @@ __device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__
 	// Linearly interpolate the potential	
 	float U0 = a->pot[home];
 	float dUdx = (a->pot[(((home+1)==(a->size)) ? (a->size)-1 : home+1)] - U0) * a->angle_step_inv;
-	// float energy = (dUdx * angle) + U0;
+        if(get_energy)
+        {
+	    float e = ((dUdx * angle) + U0)*0.3333333333;
+            atomicAdd( &energy[i], e);
+            atomicAdd( &energy[j], e);
+            atomicAdd( &energy[k], e);
+            
+        }
 	float sin = sqrtf(1.0f - cos*cos);
 	dUdx /= abs(sin) > 1e-3 ? sin : 1e-3; // avoid singularity 
 
@@ -73,7 +80,7 @@ __device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__
 
 __device__ inline void computeDihedral(const TabulatedDihedralPotential* __restrict__ d,
 				const BaseGrid* __restrict__ sys, Vector3* forces, const Vector3* __restrict__ pos,
-				const int& i, const int& j, const int& k, const int& l) {
+				const int& i, const int& j, const int& k, const int& l, float* energy, bool get_energy) {
 	const Vector3 posa = pos[i];
 	const Vector3 posb = pos[j];
 	const Vector3 posc = pos[k];
@@ -128,8 +135,14 @@ __device__ inline void computeDihedral(const TabulatedDihedralPotential* __restr
 	// Linear interpolation
 	float U0 = d->pot[home];       // Potential
 	float dU = d->pot[home1] - U0; // Change in potential
-		
-	// energy = dU * t + U0;
+	if(get_energy)
+        {	
+	    float e_local = (dU * t + U0)*0.25f;
+            atomicAdd( &energy[i], e_local );
+            atomicAdd( &energy[j], e_local );
+            atomicAdd( &energy[k], e_local );
+            atomicAdd( &energy[l], e_local );
+        }
 	force = -dU * d->angle_step_inv;
 
 	// avoid singularity when one angle is straight 
diff --git a/src/useful.cu b/src/useful.cu
index b729cacf0719b359e0bf39b5ad7c4ba3dbe1c9e9..5e1a2b2a34a4875183133fe137c2fc970ed2f735 100644
--- a/src/useful.cu
+++ b/src/useful.cu
@@ -409,7 +409,6 @@ Matrix3::Matrix3(const float* d) {
 	ezz = d[8];	
 	setIsDiag();
 }	
-
 Matrix3 Matrix3::inverse() const {
 	Matrix3 m;
 	if (isDiag) {
diff --git a/src/useful.h b/src/useful.h
index d3a70fbef6451fe8295dbf1e8ca83050eda2fabd..28417b791eb25ef54db2188ddbb508ba804f62e8 100644
--- a/src/useful.h
+++ b/src/useful.h
@@ -390,7 +390,9 @@ public:
 	HOST DEVICE
 	Matrix3 inverse() const;
 
-	float det() const;
+        HOST DEVICE
+        float det() const;
+
         //Han-Yi Chou
 	HOST DEVICE inline Matrix3 normalized() const {