From 20b17db5e68ef08515eecc5b1fc43e192ea2c3c5 Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Tue, 19 Dec 2017 13:51:36 -0600
Subject: [PATCH] HYC: Added Langevin particle dynamics, including momenta and
 kinetic energy routines; changed whitespace in some areas

---
 src/BrownParticlesKernel.h   |  207 +++++
 src/BrownianParticleType.cpp |    4 +
 src/BrownianParticleType.h   |    4 +-
 src/GrandBrownTown.cu        | 1428 +++++++++++++++++++++++++---------
 src/GrandBrownTown.cuh       |  406 +++++++++-
 src/GrandBrownTown.h         |   39 +-
 6 files changed, 1702 insertions(+), 386 deletions(-)
 create mode 100644 src/BrownParticlesKernel.h

diff --git a/src/BrownParticlesKernel.h b/src/BrownParticlesKernel.h
new file mode 100644
index 0000000..36f7daa
--- /dev/null
+++ b/src/BrownParticlesKernel.h
@@ -0,0 +1,207 @@
+#ifndef KERNEL_H_
+#define KERNEL_H_
+#include "useful.h"
+
+template<const int BlockSize>
+static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], BrownianParticleType* part[], 
+                                                   float *vec_red, int n)
+{
+    __shared__ __align__(4) float sdata[BlockSize];
+    
+    Vector3 p1, p2;
+    float mass1, mass2;
+
+    unsigned int tid = threadIdx.x;
+    unsigned int i = blockIdx.x*(BlockSize<<1) + tid;
+    unsigned int gridSize = (BlockSize<<1)*gridDim.x;
+
+    sdata[tid] = 0.f; 
+
+    while (i < n) 
+    { 
+        int t1 = type[i];
+        const BrownianParticleType& pt1 = *part[t1];
+
+        p1    = P_n[i];
+        mass1 = pt1.mass;
+        
+        if(i + BlockSize < n)
+        {
+            int t2 = type[i+BlockSize];
+            const BrownianParticleType& pt2 = *part[t2];
+
+            p2    = P_n[i+BlockSize];
+            mass2 = pt2.mass;
+
+            sdata[tid] += (p1.length2() / mass1 + p2.length2() / mass2); 
+        }
+        else
+            sdata[tid] += p1.length2() / mass1;
+
+        i += gridSize; 
+    }
+
+    sdata[tid] *= 0.50f;
+
+    __syncthreads();
+
+    if (BlockSize == 512) 
+    { 
+        if (tid < 256) 
+            sdata[tid] += sdata[tid + 256]; 
+        __syncthreads();
+       if (tid < 128)
+            sdata[tid] += sdata[tid + 128];
+        __syncthreads();
+       if (tid < 64)
+            sdata[tid] += sdata[tid + 64];
+        __syncthreads();
+       if (tid < 32)
+            sdata[tid] += sdata[tid + 32];
+        __syncthreads();
+       if (tid < 16)
+            sdata[tid] += sdata[tid + 16];
+        __syncthreads();
+       if (tid < 8)
+            sdata[tid] += sdata[tid + 8];
+        __syncthreads();
+       if (tid < 4)
+            sdata[tid] += sdata[tid + 4];
+        __syncthreads();
+       if (tid < 2)
+            sdata[tid] += sdata[tid + 2];
+        __syncthreads();
+       if (tid < 1)
+            sdata[tid] += sdata[tid + 1];
+        __syncthreads();
+    }
+    else if (BlockSize == 256) 
+    {
+        if (tid < 128)
+            sdata[tid] += sdata[tid + 128];
+        __syncthreads();
+       if (tid < 64)
+            sdata[tid] += sdata[tid + 64];
+        __syncthreads();
+       if (tid < 32)
+            sdata[tid] += sdata[tid + 32];
+        __syncthreads();
+       if (tid < 16)
+            sdata[tid] += sdata[tid + 16];
+        __syncthreads();
+       if (tid < 8)
+            sdata[tid] += sdata[tid + 8];
+        __syncthreads();
+       if (tid < 4)
+            sdata[tid] += sdata[tid + 4];
+        __syncthreads();
+       if (tid < 2)
+            sdata[tid] += sdata[tid + 2];
+        __syncthreads();
+       if (tid < 1)
+            sdata[tid] += sdata[tid + 1];
+        __syncthreads(); 
+    }
+    else if (BlockSize == 128) 
+    {
+       if (tid < 64)
+            sdata[tid] += sdata[tid + 64];
+        __syncthreads();
+       if (tid < 32)
+            sdata[tid] += sdata[tid + 32];
+        __syncthreads();
+       if (tid < 16)
+            sdata[tid] += sdata[tid + 16];
+        __syncthreads();
+       if (tid < 8)
+            sdata[tid] += sdata[tid + 8];
+        __syncthreads();
+       if (tid < 4)
+            sdata[tid] += sdata[tid + 4];
+        __syncthreads();
+       if (tid < 2)
+            sdata[tid] += sdata[tid + 2];
+        __syncthreads();
+       if (tid < 1)
+            sdata[tid] += sdata[tid + 1];
+        __syncthreads();
+ 
+    }
+    else if (BlockSize == 64)
+    {
+       if (tid < 32)
+            sdata[tid] += sdata[tid + 32];
+        __syncthreads();
+       if (tid < 16)
+            sdata[tid] += sdata[tid + 16];
+        __syncthreads();
+       if (tid < 8)
+            sdata[tid] += sdata[tid + 8];
+        __syncthreads();
+       if (tid < 4)
+            sdata[tid] += sdata[tid + 4];
+        __syncthreads();
+       if (tid < 2)
+            sdata[tid] += sdata[tid + 2];
+        __syncthreads();
+       if (tid < 1)
+            sdata[tid] += sdata[tid + 1];
+        __syncthreads();
+
+    }
+    else if (BlockSize == 32)
+    {
+       if (tid < 16)
+            sdata[tid] += sdata[tid + 16];
+        __syncthreads();
+       if (tid < 8)
+            sdata[tid] += sdata[tid + 8];
+        __syncthreads();
+       if (tid < 4)
+            sdata[tid] += sdata[tid + 4];
+        __syncthreads();
+       if (tid < 2)
+            sdata[tid] += sdata[tid + 2];
+        __syncthreads();
+       if (tid < 1)
+            sdata[tid] += sdata[tid + 1];
+        __syncthreads();
+
+    }
+    __syncthreads();
+    if (tid == 0) 
+        vec_red[blockIdx.x] = sdata[0];
+}
+//The size must be power of 2, otherwise there is error
+//This small kernel is to reduce the small vecotr obtained by
+//the reduction kernel from the above.
+//The grid size should be one.
+//This small routine is to help do further reduction of a small vector
+
+template<int BlockSize>
+static __global__ void Reduction(float* dev_vec, float* result, int Size)
+{
+    __shared__ __align__(4) float data[BlockSize];
+    const unsigned int tid = threadIdx.x;
+    
+    data[tid] = dev_vec[tid];
+    size_t idx = tid + BlockSize;
+    while(idx < Size)
+    {
+        data[tid] += dev_vec[idx];
+        idx += BlockSize;
+    }
+    __syncthreads();
+
+    int n = BlockSize;
+    while(n > 1)
+    {
+        n = (n >> 1);
+        if(tid < n)
+            data[tid] += data[tid+n];
+        __syncthreads();
+    }
+    if(tid == 0) 
+        result[0] = data[0];
+}
+#endif
diff --git a/src/BrownianParticleType.cpp b/src/BrownianParticleType.cpp
index 4adf5cf..acec0c0 100644
--- a/src/BrownianParticleType.cpp
+++ b/src/BrownianParticleType.cpp
@@ -19,10 +19,14 @@ void BrownianParticleType::copy(const BrownianParticleType& src) {
 	name = src.name;
 	num = src.num;
 	diffusion = src.diffusion;
+        mass      = src.mass;
 	charge = src.charge;
 	radius = src.radius;
 	eps = src.eps;
 	meanPmf = src.meanPmf;
+        //Han-Yi Chou
+        transDamping = src.transDamping;
+        mu = src.mu;
 	pmf = NULL, diffusionGrid = NULL;
 	forceXGrid = NULL, forceYGrid = NULL, forceZGrid = NULL;
 	reservoir = NULL;
diff --git a/src/BrownianParticleType.h b/src/BrownianParticleType.h
index b6ed4ec..da959ee 100644
--- a/src/BrownianParticleType.h
+++ b/src/BrownianParticleType.h
@@ -49,12 +49,14 @@ class BrownianParticleType {
 public:
 		String name;
 		int num; // number of particles of this type
-
+                float mass; // mass of brownian particles Han-Yi Chou
+                Vector3 transDamping; // translational damping coefficient Han-Yi Chou
 		float diffusion;
 		float radius;
 		float charge;
 		float eps;
 		float meanPmf;
+                float mu; //for Nose-Hoover Langevin dynamics
 
 		Reservoir* reservoir;
 		BaseGrid* pmf;
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 216486e..de8b301 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -2,6 +2,12 @@
 #include "GrandBrownTown.cuh"
 /* #include "ComputeGridGrid.cuh" */
 #include "WKFUtils.h"
+#include "BrownParticlesKernel.h"
+#include <stdlib.h>     /* srand, rand */
+#include <time.h>       /* time */
+#include <gsl/gsl_rng.h>
+#include <gsl/gsl_randist.h>
+#include <gsl/gsl_math.h>
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) {
@@ -19,17 +25,36 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 		const long int randomSeed, bool debug, bool imd_on, unsigned int imd_port, int numReplicas) :
 	imd_on(imd_on), imd_port(imd_port), numReplicas(numReplicas),
 	conf(c), RBC(RigidBodyController(c,outArg)) {
-
-	for (int i = 0; i < numReplicas; i++) {
+        printf("%d\n",__LINE__);
+        //Determine which dynamic. Han-Yi Chou
+        particle_dynamic  = c.ParticleDynamicType;
+        rigidbody_dynamic = c.RigidBodyDynamicType;
+        //particle_langevin_integrator = c.ParticleLangevinIntegrator;
+        printf("%d\n",__LINE__);
+	for (int i = 0; i < numReplicas; ++i) 
+        {
 		std::stringstream curr_file, restart_file, out_prefix;
 
 		curr_file << outArg << '.' << i << ".curr";
-		restart_file << outArg << '.' << i << ".restart";
+		restart_file   << outArg << '.' << i << ".restart";
 		out_prefix << outArg << '.' << i;
 
-		outCurrFiles.push_back(curr_file.str());
-		restartFiles.push_back(restart_file.str());
-		outFilePrefixes.push_back(out_prefix.str());
+                outCurrFiles.push_back(curr_file.str());
+                restartFiles.push_back(restart_file.str());
+                outFilePrefixes.push_back(out_prefix.str());
+
+                //Han-Yi Chou for flush out the momentum
+                if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+                {
+                    std::stringstream restart_file_p, out_momentum_prefix, out_force_prefix;
+                    restart_file_p << outArg << '.' << i << ".momentum.resart";
+                    out_momentum_prefix << outArg << ".momentum." << i;
+                    //out_force_prefix << outArg << ".force." << i;
+
+                    restartMomentumFiles.push_back(restart_file_p.str());//Han-Yi Chou
+                    outMomentumFilePrefixes.push_back(out_momentum_prefix.str());
+                    //outForceFilePrefixes.push_back(out_force_prefix.str());
+                }           
 	}
 
 	GrandBrownTown::DEBUG = debug;
@@ -46,6 +71,17 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 
 	// Allocate arrays of positions, types and serial numbers
 	pos    = new Vector3[num * numReplicas];  // [HOST] array of particles' positions.
+        // Allocate arrays of momentum Han-Yi Chou
+        if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+        {
+            momentum = new Vector3[num * numReplicas]; //[HOST] array of particles' momentum
+            if(particle_dynamic == String("NoseHooverLangevin"))
+                random   = new float[num * numReplicas];
+        }
+        printf("%d\n",__LINE__);
+        //for debug
+        //force = new Vector3[num * numReplicas];
+
 	type   = new     int[num * numReplicas];  // [HOST] array of particles' types.
 	serial = new     int[num * numReplicas];  // [HOST] array of particles' serial numbers.
 
@@ -66,12 +102,20 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	  if (c.copyReplicaCoordinates > 0)
 	    std::copy(c.pos, c.pos + num, pos + r*num);
 	}
-	if (c.copyReplicaCoordinates <= 0)
-	  std::copy(c.pos, c.pos + numReplicas*num, pos);
-	
+        if (c.copyReplicaCoordinates <= 0)
+          std::copy(c.pos, c.pos + numReplicas*num, pos);
+
+        //printf("%d\n",__LINE__); 
+        //Han-Yi Chou
+        if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+            std::copy(c.momentum,c.momentum+num*numReplicas,momentum);
+
+        //printf("%d\n",__LINE__);
+
 	currSerial = c.currSerial;  // serial number of the next new particle
 	name = c.name;              // list of particle types! useful when 'numFluct == 1'
 	posLast = c.posLast;        // previous positions of particles  (used for computing ionic current)
+        momLast = c.momLast;
 	timeLast = c.timeLast;      // previous time (used with posLast)
 	minimumSep = c.minimumSep;  // minimum separation allowed when placing new particles
 
@@ -131,6 +175,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	// TODO: bondList = c.bondList;
 
 	excludes = c.excludes;
+        part = c.part;
 	excludeMap = c.excludeMap;
 	excludeRule = c.excludeRule;
 	excludeCapacity = c.excludeCapacity;
@@ -153,6 +198,9 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	//angles_d = c.angles_d;
 	//dihedrals_d = c.dihedrals_d;
 
+        if(particle_dynamic == String("NoseHooverLangevin"))
+            InitNoseHooverBath(num * numReplicas);
+        printf("%d\n",__LINE__);
 	// Seed random number generator
 	long unsigned int r_seed;
 	if (seed == 0) {
@@ -171,18 +219,18 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 	Vector3 buffer = (sys->getCenter() + 2.0f * sys->getOrigin())/3.0f;
 	initialZ = buffer.z;
 
-	// Load random coordinates if necessary
+	// Load random coordinates if necessary Han-Yi Chou
 	if (!c.loadedCoordinates) {
-		printf("Populating\n");
-		populate();
+		//printf("Populating\n");
+		//populate(); Han-Yi Chou, Actually the list is already populated 
 		initialCond();
 		printf("Set random initial conditions.\n");
 	}
 
 	// Prepare internal force computation
-	// internal = new ComputeForce(num, part, numParts, sys, switchStart, switchLen, coulombConst,
-	// 			    fullLongRange, numBonds, numTabBondFiles, numExcludes, numAngles, numTabAngleFiles,
-	// 			    numDihedrals, numTabDihedralFiles, c.pairlistDistance, numReplicas);
+	 //internal = new ComputeForce(num, part, numParts, sys, switchStart, switchLen, coulombConst,
+	 //			    fullLongRange, numBonds, numTabBondFiles, numExcludes, numAngles, numTabAngleFiles,
+	 //			    numDihedrals, numTabDihedralFiles, c.pairlistDistance, numReplicas);
 	internal = new ComputeForce(c, numReplicas);
 	
 	//MLog: I did the other halve of the copyToCUDA function from the Configuration class here, keep an eye on any mistakes that may occur due to the location.
@@ -298,21 +346,48 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg,
 
 	// Prepare the trajectory output writer.
 	for (int repID = 0; repID < numReplicas; ++repID) {
-		TrajectoryWriter *w =
-				new TrajectoryWriter(outFilePrefixes[repID].c_str(),
-														 TrajectoryWriter::getFormatName(outputFormat),
-														 sys->getBox(), num, timestep, outputPeriod);
+		TrajectoryWriter *w = new TrajectoryWriter(outFilePrefixes[repID].c_str(), TrajectoryWriter::getFormatName(outputFormat),
+							   sys->getBox(), num, timestep, outputPeriod);
+                
 		writers.push_back(w);
 	}
+
+        //Preparing the writers for momentum if necessary Han-Yi Chou
+        if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+        {
+            for (int repID = 0; repID < numReplicas; ++repID) 
+            {
+
+                TrajectoryWriter *w = new TrajectoryWriter(outMomentumFilePrefixes[repID].c_str(), TrajectoryWriter::getFormatName(outputFormat),
+                                                           sys->getBox(), num, timestep, outputPeriod);
+                momentum_writers.push_back(w);
+            }
+        }
 	updateNameList();
 }
 
 GrandBrownTown::~GrandBrownTown() {
 	delete[] forceInternal;
+        forceInternal = NULL;
 	delete[] pos;
+        pos = NULL;
+        //Han-Yi Chou
+        if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+        {
+            delete[] momentum;
+            momentum = NULL;
+            if(particle_dynamic == String("NoseHooverLangevin"))
+            {
+                delete[] random;
+                random = NULL;
+            }
+        }
+        //for debug
+        //delete[] force;
+
 	delete[] type;
 	delete[] serial;
-	delete randoGen;
+	//delete randoGen;
 
 	if (numBonds > 0)
 		delete[] bondList;
@@ -322,15 +397,58 @@ GrandBrownTown::~GrandBrownTown() {
 		delete[] dihedralList;
 		delete[] dihedralPotList;
 	}
+        if(randoGen->states != NULL)
+            {
+                gpuErrchk(cudaFree(randoGen->states));
+                randoGen->states = NULL;
+            }
+            if(randoGen->integer_h != NULL)
+            {
+                delete[] randoGen->integer_h;
+                randoGen->integer_h = NULL;
+            }
+            if(randoGen->integer_d != NULL)
+            {
+                gpuErrchk(cudaFree(randoGen->integer_d));
+                randoGen->integer_d = NULL;
+            }
+            if(randoGen->uniform_h != NULL)
+            {
+                delete[] randoGen->uniform_h;
+                randoGen->uniform_h = NULL;
+            }
+            if(randoGen->uniform_d != NULL)
+            {
+                gpuErrchk(cudaFree(randoGen->uniform_d));
+                randoGen->uniform_d = NULL;
+            }
+            //curandDestroyGenerator(randoGen->generator);
+            delete randoGen;
+            gpuErrchk(cudaFree(randoGen_d));
 	
 	// Auxillary objects
 	delete internal;
+        internal = NULL;
 	for (int i = 0; i < numReplicas; ++i)
+        {
 		delete writers[i];
-
+                writers[i] = NULL;
+        }
+
+        if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+        {
+            for (int i = 0; i < numReplicas; ++i)
+            {
+                delete momentum_writers[i];
+                momentum_writers[i]=NULL;
+             }
+            //for (int i = 0; i < numReplicas; ++i)
+                //delete force_writers[i];
+
+        }
 	//gpuErrchk(cudaFree(pos_d));
 	//gpuErrchk(cudaFree(forceInternal_d));
-	gpuErrchk(cudaFree(randoGen_d));
+	//gpuErrchk(cudaFree(randoGen_d));
 	//gpuErrchk( cudaFree(bondList_d) );
 
 	if (imd_on)
@@ -338,9 +456,427 @@ GrandBrownTown::~GrandBrownTown() {
 	
 		
 }
+//temporary test for Nose-Hoover Langevin dynamics
+//Nose Hoover is now implement for particles.
+void GrandBrownTown::RunNoseHooverLangevin()
+{
+    printf("\n\n");
+    printf("Testing for Nose-Hoover Langevin dynamics\n");
+    Vector3 runningNetForce(0.0f);
+
+    //comment this out because this is the origin points Han-Yi Chou
+    // Open the files for recording ionic currents
+    for (int repID = 0; repID < numReplicas; ++repID) 
+    {
+        newCurrent(repID);
+        writers[repID]->newFile(pos + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
+        if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+            momentum_writers[repID]->newFile((momentum + repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
+        //random_writers[repID]->newFile(random + (repID * num), name, 0.0f, num);
+    }
+    // Initialize timers (util.*)
+    wkf_timerhandle timer0, timerS;
+    timer0 = wkf_timer_create();
+    timerS = wkf_timer_create();
+
+    copyToCUDA();
+
+    if(particle_dynamic == String("Langevin"))
+        internal -> copyToCUDA(forceInternal, pos,momentum);
+    else if(particle_dynamic == String("NoseHooverLangevin"))
+        internal -> copyToCUDA(forceInternal, pos, momentum, random);
+    else
+        internal -> copyToCUDA(forceInternal, pos);
+
+    // IMD Stuff
+    void* sock = NULL;
+    void* clientsock = NULL;
+    int length;
+    if (imd_on) 
+    {
+        printf("Setting up incoming socket\n");
+        vmdsock_init();
+        sock = vmdsock_create();
+        clientsock = NULL;
+        vmdsock_bind(sock, imd_port);
+
+        printf("Waiting for IMD connection on port %d...\n", imd_port);
+        vmdsock_listen(sock);
+        while (!clientsock) 
+        {
+            if (vmdsock_selread(sock, 0) > 0) 
+            {
+                clientsock = vmdsock_accept(sock);
+                if (imd_handshake(clientsock))
+                    clientsock = NULL;
+            }
+        }
+        sleep(1);
+        if (vmdsock_selread(clientsock, 0) != 1 || imd_recv_header(clientsock, &length) != IMD_GO) 
+        {
+            clientsock = NULL;
+        }
+        imdForces = new Vector3[num*numReplicas];
+        for (size_t i = 0; i < num; ++i) // clear old forces
+            imdForces[i] = Vector3(0.0f);
+
+    } // endif (imd_on)
+
+    // Start timers
+    wkf_timer_start(timer0);
+    wkf_timer_start(timerS);
+
+    if (fullLongRange == 0)
+    {
+        // cudaSetDevice(0);
+        internal->decompose();
+        gpuErrchk(cudaDeviceSynchronize());
+        RBC.updateParticleLists( internal->getPos_d() );
+    }
+
+    float t; // simulation time
+
+    int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
+    int tl = temperatureGridFile.length();
+    Vector3 *force_d;
+    gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas));
+
+    printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
+
+    // Main loop over Brownian dynamics steps
+    for (long int s = 1; s < steps; s++)
+    {
+        bool get_energy = ((s % outputEnergyPeriod) == 0);
+        //At the very first time step, the force is computed
+        if(s == 1)
+        {
+            // 'interparticleForce' - determines whether particles interact with each other
+            gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
+
+            if (interparticleForce)
+            {
+                if (tabulatedPotential)
+                {
+                    switch (fullLongRange)
+                    {
+                        case 0: // [ N*log(N) ] interactions, + cutoff | decomposition
+                            if (s % decompPeriod == 0)
+                            {
+                                // cudaSetDevice(0);
+                                internal -> decompose();
+                                RBC.updateParticleLists( internal->getPos_d() );
+                            }
+                            internal -> computeTabulated(get_energy);
+                            break;
+                        default:
+                            internal->computeTabulatedFull(get_energy);
+                            break;
+                    }
+                }
+                else
+                {
+                    // Not using tabulated potentials.
+                    switch (fullLongRange)
+                    {
+                        case 0: // Use cutoff | cell decomposition.
+                            if (s % decompPeriod == 0)
+                            {
+                                // cudaSetDevice(0);
+                                internal->decompose();
+                                RBC.updateParticleLists( internal->getPos_d() );
+                            }
+                            internal->compute(get_energy);
+                            break;
+
+                        case 1: // Do not use cutoff
+                            internal->computeFull(get_energy);
+                            break;
+
+                        case 2: // Compute only softcore forces.
+                            internal->computeSoftcoreFull(get_energy);
+                            break;
+
+                        case 3: // Compute only electrostatic forces.
+                            internal->computeElecFull(get_energy);
+                            break;
+                    }
+                }
+            }//if inter-particle force
+            gpuErrchk(cudaDeviceSynchronize());
+            RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s);
+        }//if step == 1
+
+        gpuErrchk(cudaDeviceSynchronize());
+
+        if(particle_dynamic == String("Langevin"))
+            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+        else if(particle_dynamic == String("NoseHooverLangevin"))
+            //kernel for Nose-Hoover Langevin dynamic
+            updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), 
+            internal -> getRan_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+        //For Brownian motion
+        else
+            updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(),
+                                                       part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+
+	RBC.integrate(s);
+        gpuErrchk(cudaDeviceSynchronize());
+
+        if (s % outputPeriod == 0)
+            // Copy particle positions back to CPU
+            gpuErrchk(cudaMemcpy(pos, internal ->  getPos_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+        if (imd_on && clientsock && s % outputPeriod == 0)
+        {
+            float* coords = new float[num*3]; // TODO: move allocation out of run loop
+            int* atomIds = new int[num]; // TODO: move allocation out of run loop
+            int length;
+
+            bool paused = false;
+            while (vmdsock_selread(clientsock, 0) > 0 || paused)
+            {
+                switch (imd_recv_header(clientsock, &length))
+                {
+                        case IMD_DISCONNECT:
+                            printf("[IMD] Disconnecting...\n");
+                            imd_disconnect(clientsock);
+                            clientsock = NULL;
+                            sleep(5);
+                            break;
+                        case IMD_KILL:
+                            printf("[IMD] Killing...\n");
+                            imd_disconnect(clientsock);
+                            clientsock = NULL;
+                            steps = s; // Stop the simulation at this step
+                            sleep(5);
+                            break;
+                        case IMD_PAUSE:
+                            paused = !paused;
+                            break;
+                        case IMD_GO:
+                            printf("[IMD] Caught IMD_GO\n");
+                            break;
+                        case IMD_MDCOMM:
+                            for (size_t i = 0; i < num; ++i) // clear old forces
+                                imdForces[i] = Vector3(0.0f);
+
+                            if (imd_recv_mdcomm(clientsock, length, atomIds, coords))
+                            {
+                                printf("[IMD] Error receiving forces\n");
+                            }
+                            else
+                            {
+                                for (size_t j = 0; j < length; ++j)
+                                {
+                                    int i = atomIds[j];
+                                    imdForces[i] = Vector3(coords[j*3], coords[j*3+1], coords[j*3+2]) * conf.imdForceScale;
+                                }
+                            }
+                            break;
+                        default:
+                            printf("[IMD] Something weird happened. Disconnecting..\n");
+                            break;
+                }
+            }
+            if (clientsock)
+            {
+                    // float* coords = new float[num*3]; // TODO: move allocation out of run loop
+                    for (size_t i = 0; i < num; i++)
+                    {
+                        const Vector3& p = pos[i];
+                        coords[3*i] = p.x;
+                        coords[3*i+1] = p.y;
+                        coords[3*i+2] = p.z;
+                    }
+                    imd_send_fcoords(clientsock, num, coords);
+            }
+                delete[] coords;
+                delete[] atomIds;
+        }
+        if (imd_on && clientsock)
+            internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD
+
+        RBC.clearForceAndTorque();
+        gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
+        if (interparticleForce)
+        {
+            // 'tabulatedPotential' - determines whether interaction is described with tabulated potentials or formulas
+            if (tabulatedPotential)
+            {
+                switch (fullLongRange)
+                {
+                    case 0: // [ N*log(N) ] interactions, + cutoff | decomposition
+                        if (s % decompPeriod == 0)
+                        {
+                            internal -> decompose();
+                            RBC.updateParticleLists( internal->getPos_d() );
+                        }
+                        internal -> computeTabulated(get_energy);
+                        break;
+                    default: // [ N^2 ] interactions, no cutoff | decompositions
+                        internal->computeTabulatedFull(get_energy);
+                        break;
+                }
+            }
+            else
+            {
+                // Not using tabulated potentials.
+                switch (fullLongRange)
+                {
+                        case 0: // Use cutoff | cell decomposition.
+                            if (s % decompPeriod == 0)
+                            {
+                               internal->decompose();
+                               RBC.updateParticleLists( internal->getPos_d() );
+                            }
+                            internal->compute(get_energy);
+                            break;
+                        case 1: // Do not use cutoff
+                            internal->computeFull(get_energy);
+                            break;
+
+                        case 2: // Compute only softcore forces.
+                            internal->computeSoftcoreFull(get_energy);
+                            break;
+
+                        case 3: // Compute only electrostatic forces.
+                            internal->computeElecFull(get_energy);
+                            break;
+                }
+            }
+        }
+        gpuErrchk(cudaDeviceSynchronize());
+        //compute the force for rigid bodies
+        RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s);
+        gpuErrchk(cudaDeviceSynchronize());
+
+        if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+            LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+            //gpuErrchk(cudaDeviceSynchronize());
+
+
+        gpuErrchk(cudaDeviceSynchronize());
+        if (s % outputPeriod == 0)
+        {
+            if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+            {
+                gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+            }
+            t = s*timestep;
+            // Loop over all replicas
+            for (int repID = 0; repID < numReplicas; ++repID)
+            {
+                // *Analysis*: ionic current
+                if (currentSegmentZ <= 0.0f)
+                    writeCurrent(repID, t);
+                else
+                    writeCurrentSegment(repID, t, currentSegmentZ);
+
+                if (numberFluct == 1)
+                    updateNameList(); // no need for it here if particles stay the same
+
+                // Write the trajectory.
+                writers[repID]->append(pos + (repID*num), name, serial, t, num);
+
+                if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+                {
+                    momentum_writers[repID]->append(momentum + (repID * num), name, serial, t, num);
+                    //force_writers[repID]->append(force + (repID * num), name, serial, t, num);
+                }
+            }
+            // TODO: Currently, not compatible with replicas. Needs a fix.
+            if (numberFluct == 1)
+                updateReservoirs();
+
+           remember(t);
+        }
+        if (get_energy)
+        {
+                wkf_timer_stop(timerS);
+                t = s * timestep;
+                // Simulation progress and statistics.
+                                   float percent = (100.0f * s) / steps;
+                float msPerStep = wkf_timer_time(timerS) * 1000.0f / outputEnergyPeriod;
+                float nsPerDay = numReplicas * timestep / msPerStep * 864E5f;
+
+                // Nice thousand separator
+                setlocale(LC_NUMERIC, "");
+
+                // Do the output
+                printf("\rStep %ld [%.2f%% complete | %.3f ms/step | %.3f ns/day]",s, percent, msPerStep, nsPerDay);
+
+                // Copy positions from GPU to CPU.
+                gpuErrchk(cudaMemcpy(pos, internal->getPos_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+                {
+                    //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                    gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                    float e = KineticEnergy();
+                    printf(" The kinetic energy is %f \n",e*(2.388458509e-1));
+                }
+                if(rigidbody_dynamic == String("Langevin"))
+                {
+                    //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                    float e = RotKineticEnergy();
+                    printf(" The Rotational kinetic energy is %f \n",e*(2.388458509e-1));
+                }
+
+                // Write restart files for each replica.
+                for (int repID = 0; repID < numReplicas; ++repID)
+                    writeRestart(repID);
+
+                wkf_timer_start(timerS);
+         } // s % outputEnergyPeriod
+     } // done with all Brownian dynamics steps
+
+     if (imd_on and clientsock)
+     {
+            if (vmdsock_selread(clientsock, 0) == 1)
+            {
+                int length;
+                switch (imd_recv_header(clientsock, &length))
+                {
+                    case IMD_DISCONNECT:
+                        printf("\n[IMD] Disconnecting...\n");
+                        imd_disconnect(clientsock);
+                        clientsock = NULL;
+                        sleep(5);
+                        break;
+                    case IMD_KILL:
+                        printf("\n[IMD] Killing...\n");
+                        imd_disconnect(clientsock);
+                        clientsock = NULL;
+                        sleep(5);
+                        break;
+                    default:
+                        printf("\n[IMD] Something weird happened. Disconnecting..\n");
+                        break;
+                }
+            }
+     }
+     // Stop the main timer.
+     wkf_timer_stop(timer0);
+
+     // Compute performance data.
+     const float elapsed = wkf_timer_time(timer0); // seconds
+     int tot_hrs = (int) std::fmod(elapsed / 3600.0f, 60.0f);
+     int tot_min = (int) std::fmod(elapsed / 60.0f, 60.0f);
+     float tot_sec   = std::fmod(elapsed, 60.0f);
+
+     printf("\nFinal Step: %d\n", (int) steps);
+
+     printf("Total Run Time: ");
+     if (tot_hrs > 0) printf("%dh%dm%.1fs\n", tot_hrs, tot_min, tot_sec);
+     else if (tot_min > 0) printf("%dm%.1fs\n", tot_min, tot_sec);
+     else printf("%.2fs\n", tot_sec);
+
+     gpuErrchk(cudaFree(force_d));
+} // GrandBrownTown::run()
 
 // Run the Brownian Dynamics steps.
 void GrandBrownTown::run() {
+
+        RunNoseHooverLangevin();
+#if 0
 	printf("\n\n");
 	Vector3 runningNetForce(0.0f);
 	
@@ -349,19 +885,29 @@ void GrandBrownTown::run() {
 		newCurrent(repID);
 		writers[repID]->newFile(pos + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
 	}
+        //Han-Yi Chou
+        if(particle_dynamic == String("Langevin"))
+        {
+            for (int repID = 0; repID < numReplicas; ++repID)
+            {
+                momentum_writers[repID]->newFile(momentum + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay
+                //force_writers[repID]->newFile(force + (repID * num), name, 0.0f, num);
+            }
+        }
 
 	// Save (remember) particle positions for later (analysis)
 	remember(0.0f);
 
 	// Initialize timers (util.*)
-	wkf_timerhandle cputimer;
-	cputimer = wkf_timer_create();
 	wkf_timerhandle timer0, timerS;
 	timer0 = wkf_timer_create();
 	timerS = wkf_timer_create();
 
 	copyToCUDA();
-	internal -> copyToCUDA(forceInternal, pos);
+        if(particle_dynamic == String("Langevin"))
+	    internal -> copyToCUDA(forceInternal, pos,momentum);
+        else
+            internal -> copyToCUDA(forceInternal, pos);
 
 	// IMD Stuff
 	void* sock = NULL;
@@ -410,323 +956,390 @@ void GrandBrownTown::run() {
 
 	float t; // simulation time
 
+        int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
+        int tl = temperatureGridFile.length();
+        Vector3 *force_d;
+        gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas));
+
 	printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
 
 	// Main loop over Brownian dynamics steps
-	for (long int s = 1; s < steps; s++) {
-		// Compute the internal forces. Only calculate the energy when we are about to output.
-		bool get_energy = ((s % outputEnergyPeriod) == 0);
-		// float energy = 0.0f;
-
-		// Set the timer
-		wkf_timer_start(cputimer);
-
-		// 'interparticleForce' - determines whether particles interact with each other
-		if (interparticleForce) {
-
-			// 'tabulatedPotential' - determines whether interaction is described with tabulated potentials or formulas
-			if (tabulatedPotential) {
-				// Using tabulated potentials
-
-				// 'fullLongRange' - determines whether 'cutoff' is used
-				// 0 - use cutoff (cell decomposition) [ N*log(N) ]
-				// 1 - do not use cutoff [ N^2 ]
-				switch (fullLongRange) {
-					case 0: // [ N*log(N) ] interactions, + cutoff | decomposition
-						if (s % decompPeriod == 0)
-						{
-							// cudaSetDevice(0);
-							internal -> decompose();
-							RBC.updateParticleLists( internal->getPos_d() );
-						}
+	for (long int s = 1; s < steps; s++) 
+        {
+            // Compute the internal forces. Only calculate the energy when we are about to output.
+	    bool get_energy = ((s % outputEnergyPeriod) == 0);
+            //At the very first time step, the force is computed
+            if(s == 1) 
+            {
+                // 'interparticleForce' - determines whether particles interact with each other
+                gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
+
+		if (interparticleForce) 
+                {
+                    //gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
+                    //RBC.clearForceAndTorque(); //Han-Yi Chou
+
+	            // 'tabulatedPotential' - determines whether interaction is described with tabulated potentials or formulas
+		    if (tabulatedPotential) 
+                    {
+		        // Using tabulated potentials
+		        // 'fullLongRange' - determines whether 'cutoff' is used
+		        // 0 - use cutoff (cell decomposition) [ N*log(N) ]
+		        // 1 - do not use cutoff [ N^2 ]
+		        switch (fullLongRange) 
+                        {
+		            case 0: // [ N*log(N) ] interactions, + cutoff | decomposition
+                            //I want to replace this
+                            // by checking how far particles move to decide whether the pair list should be updated
+                            //if(NeedUpdatePairList()) //ToDo, Han-Yi Chou
+                            //{
+                            //internal-> decompose();
+                            //  RBC.updateParticleLists( internal->getPos_d() );
+                            //}
+			        if (s % decompPeriod == 0)
+		                {
+		                    // cudaSetDevice(0);
+				    internal -> decompose();
+				    RBC.updateParticleLists( internal->getPos_d() );
+			        }
 						
-						//MLog: added Bond* bondList to the list of passed in variables.
-						/*energy = internal->computeTabulated(forceInternal_d, pos_d, type_d, bonds_d, bondMap_d, excludes_d, excludeMap_d,	angles_d, dihedrals_d, get_energy);*/
-						// energy = internal -> computeTabulated(get_energy);
-						internal -> computeTabulated(get_energy);
-						break;
-					default: // [ N^2 ] interactions, no cutoff | decompositions
-					    internal->computeTabulatedFull(get_energy);
-						break;
-				}
-			
-			} else {
-				// Not using tabulated potentials.
-				
-				switch (fullLongRange) {
-
-					case 0: // Use cutoff | cell decomposition.
-						if (s % decompPeriod == 0)
-						{
-							// cudaSetDevice(0);
-							internal->decompose();
-							RBC.updateParticleLists( internal->getPos_d() );
-						}
-						internal->compute(get_energy);
-						break;
-
-					case 1: // Do not use cutoff
-						internal->computeFull(get_energy);
-						break;
-
-					case 2: // Compute only softcore forces.
-						internal->computeSoftcoreFull(get_energy);
-						break;
-
-					case 3: // Compute only electrostatic forces.
-						internal->computeElecFull(get_energy);
-						break;
-				}
-			}
+			        //MLog: added Bond* bondList to the list of passed in variables.
+			        /*energy = internal->computeTabulated(forceInternal_d, pos_d, type_d, bonds_d, bondMap_d, excludes_d, excludeMap_d,	angles_d, dihedrals_d, get_energy);*/
+		               // energy = internal -> computeTabulated(get_energy);
+			        internal -> computeTabulated(get_energy);
+			        break;
+			    default: 
+                               // [ N^2 ] interactions, no cutoff | decompositions
+			        internal->computeTabulatedFull(get_energy);
+			        break;
+		        }
+                    } 
+                    else 
+                    {
+		        // Not using tabulated potentials.
+		        switch (fullLongRange) 
+                        {
+                            case 0: // Use cutoff | cell decomposition.
+		               if (s % decompPeriod == 0)
+		               {
+		                   // cudaSetDevice(0);
+				   internal->decompose();
+				   RBC.updateParticleLists( internal->getPos_d() );
+			       }
+			       internal->compute(get_energy);
+			       break;
+
+			    case 1: // Do not use cutoff
+		                internal->computeFull(get_energy);
+				break;
+
+			    case 2: // Compute only softcore forces.
+		                internal->computeSoftcoreFull(get_energy);
+				break;
+
+		            case 3: // Compute only electrostatic forces.
+				internal->computeElecFull(get_energy);
+				break;
+		        }
+	            }
+		}//if inter-particle force
+
+	        gpuErrchk(cudaDeviceSynchronize());
+		RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s);
+
+            }//if step == 1
+
+            gpuErrchk(cudaDeviceSynchronize());
+          
+	    //Han-Yi Chou
+            //update the rigid body positions and orientation
+            //So far only brownian dynamics is used
+            //RBC.integrate(s);
+
+	    if(particle_dynamic == String("Langevin"))
+                updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+            //For Brownian motion
+            else
+                updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), 
+                                                           part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+
+
+	    RBC.integrate(s);
+
+            gpuErrchk(cudaDeviceSynchronize());
+
+            if (s % outputPeriod == 0)
+                // Copy particle positions back to CPU
+                gpuErrchk(cudaMemcpy(pos, internal ->  getPos_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+
+            //compute again the new force with new positions.
+            //reset the internal force, I hope. Han-Yi Chou
+            //First clear the old force
+            if (imd_on && clientsock && s % outputPeriod == 0) 
+            {
+                float* coords = new float[num*3]; // TODO: move allocation out of run loop
+                int* atomIds = new int[num]; // TODO: move allocation out of run loop
+                int length;
+
+                bool paused = false;
+                while (vmdsock_selread(clientsock, 0) > 0 || paused) 
+                {
+                    switch (imd_recv_header(clientsock, &length)) 
+                    {
+                        case IMD_DISCONNECT:
+                            printf("[IMD] Disconnecting...\n");
+                            imd_disconnect(clientsock);
+                            clientsock = NULL;
+                            sleep(5);
+                            break;
+                        case IMD_KILL:
+                            printf("[IMD] Killing...\n");
+                            imd_disconnect(clientsock);
+                            clientsock = NULL;
+                            steps = s; // Stop the simulation at this step
+                            sleep(5);
+                            break;
+                        case IMD_PAUSE:
+                            paused = !paused;
+                            break;
+                        case IMD_GO:
+                            printf("[IMD] Caught IMD_GO\n");
+                            break;
+                        case IMD_MDCOMM:
+                            for (size_t i = 0; i < num; ++i) // clear old forces
+                                imdForces[i] = Vector3(0.0f);
+
+                            if (imd_recv_mdcomm(clientsock, length, atomIds, coords)) 
+                            {
+                                printf("[IMD] Error receiving forces\n");
+                            } 
+                            else 
+                            {
+                                for (size_t j = 0; j < length; ++j) 
+                                {
+                                    int i = atomIds[j];
+                                    imdForces[i] = Vector3(coords[j*3], coords[j*3+1], coords[j*3+2]) * conf.imdForceScale;
+                                }
+                            }
+                            break;
+                        default:
+                            printf("[IMD] Something weird happened. Disconnecting..\n");
+                            break;
+                    }
+                }
+                if (clientsock) 
+                {
+                    // float* coords = new float[num*3]; // TODO: move allocation out of run loop
+                    for (size_t i = 0; i < num; i++) 
+                    {
+                        const Vector3& p = pos[i];
+                        coords[3*i] = p.x;
+                        coords[3*i+1] = p.y;
+                        coords[3*i+2] = p.z;
+                    }
+                    imd_send_fcoords(clientsock, num, coords);
+                }
+                delete[] coords;
+                delete[] atomIds;
+            }
+            if (imd_on && clientsock) 
+                internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD
+            //else
+            //{ 
+                //int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
+                //MLog: along with calls to internal (ComputeForce class) this function should execute once per GPU.
+                //clearInternalForces<<< numBlocks, NUM_THREADS >>>(internal->getForceInternal_d(), num*numReplicas);
+                //use cudaMemset instead
+                //gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
+                //RBC.clearForceAndTorque();
+            //}
+            //compute the new force for particles
+            RBC.clearForceAndTorque();
+            gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3)));
+            //RBC.clearForceAndTorque();
+
+            if (interparticleForce) 
+            {
+                // 'tabulatedPotential' - determines whether interaction is described with tabulated potentials or formulas
+                if (tabulatedPotential)
+                {
+                    switch (fullLongRange) 
+                    {
+                        case 0: // [ N*log(N) ] interactions, + cutoff | decomposition
+                            if (s % decompPeriod == 0)
+                            {
+                                internal -> decompose();
+                                RBC.updateParticleLists( internal->getPos_d() );
+                            }
+                            internal -> computeTabulated(get_energy);
+                            break;
+                        default: // [ N^2 ] interactions, no cutoff | decompositions
+                            internal->computeTabulatedFull(get_energy);
+                            break;
+                    }
+                } 
+                else 
+                {
+                    // Not using tabulated potentials.
+                    switch (fullLongRange) 
+                    {
+                        case 0: // Use cutoff | cell decomposition.
+                            if (s % decompPeriod == 0)
+                            {
+                               internal->decompose();
+                               RBC.updateParticleLists( internal->getPos_d() );
+                            }
+                            internal->compute(get_energy);
+                            break;
+                        case 1: // Do not use cutoff
+                            internal->computeFull(get_energy);
+                            break;
+
+                        case 2: // Compute only softcore forces.
+                            internal->computeSoftcoreFull(get_energy);
+                            break;
+
+                        case 3: // Compute only electrostatic forces.
+                            internal->computeElecFull(get_energy);
+                            break;
+                    }
+                }
+            }
+            gpuErrchk(cudaDeviceSynchronize());
+
+            //compute the force for rigid bodies
+            RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s);
+            //Han-Yi Chou
+            //For BAOAB, the last update is only to update the momentum
+            gpuErrchk(cudaDeviceSynchronize());
+
+            if(particle_dynamic == String("Langevin"))
+                LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
+
+           //gpuErrchk(cudaDeviceSynchronize());
+
+           gpuErrchk(cudaDeviceSynchronize());
+
+           if (s % outputPeriod == 0) 
+           {
+                if(particle_dynamic == String("Langevin"))
+                {
+                    gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+                    /*
+                    gpuErrchk(cudaMemcpy(force, force_d, sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
+                    Vector3 f(0.f), p(0.f), r0(0.f);
+                    double total_mass = 0.;
+                    for(int i = 0; i < num * numReplicas; ++i)
+                    {
+                        f = f + force[i];
+                        p = p + momentum[i];
+                        total_mass += part[type[i]].mass;
+                        r0 = r0 + part[type[i]].mass * pos[i];
+                    }
+                    printf("The COM is %f %f %f\n",r0.x / total_mass, r0.y / total_mass, r0.z / total_mass);
+                    printf("The total momentum is %f %f %f\n",p.x, p.y, p.z);
+                    printf("The total force %f %f %f\n",f.x, f.y, f.z);
+                    */
+		    // Output trajectories (to files)
 		}
-
-		/* Time force computations.
-		wkf_timer_stop(cputimer);
-		float dt1 = wkf_timer_time(cputimer);
-		printf("Force Computation Time: %f ms\n", dt1 * 1000);
-		wkf_timer_start(cputimer);
-		// */
-
-		// Make sure the force computation has completed without errors before continuing.
-		//gpuErrchk(cudaPeekAtLastError()); // Does not work on old GPUs (like mine). TODO: write a better wrapper around Peek
-		gpuErrchk(cudaDeviceSynchronize());
-		
-		// printf("  Computed energies\n");
-
-		// int numBlocks = (num * numReplicas) / NUM_THREADS + 1;
-		int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1);
-		int tl = temperatureGridFile.length();
-
-		RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s);					/* update RB forces before update particle positions... */
-
-		/* DEBUG: reduce net force on particles
-		{  
-			Vector3 netForce(0.0f);
-			Vector3* netForce_d;
-			gpuErrchk(cudaMalloc(&netForce_d, sizeof(Vector3)));
-			gpuErrchk(cudaMemcpy(netForce_d, &netForce, sizeof(Vector3), cudaMemcpyHostToDevice));
-			reduceVector<<< 500, NUM_THREADS, NUM_THREADS*sizeof(Vector3)>>>(
-				num, internal->getForceInternal_d(), netForce_d);
-			gpuErrchk(cudaDeviceSynchronize());
-			gpuErrchk(cudaMemcpy(&netForce, netForce_d, sizeof(Vector3), cudaMemcpyDeviceToHost));
-			runningNetForce = runningNetForce + netForce;
-			printf("Net Force: %f %f %f\n", runningNetForce.x,runningNetForce.y,runningNetForce.z);
-		}		
-		// */
-
-		//MLog: Call the kernel to update the positions of each particle
-		// cudaSetDevice(0);
-		updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas);
-		//gpuErrchk(cudaPeekAtLastError()); // Does not work on old GPUs (like mine). TODO: write a better wrapper around Peek
-		
-		/* Time position computations.
-		wkf_timer_stop(cputimer);
-		float dt2 = wkf_timer_time(cputimer);
-		printf("Position Update Time: %f ms\n", dt2 * 1000);
-		*/
-
-		RBC.integrate(s);
-		/* 	for (int j = 0; j < t->num; j++) { */
-		/* computeGridGridForce<<< numBlocks, NUM_THREADS >>>(grid1_d, grid2_d); */
-		
-		// int numBlocks = (numRB ) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
-
-		Vector3 force0(0.0f);		
-
-		
-		if (s % outputPeriod == 0) {
-			// Copy particle positions back to CPU
-			// cudaSetDevice(0);
-			gpuErrchk(cudaMemcpy(pos, internal ->  getPos_d(), sizeof(Vector3) * num * numReplicas,
-					cudaMemcpyDeviceToHost));
-		}
-
-		if (imd_on && clientsock && s % outputPeriod == 0) {
-		
-			float* coords = new float[num*3]; // TODO: move allocation out of run loop
-			int* atomIds = new int[num]; // TODO: move allocation out of run loop
-
-			int length;
-
-			bool paused = false;
-			while (vmdsock_selread(clientsock, 0) > 0 || paused) {
-				switch (imd_recv_header(clientsock, &length)) {
-				case IMD_DISCONNECT:
-					printf("[IMD] Disconnecting...\n");
-					imd_disconnect(clientsock);
-					clientsock = NULL;
-					sleep(5);
-					break;
-				case IMD_KILL:
-					printf("[IMD] Killing...\n");
-					imd_disconnect(clientsock);
-					clientsock = NULL;
-					steps = s; // Stop the simulation at this step
-					sleep(5);
-					break;
-				case IMD_PAUSE:
-					// if (paused) {
-					// 	printf("[IMD] Continuing...\n");
-					// } else {
-					// 	printf("[IMD] Pausing...\n");
-					// }
-					paused = !paused;
-					break;
-				case IMD_GO:
-					printf("[IMD] Caught IMD_GO\n");
-					break;
-
-				case IMD_MDCOMM:					
-					// if (!paused)
-					// 	printf("[IMD] Receiving %d forces...\n",length);
-					for (size_t i = 0; i < num; ++i) // clear old forces
-						imdForces[i] = Vector3(0.0f);
-
-					if (imd_recv_mdcomm(clientsock, length, atomIds, coords)) {
-						printf("[IMD] Error receiving forces\n"); 
-					} else {
-						for (size_t j = 0; j < length; ++j) {
-							int i = atomIds[j];
-							imdForces[i] = Vector3(coords[j*3], coords[j*3+1], coords[j*3+2]) * conf.imdForceScale;
-							// if (!paused)
-							  // printf("[IMD] Applying force (%0.2f, %0.2f, %0.2f) to particle %d\n", imdForces[i].x, imdForces[i].y, imdForces[i].z, i);
-							  //printf("[IMD] Applying %0.1f pN force to particle %d at step %d\n", 69.47694f * imdForces[i].length(), i, s);
-						}
-					}
-					break; 
-
-				default:
-					printf("[IMD] Something weird happened. Disconnecting..\n");
-					break;
-				}
-			}
-			if (clientsock) {
-				// cudaSetDevice(0);
-				// float* coords = new float[num*3]; // TODO: move allocation out of run loop
-				for (size_t i = 0; i < num; i++) {
-					const Vector3& p = pos[i];
-					coords[3*i] = p.x;
-					coords[3*i+1] = p.y;
-					coords[3*i+2] = p.z;
-				}
-				imd_send_fcoords(clientsock, num, coords);
-			}
-
-			delete[] coords;
-			delete[] atomIds;
-		}
-		
-
-		// Output trajectories (to files)
-		if (s % outputPeriod == 0) {
-		  /* // print IMDforces
-		  for (int i=0; i < num * numReplicas; ++i) {
-		    float f = imdForces[i].length() * 69.47694f;
-		    if (f > 0.1f)
-		      printf("[IMD] Applying %0.1f pN force to particle %d at step %d\n", f, i, s);
-		  }
-		  // */
-			// Nanoseconds computed
-			t = s*timestep;
-
-			// Loop over all replicas
-			for (int repID = 0; repID < numReplicas; ++repID) {
-
-				// *Analysis*: ionic current
-				if (currentSegmentZ <= 0.0f) writeCurrent(repID, t);
-				else writeCurrentSegment(repID, t, currentSegmentZ);
-
-				// 
-				if (numberFluct == 1) updateNameList(); // no need for it here if particles stay the same
-														// TODO: doublecheck
-
-				// Write the trajectory.
-				writers[repID]->append(pos + (repID*num), name, serial, t, num);
-			}
-
-			// Handle particle fluctuations.
-			// TODO: Currently, not compatible with replicas. Needs a fix.
-			if (numberFluct == 1) updateReservoirs();
-
-			// Store the current positions.
-			// We must do this after particles have been added.
-			remember(t);
-		} // s % outputPeriod == 0
-
-
-		// Output energy.
-		if (get_energy) {
-			// Stop the timer.
-			// cudaSetDevice(0);
-			wkf_timer_stop(timerS);
-
-			// Copy back forces to display (internal only)
-			// gpuErrchk(cudaMemcpy(&force0, internal -> getForceInternal_d(), sizeof(Vector3), cudaMemcpyDeviceToHost));
-			
-			// Nanoseconds computed
-			t = s * timestep;
-
-			// Simulation progress and statistics.
-			float percent = (100.0f * s) / steps;
-			float msPerStep = wkf_timer_time(timerS) * 1000.0f / outputEnergyPeriod;
-			float nsPerDay = numReplicas * timestep / msPerStep * 864E5f;
-
-			// Nice thousand separator
-			setlocale(LC_NUMERIC, "");
-
-			// Do the output
-			printf("\rStep %ld [%.2f%% complete | %.3f ms/step | %.3f ns/day]",
-						 s, percent, msPerStep, nsPerDay);
-		/*	printf("T: %.5g ns | E: %.5g | F: %.5g %.5g %.5g\n",
-						 t, energy, force0.x, force0.y, force0.z);
-		*/
-
-			// Copy positions from GPU to CPU.
-			gpuErrchk(cudaMemcpy(pos, internal->getPos_d(), sizeof(Vector3)*num*numReplicas,
-													 cudaMemcpyDeviceToHost));
-
-			// Write restart files for each replica.
-			for (int repID = 0; repID < numReplicas; ++repID)
-				writeRestart(repID);
-
-			// restart the timer
-			wkf_timer_start(timerS);
-		} // s % outputEnergyPeriod
-
-		if (imd_on && clientsock) {
-			internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD
-		} else {
-			int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
-			//MLog: along with calls to internal (ComputeForce class) this function should execute once per GPU.
-			clearInternalForces<<< numBlocks, NUM_THREADS >>>(internal->getForceInternal_d(), num*numReplicas);
-		}
-		
+                //RBC.print(s);
+                t = s*timestep;
+
+		// Loop over all replicas
+		for (int repID = 0; repID < numReplicas; ++repID) 
+                {
+		    // *Analysis*: ionic current
+		    if (currentSegmentZ <= 0.0f) 
+                        writeCurrent(repID, t);
+		    else 
+                        writeCurrentSegment(repID, t, currentSegmentZ);
+
+                    if (numberFluct == 1) 
+                        updateNameList(); // no need for it here if particles stay the same
+
+                    // Write the trajectory.
+		    writers[repID]->append(pos + (repID*num), name, serial, t, num);
+
+                    if(particle_dynamic == String("Langevin"))
+                    {
+                        momentum_writers[repID]->append(momentum + (repID * num), name, serial, t, num);
+                        //force_writers[repID]->append(force + (repID * num), name, serial, t, num);
+                    }
+
+	        }
+
+                // TODO: Currently, not compatible with replicas. Needs a fix.
+		if (numberFluct == 1) 
+                    updateReservoirs();
+
+		remember(t);
+            }
+	    // Output energy.
+            if (get_energy) 
+            {
+	        wkf_timer_stop(timerS);
+	        t = s * timestep;
+	        // Simulation progress and statistics.
+	        float percent = (100.0f * s) / steps;
+	        float msPerStep = wkf_timer_time(timerS) * 1000.0f / outputEnergyPeriod;
+	        float nsPerDay = numReplicas * timestep / msPerStep * 864E5f;
+
+	        // Nice thousand separator
+	        setlocale(LC_NUMERIC, "");
+
+	        // Do the output
+	        printf("\rStep %ld [%.2f%% complete | %.3f ms/step | %.3f ns/day]",s, percent, msPerStep, nsPerDay);
+
+	        // Copy positions from GPU to CPU.
+	        gpuErrchk(cudaMemcpy(pos, internal->getPos_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                if(particle_dynamic == String("Langevin"))
+                {
+                    //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                    gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                    float e = KineticEnergy();
+                    printf(" The kinetic energy is %f \n",e*(2.388458509e-1));
+                }
+                if(rigidbody_dynamic == String("Langevin"))
+                {
+                    //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+                    float e = RotKineticEnergy();
+                    printf(" The Rotational kinetic energy is %f \n",e*(2.388458509e-1));
+                }
+       
+                // Write restart files for each replica.
+                for (int repID = 0; repID < numReplicas; ++repID)
+                    writeRestart(repID);
+                
+	        wkf_timer_start(timerS);
+            } // s % outputEnergyPeriod
 	} // done with all Brownian dynamics steps
 
 	// If IMD is on & our socket is still open.
-	if (imd_on and clientsock) {
-		if (vmdsock_selread(clientsock, 0) == 1) {
-			int length;
-			switch (imd_recv_header(clientsock, &length)) {
-				case IMD_DISCONNECT:
-					printf("\n[IMD] Disconnecting...\n");
-					imd_disconnect(clientsock);
-					clientsock = NULL;
-					sleep(5);
-					break;
-				case IMD_KILL:
-					printf("\n[IMD] Killing...\n");
-					imd_disconnect(clientsock);
-					clientsock = NULL;
-					sleep(5);
-					break;
-				default:
-					printf("\n[IMD] Something weird happened. Disconnecting..\n");
-					break;
-			}
-		}
+	if (imd_on and clientsock) 
+        {
+            if (vmdsock_selread(clientsock, 0) == 1) 
+            {
+	        int length;
+		switch (imd_recv_header(clientsock, &length)) 
+                {
+		    case IMD_DISCONNECT:
+		        printf("\n[IMD] Disconnecting...\n");
+			imd_disconnect(clientsock);
+			clientsock = NULL;
+			sleep(5);
+			break;
+		    case IMD_KILL:
+		        printf("\n[IMD] Killing...\n");
+			imd_disconnect(clientsock);
+			clientsock = NULL;
+			sleep(5);
+			break;
+		    default:
+		        printf("\n[IMD] Something weird happened. Disconnecting..\n");
+			break;
+	        }
+            }
 	}
-
 	// Stop the main timer.
 	wkf_timer_stop(timer0);
 
@@ -743,15 +1356,9 @@ void GrandBrownTown::run() {
 	else if (tot_min > 0) printf("%dm%.1fs\n", tot_min, tot_sec);
 	else printf("%.2fs\n", tot_sec);
 
-	// cudaSetDevice(0);
-	// gpuErrchk(cudaMemcpy(pos, internal -> getPos_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost));
-
-	// // Write the restart file (once again)
-	// for (int repID = 0; repID < numReplicas; ++repID)
-	// 	writeRestart(repID);
-
+        gpuErrchk(cudaFree(force_d));
+#endif
 } // GrandBrownTown::run()
-
 // --------------------------------------------
 // Populate lists of types and serial numbers.
 //
@@ -774,15 +1381,32 @@ void GrandBrownTown::populate() {
 
 
 
-void GrandBrownTown::writeRestart(int repID) const {
-	FILE* out = fopen(restartFiles[repID].c_str(), "w");
-	const int offset = repID * num;
-	for (int i = 0; i < num; ++i) {
-		const int ind = i + offset;
-		const Vector3& p = pos[ind];
-		fprintf(out, "%d %.10g %.10g %.10g\n", type[ind], p.x, p.y, p.z);
-	}
-	fclose(out);
+void GrandBrownTown::writeRestart(int repID) const 
+{
+    FILE* out   = fopen(restartFiles[repID].c_str(), "w");
+    const int offset = repID * num;
+
+    for (int i = 0; i < num; ++i) 
+    {
+        const int ind = i + offset;
+        const Vector3& p = pos[ind];
+	fprintf(out, "%d %.10g %.10g %.10g\n", type[ind], p.x, p.y, p.z); 
+    }
+    fclose(out);
+
+    if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+    {
+        out = fopen(restartMomentumFiles[repID].c_str(), "w");
+        
+        for (int i = 0; i < num; ++i) 
+        {
+            const int ind = i + offset;
+            const Vector3& p = momentum[ind];
+            fprintf(out, "%d %.10g %.10g %.10g\n", type[ind], p.x, p.y, p.z); 
+        }
+        fclose(out);
+    }
+   
 }
 
 
@@ -802,7 +1426,7 @@ void GrandBrownTown::initialCond() {
 	}
 }
 
-
+#if 0
 // A couple old routines for getting particle positions.
 Vector3 GrandBrownTown::findPos(int typ) {
 	Vector3 r;
@@ -828,7 +1452,92 @@ Vector3 GrandBrownTown::findPos(int typ, float minZ) {
 	} while (pt.pmf->interpolatePotential(r) > pt.meanPmf and fabs(r.z) > minZ);
 	return r;
 }
+#endif 
+
+Vector3 GrandBrownTown::findPos(int typ) {
+        Vector3 r;
+        static gsl_rng *gslcpp_rng = gsl_rng_alloc(gsl_rng_default);
+        //srand (time(NULL));
+        //gsl_rng_set (gslcpp_rng, rand() % 100000);
+        
+        const BrownianParticleType& pt = part[typ];
+        do {
+                const float rx = sysDim.x * (float)(gsl_ran_flat(gslcpp_rng,0.,1.));
+                const float ry = sysDim.y * (float)(gsl_ran_flat(gslcpp_rng,0.,1.));
+                const float rz = sysDim.z * (float)(gsl_ran_flat(gslcpp_rng,0.,1.));
+                r = sys->wrap( Vector3(rx, ry, rz) );
+        } while (pt.pmf->interpolatePotential(r) > pt.meanPmf);
+        return r;
+}
+
+
+Vector3 GrandBrownTown::findPos(int typ, float minZ) {
+        Vector3 r;
+        static gsl_rng *gslcpp_rng = gsl_rng_alloc(gsl_rng_default);
+        //srand(time(NULL));
+
+        const BrownianParticleType& pt = part[typ];
+        do {
+                const float rx = sysDim.x * (float)(gsl_ran_flat(gslcpp_rng,0.,1.));
+                const float ry = sysDim.y * (float)(gsl_ran_flat(gslcpp_rng,0.,1.));
+                const float rz = sysDim.z * (float)(gsl_ran_flat(gslcpp_rng,0.,1.));
+                r = sys->wrap( Vector3(rx, ry, rz) );
+        } while (pt.pmf->interpolatePotential(r) > pt.meanPmf and fabs(r.z) > minZ);
+        return r;
+}
+
+//Compute the kinetic energy of particle and rigid body Han-Yi Chou
+float GrandBrownTown::KineticEnergy()
+{
+    float *vec_red, *energy;
+    float particle_energy;
+
+    gpuErrchk(cudaMalloc((void**)&vec_red, 512*sizeof(float)));
+    gpuErrchk(cudaMalloc((void**)&energy, sizeof(float)));
+    gpuErrchk(cudaMemset((void*)energy,0,sizeof(float)));
+
+    BrownParticlesKineticEnergy<64><<<dim3(512),dim3(64)>>>(internal->getMom_d(), internal -> getType_d(), part_d, vec_red, numReplicas*num);
+    gpuErrchk(cudaDeviceSynchronize());
+
+    Reduction<64><<<dim3(1),dim3(64)>>>(vec_red, energy, 512);
+
+    gpuErrchk(cudaMemcpy((void*)&particle_energy, energy, sizeof(float), cudaMemcpyDeviceToHost));
+
+    gpuErrchk(cudaFree(vec_red));
+    gpuErrchk(cudaFree(energy));
+
+    return 2. * particle_energy / kT / num / numReplicas; //In the unit of 0.5kT
+}
+
+float GrandBrownTown::RotKineticEnergy()
+{
+    float e = RBC.KineticEnergy();
+
+    return 2. * e / numReplicas / kT; //In the unit of 0.5kT
+}
 
+void GrandBrownTown::InitNoseHooverBath(int N)
+{
+    printf("Entering Nose-Hoover Langevin\n");
+    int count = 0;
+    gsl_rng *gslcpp_rng = gsl_rng_alloc(gsl_rng_default);
+    srand (time(NULL));
+    gsl_rng_set (gslcpp_rng, rand() % 100000);
+
+    for(int i = 0; i < N; ++i)
+    {
+        int typ = type[i];
+        double mu = part[typ].mu;
+        
+        double sigma = sqrt(kT / mu);
+
+        float tmp = gsl_ran_gaussian(gslcpp_rng,sigma);
+        random[(size_t)count] = tmp;
+        ++count;
+    }
+    gsl_rng_free(gslcpp_rng);
+    printf("Done in nose-hoover bath\n");
+}
 // -----------------------------------------------------------------------------
 // Initialize file for recording ionic current
 void GrandBrownTown::newCurrent(int repID) const {
@@ -989,6 +1698,8 @@ void GrandBrownTown::updateNameList() {
 void GrandBrownTown::remember(float t) {
 	timeLast = t;
 	std::copy(pos, pos + num * numReplicas, posLast);
+        if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+            std::copy(momentum, momentum + num * numReplicas, momLast);
 }
 
 // -----------------------------------------------------------------------------
@@ -1119,9 +1830,18 @@ void GrandBrownTown::copyToCUDA() {
 	gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num,
 														cudaMemcpyHostToDevice));*/
 
-	gpuErrchk(cudaMalloc(&randoGen_d, sizeof(Random)));
-	gpuErrchk(cudaMemcpy(randoGen_d, randoGen, sizeof(Random),
-														cudaMemcpyHostToDevice));
+	//gpuErrchk(cudaMalloc(&randoGen_d, sizeof(Random)));
+	//gpuErrchk(cudaMemcpy(randoGen_d, randoGen, sizeof(Random),
+	//													cudaMemcpyHostToDevice));
+	gpuErrchk(cudaMalloc((void**)&randoGen_d, sizeof(Random)));
+        //gpuErrchk(cudaMemcpy(randoGen_d, randoGen,  sizeof(Random), cudaMemcpyHostToDevice));
+
+        //gpuErrchk(cudaMemcpy(&(randoGen_d->generator), &(randoGen->generator), sizeof(curandGenerator_t),cudaMemcpyHostToDevice));
+        gpuErrchk(cudaMemcpy(&(randoGen_d->states), &(randoGen->states), sizeof(curandState_t*),cudaMemcpyHostToDevice));
+        gpuErrchk(cudaMemcpy(&(randoGen_d->generator), &(randoGen->generator), sizeof(curandGenerator_t),cudaMemcpyHostToDevice));
+        gpuErrchk(cudaMemcpy(&(randoGen_d->integer_d), &(randoGen->integer_d), sizeof(unsigned int*),cudaMemcpyHostToDevice));
+        gpuErrchk(cudaMemcpy(&(randoGen_d->uniform_d), &(randoGen->uniform_d), sizeof(float*),cudaMemcpyHostToDevice));
+
 	// gpuErrchk(cudaDeviceSynchronize());
 }
 
diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh
index 78c4f0b..4fc8d0f 100644
--- a/src/GrandBrownTown.cuh
+++ b/src/GrandBrownTown.cuh
@@ -2,27 +2,354 @@
 //
 // Terrance Howard <heyterrance@gmail.com>
 #pragma once
-
+//#define MDSTEP
+#define Unit1 4.18679994e4
+#define Unit2 2.046167337
+//#define Debug
 #include "CudaUtil.cuh"
 #include "RigidBodyType.h"
 #include "RigidBodyGrid.h"
 
 __device__
-Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
+Vector3 step(Vector3& r0, float kTlocal, Vector3 force, float diffusion, Vector3 diffGrad,
 						 float timestep, BaseGrid *sys, Random *randoGen, int num);
 
-__device__
-Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
-						 Vector3 diffGrad, float timestep, BaseGrid *sys,
-						 Random *randoGen, int num);
+//update both position and momentum for Langevin dynamics
+//Han-Yi Chou
+#if 0
+__global__ void updateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal, 
+                             int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField, 
+                             int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas) 
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
-__global__
-void clearInternalForces(Vector3* __restrict__ forceInternal, const int num) {
-	const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-	if (idx < num)
-		forceInternal[idx] = Vector3(0.0f);
+    if (idx < num * numReplicas)
+    {
+        int t = type[idx];
+        //Vector3 r0(tex1Dfetch(PosTex, idx));
+        //Vector3 p0(tex1Dfetch(MomTex, idx));
+        Vector3 r0 = pos[idx];
+        Vector3 p0 = momentum[idx];
+        
+        const BrownianParticleType& pt = *part[t];
+        float mass      = pt.mass;
+        r0 = r0 + (timestep * 5e3 * p0) / mass;
+        r0 = sys->wrap(r0);
+        pos[idx]      = r0;
+
+#if 0
+        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
+
+        // Compute PMF
+        // TODO: maybe make periodic and nonPeriodic versions of this kernel
+        ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
+
+#ifndef FORCEGRIDOFF
+        // Add a force defined via 3D FORCE maps (not 3D potential maps)
+        if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
+#endif
+        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+
+        // Get local kT value
+        float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(r0); /* periodic */
+
+        // Update the particle's position using the calculated values for time, force, etc.
+        float mass      = pt.mass;
+        Vector3 gamma   = pt.transDamping;
+
+        if (pt.diffusionGrid != NULL) 
+        {
+
+            Vector3 gridCenter = pt.diffusionGrid->origin +
+            pt.diffusionGrid->basis.transform( Vector3(0.5*pt.diffusionGrid->nx, 0.5*pt.diffusionGrid->ny,
+                                                       0.5*pt.diffusionGrid->nz));
+            Vector3 p2 = r0 - gridCenter;
+            p2 = sys->wrapDiff( p2 ) + gridCenter;
+            ForceEnergy diff = pt.diffusionGrid->interpolateForceDLinearlyPeriodic(p2);
+            gamma = Vector3(kTlocal / (mass * diff.e));
+            //diffGrad = diff.f;
+        }
+
+        float tmp = sqrtf(0.50f * kTlocal * mass * timestep);
+        Vector3 rando = randoGen->gaussian_vector(idx, num * numReplicas);
+
+        #ifdef MDSTEP
+        force = Vector3(-r0.x, -r0.y, -r0.z);
+        #endif
+
+        p0.x = (1.0f - 0.50f * timestep * gamma.x) * p0.x + (0.50f * timestep * force.x * Unit1 + tmp * sqrtf(gamma.x) * rando.x * Unit2);
+        p0.y = (1.0f - 0.50f * timestep * gamma.y) * p0.y + (0.50f * timestep * force.y * Unit1 + tmp * sqrtf(gamma.y) * rando.y * Unit2);
+        p0.z = (1.0f - 0.50f * timestep * gamma.z) * p0.z + (0.50f * timestep * force.z * Unit1 + tmp * sqrtf(gamma.z) * rando.z * Unit2);
+
+        r0 = r0 + (timestep * 1e4 * p0) / mass;
+        r0 = sys->wrap(r0);
+ 
+        //step(r0, p0, kTlocal, force, tmp, -diffGrad, mass, gamma, timestep, sys, randoGen, num * numReplicas);
+        pos[idx]      = r0;
+        momentum[idx] = p0;
+#endif
+
+    }
+}
+#endif
+////The kernel is for Nose-Hoover Langevin dynamics
+__global__ void updateKernelNoseHooverLangevin(Vector3* pos, Vector3* momentum, float* random, Vector3* __restrict__ forceInternal,
+                                  int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField,
+                                  int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas)
+{
+    const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx < num * numReplicas)
+    {
+        int t = type[idx];
+
+        Vector3 r0  = pos[idx];
+        Vector3 p0  = momentum[idx];
+        float   ran = random[idx];
+
+        const BrownianParticleType& pt = *part[t];
+        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
+        ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
+
+#ifndef FORCEGRIDOFF
+        // Add a force defined via 3D FORCE maps (not 3D potential maps)
+        if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
+#endif
+        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+        #ifdef Debug
+        forceInternal[idx] = -force;
+        #endif
+        // Get local kT value
+        float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(r0); /* periodic */
+
+        // Update the particle's position using the calculated values for time, force, etc.
+        float mass  = pt.mass;
+        float mu    = pt.mu;
+        Vector3 gamma   = pt.transDamping;
+        float rando = (*randoGen).gaussian(idx, num * numReplicas);
+
+        float tmp   = sqrtf(kTlocal * (1.f - expf(-2.f * gamma.x * timestep)) / mu);
+
+        if (pt.diffusionGrid != NULL)
+        {
+
+            Vector3 gridCenter = pt.diffusionGrid->origin +
+            pt.diffusionGrid->basis.transform( Vector3(0.5*pt.diffusionGrid->nx, 0.5*pt.diffusionGrid->ny,
+                                                       0.5*pt.diffusionGrid->nz));
+            Vector3 p2 = r0 - gridCenter;
+            p2 = sys->wrapDiff( p2 ) + gridCenter;
+            ForceEnergy diff = pt.diffusionGrid->interpolateForceDLinearlyPeriodic(p2);
+            gamma = Vector3(kTlocal / (mass * diff.e));
+        }
+
+        #ifdef MDSTEP
+        force = Vector3(-r0.x, -r0.y, -r0.z);
+        #endif
+
+        p0  = p0  + 0.5f * timestep * force * Unit1;
+
+        r0  = r0  + 0.5f * timestep * p0 * 1e4 / mass;
+        r0 = sys->wrap(r0);
+
+        ran = ran + 0.5f * (p0.length2() / mass * 0.238845899f - 3.f * kTlocal) / mu;
+
+        p0  = expf(-0.5f * ran) * p0;
+
+        ran = expf(-gamma.x * timestep) * ran + tmp * rando;
+
+        p0  = expf(-0.5f * ran) * p0;
+
+        ran = ran + 0.5f * (p0.length2() / mass * 0.238845899 - 3.f * kTlocal) / mu;
+ 
+        r0  = r0  + 0.5f * timestep * p0 * 1e4 / mass;
+        r0 = sys->wrap(r0);
+
+        pos[idx] = r0;
+        momentum[idx] = p0;
+        random[idx]= ran;     
+    }
 }
 
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//This is the kernel for BAOAB (velocity verlet algorithm) which is symplectic. For more information,
+//please infer http://www.MolecularDynamics.info
+//The original BBK kernel is no longer used since the random numbers should be reused 
+//which is not possible in GPU code.
+//Han-Yi Chou
+__global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
+                                  int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField,
+                                  int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas)
+{
+    const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+
+    if (idx < num * numReplicas)
+    {
+        int t = type[idx];
+
+        Vector3 r0 = pos[idx];
+        Vector3 p0 = momentum[idx];
+
+        const BrownianParticleType& pt = *part[t];
+        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
+
+        ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
+
+#ifndef FORCEGRIDOFF
+        // Add a force defined via 3D FORCE maps (not 3D potential maps)
+        if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
+#endif
+        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+#ifdef Debug
+        forceInternal[idx] = -force;
+        #endif
+
+        // Get local kT value
+        float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(r0); /* periodic */
+
+        // Update the particle's position using the calculated values for time, force, etc.
+        float mass      = pt.mass;
+        Vector3 gamma   = pt.transDamping;
+        Vector3 rando = (*randoGen).gaussian_vector(idx, num * numReplicas);
+        //printf("%f %f %f\n", rando.x, rando.y, rando.z);
+        if (pt.diffusionGrid != NULL) 
+        {
+
+            Vector3 gridCenter = pt.diffusionGrid->origin +
+            pt.diffusionGrid->basis.transform( Vector3(0.5*pt.diffusionGrid->nx, 0.5*pt.diffusionGrid->ny,
+                                                       0.5*pt.diffusionGrid->nz));
+            Vector3 p2 = r0 - gridCenter;
+            p2 = sys->wrapDiff( p2 ) + gridCenter;
+            ForceEnergy diff = pt.diffusionGrid->interpolateForceDLinearlyPeriodic(p2);
+            gamma = Vector3(kTlocal / (mass * diff.e));
+        }
+
+        #ifdef MDSTEP
+        force = Vector3(-r0.x, -r0.y, -r0.z);
+        #endif
+
+        p0 = p0 + 0.5f * timestep * force * Unit1;
+        r0 = r0 + 0.5f * timestep / mass * p0 * 1e4;
+
+        p0.x = expf(-timestep * gamma.x) * p0.x + sqrtf(mass * kTlocal * (1.f-expf(-2.f*timestep*gamma.x))) * rando.x * Unit2;
+        p0.y = expf(-timestep * gamma.y) * p0.y + sqrtf(mass * kTlocal * (1.f-expf(-2.f*timestep*gamma.y))) * rando.y * Unit2;
+        p0.z = expf(-timestep * gamma.z) * p0.z + sqrtf(mass * kTlocal * (1.f-expf(-2.f*timestep*gamma.z))) * rando.z * Unit2;
+
+        r0 = r0 + 0.5f * timestep * p0 * 1e4 / mass;
+        r0 = sys->wrap(r0);
+        
+        pos[idx]      = r0;
+        momentum[idx] = p0;
+
+        //if(idx == 0)
+          //  printf("%f %f %f\n", pos[idx].x,pos[idx].y,pos[idx].z);
+    }
+}
+
+//update momentum in the last step of BAOAB integrator for the Langevin dynamics. Han-Yi Chou
+__global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* __restrict__ forceInternal,
+                                      int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField,
+                                      int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas)
+{
+    const int idx  = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+
+    if (idx < num * numReplicas)
+    {
+        int t = type[idx];
+        Vector3 r0 = pos[idx];
+        Vector3 p0 = momentum[idx];
+
+        const BrownianParticleType& pt = *part[t];
+        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
+
+        ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
+#ifndef FORCEGRIDOFF
+        // Add a force defined via 3D FORCE maps (not 3D potential maps)
+        if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
+#endif
+        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+#ifdef Debug
+        forceInternal[idx] = -force;
+        #endif
+
+        #ifdef MDSTEP
+        force = Vector3(-r0.x, -r0.y, -r0.z);
+        #endif
+
+        p0 = p0 + 0.5f * timestep * force * Unit1;
+        momentum[idx] = p0;
+    }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//The last step of BBK integrator to update the momentum for Langevin dynamics.
+#if 0
+__global__ void LastUpdateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
+                                 int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField,
+                                 int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas)
+{
+    const int idx  = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < num * numReplicas)
+    {
+        int t = type[idx];
+        Vector3 r0 = pos[idx];
+        Vector3 p0 = momentum[idx];
+
+        const BrownianParticleType& pt = *part[t];
+        Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField);
+
+        ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0);
+
+#ifndef FORCEGRIDOFF
+        // Add a force defined via 3D FORCE maps (not 3D potential maps)
+        if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0);
+        if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0);
+#endif
+        Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+
+        float mass      = pt.mass;
+        Vector3 gamma   = pt.transDamping;
+        
+        float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(r0); /* periodic */
+
+        float tmp = sqrtf(0.50f * kTlocal * mass * timestep);
+
+        if (pt.diffusionGrid != NULL)
+        {
+
+            Vector3 gridCenter = pt.diffusionGrid->origin +
+            pt.diffusionGrid->basis.transform( Vector3(0.5*pt.diffusionGrid->nx, 0.5*pt.diffusionGrid->ny,
+                                                       0.5*pt.diffusionGrid->nz));
+            Vector3 p2 = r0 - gridCenter;
+            p2 = sys->wrapDiff( p2 ) + gridCenter;
+            ForceEnergy diff = pt.diffusionGrid->interpolateForceDLinearlyPeriodic(p2);
+            gamma = Vector3(kTlocal / (mass * diff.e));
+        }
+
+        Vector3 rando = randoGen->gaussian_vector(idx, num * numReplicas);
+        #ifdef MDSTEP
+        force = Vector3(-r0.x, -r0.y, -r0.z);
+        #endif
+
+        //step(p0, kTlocal, force, tmp, -diffGrad, mass, gamma, timestep, sys, randoGen, num * numReplicas);
+        p0.x = (p0.x + 0.50 * timestep * force.x * Unit1 + tmp * sqrtf(gamma.x) * rando.x * Unit2) / (1.0f+0.5f*timestep*gamma.x);
+        p0.y = (p0.y + 0.50 * timestep * force.y * Unit1 + tmp * sqrtf(gamma.y) * rando.y * Unit2) / (1.0f+0.5f*timestep*gamma.y);
+        p0.z = (p0.z + 0.50 * timestep * force.z * Unit1 + tmp * sqrtf(gamma.z) * rando.z * Unit2) / (1.0f+0.5f*timestep*gamma.z);
+
+        momentum[idx] = p0;
+    }
+}
+#endif
+
+//Update kernel for Brownian dynamics
 __global__
 void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal,
 									int type[], BrownianParticleType* part[],
@@ -31,16 +358,16 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal,
 									float timestep, int num, BaseGrid* sys,
 									Random* randoGen, int numReplicas) {
 	// Calculate this thread's ID
-	const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
+	const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+        
 	// TODO: Make this a grid-stride loop to make efficient reuse of RNG states 
 	// Loop over ALL particles in ALL replicas
 	if (idx < num * numReplicas) {
 		const int t = type[idx];
-		Vector3& p = pos[idx];
+		Vector3   p = pos[idx];
 
 		const BrownianParticleType& pt = *part[t];
-
+                
 	 	/* printf("atom %d: forceInternal: %f %f %f\n", idx, forceInternal[idx].x, forceInternal[idx].y, forceInternal[idx].z);  */
 
 		// Compute external force
@@ -62,9 +389,12 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal,
 		//	  External:  electric field (now this is basically a constant vector)
 		//	  forceGrid: ADD force due to PMF or other potentials defined in 3D space
 		Vector3 force = forceInternal[idx] + forceExternal + fe.f;
+#ifdef Debug
+        forceInternal[idx] = -force;
+        #endif
 
-		if (idx == 0)
-			forceInternal[idx] = force; // write it back out for force0 in run()
+		//if (idx == 0)
+			//forceInternal[idx] = force; // write it back out for force0 in run()
 
 		// Get local kT value
 		float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(p); /* periodic */
@@ -101,17 +431,49 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal,
 		Vector3 tmp = step(p, kTlocal, force, diffusion, -diffGrad, timestep, sys, randoGen, num * numReplicas);
 		// assert( tmp.length() < 10000.0f );
 		pos[idx] = tmp;
+		
 			
 	}
 }
+/*
+//This is the BBK Langevin integrator for Langevin dynamics Han-Yi Chou
+__device__ inline void step(Vector3& r0, Vector3& p0, float kTlocal, Vector3 force, float diffusion, Vector3 diffGrad,
+                            float mass, Vector3& gamma, float timestep, BaseGrid *sys, Random *randoGen, int num)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    Vector3 rando = randoGen->gaussian_vector(idx, num);
+    float tmp = sqrtf(diffusion * timestep);
+
+    p0.x = (1.0f - 0.50f * timestep * gamma.x) * p0.x + (0.50f * timestep * force.x * Unit1 + (tmp * sqrtf(gamma.x) * rando.x) * mass * Unit2);
+    p0.y = (1.0f - 0.50f * timestep * gamma.y) * p0.y + (0.50f * timestep * force.y * Unit1 + (tmp * sqrtf(gamma.y) * rando.y) * mass * Unit2);
+    p0.z = (1.0f - 0.50f * timestep * gamma.z) * p0.z + (0.50f * timestep * force.z * Unit1 + (tmp * sqrtf(gamma.z) * rando.z) * mass * Unit2);
 
+    r0 = r0 + (timestep * p0) / mass * 1e4;
+    Vector3 r = r0;
+    r0 = sys->wrap(r);
+}
+
+//This is the BBK integrator for updating momentum in Langevin dynamics Han-Yi Chou
+__device__ inline void step(Vector3& p0, float kTlocal, Vector3 force, float diffusion, Vector3 diffGrad,
+                            float mass, Vector3& gamma, float timestep, BaseGrid *sys, Random *randoGen, int num)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    Vector3 rando = randoGen->gaussian_vector(idx, num);
+    float tmp = sqrtf(diffusion * timestep);
+    
+    p0.x = (p0.x + (0.50 * timestep * force.x * Unit1 + (tmp * sqrtf(gamma.x) * rando.x) * mass * Unit2)) / (1.0f+0.5f*timestep*gamma.x);
+    p0.y = (p0.y + (0.50 * timestep * force.y * Unit1 + (tmp * sqrtf(gamma.y) * rando.y) * mass * Unit2)) / (1.0f+0.5f*timestep*gamma.y);
+    p0.z = (p0.z + (0.50 * timestep * force.z * Unit1 + (tmp * sqrtf(gamma.z) * rando.z) * mass * Unit2)) / (1.0f+0.5f*timestep*gamma.z);
+}
+*/
+//For Brownian dynamics
 __device__
-inline Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion,
-						 Vector3 diffGrad, float timestep, BaseGrid *sys,
-						 Random *randoGen, int num) {
-	const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+inline Vector3 step(Vector3& r0, float kTlocal, Vector3 force, float diffusion,
+						Vector3 diffGrad, float timestep, BaseGrid *sys,
+						Random *randoGen, int num) {
+	const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
 	// TODO: improve performance by storing state locally, then sending it back to GPU
-	Vector3 rando = randoGen->gaussian_vector(idx, num);
+	Vector3 rando = (*randoGen).gaussian_vector(idx, num);
 
 	diffusion *= timestep;
 	Vector3 r = r0 + (diffusion / kTlocal) * force
diff --git a/src/GrandBrownTown.h b/src/GrandBrownTown.h
index 444349f..b5d8430 100644
--- a/src/GrandBrownTown.h
+++ b/src/GrandBrownTown.h
@@ -61,25 +61,19 @@ public:
 	~GrandBrownTown();
 
 	void run();
+        void RunNoseHooverLangevin();
 	static bool DEBUG;
 
 private:  
-	Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion);
-	Vector3 step(Vector3 r0, float kTlocal, Vector3 force, float diffusion, Vector3 diffGrad);
 
 	// Given the numbers of each particle, populate the type list.
 	void populate();
 
-	// Load coordinates from a three column text file.
-	bool loadCoordinates(const char* fileName);
-
 	// Count the number of atoms in the restart file.
 	int countRestart(const char* fileName);
 
 	void writeRestart(int repID) const;
-		
-	// Load coordinates from a four column text file (type x y z).
-	void loadRestart(const char* fileName);
+        void writeMomentumRestart(int repID) const;
 
 	void initialCondCen();
 	void initialCond();
@@ -99,6 +93,13 @@ private:
 	
 	void copyToCUDA();
 
+        //Compute the kinetic energy in general. Han-Yi Chou
+        float KineticEnergy();
+        float RotKineticEnergy();
+
+        //Initialize the Nose-Hoover auxilliary variables
+        void InitNoseHooverBath(int N);
+        //curandState_t *randoDevice;
 public:
 	// Compute the current in nanoamperes.
 	float current(float t) const;
@@ -126,7 +127,17 @@ private:
 	std::vector<std::string> outCurrFiles;
 	std::vector<std::string> restartFiles;
 	std::vector<std::string> outFilePrefixes;
+
+        //Hna-Yi Chou Langevin Dynamics
+        std::vector<std::string> restartMomentumFiles;
+        std::vector<std::string> outMomentumFilePrefixes;//, outForceFilePrefixes;
+
 	std::vector<TrajectoryWriter*> writers;
+
+        //For momentum, i.e. Langevin dynamic Han-Yi Chou
+        std::vector<TrajectoryWriter*> momentum_writers;
+        //std::vector<TrajectoryWriter*> force_writers;
+
 	Vector3 sysDim;
 
 	// Integrator variables
@@ -143,11 +154,15 @@ private:
 	int numCap; 		// max number of particles
 	int num; 			// number of particles
 	Vector3* pos; 		// particle positions
+        Vector3* momentum;      // particle momentum Han-Yi Chou
+        float *random;
+        //Vector3* force;
 	int* type; 			// particle types: 0, 1, ... -> num * numReplicas
 	String* name; 		// particle types: POT, CLA, ... -> num * numReplicas
 	int* serial; 		// particle serial numbers
 	int currSerial; 	// the serial number of the next new particle
 	Vector3* posLast; 	// previous positions of particles
+        Vector3* momLast;
 	float timeLast; 	// used with posLast
 	float minimumSep; 	// minimum separation allowed with placing new particles
 
@@ -159,7 +174,7 @@ private:
 	//int *type_d;
 	BrownianParticleType **part_d;
 	BaseGrid *sys_d, *kTGrid_d;
-	Random *randoGen_d;
+	Random* randoGen_d;
 	//Bond* bonds_d;
 	//int2* bondMap_d;
 	//Exclude* excludes_d;
@@ -250,6 +265,11 @@ private:
 	int4 *dihedralList;
 	int  *dihedralPotList;
 
+        //Han-Yi Chou
+        String particle_dynamic;
+        String rigidbody_dynamic;
+        String particle_langevin_integrator;
+
 	void updateNameList();
 
 	void remember(float t);
@@ -268,6 +288,7 @@ private:
 	// Add or delete particles in the reservoirs.
 	// Reservoirs are not wrapped.
 	void updateReservoirs();
+
 };
 
 #endif
-- 
GitLab