From ec1b6e9d5379ced4200ff0fcd82c78d1c00fb7eb Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Thu, 21 Mar 2024 14:29:02 -0500
Subject: [PATCH] Improved fetching energy so it will be exactly synced with
 coordinates

---
 src/GrandBrownTown.cu  | 63 +++++++++++++++++++++++++-----------------
 src/GrandBrownTown.cuh | 30 +++++++++++++++-----
 2 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 29f23b3..c3b313a 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -646,10 +646,16 @@ void GrandBrownTown::run()
 
     //float total_energy = 0.f;
     // Main loop over Brownian dynamics steps
+
+    float kinetic_energy = 0.0f;
+    float potential_energy = 0.0f;
+    
+    bool get_energy = false;
+    bool print_energy = false;
+    
     for (long int s = 1; s < steps; s++)
     {
       PUSH_NVTX("Main loop timestep",0)
-        bool get_energy = ((s % outputEnergyPeriod) == 0);
         //At the very first time step, the force is computed
         if(s == 1)
         {
@@ -750,13 +756,13 @@ void GrandBrownTown::run()
 		if (get_energy) {
 		    compute_position_dependent_force_for_rb_attached_particles
 			<<< numBlocks, NUM_THREADS >>> (
-			    internal -> getPos_d()[0], internal -> getForceInternal_d()[0],
+			    internal -> getPos_d()[0],
+			    internal -> getForceInternal_d()[0], internal -> getEnergy(),
 			    internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
 		} else {
 		    compute_position_dependent_force_for_rb_attached_particles
 			<<< numBlocks, NUM_THREADS >>> (
-			    internal -> getPos_d()[0],
-			    internal -> getForceInternal_d()[0], internal -> getEnergy(),
+			    internal -> getPos_d()[0], internal -> getForceInternal_d()[0],
 			    internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
 		}
 	    }
@@ -795,17 +801,14 @@ void GrandBrownTown::run()
 
         }//if step == 1
 
-	PUSH_NVTX("Clear particle energy data",1)
-	internal->clear_energy();
 	gpuman.sync();
-	POP_NVTX
 
 	PUSH_NVTX("Integrate particles",2)
         if(particle_dynamic == String("Langevin"))
-            updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
+            updateKernelBAOAB<s == 1><<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
         else if(particle_dynamic == String("NoseHooverLangevin"))
             //kernel for Nose-Hoover Langevin dynamic
-            updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), 
+            updateKernelNoseHooverLangevin<s == 1><<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), 
             internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d,
             randoGen_d, numReplicas, ParticleInterpolationType);
         ////For Brownian motion
@@ -953,10 +956,32 @@ void GrandBrownTown::run()
 	  POP_NVTX
 	}
 
+	if (get_energy) {
+	    PUSH_NVTX("Fetch particle energy data",1)
+	    thrust::device_ptr<float> en_d(internal->getEnergy());
+	    potential_energy = (thrust::reduce(en_d, en_d+(num+num_rb_attached_particles)*numReplicas)) / numReplicas;
+	    if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
+	    {
+		gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
+		kinetic_energy = KineticEnergy();
+	    }
+	    gpuman.sync();
+	    POP_NVTX	
+	}
+	
+	get_energy = (((s+1) % outputEnergyPeriod) == 0) || (((s+1) % outputPeriod) == 0);
+	print_energy = ((s % outputEnergyPeriod) == 0);
+
+	if (get_energy) {
+	    PUSH_NVTX("Clear particle energy data",1)
+	    internal->clear_energy();
+	    POP_NVTX	
+	}
+	    
         if (imd_on && clientsock)
             internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD // TODO add multigpu support with IMD
 	else {
-	  PUSH_NVTX("Clear particle forces",2)
+	    PUSH_NVTX("Clear particle forces",2)
             internal->clear_force();
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
@@ -1135,7 +1160,7 @@ void GrandBrownTown::run()
 
            remember(t);
         }
-        if (get_energy)
+        if (print_energy)
         {
                 wkf_timer_stop(timerS);
                 t = s * timestep;
@@ -1149,27 +1174,15 @@ void GrandBrownTown::run()
 
                 // Do the output
                 printf("\rStep %ld [%.2f%% complete | %.3f ms/step | %.3f ns/day]",s, percent, msPerStep, nsPerDay);
-        //}
-        //if (get_energy)
-        //{
 
                 // Copy positions from GPU to CPU.
                 //gpuErrchk(cudaMemcpy(pos, internal->getPos_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
-                float e = 0.f;
-                float V = 0.f;
-                thrust::device_ptr<float> en_d(internal->getEnergy());
-                V = (thrust::reduce(en_d, en_d+(num+num_rb_attached_particles)*numReplicas)) / numReplicas;
-                if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
-                {
-                    gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost));
-                    e = KineticEnergy();
-                }   
                 std::fstream energy_file;
                 energy_file.open( (outFilePrefixes[0]+".energy.dat").c_str(), std::fstream::out | std::fstream::app);
                 if(energy_file.is_open())
                 {
-                    energy_file << "Kinetic Energy: " << e*num*0.5f*(2.388458509e-1) << " (kT) "<< std::endl;
-                    energy_file << "Potential Energy: " << V << " (kcal/mol) " << std::endl;
+                    energy_file << "Kinetic Energy: " << kinetic_energy*num*0.5f*(2.388458509e-1) << " (kT) "<< std::endl;
+                    energy_file << "Potential Energy: " << potential_energy << " (kcal/mol) " << std::endl;
                     energy_file.close();
                 }
                 else
diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh
index 2783943..b0b6bdc 100644
--- a/src/GrandBrownTown.cuh
+++ b/src/GrandBrownTown.cuh
@@ -61,6 +61,7 @@ ForceEnergy compute_position_dependent_force(
 
 
 ////The kernel is for Nose-Hoover Langevin dynamics
+template <bool need_position_dep_force = false>
 __global__ void 
 updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__ momentum, float* random, 
                                Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[], 
@@ -72,9 +73,12 @@ updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__
     {
 	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
 
-	ForceEnergy fe = compute_position_dependent_force(
-	    pos, forceInternal, type, part, electricField, scheme, idx );
-
+	ForceEnergy fe;
+	if (need_position_dep_force) {
+	    fe = compute_position_dependent_force(
+		pos, forceInternal, type, part, electricField, scheme, idx );
+	}
+	
         int t = type[idx];
         Vector3 r0  = pos[idx];
         Vector3 p0  = momentum[idx];
@@ -82,7 +86,10 @@ updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__
 
         const BrownianParticleType& pt = *part[t];
 
-        Vector3 force = forceInternal[idx] + fe.f;
+	if (need_position_dep_force) {
+	    forceInternal[idx] = forceInternal[idx] + fe.f;
+	}
+        Vector3 force = forceInternal[idx];
         #ifdef Debug
         forceInternal[idx] = -force;
         #endif
@@ -152,6 +159,7 @@ updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__
 //The original BBK kernel is no longer used since the random numbers should be reused 
 //which is not possible in GPU code.
 //Han-Yi Chou
+template <bool need_position_dep_force = false>
 __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal,
                                   int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, 
                                   float electricField,int tGridLength, float timestep,
@@ -165,8 +173,11 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re
     {
 	idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles);
 
-	ForceEnergy fe = compute_position_dependent_force(
-	    pos, forceInternal, type, part, electricField, scheme, idx );
+	ForceEnergy fe;
+	if (need_position_dep_force) {
+	    fe = compute_position_dependent_force(
+		pos, forceInternal, type, part, electricField, scheme, idx );
+	}
 	// if (get_energy) energy[idx] += fe.e;
 
         int t = type[idx];
@@ -174,7 +185,11 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re
         Vector3 p0 = momentum[idx];
         const BrownianParticleType& pt = *part[t];
 
-        Vector3 force = forceInternal[idx] + fe.f;
+	if (need_position_dep_force) { // only needed at ts == 1
+	    forceInternal[idx] = forceInternal[idx] + fe.f;
+	}
+	Vector3 force = forceInternal[idx];
+	
 #ifdef Debug
         forceInternal[idx] = -force;
 #endif
@@ -249,6 +264,7 @@ __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* _
         Vector3 p0 = momentum[idx];
 
         Vector3 force = forceInternal[idx] + fe.f;
+        forceInternal[idx] = forceInternal[idx] + fe.f; // Adding position-dep force so it doesn't need to be calculated in the "plain" kernel 
 #ifdef Debug
         forceInternal[idx] = -force;
 #endif
-- 
GitLab