From 37bb1694999c555a9b9a28a14ca010c76a0eb466 Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Mon, 7 Nov 2022 14:31:27 -0600
Subject: [PATCH] Add NVTX

---
 src/CMakeLists.txt    |  3 +++
 src/GrandBrownTown.cu | 47 +++++++++++++++++++++++++++++++++++++++----
 src/nvtx_defs.h       | 24 ++++++++++++++++++++++
 3 files changed, 70 insertions(+), 4 deletions(-)
 create mode 100644 src/nvtx_defs.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9f8440f..f8f9af0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -131,5 +131,8 @@ endif()
 if(USE_NCCL)
   target_link_libraries(arbd PRIVATE nccl)
 endif()
+if(USE_NVTX)
+  target_link_libraries(arbd PRIVATE nvToolsExt)
+endif()
 
 install(TARGETS arbd)
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 3f0fc8c..d610c5d 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -3,6 +3,7 @@
 /* #include "ComputeGridGrid.cuh" */
 #include "WKFUtils.h"
 #include "BrownParticlesKernel.h"
+#include "nvtx_defs.h"
 #include <stdlib.h>     /* srand, rand */
 #include <time.h>       /* time */
 #include <thrust/device_ptr.h>
@@ -599,6 +600,7 @@ void GrandBrownTown::run()
     // Main loop over Brownian dynamics steps
     for (long int s = 1; s < steps; s++)
     {
+      PUSH_NVTX("Main loop timestep",0)
         bool get_energy = ((s % outputEnergyPeriod) == 0);
         //At the very first time step, the force is computed
         if(s == 1)
@@ -744,10 +746,12 @@ void GrandBrownTown::run()
 
         }//if step == 1
 
+	PUSH_NVTX("Clear particle force data",1)
 	internal->clear_energy();
 	gpuman.sync();
+	POP_NVTX
 
-
+	PUSH_NVTX("Integrate particles",2)
         if(particle_dynamic == String("Langevin"))
             updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, ParticleInterpolationType);
         else if(particle_dynamic == String("NoseHooverLangevin"))
@@ -761,6 +765,9 @@ void GrandBrownTown::run()
                                                        part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas,
                                                        internal->getEnergy(), get_energy, ParticleInterpolationType);
 
+	POP_NVTX
+
+	PUSH_NVTX("Integrate rigid bodies",2)
         if(rigidbody_dynamic == String("Langevin"))
         {
             #ifdef _OPENMP
@@ -786,7 +793,9 @@ void GrandBrownTown::run()
                 RBC[i]->print(s);
             }
         }
+	POP_NVTX
 
+	PUSH_NVTX("Update rigid body attached particle positions",3)
 	if (num_rb_attached_particles > 0) {
 	    #pragma omp parallel for
 	    for(int i = 0; i < numReplicas; ++i) {
@@ -797,11 +806,14 @@ void GrandBrownTown::run()
 		    sys_d, num, num_rb_attached_particles, numReplicas);
 	    }
 	}
-	
+	POP_NVTX
+
         if (s % outputPeriod == 0) {
+	    PUSH_NVTX("Copy particle positions to host for output",7)
             // Copy particle positions back to CPU
 	    gpuErrchk(cudaDeviceSynchronize());
             gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * (num+num_rb_attached_particles) * numReplicas, cudaMemcpyDeviceToHost));
+	    POP_NVTX
 	}
         if (imd_on && clientsock && s % outputPeriod == 0)
         {
@@ -876,20 +888,26 @@ void GrandBrownTown::run()
         #ifdef _OPENMP
         omp_set_num_threads(4);
         #endif
+
+	PUSH_NVTX("Clear rigid body forces",2)
         #pragma omp parallel for
         for(int i = 0; i < numReplicas; ++i) 
             RBC[i]->clearForceAndTorque();
+	POP_NVTX
 
 	
 	if (numGroupSites > 0) {
+ 	  PUSH_NVTX("Update collective coordinates",2)
 	    gpuman.sync();
 	    updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas);
 	    gpuman.sync();
+	  POP_NVTX
 	}
 
         if (imd_on && clientsock)
             internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD // TODO add multigpu support with IMD
 	else {
+	  PUSH_NVTX("Clear particle forces",2)
             internal->clear_force();
 	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
@@ -898,6 +916,7 @@ void GrandBrownTown::run()
 		gpuman.nccl_broadcast(0, _p, _p, (num+num_rb_attached_particles+numGroupSites)*numReplicas, nccl_broadcast_streams);
 	    }
 	    #endif
+	    POP_NVTX
     	}
 
         if (interparticleForce)
@@ -910,19 +929,27 @@ void GrandBrownTown::run()
                     case 0: // [ N*log(N) ] interactions, + cutoff | decomposition
                         if (s % decompPeriod == 0)
                         {
+			  PUSH_NVTX("Decompose particles",5)
                             internal -> decompose();
+			  POP_NVTX
                             #ifdef _OPENMP
                             omp_set_num_threads(4);
                             #endif
+			    PUSH_NVTX("Update rigid body particle lists",6)
                             #pragma omp parallel for
                             for(int i = 0; i < numReplicas; ++i)
                                 RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d);
+			    POP_NVTX
                         }
+			PUSH_NVTX("Calculate particle-particle forces",7)
                         internal -> computeTabulated(get_energy);
+			POP_NVTX
 			#ifdef USE_NCCL
 			if (gpuman.gpus.size() > 1) {
+			  PUSH_NVTX("Reduce particle forces",6)
 			    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
 			    gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles)*numReplicas, -1);
+			  POP_NVTX
 			}
 			#endif
                         break;
@@ -932,7 +959,7 @@ void GrandBrownTown::run()
                 }
             }
             else
-            {
+            { 
                 // Not using tabulated potentials.
                 switch (fullLongRange)
                 {
@@ -964,6 +991,7 @@ void GrandBrownTown::run()
             }
         }
 
+	PUSH_NVTX("Compute RB attached particle forces",4)
 	if (get_energy) {
 	    compute_position_dependent_force_for_rb_attached_particles
 		<<< numBlocks, NUM_THREADS >>> (
@@ -976,18 +1004,22 @@ void GrandBrownTown::run()
 		    internal -> getForceInternal_d()[0], internal -> getEnergy(),
 		    internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType);
 	}
+	POP_NVTX
 
 
         //compute the force for rigid bodies
         #ifdef _OPENMP
         omp_set_num_threads(4);
         #endif
+	PUSH_NVTX("Compute RB-RB forces forces",5)
         #pragma omp parallel for
         for(int i = 0; i < numReplicas; ++i) // TODO: Use different buffer for RB particle forces to avoid race condition
             RBC[i]->updateForces((internal->getPos_d()[0])+i*(num+num_rb_attached_particles), (internal->getForceInternal_d()[0])+i*(num+num_rb_attached_particles), s, (internal->getEnergy())+i*(num+num_rb_attached_particles), get_energy,
 				 RigidBodyInterpolationType, sys, sys_d, num, num_rb_attached_particles);
+	POP_NVTX
 
 	if (numGroupSites > 0) {
+	  PUSH_NVTX("Spread collective coordinate forces to constituent particles",4)
 	    gpuman.sync();
 	    // if ((s%100) == 0) {
 	    distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
@@ -995,14 +1027,18 @@ void GrandBrownTown::run()
 	//     distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas);
 	// }
 	    gpuman.sync();
+	  POP_NVTX
 	}
 
+	PUSH_NVTX("Update particle coordinates",2)
         if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), internal -> getForceInternal_d()[0], 
             internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy,
             ParticleInterpolationType);
             //gpuErrchk(cudaDeviceSynchronize());
-
+	POP_NVTX
+  
+	PUSH_NVTX("Update RB coordinates",3)
         if(rigidbody_dynamic == String("Langevin"))
         {
             #ifdef _OPENMP
@@ -1018,9 +1054,11 @@ void GrandBrownTown::run()
                 RBC[i]->print(s);
             }
         }
+	POP_NVTX
 
         if (s % outputPeriod == 0)
         {
+	  PUSH_NVTX("Copy and write particle and RB coordinates for output",3)
             if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
             {
                 gpuErrchk(cudaMemcpy(momentum, internal ->  getMom_d(), sizeof(Vector3) * (num) * numReplicas, cudaMemcpyDeviceToHost));
@@ -1131,6 +1169,7 @@ void GrandBrownTown::run()
                     writeRestart(repID);
 
                 wkf_timer_start(timerS);
+		POP_NVTX
          } // s % outputEnergyPeriod
      } // done with all Brownian dynamics steps
 
diff --git a/src/nvtx_defs.h b/src/nvtx_defs.h
new file mode 100644
index 0000000..01099f5
--- /dev/null
+++ b/src/nvtx_defs.h
@@ -0,0 +1,24 @@
+/* Adapted from: https://developer.nvidia.com/blog/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ */
+#ifdef USE_NVTX
+#include <nvToolsExt.h>
+
+const uint32_t nvtx_colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
+const int num_nvtx_colors = sizeof(nvtx_colors)/sizeof(uint32_t);
+
+#define PUSH_NVTX(name,cid) { \
+    int color_id = cid; \
+    color_id = color_id%num_nvtx_colors;\
+    nvtxEventAttributes_t eventAttrib = {0}; \
+    eventAttrib.version = NVTX_VERSION; \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \
+    eventAttrib.colorType = NVTX_COLOR_ARGB; \
+    eventAttrib.color = nvtx_colors[color_id]; \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name; \
+    nvtxRangePushEx(&eventAttrib); \
+}
+#define POP_NVTX nvtxRangePop();
+#else
+#define PUSH_NVTX(name,cid)
+#define POP_NVTX
+#endif
-- 
GitLab