From 37bb1694999c555a9b9a28a14ca010c76a0eb466 Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Mon, 7 Nov 2022 14:31:27 -0600 Subject: [PATCH] Add NVTX --- src/CMakeLists.txt | 3 +++ src/GrandBrownTown.cu | 47 +++++++++++++++++++++++++++++++++++++++---- src/nvtx_defs.h | 24 ++++++++++++++++++++++ 3 files changed, 70 insertions(+), 4 deletions(-) create mode 100644 src/nvtx_defs.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9f8440f..f8f9af0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -131,5 +131,8 @@ endif() if(USE_NCCL) target_link_libraries(arbd PRIVATE nccl) endif() +if(USE_NVTX) + target_link_libraries(arbd PRIVATE nvToolsExt) +endif() install(TARGETS arbd) diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu index 3f0fc8c..d610c5d 100644 --- a/src/GrandBrownTown.cu +++ b/src/GrandBrownTown.cu @@ -3,6 +3,7 @@ /* #include "ComputeGridGrid.cuh" */ #include "WKFUtils.h" #include "BrownParticlesKernel.h" +#include "nvtx_defs.h" #include <stdlib.h> /* srand, rand */ #include <time.h> /* time */ #include <thrust/device_ptr.h> @@ -599,6 +600,7 @@ void GrandBrownTown::run() // Main loop over Brownian dynamics steps for (long int s = 1; s < steps; s++) { + PUSH_NVTX("Main loop timestep",0) bool get_energy = ((s % outputEnergyPeriod) == 0); //At the very first time step, the force is computed if(s == 1) @@ -744,10 +746,12 @@ void GrandBrownTown::run() }//if step == 1 + PUSH_NVTX("Clear particle force data",1) internal->clear_energy(); gpuman.sync(); + POP_NVTX - + PUSH_NVTX("Integrate particles",2) if(particle_dynamic == String("Langevin")) updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, ParticleInterpolationType); else if(particle_dynamic == String("NoseHooverLangevin")) @@ -761,6 +765,9 @@ void GrandBrownTown::run() part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, ParticleInterpolationType); + POP_NVTX + + PUSH_NVTX("Integrate rigid bodies",2) if(rigidbody_dynamic == String("Langevin")) { #ifdef _OPENMP @@ -786,7 +793,9 @@ void GrandBrownTown::run() RBC[i]->print(s); } } + POP_NVTX + PUSH_NVTX("Update rigid body attached particle positions",3) if (num_rb_attached_particles > 0) { #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) { @@ -797,11 +806,14 @@ void GrandBrownTown::run() sys_d, num, num_rb_attached_particles, numReplicas); } } - + POP_NVTX + if (s % outputPeriod == 0) { + PUSH_NVTX("Copy particle positions to host for output",7) // Copy particle positions back to CPU gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * (num+num_rb_attached_particles) * numReplicas, cudaMemcpyDeviceToHost)); + POP_NVTX } if (imd_on && clientsock && s % outputPeriod == 0) { @@ -876,20 +888,26 @@ void GrandBrownTown::run() #ifdef _OPENMP omp_set_num_threads(4); #endif + + PUSH_NVTX("Clear rigid body forces",2) #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) RBC[i]->clearForceAndTorque(); + POP_NVTX if (numGroupSites > 0) { + PUSH_NVTX("Update collective coordinates",2) gpuman.sync(); updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas); gpuman.sync(); + POP_NVTX } if (imd_on && clientsock) internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD // TODO add multigpu support with IMD else { + PUSH_NVTX("Clear particle forces",2) internal->clear_force(); #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { @@ -898,6 +916,7 @@ void GrandBrownTown::run() gpuman.nccl_broadcast(0, _p, _p, (num+num_rb_attached_particles+numGroupSites)*numReplicas, nccl_broadcast_streams); } #endif + POP_NVTX } if (interparticleForce) @@ -910,19 +929,27 @@ void GrandBrownTown::run() case 0: // [ N*log(N) ] interactions, + cutoff | decomposition if (s % decompPeriod == 0) { + PUSH_NVTX("Decompose particles",5) internal -> decompose(); + POP_NVTX #ifdef _OPENMP omp_set_num_threads(4); #endif + PUSH_NVTX("Update rigid body particle lists",6) #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d); + POP_NVTX } + PUSH_NVTX("Calculate particle-particle forces",7) internal -> computeTabulated(get_energy); + POP_NVTX #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { + PUSH_NVTX("Reduce particle forces",6) const std::vector<Vector3*>& _f = internal->getForceInternal_d(); gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles)*numReplicas, -1); + POP_NVTX } #endif break; @@ -932,7 +959,7 @@ void GrandBrownTown::run() } } else - { + { // Not using tabulated potentials. switch (fullLongRange) { @@ -964,6 +991,7 @@ void GrandBrownTown::run() } } + PUSH_NVTX("Compute RB attached particle forces",4) if (get_energy) { compute_position_dependent_force_for_rb_attached_particles <<< numBlocks, NUM_THREADS >>> ( @@ -976,18 +1004,22 @@ void GrandBrownTown::run() internal -> getForceInternal_d()[0], internal -> getEnergy(), internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType); } + POP_NVTX //compute the force for rigid bodies #ifdef _OPENMP omp_set_num_threads(4); #endif + PUSH_NVTX("Compute RB-RB forces forces",5) #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) // TODO: Use different buffer for RB particle forces to avoid race condition RBC[i]->updateForces((internal->getPos_d()[0])+i*(num+num_rb_attached_particles), (internal->getForceInternal_d()[0])+i*(num+num_rb_attached_particles), s, (internal->getEnergy())+i*(num+num_rb_attached_particles), get_energy, RigidBodyInterpolationType, sys, sys_d, num, num_rb_attached_particles); + POP_NVTX if (numGroupSites > 0) { + PUSH_NVTX("Spread collective coordinate forces to constituent particles",4) gpuman.sync(); // if ((s%100) == 0) { distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas); @@ -995,14 +1027,18 @@ void GrandBrownTown::run() // distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas); // } gpuman.sync(); + POP_NVTX } + PUSH_NVTX("Update particle coordinates",2) if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, ParticleInterpolationType); //gpuErrchk(cudaDeviceSynchronize()); - + POP_NVTX + + PUSH_NVTX("Update RB coordinates",3) if(rigidbody_dynamic == String("Langevin")) { #ifdef _OPENMP @@ -1018,9 +1054,11 @@ void GrandBrownTown::run() RBC[i]->print(s); } } + POP_NVTX if (s % outputPeriod == 0) { + PUSH_NVTX("Copy and write particle and RB coordinates for output",3) if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) { gpuErrchk(cudaMemcpy(momentum, internal -> getMom_d(), sizeof(Vector3) * (num) * numReplicas, cudaMemcpyDeviceToHost)); @@ -1131,6 +1169,7 @@ void GrandBrownTown::run() writeRestart(repID); wkf_timer_start(timerS); + POP_NVTX } // s % outputEnergyPeriod } // done with all Brownian dynamics steps diff --git a/src/nvtx_defs.h b/src/nvtx_defs.h new file mode 100644 index 0000000..01099f5 --- /dev/null +++ b/src/nvtx_defs.h @@ -0,0 +1,24 @@ +/* Adapted from: https://developer.nvidia.com/blog/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ */ +#ifdef USE_NVTX +#include <nvToolsExt.h> + +const uint32_t nvtx_colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff }; +const int num_nvtx_colors = sizeof(nvtx_colors)/sizeof(uint32_t); + +#define PUSH_NVTX(name,cid) { \ + int color_id = cid; \ + color_id = color_id%num_nvtx_colors;\ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = nvtx_colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ +} +#define POP_NVTX nvtxRangePop(); +#else +#define PUSH_NVTX(name,cid) +#define POP_NVTX +#endif -- GitLab