diff --git a/src/GPUManager.h b/src/GPUManager.h index 1d3335242d431e4b751e6264ee87599f4df809b5..e29569bb1c8bef382fefb0b377511e525a448447 100644 --- a/src/GPUManager.h +++ b/src/GPUManager.h @@ -142,6 +142,17 @@ public: } NCCLCHECK(ncclGroupEnd()); } + template<typename T> + void nccl_broadcast(int root, std::vector<T*> send_d, std::vector<T*> recv_d, unsigned int size, cudaStream_t* streams) { + if (gpus.size() == 1) return; + NCCLCHECK(ncclGroupStart()); + for (size_t i = 0; i < gpus.size(); ++i) { + NCCLCHECK( ncclBroadcast((const void*) send_d[i], (void*) recv_d[i], + size*sizeof(T)/sizeof(float), ncclFloat, root, + comms[i], streams[i]) ); + } + NCCLCHECK(ncclGroupEnd()); + } template<typename T> void nccl_reduce(int root, const std::vector<T*> send_d, const std::vector<T*> recv_d, const unsigned int size, const int stream_id) { diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu index ff823d00d3a821190bb711ff777b971dea59aeda..f302d91c473f5c93c01d74e52a95da8ea48ece3e 100644 --- a/src/GrandBrownTown.cu +++ b/src/GrandBrownTown.cu @@ -7,6 +7,7 @@ #include <time.h> /* time */ #include <thrust/device_ptr.h> #include <fstream> +#include <cuda_profiler_api.h> #ifdef _OPENMP #include <omp.h> @@ -527,6 +528,9 @@ void GrandBrownTown::RunNoseHooverLangevin() timer0 = wkf_timer_create(); timerS = wkf_timer_create(); + cudaStream_t* nccl_broadcast_streams = new cudaStream_t[gpuman.gpus.size()]; + for (int i=0; i< gpuman.gpus.size(); ++i) nccl_broadcast_streams[i] = 0; + copyToCUDA(); if(particle_dynamic == String("Langevin")) @@ -596,6 +600,8 @@ void GrandBrownTown::RunNoseHooverLangevin() gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas)); printf("Configuration: %d particles | %d replicas\n", num, numReplicas); + gpuErrchk( cudaProfilerStart() ); + //float total_energy = 0.f; // Main loop over Brownian dynamics steps for (long int s = 1; s < steps; s++) @@ -608,8 +614,9 @@ void GrandBrownTown::RunNoseHooverLangevin() internal->clear_force(); internal->clear_energy(); const std::vector<Vector3*>& _pos = internal->getPos_d(); - if (gpuman.gpus.size() > 1) + if (gpuman.gpus.size() > 1) { gpuman.nccl_broadcast(0, _pos, _pos, num*numReplicas, -1); + } gpuman.sync(); #ifdef _OPENMP @@ -832,7 +839,8 @@ void GrandBrownTown::RunNoseHooverLangevin() internal->clear_force(); if (gpuman.gpus.size() > 1) { const std::vector<Vector3*>& _p = internal->getPos_d(); - gpuman.nccl_broadcast(0, _p, _p, num*numReplicas, -1); + nccl_broadcast_streams[0] = gpuman.gpus[0].get_next_stream(); + gpuman.nccl_broadcast(0, _p, _p, num*numReplicas, nccl_broadcast_streams); } }