From 7aba75b5832a44992ea2993ba92704d1cf1e8bab Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Mon, 4 Jan 2021 18:08:43 -0600 Subject: [PATCH] Fixed nccl_reduce --- src/ComputeForce.cu | 8 +------- src/GrandBrownTown.cu | 26 +++++++++++++++----------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu index dec829c..5379e31 100644 --- a/src/ComputeForce.cu +++ b/src/ComputeForce.cu @@ -587,13 +587,7 @@ void ComputeForce::decompose() { gpuman.nccl_broadcast(0, pairTabPotType_d, pairTabPotType_d, numPairs, -1); gpuman.nccl_broadcast(0, pairLists_d, pairLists_d, numPairs, -1); } - - for (size_t i = 0; i < gpuman.gpus.size(); ++i) { - gpuman.use(i); - gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ - } - gpuman.use(0); - + gpuman.sync(); //createPairlists<64,64><<< dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], // pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu index f302d91..9a21faf 100644 --- a/src/GrandBrownTown.cu +++ b/src/GrandBrownTown.cu @@ -600,7 +600,11 @@ void GrandBrownTown::RunNoseHooverLangevin() gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas)); printf("Configuration: %d particles | %d replicas\n", num, numReplicas); - gpuErrchk( cudaProfilerStart() ); + for (int i=0; i< gpuman.gpus.size(); ++i) { + gpuman.use(i); + gpuErrchk( cudaProfilerStart() ); + } + gpuman.use(0); //float total_energy = 0.f; // Main loop over Brownian dynamics steps @@ -704,12 +708,13 @@ void GrandBrownTown::RunNoseHooverLangevin() RBC[i]->AddLangevin(); } } + if (gpuman.gpus.size() > 1) { + const std::vector<Vector3*>& _f = internal->getForceInternal_d(); + gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1); + } + }//if step == 1 - if (gpuman.gpus.size() > 1) { - const std::vector<Vector3*>& _f = internal->getForceInternal_d(); - gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1); - } internal->clear_energy(); gpuman.sync(); @@ -863,6 +868,10 @@ void GrandBrownTown::RunNoseHooverLangevin() RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); } internal -> computeTabulated(get_energy); + if (gpuman.gpus.size() > 1) { + const std::vector<Vector3*>& _f = internal->getForceInternal_d(); + gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1); + } break; default: // [ N^2 ] interactions, no cutoff | decompositions internal->computeTabulatedFull(get_energy); @@ -906,7 +915,7 @@ void GrandBrownTown::RunNoseHooverLangevin() omp_set_num_threads(4); #endif #pragma omp parallel for - for(int i = 0; i < numReplicas; ++i) + for(int i = 0; i < numReplicas; ++i) // TODO: Use different buffer for RB particle forces to avoid race condition RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, RigidBodyInterpolationType, sys, sys_d); @@ -932,11 +941,6 @@ void GrandBrownTown::RunNoseHooverLangevin() } } - if (gpuman.gpus.size() > 1) { - const std::vector<Vector3*>& _f = internal->getForceInternal_d(); - gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1); - } - if (s % outputPeriod == 0) { if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) -- GitLab