From 7aba75b5832a44992ea2993ba92704d1cf1e8bab Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Mon, 4 Jan 2021 18:08:43 -0600
Subject: [PATCH] Fixed nccl_reduce

---
 src/ComputeForce.cu   |  8 +-------
 src/GrandBrownTown.cu | 26 +++++++++++++++-----------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index dec829c..5379e31 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -587,13 +587,7 @@ void ComputeForce::decompose() {
 	  gpuman.nccl_broadcast(0, pairTabPotType_d, pairTabPotType_d, numPairs, -1);
 	  gpuman.nccl_broadcast(0, pairLists_d, pairLists_d, numPairs, -1);
       }
-
-      for (size_t i = 0; i < gpuman.gpus.size(); ++i) {
-	  gpuman.use(i);
-	  gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */
-      }
-      gpuman.use(0);
-
+      gpuman.sync();
 
     //createPairlists<64,64><<< dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
     //                                                                  pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d,
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index f302d91..9a21faf 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -600,7 +600,11 @@ void GrandBrownTown::RunNoseHooverLangevin()
     gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas));
 
     printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
-    gpuErrchk( cudaProfilerStart() );
+    for (int i=0; i< gpuman.gpus.size(); ++i) {
+	gpuman.use(i);
+	gpuErrchk( cudaProfilerStart() );
+    }
+    gpuman.use(0);
 
     //float total_energy = 0.f;
     // Main loop over Brownian dynamics steps
@@ -704,12 +708,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     RBC[i]->AddLangevin();
                 }
             }
+	    if (gpuman.gpus.size() > 1) {
+		const std::vector<Vector3*>& _f = internal->getForceInternal_d();
+		gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
+	    }
+
         }//if step == 1
 
-	if (gpuman.gpus.size() > 1) {
-	    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-	    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
-	}
 	internal->clear_energy();
 	gpuman.sync();
 
@@ -863,6 +868,10 @@ void GrandBrownTown::RunNoseHooverLangevin()
                                 RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
                         }
                         internal -> computeTabulated(get_energy);
+			if (gpuman.gpus.size() > 1) {
+			    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
+			    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
+			}
                         break;
                     default: // [ N^2 ] interactions, no cutoff | decompositions
                         internal->computeTabulatedFull(get_energy);
@@ -906,7 +915,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
         omp_set_num_threads(4);
         #endif
         #pragma omp parallel for
-        for(int i = 0; i < numReplicas; ++i)
+        for(int i = 0; i < numReplicas; ++i) // TODO: Use different buffer for RB particle forces to avoid race condition
             RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, 
                                  RigidBodyInterpolationType, sys, sys_d);
 
@@ -932,11 +941,6 @@ void GrandBrownTown::RunNoseHooverLangevin()
             }
         }
 
-	if (gpuman.gpus.size() > 1) {
-	    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-	    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
-	}
-
         if (s % outputPeriod == 0)
         {
             if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))
-- 
GitLab