diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index dec829c8f63e5d0f3c35a3da168746e0664259ae..5379e310f29d40f45eddbaf89203f4b00b772180 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -587,13 +587,7 @@ void ComputeForce::decompose() {
 	  gpuman.nccl_broadcast(0, pairTabPotType_d, pairTabPotType_d, numPairs, -1);
 	  gpuman.nccl_broadcast(0, pairLists_d, pairLists_d, numPairs, -1);
       }
-
-      for (size_t i = 0; i < gpuman.gpus.size(); ++i) {
-	  gpuman.use(i);
-	  gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */
-      }
-      gpuman.use(0);
-
+      gpuman.sync();
 
     //createPairlists<64,64><<< dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
     //                                                                  pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d,
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index f302d91c473f5c93c01d74e52a95da8ea48ece3e..9a21faf3fec61d464850a53f1af8f2cec3830f39 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -600,7 +600,11 @@ void GrandBrownTown::RunNoseHooverLangevin()
     gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas));
 
     printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
-    gpuErrchk( cudaProfilerStart() );
+    for (int i=0; i< gpuman.gpus.size(); ++i) {
+	gpuman.use(i);
+	gpuErrchk( cudaProfilerStart() );
+    }
+    gpuman.use(0);
 
     //float total_energy = 0.f;
     // Main loop over Brownian dynamics steps
@@ -704,12 +708,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     RBC[i]->AddLangevin();
                 }
             }
+	    if (gpuman.gpus.size() > 1) {
+		const std::vector<Vector3*>& _f = internal->getForceInternal_d();
+		gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
+	    }
+
         }//if step == 1
 
-	if (gpuman.gpus.size() > 1) {
-	    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-	    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
-	}
 	internal->clear_energy();
 	gpuman.sync();
 
@@ -863,6 +868,10 @@ void GrandBrownTown::RunNoseHooverLangevin()
                                 RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
                         }
                         internal -> computeTabulated(get_energy);
+			if (gpuman.gpus.size() > 1) {
+			    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
+			    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
+			}
                         break;
                     default: // [ N^2 ] interactions, no cutoff | decompositions
                         internal->computeTabulatedFull(get_energy);
@@ -906,7 +915,7 @@ void GrandBrownTown::RunNoseHooverLangevin()
         omp_set_num_threads(4);
         #endif
         #pragma omp parallel for
-        for(int i = 0; i < numReplicas; ++i)
+        for(int i = 0; i < numReplicas; ++i) // TODO: Use different buffer for RB particle forces to avoid race condition
             RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, 
                                  RigidBodyInterpolationType, sys, sys_d);
 
@@ -932,11 +941,6 @@ void GrandBrownTown::RunNoseHooverLangevin()
             }
         }
 
-	if (gpuman.gpus.size() > 1) {
-	    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
-	    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
-	}
-
         if (s % outputPeriod == 0)
         {
             if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin"))