diff --git a/src/GPUManager.h b/src/GPUManager.h
index 1d3335242d431e4b751e6264ee87599f4df809b5..e29569bb1c8bef382fefb0b377511e525a448447 100644
--- a/src/GPUManager.h
+++ b/src/GPUManager.h
@@ -142,6 +142,17 @@ public:
 	}
 	NCCLCHECK(ncclGroupEnd());
     }
+    template<typename T>
+	void nccl_broadcast(int root, std::vector<T*> send_d, std::vector<T*> recv_d, unsigned int size, cudaStream_t* streams) {
+	if (gpus.size() == 1) return;
+	NCCLCHECK(ncclGroupStart());
+	for (size_t i = 0; i < gpus.size(); ++i) {
+	    NCCLCHECK( ncclBroadcast((const void*) send_d[i], (void*) recv_d[i],
+				     size*sizeof(T)/sizeof(float), ncclFloat, root,
+				     comms[i], streams[i]) );
+	}
+	NCCLCHECK(ncclGroupEnd());
+    }
 
     template<typename T>
     void nccl_reduce(int root, const std::vector<T*> send_d, const std::vector<T*> recv_d, const unsigned int size, const int stream_id) {
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index ff823d00d3a821190bb711ff777b971dea59aeda..f302d91c473f5c93c01d74e52a95da8ea48ece3e 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -7,6 +7,7 @@
 #include <time.h>       /* time */
 #include <thrust/device_ptr.h>
 #include <fstream>
+#include <cuda_profiler_api.h>
 
 #ifdef _OPENMP
 #include <omp.h>
@@ -527,6 +528,9 @@ void GrandBrownTown::RunNoseHooverLangevin()
     timer0 = wkf_timer_create();
     timerS = wkf_timer_create();
 
+    cudaStream_t* nccl_broadcast_streams = new cudaStream_t[gpuman.gpus.size()];
+    for (int i=0; i< gpuman.gpus.size(); ++i) nccl_broadcast_streams[i] = 0;
+
     copyToCUDA();
 
     if(particle_dynamic == String("Langevin"))
@@ -596,6 +600,8 @@ void GrandBrownTown::RunNoseHooverLangevin()
     gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas));
 
     printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
+    gpuErrchk( cudaProfilerStart() );
+
     //float total_energy = 0.f;
     // Main loop over Brownian dynamics steps
     for (long int s = 1; s < steps; s++)
@@ -608,8 +614,9 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    internal->clear_force();
 	    internal->clear_energy();
 	    const std::vector<Vector3*>& _pos = internal->getPos_d();
-	    if (gpuman.gpus.size() > 1)
+	    if (gpuman.gpus.size() > 1) {
 		gpuman.nccl_broadcast(0, _pos, _pos, num*numReplicas, -1);
+	    }
 	    gpuman.sync();
 
             #ifdef _OPENMP
@@ -832,7 +839,8 @@ void GrandBrownTown::RunNoseHooverLangevin()
             internal->clear_force();
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _p = internal->getPos_d();
-		gpuman.nccl_broadcast(0, _p, _p, num*numReplicas, -1);
+		nccl_broadcast_streams[0] = gpuman.gpus[0].get_next_stream();
+		gpuman.nccl_broadcast(0, _p, _p, num*numReplicas, nccl_broadcast_streams);
 	    }
     	}