diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu
index aa062e642546343b647e0461c6b2d7df1e43c76b..899bc780f600917d69c3e765ee071ebd65ff7ddb 100644
--- a/GrandBrownTown.cu
+++ b/GrandBrownTown.cu
@@ -1,5 +1,7 @@
 #include "GrandBrownTown.h"
 #include "GrandBrownTown.cuh"
+#include "nvtxEvents.h"
+
 /* #include "ComputeGridGrid.cuh" */
 
 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
@@ -313,7 +315,14 @@ void GrandBrownTown::run() {
 
 	printf("Configuration: %d particles | %d replicas\n", num, numReplicas);
 
-	// Main loop over Brownian dynamics steps
+	
+#pragma omp parallel sections
+	{
+		
+/* # pragma omp single nowait */
+		{
+			
+// Main loop over Brownian dynamics steps
 	for (long int s = 1; s < steps; s++) {
 		// Compute the internal forces. Only calculate the energy when we are about to output.
 		bool get_energy = ((s % outputEnergyPeriod) == 0);
@@ -322,6 +331,10 @@ void GrandBrownTown::run() {
 		// Set the timer
 		rt_timer_start(cputimer);
 
+
+		// compute forces
+		nvtxRangeId_t rid0 = nvtxStart("Compute 'atom' forces", 1); 
+		
 		// 'interparticleForce' - determines whether particles interact with each other
 		if (interparticleForce) {
 
@@ -345,8 +358,7 @@ void GrandBrownTown::run() {
 																								get_energy);
 						break;
 					default: // [ N^2 ] interactions, no cutoff | decompositions
-						energy =
-								internal->computeTabulatedFull(forceInternal_d, pos_d, type_d,
+						energy = internal->computeTabulatedFull(forceInternal_d, pos_d, type_d,
 																							 bonds_d, bondMap_d,
 																							 excludes_d, excludeMap_d,
 																							 angles_d, dihedrals_d,
@@ -401,6 +413,8 @@ void GrandBrownTown::run() {
 		int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
 		int tl = temperatureGrid.length();
 
+		nvtxRangeEnd(rid0);
+		rid0 = nvtxStart("RigidBody and update", 1); 
 		RBC.updateForces(s);					/* update RB forces before update particle positions... */
 
 		// Call the kernel to update the positions of each particle
@@ -421,7 +435,11 @@ void GrandBrownTown::run() {
 		/* computeGridGridForce<<< numBlocks, NUM_THREADS >>>(grid1_d, grid2_d); */
 		
 		// int numBlocks = (numRB ) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
-		
+
+
+		nvtxRangeEnd(rid0);
+		rid0 = nvtxStart("I/O", 1); 
+
 		Vector3 force0(0.0f);
 
 		if (imd_on && clientsock && s % outputPeriod == 0) {
@@ -460,37 +478,48 @@ void GrandBrownTown::run() {
 			}
 		}
 
+		bool posFetched = false;
 		// Output trajectories (to files)
 		if (s % outputPeriod == 0) {
+
+			// wait for previous writes before copying new data 
+#pragma omp taskwait
 			// Copy particle positions back to CPU
-			gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas,
-					cudaMemcpyDeviceToHost));
+			posFetched=true;
+#pragma omp task untied
+			{
+				nvtxRangeId_t rid = nvtxStart("Write output", 1); 
 
-			// Nanoseconds computed
-			t = s*timestep;
+				gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas,
+												cudaMemcpyDeviceToHost));
+				// Nanoseconds computed
+				t = s*timestep;
 
-			// Loop over all replicas
-			for (int repID = 0; repID < numReplicas; ++repID) {
+				// Loop over all replicas
+				for (int repID = 0; repID < numReplicas; ++repID) {
 
-				// *Analysis*: ionic current
-				if (currentSegmentZ <= 0.0f) writeCurrent(repID, t);
-				else writeCurrentSegment(repID, t, currentSegmentZ);
+					// *Analysis*: ionic current
+					if (currentSegmentZ <= 0.0f) writeCurrent(repID, t);
+					else writeCurrentSegment(repID, t, currentSegmentZ);
 
-				// 
-				if (numberFluct == 1) updateNameList(); // no need for it here if particles stay the same
-														// TODO: doublecheck
+					// 
+					if (numberFluct == 1) updateNameList(); // no need for it here if particles stay the same
+					// TODO: doublecheck
 
-				// Write the trajectory.
-				writers[repID]->append(pos + (repID*num), name, serial, t, num);
-			}
+					// Write the trajectory.
+					writers[repID]->append(pos + (repID*num), name, serial, t, num);
+				}
 
-			// Handle particle fluctuations.
-			// TODO: Currently, not compatible with replicas. Needs a fix.
-			if (numberFluct == 1) updateReservoirs();
+				// Handle particle fluctuations.
+				// TODO: Currently, not compatible with replicas. Needs a fix.
+				if (numberFluct == 1) updateReservoirs();
 
-			// Store the current positions.
-			// We must do this after particles have been added.
-			remember(t);
+				// Store the current positions.
+				// We must do this after particles have been added.
+				remember(t);
+				nvtxRangeEnd(rid);
+			}
+			
 		} // s % outputPeriod == 0
 
 
@@ -500,7 +529,7 @@ void GrandBrownTown::run() {
 			rt_timer_stop(timerS);
 
 			// Copy back forces to display (internal only)
-			gpuErrchk(cudaMemcpy(&force0, forceInternal_d, sizeof(Vector3), cudaMemcpyDeviceToHost));
+			// gpuErrchk(cudaMemcpy(&force0, forceInternal_d, sizeof(Vector3), cudaMemcpyDeviceToHost));	
 			
 			// Nanoseconds computed
 			t = s * timestep;
@@ -520,25 +549,31 @@ void GrandBrownTown::run() {
 						 t, energy, force0.x, force0.y, force0.z);
 		*/
 
-			// Copy positions from GPU to CPU.
-			gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas,
-													 cudaMemcpyDeviceToHost));
 
-			// Write restart files for each replica.
-			for (int repID = 0; repID < numReplicas; ++repID)
-				writeRestart(repID);
+				// Copy positions from GPU to CPU.
+				#pragma omp taskwait
+			if (!posFetched) {
+				gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas,
+												cudaMemcpyDeviceToHost));
+			}
 
+      #pragma omp task untied
+			{ // Write restart files for each replica.
+				nvtxRangeId_t rid = nvtxStart("Write Restart", 1); 
+				for (int repID = 0; repID < numReplicas; ++repID)
+					writeRestart(repID);
+				nvtxRangeEnd(rid);
+			}
+			
 			// restart the timer
 			rt_timer_start(timerS);
-		} // s % outputEnergyPeriod
-
-		{
-			int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
-			clearInternalForces<<< numBlocks, NUM_THREADS >>>(forceInternal_d, num, numReplicas);
 		}
-		
+
+		nvtxRangeEnd(rid0);
+
 	} // done with all Brownian dynamics steps
 
+		}
 	// If IMD is on & our socket is still open.
 	if (imd_on and clientsock) {
 		if (vmdsock_selread(clientsock, 0) == 1) {
@@ -562,6 +597,7 @@ void GrandBrownTown::run() {
 			}
 		}
 	}
+		}
 
 	// Stop the main timer.
 	rt_timer_stop(timer0);
@@ -612,10 +648,9 @@ void GrandBrownTown::populate() {
 void GrandBrownTown::writeRestart(int repID) const {
 	FILE* out = fopen(restartFiles[repID].c_str(), "w");
 	const int offset = repID * num;
-	for (int i = 0; i < num; ++i) {
-		const int ind = i + offset;
-		const Vector3& p = pos[ind];
-		fprintf(out, "%d %.10g %.10g %.10g\n", type[ind], p.x, p.y, p.z);
+	for (int i = offset; i < offset+num; ++i) {
+		const Vector3& p = pos[i];
+		fprintf(out, "%d %.10g %.10g %.10g\n", type[i], p.x, p.y, p.z);
 	}
 	fclose(out);
 }
diff --git a/makefile b/makefile
index 6ec127e3d6645931c09d1d86b2464603ea4e8870..5c7ecc0dd31b86145a66f325737fa7f28456ade1 100644
--- a/makefile
+++ b/makefile
@@ -13,9 +13,11 @@ INCLUDE = $(CUDA_PATH)/include
 
 
 # DEBUG = -g -O0
-CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic
+CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic -fopenmp	
 # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no
-NV_FLAGS = -Xptxas -v # -v,-abi=no 
+NV_FLAGS = -Xptxas -v # -v,-abi=no
+NV_FLAGS += -Xcompiler -fopenmp
+
 ifneq ($(DEBUG),)
 	NV_FLAGS += -g -G								#debug
 	EX_FLAGS = -O0 -m$(OS_SIZE)
@@ -56,7 +58,7 @@ NV_FLAGS += -gencode arch=compute_$(SM),code=compute_$(SM)
 NVLD_FLAGS := $(NV_FLAGS) --device-link 
 # NV_FLAGS += -rdc=true
 
-LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(LIBRARY)
+LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -lcudadevrt -lnvToolsExt -Wl,-rpath,$(LIBRARY)
 LD_FLAGS += -lcuda 
 
 
diff --git a/nvtxEvents.h b/nvtxEvents.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c9ffe5640c07968628745e696aa7e4615aed541
--- /dev/null
+++ b/nvtxEvents.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <nvToolsExt.h>
+
+// http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx
+nvtxRangeId_t nvtxStart(const char* name, int colorId) {
+	return nvtxRangeStartA(name);
+}
+
+