diff --git a/RigidBodyController.cu b/RigidBodyController.cu
index e2a2221723d147c9a67e64f5e227122d44ae9d72..1054f61c3d670b4aa57024ea59a6dfcfafa75c9b 100644
--- a/RigidBodyController.cu
+++ b/RigidBodyController.cu
@@ -167,24 +167,10 @@ void RigidBodyController::initializeForcePairs() {
 }
 	
 void RigidBodyController::updateForces(int s) {
-	/*â€“â€“{ RBTODO }â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“.
-	| probably coalesce kernel calls, or move this to a device kernel caller   |
-	|                                                                          |
-	| - consider removing references (unless they are optimized out!) ---      |
-	| - caclulate numthreads && numblocks                                      |
-	|                                                                          |
-	| all threads in a block should: ----------------------------------------  |
-	|   (1) apply the same transformations to get the data point position in a |
-	|   destination grid ----------------------------------------------------- |
-	|   (2) reduce forces and torques to the same location ------------------- |
-	|   (3) ???  ------------------------------------------------------------- |
-	|                                                                          |
-	| Opportunities for memory bandwidth savings:                              |
-	`â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“*/
-	// int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1);
-	// int numBlocks = 1;
-	/* int numThreads = 256; */
+	if (s <= 1)
+		gpuErrchk( cudaProfilerStart() );
 
+	
 	// clear old forces
 	for (int i = 0; i < rigidBodyByType.size(); i++) {
 		for (int j = 0; j < rigidBodyByType[i].size(); j++) {
@@ -202,11 +188,12 @@ void RigidBodyController::updateForces(int s) {
 	fp->retrieveForcesForGrid( fp->lastRbGridID );
 	fp->lastRbGridID = -1;
 
-	// sync streams
-	for (int i = 0; i < NUMSTREAMS; i++) {
-		const cudaStream_t &s = fp->stream[i];
-		gpuErrchk(cudaStreamSynchronize( s )); 
-	}
+	// stream sync was slower than device sync
+	/* for (int i = 0; i < NUMSTREAMS; i++) { */
+	/* 	const cudaStream_t &s = RigidBodyForcePair::stream[i]; */
+	/* 	gpuErrchk(cudaStreamSynchronize( s ));  */
+	/* } */
+	gpuErrchk(cudaDeviceSynchronize());
 	
 	for (int i=0; i < forcePairs.size(); i++)
 		forcePairs[i].processGPUForces();
@@ -277,7 +264,6 @@ int RigidBodyForcePair::lastRbGridID = -1; /* used to schedule kernel interactio
 RigidBodyForcePair* RigidBodyForcePair::lastRbForcePair = NULL;
 
 void RigidBodyForcePair::createStreams() {
-	gpuErrchk( cudaProfilerStart() );
 	for (int i = 0; i < NUMSTREAMS; i++)
 		gpuErrchk( cudaStreamCreate( &(stream[i]) ) );
 		// gpuErrchk( cudaStreamCreateWithFlags( &(stream[i]) , cudaStreamNonBlocking ) );
@@ -355,7 +341,8 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
 				 B1, B2, c,
 				 forces_d[i], torques_d[i]);
 		}
-
+		// retrieveForcesForGrid(i); // this is slower than approach below, unsure why
+		
 		if (lastRbGridID >= 0)
 			lastRbForcePair->retrieveForcesForGrid(lastRbGridID);
 		lastRbForcePair = this;