working on stream synchronization

a3658fe7 · cmaffeo2 · 7785a80d · a3658fe7 · a3658fe7
Commit a3658fe7 authored 9 years ago by cmaffeo2
--- a/RigidBodyController.cu
+++ b/RigidBodyController.cu
@@ -10,6 +10,8 @@
 #include "RigidBodyType.h"
 #include "ComputeGridGrid.cuh"

+#include <cuda_profiler_api.h>
+
 // #include <vector>
 #include "Debug.h"

@@ -157,6 +159,12 @@ void RigidBodyController::initializeForcePairs() {
 			}
 		}
 	}
+
+	// Initialize device data for RB force pairs after std::vector is done growing
+	for (int i = 0; i < forcePairs.size(); i++)
+		forcePairs[i].initialize();
+		
+	
 }
 	
 void RigidBodyController::updateForces(int s) {
@@ -194,8 +202,9 @@ void RigidBodyController::updateForces(int s) {
 		forcePairs[i].retrieveForces();

 	// RBTODO: see if there is a better way to sync
-	gpuErrchk(cudaDeviceSynchronize());
-
+	// gpuErrchk(cudaDeviceSynchronize());
+	
+	
 	/*/ debug
 	if (s %10 == 0) {
 		int tmp = 0;
@@ -271,9 +280,11 @@ void RigidBodyController::integrate(int step) {

 // allocate and initialize an array of stream handles
 cudaStream_t *RigidBodyForcePair::stream = (cudaStream_t *) malloc(NUMSTREAMS * sizeof(cudaStream_t));
+bool *RigidBodyForcePair::isStreamLaunched = (bool *) malloc(NUMSTREAMS * sizeof(bool));
 // new cudaStream_t[NUMSTREAMS];
 int RigidBodyForcePair::nextStreamID = 0;
 void RigidBodyForcePair::createStreams() {
+	gpuErrchk( cudaProfilerStart() );
 	for (int i = 0; i < NUMSTREAMS; i++)
 		gpuErrchk( cudaStreamCreate( &(stream[i]) ) );
 		// gpuErrchk( cudaStreamCreateWithFlags( &(stream[i]) , cudaStreamNonBlocking ) );
@@ -315,7 +326,12 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
 		const int nb = numBlocks[i];
 		const int k1 = gridKeyId1[i];
 		const int k2 = gridKeyId2[i];
-		const cudaStream_t &s = stream[streamID[i]];
+		const int sid = streamID[i];
+		const cudaStream_t &s = stream[sid];
+
+		if (isStreamLaunched[sid])
+			retrieveForcesForGrid(i);
+			
 		/*
 			ijk: index of grid value
 			r: postion of point ijk in real space
@@ -337,6 +353,7 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
 		
 		Matrix3 B2 = getBasis2(i).inverse();

+		
 		// RBTODO: get energy
 		if (!isPmf) {								/* pair of RBs */
 			computeGridGridForce<<< nb, numThreads, NUMTHREADS*2*sizeof(Vector3), s >>>
@@ -349,19 +366,39 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) {
 				 B1, B2, c,
 				 forces_d[i], torques_d[i]);
 		}
+		isStreamLaunched[sid] = true;
+
+		/* gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(Vector3)*nb, */
+		/* 													cudaMemcpyDeviceToHost, s)); */
+		/* gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb, */
+		/* 													cudaMemcpyDeviceToHost, s)); */
+
 	}
 }
-
+void RigidBodyForcePair::retrieveForcesForGrid(const int i) {
+	// i: grid ID (less than numGrids)
+	const int sid = streamID[i];
+	const cudaStream_t &s = stream[sid];
+	const int nb = numBlocks[i];
+
+	gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(Vector3)*nb,
+														cudaMemcpyDeviceToHost, s));
+	gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb,
+														cudaMemcpyDeviceToHost, s));
+  //gpuErrchk(cudaStreamSynchronize( s ));
+	
+}
 void RigidBodyForcePair::retrieveForces() {
 	// sum forces + torques
 	const int numGrids = gridKeyId1.size();
 	Vector3 f = Vector3(0.0f);
 	Vector3 t = Vector3(0.0f);

-	// RBTODO better way to sync?
 	for (int i = 0; i < numGrids; i++) {
-		const cudaStream_t &s = stream[streamID[i]];
+		const int sid = streamID[i];
+		const cudaStream_t &s = stream[sid];
 		const int nb = numBlocks[i];
+		isStreamLaunched[sid] = false;

 		gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(Vector3)*nb,
 															cudaMemcpyDeviceToHost, s));
@@ -642,7 +679,7 @@ void RigidBodyController::printData(int step,std::ofstream &file) {
 }

 int RigidBodyForcePair::initialize() {
-	printf("    Initializing (memory for) RB force pair...\n");
+	printf("    Initializing (streams for) RB force pair...\n");

 	const int numGrids = gridKeyId1.size();
 	// RBTODO assert gridKeysIds are same size 
@@ -700,17 +737,19 @@ RigidBodyForcePair::~RigidBodyForcePair() {
 	// RBTODO assert gridKeysIds are same size 

 	// allocate memory for forces/torques
-	for (int i = 0; i < numGrids; i++) {
-		const int k1 = gridKeyId1[i];
-		const int nb = numBlocks[i];
-
-		// free device memory for numBlocks of torque, etc.
-		// printf("      Freeing device memory for forces/torques\n");
-		gpuErrchk(cudaFree( forces_d[i] ));	
-		gpuErrchk(cudaFree( torques_d[i] ));
+	if (streamID.size() > 0) {
+		for (int i = 0; i < numGrids; i++) {
+			const int k1 = gridKeyId1[i];
+			const int nb = numBlocks[i];
+
+			// free device memory for numBlocks of torque, etc.
+			// printf("      Freeing device memory for forces/torques\n");
+			gpuErrchk(cudaFree( forces_d[i] ));	
+			gpuErrchk(cudaFree( torques_d[i] ));
+		}
+		gpuErrchk(cudaDeviceSynchronize());
 	}
-	gpuErrchk(cudaDeviceSynchronize());
-	
+	streamID.clear();
 	numBlocks.clear();
 	forces.clear();
 	forces_d.clear();

--- a/RigidBodyController.h
+++ b/RigidBodyController.h
@@ -23,14 +23,14 @@ public:
 		gridKeyId1(gridKeyId1), gridKeyId2(gridKeyId2), isPmf(isPmf)
 		{
 			printf("    Constructing RB force pair...\n");
-			initialize();
+			/* initialize(); */
 			// printf("    done constructing RB force pair\n");
 		}
 	RigidBodyForcePair(const RigidBodyForcePair& o) :
 		type1(o.type1), type2(o.type2), rb1(o.rb1), rb2(o.rb2),
 		gridKeyId1(o.gridKeyId1), gridKeyId2(o.gridKeyId2), isPmf(o.isPmf) {
 		printf("    Copying RB force pair...\n");
-		initialize();
+		/* initialize(); */
 	}
 	RigidBodyForcePair& operator=(RigidBodyForcePair& o) {
 		printf("    Copying assigning RB force pair...\n");
@@ -65,8 +65,11 @@ private:
 	static int nextStreamID; 
 	std::vector<int> streamID;
 	static cudaStream_t* stream;
+	static bool* isStreamLaunched;
 	static void createStreams();
+
 	void callGridForceKernel(int pairId, int s);
+	void retrieveForcesForGrid(const int i);
 	void retrieveForces();
 	Matrix3 getBasis1(const int i);
 	Matrix3 getBasis2(const int i);