diff --git a/RigidBodyController.cu b/RigidBodyController.cu index c5bb54357ebd954e1f5b0806fc205762a2017938..e2a2221723d147c9a67e64f5e227122d44ae9d72 100644 --- a/RigidBodyController.cu +++ b/RigidBodyController.cu @@ -197,33 +197,20 @@ void RigidBodyController::updateForces(int s) { for (int i=0; i < forcePairs.size(); i++) forcePairs[i].callGridForceKernel(i,s); - /* for (int i=0; i < forcePairs.size(); i++) */ - /* forcePairs[i].retrieveForces(); */ - RigidBodyForcePair::lastRigidBodyForcePair->retrieveForcesForGrid( - RigidBodyForcePair::lastRigidBodyGridID); - RigidBodyForcePair::lastRigidBodyGridID = -1; + // each kernel call is followed by async memcpy for previous; now get last + RigidBodyForcePair* fp = RigidBodyForcePair::lastRbForcePair; + fp->retrieveForcesForGrid( fp->lastRbGridID ); + fp->lastRbGridID = -1; + + // sync streams + for (int i = 0; i < NUMSTREAMS; i++) { + const cudaStream_t &s = fp->stream[i]; + gpuErrchk(cudaStreamSynchronize( s )); + } for (int i=0; i < forcePairs.size(); i++) - forcePairs[i].processForces(); + forcePairs[i].processGPUForces(); - // RBTODO: see if there is a better way to sync - // gpuErrchk(cudaDeviceSynchronize()); - - - /*/ debug - if (s %10 == 0) { - int tmp = 0; - for (int i = 0; i < rigidBodyByType.size(); i++) { - for (int j = 0; j < rigidBodyByType[i].size(); j++) { - RigidBody& rb = rigidBodyByType[i][j]; - tmp++; - Vector3 p = rb.getPosition(); - Vector3 t = rb.torque; - printf("RBTORQUE: %d %f %f %f %f %f %f\n", tmp, p.x, p.y, p.z, t.x,t.y,t.z); - } - } - } - */ } void RigidBodyController::integrate(int step) { // tell RBs to integrate @@ -285,11 +272,10 @@ void RigidBodyController::integrate(int step) { // allocate and initialize an array of stream handles cudaStream_t *RigidBodyForcePair::stream = (cudaStream_t *) malloc(NUMSTREAMS * sizeof(cudaStream_t)); -bool *RigidBodyForcePair::isStreamLaunched = (bool *) malloc(NUMSTREAMS * sizeof(bool)); -// new cudaStream_t[NUMSTREAMS]; int RigidBodyForcePair::nextStreamID = 0; /* used during stream init */ -int RigidBodyForcePair::lastRigidBodyGridID = -1; /* used to schedule kernel interaction */ -RigidBodyForcePair* RigidBodyForcePair::lastRigidBodyForcePair = NULL; +int RigidBodyForcePair::lastRbGridID = -1; /* used to schedule kernel interaction */ +RigidBodyForcePair* RigidBodyForcePair::lastRbForcePair = NULL; + void RigidBodyForcePair::createStreams() { gpuErrchk( cudaProfilerStart() ); for (int i = 0; i < NUMSTREAMS; i++) @@ -333,12 +319,8 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) { const int nb = numBlocks[i]; const int k1 = gridKeyId1[i]; const int k2 = gridKeyId2[i]; - const int sid = streamID[i]; - const cudaStream_t &s = stream[sid]; + const cudaStream_t &s = stream[streamID[i]]; - /* if (isStreamLaunched[sid]) */ - /* retrieveForcesForGrid(i); */ - /* ijk: index of grid value r: postion of point ijk in real space @@ -374,66 +356,31 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) { forces_d[i], torques_d[i]); } - if (lastRigidBodyGridID >= 0) - lastRigidBodyForcePair->retrieveForcesForGrid(lastRigidBodyGridID); - lastRigidBodyForcePair = this; - lastRigidBodyGridID = i; - - /* isStreamLaunched[sid] = true; */ - - /* gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(Vector3)*nb, */ - /* cudaMemcpyDeviceToHost, s)); */ - /* gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb, */ - /* cudaMemcpyDeviceToHost, s)); */ - + if (lastRbGridID >= 0) + lastRbForcePair->retrieveForcesForGrid(lastRbGridID); + lastRbForcePair = this; + lastRbGridID = i; } } void RigidBodyForcePair::retrieveForcesForGrid(const int i) { // i: grid ID (less than numGrids) - const int sid = streamID[i]; - const cudaStream_t &s = stream[sid]; + const cudaStream_t &s = stream[streamID[i]]; const int nb = numBlocks[i]; gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(Vector3)*nb, cudaMemcpyDeviceToHost, s)); gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb, cudaMemcpyDeviceToHost, s)); - //gpuErrchk(cudaStreamSynchronize( s )); } -void RigidBodyForcePair::retrieveForces() { - // sum forces + torques - const int numGrids = gridKeyId1.size(); - Vector3 f = Vector3(0.0f); - Vector3 t = Vector3(0.0f); - - for (int i = 0; i < numGrids; i++) - retrieveForcesForGrid(i); -/* { */ - /* const int sid = streamID[i]; */ - /* const cudaStream_t &s = stream[sid]; */ - /* const int nb = numBlocks[i]; */ - /* isStreamLaunched[sid] = false; */ - - /* gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(Vector3)*nb, */ - /* cudaMemcpyDeviceToHost, s)); */ - /* gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb, */ - /* cudaMemcpyDeviceToHost, s)); */ - /* } */ -} -void RigidBodyForcePair::processForces() { +void RigidBodyForcePair::processGPUForces() { const int numGrids = gridKeyId1.size(); Vector3 f = Vector3(0.0f); Vector3 t = Vector3(0.0f); for (int i = 0; i < numGrids; i++) { - const int sid = streamID[i]; - const cudaStream_t &s = stream[sid]; const int nb = numBlocks[i]; - isStreamLaunched[sid] = false; - - gpuErrchk(cudaStreamSynchronize( s )); Vector3 tmpF = Vector3(0.0f); Vector3 tmpT = Vector3(0.0f); @@ -453,13 +400,6 @@ void RigidBodyForcePair::processForces() { t = t + tmpT; } - // transform torque from lab-frame origin to rb centers - // add forces to rbs - /* Vector3 tmp; */ - /* /\* tmp = rb1->position; *\/ */ - /* /\* printf("rb1->position: (%f,%f,%f)\n", tmp.x, tmp.y, tmp.z); *\/ */ - /* tmp = rb1->getPosition(); */ - /* printf("rb1->getPosition(): (%f,%f,%f)\n", tmp.x, tmp.y, tmp.z); */ rb1->addForce( f ); rb1->addTorque( t ); diff --git a/RigidBodyController.h b/RigidBodyController.h index 902b5a313dd5af7e1b902cc47f61e1f23cecf0bb..ec31395e1021444b1fa923075adce64000cd8720 100644 --- a/RigidBodyController.h +++ b/RigidBodyController.h @@ -65,17 +65,15 @@ private: static int nextStreamID; std::vector<int> streamID; static cudaStream_t* stream; - static bool* isStreamLaunched; static void createStreams(); static int lastStreamID; - static RigidBodyForcePair* lastRigidBodyForcePair; - static int lastRigidBodyGridID; + static RigidBodyForcePair* lastRbForcePair; + static int lastRbGridID; void callGridForceKernel(int pairId, int s); void retrieveForcesForGrid(const int i); - void retrieveForces(); - void processForces(); + void processGPUForces(); Matrix3 getBasis1(const int i); Matrix3 getBasis2(const int i); Vector3 getOrigin1(const int i);