From 650cb4fc402e4cc85a901b876af6e4b339df8cb1 Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Fri, 8 Jan 2016 12:37:44 -0600 Subject: [PATCH] Attempt to parallelize kernel; Added names to multiple instances of an RB; rotation integration still appears flawed --- ComputeGridGrid.cuh | 20 +++--- RigidBody.cu | 4 +- RigidBody.h | 10 +-- RigidBodyController.cu | 84 ++++++++++++++++--------- RigidBodyController.h | 18 ++++-- RigidBodyGrid.cu | 140 ++++++++++++----------------------------- RigidBodyGrid.h | 32 ++++------ makefile | 15 +++-- 8 files changed, 145 insertions(+), 178 deletions(-) diff --git a/ComputeGridGrid.cuh b/ComputeGridGrid.cuh index b9fede9..664732a 100644 --- a/ComputeGridGrid.cuh +++ b/ComputeGridGrid.cuh @@ -34,27 +34,24 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, /* return; */ // RBTODO: maybe organize data into compact regions for grid data for better use of shared memory... - Vector3 r_ijk = rho->getPosition(r_id); /* i,j,k value of voxel */ - Vector3 r_pos = basis_rho.transform( r_ijk ) + origin_rho; + const Vector3 r_ijk = rho->getPosition(r_id); /* i,j,k value of voxel */ + const Vector3 r_pos = basis_rho.transform( r_ijk ) + origin_rho; // Vector3 u_ijk_float = basis_u.transform( r_pos - origin_u ); - Matrix3 basis_u_inv = basis_u.inverse(); - Vector3 u_ijk_float = basis_u_inv.transform( r_pos - origin_u ); + const Matrix3 basis_u_inv = basis_u.inverse(); + const Vector3 u_ijk_float = basis_u_inv.transform( r_pos - origin_u ); - float r_val = rho->val[r_id]; + const float r_val = rho->val[r_id]; float energy = r_val * u->interpolatePotential( u_ijk_float ); // RBTODO What about non-unit delta? // RBTODO combine interp methods and reduce repetition! - Vector3 f = r_val*u->interpolateForceD( u_ijk_float ); /* in coord frame of u */ + const Vector3 f = r_val*u->interpolateForceD( u_ijk_float ); /* in coord frame of u */ // f = basis_u.inverse().transpose().transform( f ); /* transform to lab frame */ - f = basis_u_inv.transpose().transform( f ); /* transform to lab frame */ + force[tid] = basis_u_inv.transpose().transform( f ); /* transform to lab frame */ // Calculate torque about lab-frame origin - Vector3 t = r_pos.cross(f); // RBTODO: test if sign is correct! + torque[tid] = r_pos.cross(f); // RBTODO: test if sign is correct! - force[tid] = f; - torque[tid] = t; - __syncthreads(); /*/ debug forces float cutoff = 1e-3; @@ -65,6 +62,7 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, // Reduce force and torques // http://www.cuvilib.com/Reduction.pdf // RBTODO optimize + __syncthreads(); for (unsigned int offset = blockDim.x/2; offset > 0; offset >>= 1) { if (tid < offset) { unsigned int oid = tid + offset; diff --git a/RigidBody.cu b/RigidBody.cu index 10a6aa4..9cfe33e 100644 --- a/RigidBody.cu +++ b/RigidBody.cu @@ -11,8 +11,8 @@ #include "Debug.h" -RigidBody::RigidBody(const Configuration& cref, RigidBodyType& tref) - : c(&cref), t(&tref), impulse_to_momentum(4.184e8f) { +RigidBody::RigidBody(String name, const Configuration& cref, RigidBodyType& tref) + : name(name), c(&cref), t(&tref), impulse_to_momentum(4.184e8f) { // units "(kcal_mol/AA) * ns" "amu AA/ns" * 4.184e+08 timestep = c->timestep; diff --git a/RigidBody.h b/RigidBody.h index b1eddcc..dd71ebe 100644 --- a/RigidBody.h +++ b/RigidBody.h @@ -27,7 +27,7 @@ class RigidBody { // host side representation of rigid bodies | splitting methods for rigid body molecular dynamics". J Chem Phys. (1997) | `––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––*/ public: - HOST DEVICE RigidBody(const Configuration& c, RigidBodyType& t); + HOST DEVICE RigidBody(String name, const Configuration& c, RigidBodyType& t); /* HOST DEVICE RigidBody(RigidBodyType t); */ HOST DEVICE ~RigidBody(); @@ -43,8 +43,9 @@ class RigidBody { // host side representation of rigid bodies HOST DEVICE void integrate(int startFinishAll); // HOST DEVICE inline String getKey() const { return key; } - HOST DEVICE inline String getKey() const { return t->name; } - + // HOST DEVICE inline String getKey() const { return t->name; } + HOST DEVICE inline String getKey() const { return name; } + HOST DEVICE inline Vector3 getPosition() const { return position; } HOST DEVICE inline Matrix3 getOrientation() const { return orientation; } // HOST DEVICE inline Matrix3 getBasis() const { return orientation; } @@ -59,7 +60,8 @@ class RigidBody { // host side representation of rigid bodies Vector3 torque; // lab frame (except in integrate()) private: - String key; + // String key; + String name; /* static const SimParameters * simParams; */ Vector3 position; // Q = orientation.transpose(); in Dullweber et al diff --git a/RigidBodyController.cu b/RigidBodyController.cu index 0142f4c..adf6d47 100644 --- a/RigidBodyController.cu +++ b/RigidBodyController.cu @@ -39,16 +39,24 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* out for (int i = 0; i < conf.numRigidTypes; i++) { numRB += conf.rigidBody[i].num; std::vector<RigidBody> tmp; - for (int j = 0; j < conf.rigidBody[i].num; j++) { - RigidBody r(conf, conf.rigidBody[i]); + // RBTODO: change conf.rigidBody to conf.rigidBodyType + const int jmax = conf.rigidBody[i].num; + for (int j = 0; j < jmax; j++) { + String name = conf.rigidBody[i].name; + if (jmax > 1) { + char tmp[128]; + snprintf(tmp, 128, "#%d", j); + name.add( tmp ); + } + RigidBody r(name, conf, conf.rigidBody[i]); tmp.push_back( r ); - } - rigidBodyByType.push_back(tmp); } + rigidBodyByType.push_back(tmp); +} random = new RandomCPU(conf.seed + 1); /* +1 to avoid using same seed as RandomCUDA */ - initializeForcePairs(); +initializeForcePairs(); } RigidBodyController::~RigidBodyController() { for (int i = 0; i < rigidBodyByType.size(); i++) @@ -60,6 +68,7 @@ RigidBodyController::~RigidBodyController() { void RigidBodyController::initializeForcePairs() { // Loop over all pairs of rigid body types // the references here make the code more readable, but they may incur a performance loss + RigidBodyForcePair::createStreams(); printf("Initializing force pairs\n"); for (int ti = 0; ti < conf.numRigidTypes; ti++) { RigidBodyType& t1 = conf.rigidBody[ti]; @@ -178,9 +187,11 @@ void RigidBodyController::updateForces(int s) { } } - for (int i=0; i < forcePairs.size(); i++) { - forcePairs[i].updateForces(i,s); - } + for (int i=0; i < forcePairs.size(); i++) + forcePairs[i].callGridForceKernel(i,s); + + for (int i=0; i < forcePairs.size(); i++) + forcePairs[i].retrieveForces(); // RBTODO: see if there is a better way to sync gpuErrchk(cudaDeviceSynchronize()); @@ -257,21 +268,32 @@ void RigidBodyController::integrate(int step) { } } } - + +// allocate and initialize an array of stream handles +cudaStream_t *RigidBodyForcePair::stream = (cudaStream_t *) malloc(NUMSTREAMS * sizeof(cudaStream_t)); +// new cudaStream_t[NUMSTREAMS]; +int RigidBodyForcePair::nextStreamID = 0; +void RigidBodyForcePair::createStreams() { + for (int i = 0; i < NUMSTREAMS; i++) + gpuErrchk( cudaStreamCreate( &(stream[i]) ) ); + // gpuErrchk( cudaStreamCreateWithFlags( &(stream[i]) , cudaStreamNonBlocking ) ); +} + // RBTODO: bundle several rigidbodypair evaluations in single kernel call -void RigidBodyForcePair::updateForces(int pairId, int s) { +void RigidBodyForcePair::callGridForceKernel(int pairId, int s) { // get the force/torque between a pair of rigid bodies /* printf(" Updating rbPair forces\n"); */ const int numGrids = gridKeyId1.size(); - if (s%10 != 0) - pairId = -1000; + /* if (s%10 != 0) */ + /* pairId = -1000; */ // RBTODO: precompute certain common transformations and pass in kernel call for (int i = 0; i < numGrids; i++) { const int nb = numBlocks[i]; const int k1 = gridKeyId1[i]; const int k2 = gridKeyId2[i]; + const cudaStream_t &s = stream[streamID[i]]; /* ijk: index of grid value @@ -301,7 +323,7 @@ void RigidBodyForcePair::updateForces(int pairId, int s) { c2 = rb2->getOrientation()*type2->potentialGrids[k2].getOrigin() + rb2->getPosition(); // RBTODO: get energy - computeGridGridForce<<< nb, numThreads >>> + computeGridGridForce<<< nb, numThreads, 0, s >>> (type1->rawDensityGrids_d[k1], type2->rawPotentialGrids_d[k2], B1, c1, B2, c2, forces_d[i], torques_d[i], pairId+i); @@ -311,37 +333,39 @@ void RigidBodyForcePair::updateForces(int pairId, int s) { B2 = type2->rawPmfs[k2].getBasis(); c2 = type2->rawPmfs[k2].getOrigin(); - computeGridGridForce<<< nb, numThreads >>> + computeGridGridForce<<< nb, numThreads, 0, s >>> (type1->rawDensityGrids_d[k1], type2->rawPmfs_d[k2], B1, c1, B2, c2, forces_d[i], torques_d[i], pairId+i); } - - } - // RBTODO better way to sync? - gpuErrchk(cudaDeviceSynchronize()); - for (int i = 0; i < numGrids; i++) { - const int nb = numBlocks[i]; - gpuErrchk(cudaMemcpy(forces[i], forces_d[i], sizeof(Vector3)*nb, - cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(torques[i], torques_d[i], sizeof(Vector3)*nb, - cudaMemcpyDeviceToHost)); } - gpuErrchk(cudaDeviceSynchronize()); +} +void RigidBodyForcePair::retrieveForces() { // sum forces + torques + const int numGrids = gridKeyId1.size(); Vector3 f = Vector3(0.0f); Vector3 t = Vector3(0.0f); + // RBTODO better way to sync? for (int i = 0; i < numGrids; i++) { + const cudaStream_t &s = stream[streamID[i]]; const int nb = numBlocks[i]; + + gpuErrchk(cudaMemcpyAsync(forces[i], forces_d[i], sizeof(Vector3)*nb, + cudaMemcpyDeviceToHost, s)); + gpuErrchk(cudaMemcpyAsync(torques[i], torques_d[i], sizeof(Vector3)*nb, + cudaMemcpyDeviceToHost, s)); + + gpuErrchk(cudaStreamSynchronize( s )); + for (int j = 0; j < nb; j++) { f = f + forces[i][j]; t = t + torques[i][j]; } } - + // transform torque from lab-frame origin to rb centers // add forces to rbs /* Vector3 tmp; */ @@ -350,7 +374,7 @@ void RigidBodyForcePair::updateForces(int pairId, int s) { /* tmp = rb1->getPosition(); */ /* printf("rb1->getPosition(): (%f,%f,%f)\n", tmp.x, tmp.y, tmp.z); */ Vector3 t1 = t - rb1->getPosition().cross( f ); - rb1->addForce( f); + rb1->addForce( f ); rb1->addTorque(t1); if (!isPmf) { @@ -775,7 +799,7 @@ RigidBodyParams* RigidBodyParamsList::find_key(const char* key) { /* printf(" Done constructing RB force pair\n"); */ /* } */ -void RigidBodyForcePair::initialize() { +int RigidBodyForcePair::initialize() { printf(" Initializing (memory for) RB force pair...\n"); const int numGrids = gridKeyId1.size(); @@ -786,6 +810,8 @@ void RigidBodyForcePair::initialize() { const int k1 = gridKeyId1[i]; const int sz = type1->rawDensityGrids[k1].getSize(); const int nb = sz / numThreads + ((sz % numThreads == 0) ? 0:1 ); + streamID.push_back( nextStreamID % NUMSTREAMS ); + nextStreamID++; numBlocks.push_back(nb); forces.push_back( new Vector3[nb] ); @@ -801,7 +827,7 @@ void RigidBodyForcePair::initialize() { } gpuErrchk(cudaDeviceSynchronize()); // printf(" Done initializing RB force pair\n"); - + return nextStreamID; } /* RigidBodyForcePair::RigidBodyForcePair(const RigidBodyForcePair& orig ) : */ diff --git a/RigidBodyController.h b/RigidBodyController.h index 4acfa5f..d54d48c 100644 --- a/RigidBodyController.h +++ b/RigidBodyController.h @@ -5,7 +5,8 @@ #include <cuda.h> #include <cuda_runtime.h> -#define NUMTHREADS 256 +#define NUMTHREADS 256 /* try with 64, every 32+ */ +#define NUMSTREAMS 8 class Configuration; class RandomCPU; @@ -39,11 +40,11 @@ public: ~RigidBodyForcePair(); private: - void initialize(); + int initialize(); void swap(RigidBodyForcePair& a, RigidBodyForcePair& b); static const int numThreads = NUMTHREADS; - + bool isPmf; RigidBodyType* type1; @@ -54,13 +55,18 @@ private: std::vector<int> gridKeyId1; std::vector<int> gridKeyId2; std::vector<int> numBlocks; - + std::vector<Vector3*> forces; std::vector<Vector3*> forces_d; std::vector<Vector3*> torques; std::vector<Vector3*> torques_d; - - void updateForces(int pairId, int s); + + static int nextStreamID; + std::vector<int> streamID; + static cudaStream_t* stream; + static void createStreams(); + void callGridForceKernel(int pairId, int s); + void retrieveForces(); }; class RigidBodyController { diff --git a/RigidBodyGrid.cu b/RigidBodyGrid.cu index c43fb36..4544d10 100644 --- a/RigidBodyGrid.cu +++ b/RigidBodyGrid.cu @@ -343,32 +343,29 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(Vector3 l) const { home[1] = homeY; home[2] = homeZ; - // Get the grid dimensions. - int g[3]; - g[0] = nx; - g[1] = ny; - g[2] = nz; - // Get the interpolation coordinates. - float w[3]; - w[0] = l.x - homeX; - w[1] = l.y - homeY; - w[2] = l.z - homeZ; + const float wx = l.x - homeX; + const float wy = l.y - homeY; + const float wz = l.z - homeZ; + + // RBTODO: remove wrap + // RBTODO: // Find the values at the neighbors. float g1[4][4][4]; for (int ix = 0; ix < 4; ix++) { int jx = ix-1 + home[0]; - jx = wrap(jx, g[0]); + jx = wrap(jx, nx); for (int iy = 0; iy < 4; iy++) { int jy = iy-1 + home[1]; - jy = wrap(jy, g[1]); + jy = wrap(jy, ny); for (int iz = 0; iz < 4; iz++) { // Wrap around the periodic boundaries. int jz = iz-1 + home[2]; - jz = wrap(jz, g[2]); - + jz = wrap(jz, nz); + int ind = jz*jump[2] + jy*jump[1] + jx*jump[0]; + // g1[ix][iy][iz] = jz < 0 || jz > home[2] || ... ? val[ind] : 0; g1[ix][iy][iz] = val[ind]; } } @@ -384,7 +381,7 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(Vector3 l) const { a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]); a0 = g1[1][iy][iz]; - g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0; + g2[iy][iz] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; } } @@ -396,7 +393,7 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(Vector3 l) const { a1 = 0.5f*(-g2[0][iz] + g2[2][iz]); a0 = g2[1][iz]; - g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0; + g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; } // Mix along z. @@ -405,11 +402,11 @@ HOST DEVICE float RigidBodyGrid::interpolatePotential(Vector3 l) const { a1 = 0.5f*(-g3[0] + g3[2]); a0 = g3[1]; - return a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0; + return a3*wz*wz*wz + a2*wz*wz + a1*wz + a0; } /** interpolateForce() to be used on CUDA Device **/ -HOST DEVICE Vector3 RigidBodyGrid::interpolateForceD(Vector3 l) const { +HOST DEVICE Vector3 RigidBodyGrid::interpolateForceD(const Vector3 l) const { Vector3 f; // Vector3 l = basisInv.transform(pos - origin); int homeX = int(floor(l.x)); @@ -426,17 +423,11 @@ HOST DEVICE Vector3 RigidBodyGrid::interpolateForceD(Vector3 l) const { home[1] = homeY; home[2] = homeZ; - // Shift the indices in the grid dimensions. - int g[3]; - g[0] = nx; - g[1] = ny; - g[2] = nz; - // Get the interpolation coordinates. - float w[3]; - w[0] = l.x - homeX; - w[1] = l.y - homeY; - w[2] = l.z - homeZ; + const float wx = l.x - homeX; + const float wy = l.y - homeY; + const float wz = l.z - homeZ; + // Find the values at the neighbors. float g1[4][4][4]; /* neighborhood */ for (int ix = 0; ix < 4; ix++) { @@ -445,78 +436,25 @@ HOST DEVICE Vector3 RigidBodyGrid::interpolateForceD(Vector3 l) const { // RBTODO: probably don't wrap! // Wrap around the periodic boundaries. int jx = ix-1 + home[0]; - jx = wrap(jx, g[0]); + jx = wrap(jx, nx); int jy = iy-1 + home[1]; - jy = wrap(jy, g[1]); + jy = wrap(jy, ny); int jz = iz-1 + home[2]; - jz = wrap(jz, g[2]); + jz = wrap(jz, nz); int ind = jz*jump[2] + jy*jump[1] + jx*jump[0]; g1[ix][iy][iz] = val[ind]; } } } - f.x = interpolateDiffX(w, g1); - f.y = interpolateDiffY(w, g1); - f.z = interpolateDiffZ(w, g1); + f.x = interpolateDiffX(wx,wy,wz, g1); + f.y = interpolateDiffY(wx,wy,wz, g1); + f.z = interpolateDiffZ(wx,wy,wz, g1); // Vector3 f1 = basisInv.transpose().transform(f); // return f1; return f; } -/* inline virtual Vector3 interpolateForce(Vector3 pos) const { */ -/* Vector3 f; */ -/* Vector3 l = basisInv.transform(pos - origin); */ -/* int homeX = int(floor(l.x)); */ -/* int homeY = int(floor(l.y)); */ -/* int homeZ = int(floor(l.z)); */ -/* // Get the array jumps with shifted indices. */ -/* int jump[3]; */ -/* jump[0] = nz*ny; */ -/* jump[1] = nz; */ -/* jump[2] = 1; */ -/* // Shift the indices in the home array. */ -/* int home[3]; */ -/* home[0] = homeX; */ -/* home[1] = homeY; */ -/* home[2] = homeZ; */ - -/* // Shift the indices in the grid dimensions. */ -/* int g[3]; */ -/* g[0] = nx; */ -/* g[1] = ny; */ -/* g[2] = nz; */ - -/* // Get the interpolation coordinates. */ -/* float w[3]; */ -/* w[0] = l.x - homeX; */ -/* w[1] = l.y - homeY; */ -/* w[2] = l.z - homeZ; */ -/* // Find the values at the neighbors. */ -/* float g1[4][4][4]; */ -/* //RBTODO parallelize? */ -/* for (int ix = 0; ix < 4; ix++) { */ -/* for (int iy = 0; iy < 4; iy++) { */ -/* for (int iz = 0; iz < 4; iz++) { */ -/* // Wrap around the periodic boundaries. */ -/* int jx = ix-1 + home[0]; */ -/* jx = wrap(jx, g[0]); */ -/* int jy = iy-1 + home[1]; */ -/* jy = wrap(jy, g[1]); */ -/* int jz = iz-1 + home[2]; */ -/* jz = wrap(jz, g[2]); */ -/* int ind = jz*jump[2] + jy*jump[1] + jx*jump[0]; */ -/* g1[ix][iy][iz] = val[ind]; */ -/* } */ -/* } */ -/* } */ -/* f.x = interpolateDiffX(pos, w, g1); */ -/* f.y = interpolateDiffY(pos, w, g1); */ -/* f.z = interpolateDiffZ(pos, w, g1); */ -/* Vector3 f1 = basisInv.transpose().transform(f); */ -/* return f1; */ -/* } */ - -HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(float w[3], float g1[4][4][4]) const { +HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(const float wx, const float wy, const float wz, float g1[4][4][4]) const { float a0, a1, a2, a3; // RBTODO further parallelize loops? unlikely? @@ -529,8 +467,8 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(float w[3], float g1[4] a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]); a0 = g1[1][iy][iz]; - //g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0; - g2[iy][iz] = 3.0f*a3*w[0]*w[0] + 2.0f*a2*w[0] + a1; + //g2[iy][iz] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; + g2[iy][iz] = 3.0f*a3*wx*wx + 2.0f*a2*wx + a1; } } @@ -542,7 +480,7 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(float w[3], float g1[4] a1 = 0.5f*(-g2[0][iz] + g2[2][iz]); a0 = g2[1][iz]; - g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0; + g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; } // Mix along z. @@ -551,11 +489,11 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffX(float w[3], float g1[4] a1 = 0.5f*(-g3[0] + g3[2]); a0 = g3[1]; - float retval = -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0); + float retval = -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0); return retval; } -HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(float w[3], float g1[4][4][4]) const { +HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(const float wx, const float wy, const float wz, float g1[4][4][4]) const { float a0, a1, a2, a3; // Mix along x, taking the derivative. @@ -567,7 +505,7 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(float w[3], float g1[4] a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]); a0 = g1[1][iy][iz]; - g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0; + g2[iy][iz] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; } } @@ -579,8 +517,8 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(float w[3], float g1[4] a1 = 0.5f*(-g2[0][iz] + g2[2][iz]); a0 = g2[1][iz]; - //g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0; - g3[iz] = 3.0f*a3*w[1]*w[1] + 2.0f*a2*w[1] + a1; + //g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; + g3[iz] = 3.0f*a3*wy*wy + 2.0f*a2*wy + a1; } // Mix along z. @@ -589,10 +527,10 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffY(float w[3], float g1[4] a1 = 0.5f*(-g3[0] + g3[2]); a0 = g3[1]; - return -(a3*w[2]*w[2]*w[2] + a2*w[2]*w[2] + a1*w[2] + a0); + return -(a3*wz*wz*wz + a2*wz*wz + a1*wz + a0); } -HOST DEVICE inline float RigidBodyGrid::interpolateDiffZ(float w[3], float g1[4][4][4]) const { +HOST DEVICE inline float RigidBodyGrid::interpolateDiffZ(const float wx, const float wy, const float wz, float g1[4][4][4]) const { float a0, a1, a2, a3; // Mix along x, taking the derivative. @@ -604,7 +542,7 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffZ(float w[3], float g1[4 a1 = 0.5f*(-g1[0][iy][iz] + g1[2][iy][iz]); a0 = g1[1][iy][iz]; - g2[iy][iz] = a3*w[0]*w[0]*w[0] + a2*w[0]*w[0] + a1*w[0] + a0; + g2[iy][iz] = a3*wx*wx*wx + a2*wx*wx + a1*wx + a0; } } @@ -616,7 +554,7 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffZ(float w[3], float g1[4 a1 = 0.5f*(-g2[0][iz] + g2[2][iz]); a0 = g2[1][iz]; - g3[iz] = a3*w[1]*w[1]*w[1] + a2*w[1]*w[1] + a1*w[1] + a0; + g3[iz] = a3*wy*wy*wy + a2*wy*wy + a1*wy + a0; } // Mix along z. @@ -625,7 +563,7 @@ HOST DEVICE inline float RigidBodyGrid::interpolateDiffZ(float w[3], float g1[4 a1 = 0.5f*(-g3[0] + g3[2]); a0 = g3[1]; - return -(3.0f*a3*w[2]*w[2] + 2.0f*a2*w[2] + a1); + return -(3.0f*a3*wz*wz + 2.0f*a2*wz + a1); } diff --git a/RigidBodyGrid.h b/RigidBodyGrid.h index b456af8..518e496 100644 --- a/RigidBodyGrid.h +++ b/RigidBodyGrid.h @@ -142,9 +142,9 @@ public: // Added by Rogan for times when simpler calculations are required. virtual float interpolatePotentialLinearly(Vector3 pos) const; - HOST DEVICE float interpolateDiffX(float w[3], float g1[4][4][4]) const; - HOST DEVICE float interpolateDiffY(float w[3], float g1[4][4][4]) const; - HOST DEVICE float interpolateDiffZ(float w[3], float g1[4][4][4]) const; + HOST DEVICE float interpolateDiffX(const float wx, const float wy, const float wz, float g1[4][4][4]) const; + HOST DEVICE float interpolateDiffY(const float wx, const float wy, const float wz, float g1[4][4][4]) const; + HOST DEVICE float interpolateDiffZ(const float wx, const float wy, const float wz, float g1[4][4][4]) const; HOST DEVICE float interpolatePotential(Vector3 l) const; @@ -180,17 +180,11 @@ public: home[1] = homeY; home[2] = homeZ; - // Shift the indices in the grid dimensions. - int g[3]; - g[0] = nx; - g[1] = ny; - g[2] = nz; - // Get the interpolation coordinates. - float w[3]; - w[0] = l.x - homeX; - w[1] = l.y - homeY; - w[2] = l.z - homeZ; + const float wx = l.x - homeX; + const float wy = l.y - homeY; + const float wz = l.z - homeZ; + // Find the values at the neighbors. float g1[4][4][4]; //RBTODO parallelize? @@ -199,19 +193,19 @@ public: for (int iz = 0; iz < 4; iz++) { // Wrap around the periodic boundaries. int jx = ix-1 + home[0]; - jx = wrap(jx, g[0]); + jx = wrap(jx, nx); int jy = iy-1 + home[1]; - jy = wrap(jy, g[1]); + jy = wrap(jy, ny); int jz = iz-1 + home[2]; - jz = wrap(jz, g[2]); + jz = wrap(jz, nz); int ind = jz*jump[2] + jy*jump[1] + jx*jump[0]; g1[ix][iy][iz] = val[ind]; } } } - f.x = interpolateDiffX( w, g1); - f.y = interpolateDiffY( w, g1); - f.z = interpolateDiffZ( w, g1); + f.x = interpolateDiffX( wx, wy, wz, g1 ); + f.y = interpolateDiffY( wx, wy, wz, g1 ); + f.z = interpolateDiffZ( wx, wy, wz, g1 ); // Vector3 f1 = basisInv.transpose().transform(f); // return f1; return f; diff --git a/makefile b/makefile index 247504d..add8f91 100644 --- a/makefile +++ b/makefile @@ -37,14 +37,16 @@ endif #CODE_20 := -gencode arch=compute_20,code=sm_20 CODE_20 := -arch=sm_20 #CODE_30 := -gencode arch=compute_30,code=sm_30 -#CODE_35 := -gencode arch=compute_35,code=\"sm_35,compute_35\" +# CODE_35 := -gencode arch=compute_35,code=\"sm_35,compute_35\" +# CODE_35 := -arch=compute_35 -NV_FLAGS += $(CODE_10) $(CODE_12) $(CODE_20) $(CODE_30) $(CODE_35) +# NV_FLAGS += $(CODE_10) $(CODE_12) $(CODE_20) $(CODE_30) $(CODE_35) +NV_FLAGS += -arch=sm_35 NVLD_FLAGS := $(NV_FLAGS) --device-link -NV_FLAGS += -rdc=true +# NV_FLAGS += -rdc=true -LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -Wl,-rpath,$(LIBRARY) +LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(LIBRARY) ### Sources @@ -65,7 +67,8 @@ all: $(TARGET) @echo "Done ->" $(TARGET) $(TARGET): $(CU_OBJ) $(CC_OBJ) runBrownTown.cpp vmdsock.c imd.c imd.h - $(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o +# $(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o + $(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o $(EXEC) $(CC) $(CC_FLAGS) $(EX_FLAGS) runBrownTown.cpp vmdsock.c imd.c $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) $(LD_FLAGS) -o $(TARGET) # $(EXEC) $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o @@ -73,7 +76,7 @@ $(TARGET): $(CU_OBJ) $(CC_OBJ) runBrownTown.cpp vmdsock.c imd.c imd.h .SECONDEXPANSION: $(CU_OBJ): %.o: %.cu $$(wildcard %.h) $$(wildcard %.cuh) - $(EXEC) $(NVCC) $(NV_FLAGS) $(EX_FLAGS) -c $< -o $@ + $(EXEC) $(NVCC) $(NV_FLAGS) $(EX_FLAGS) -dc $< -o $@ $(CC_OBJ): %.o: %.cpp %.h $(EXEC) $(CC) $(CC_FLAGS) $(EX_FLAGS) -c $< -o $@ -- GitLab