From 4650488b8f6bcdf99210c6a397b0af626d47bc6a Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Fri, 16 Dec 2016 17:07:29 -0600 Subject: [PATCH] Multiple bug fixes: (1) PairlistDistance is properly passed on to ComputeForce objects; (2) Power of 2 threads are now used in particle-grid force reduction; (3) Fixed accidental transpose of threads and blocks in grids' particleList kernel call --- src/ComputeGridGrid.cu | 10 +++++----- src/ComputeGridGrid.cuh | 2 +- src/Configuration.cpp | 15 +++++++++------ src/CudaUtil.cuh | 2 +- src/GrandBrownTown.cu | 12 ++++++------ src/GrandBrownTown.cuh | 8 ++++++++ src/RigidBody.cu | 11 +++++++---- src/RigidBodyController.cu | 12 +++--------- src/RigidBodyController.h | 4 ---- 9 files changed, 40 insertions(+), 36 deletions(-) diff --git a/src/ComputeGridGrid.cu b/src/ComputeGridGrid.cu index b074fde..1cbba92 100644 --- a/src/ComputeGridGrid.cu +++ b/src/ComputeGridGrid.cu @@ -36,7 +36,7 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, /* const ForceEnergy fe = ForceEnergy( tmpf, tmpe); */ const ForceEnergy fe = u->interpolateForceDLinearly( u_ijk_float ); /* in coord frame of u */ force[tid] = fe.f; - + const float r_val = rho->val[r_id]; /* maybe move to beginning of function? */ force[tid] = basis_u_inv.transpose().transform( r_val*force[tid] ); /* transform to lab frame, with correct scaling factor */ @@ -47,6 +47,7 @@ void computeGridGridForce(const RigidBodyGrid* rho, const RigidBodyGrid* u, // Reduce force and torques // http://www.cuvilib.com/Reduction.pdf // RBTODO optimize further, perhaps + // assert( NUMTHREADS==32 || NUMTHREADS==64 || NUMTHREADS==128 || NUMTHREADS==256 || NUMTHREADS==512 ); __syncthreads(); for (int offset = blockDim.x/2; offset > 0; offset >>= 1) { if (tid < offset) { @@ -73,7 +74,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc extern __shared__ Vector3 s[]; Vector3 *force = s; Vector3 *torque = &s[NUMTHREADS]; - + const int tid = threadIdx.x; const int i = blockIdx.x * blockDim.x + threadIdx.x; @@ -94,6 +95,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc } // Reduce force and torques + // assert( NUMTHREADS==32 || NUMTHREADS==64 || NUMTHREADS==128 || NUMTHREADS==256 || NUMTHREADS==512 ); __syncthreads(); for (int offset = blockDim.x/2; offset > 0; offset >>= 1) { if (tid < offset) { @@ -103,7 +105,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc } __syncthreads(); } - + if (tid == 0) { retForce[blockIdx.x] = force[0]; retTorque[blockIdx.x] = torque[0]; @@ -117,8 +119,6 @@ void createPartlist(const Vector3* __restrict__ pos, const Vector3 gridCenter, const float radius2) { const int tid = threadIdx.x; const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */ - const int split = 32; /* numblocks should be divisible by split */ - /* const int blocksPerCell = gridDim.x/split; */ const int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < numTypeParticles) { diff --git a/src/ComputeGridGrid.cuh b/src/ComputeGridGrid.cuh index 6150ae6..0140616 100644 --- a/src/ComputeGridGrid.cuh +++ b/src/ComputeGridGrid.cuh @@ -1,7 +1,7 @@ // Included in RigidBodyController.cu #pragma once #include "useful.h" -#define NUMTHREADS 96 +#define NUMTHREADS 128 #define WARPSIZE 32 class RigidBodyGrid; diff --git a/src/Configuration.cpp b/src/Configuration.cpp index 4440e42..c38ba2a 100644 --- a/src/Configuration.cpp +++ b/src/Configuration.cpp @@ -1,4 +1,5 @@ #include "Configuration.h" +#include "myAssert.h" #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } @@ -1223,14 +1224,16 @@ void Configuration::readExcludes() for (int i = 0; i < numExcludes; i++) { if (excludes[i].ind1 != currPart) { currPart = excludes[i].ind1; - if (currPart < num) { - excludeMap[currPart].x = i; - if (lastPart >= 0) - excludeMap[lastPart].y = i; - lastPart = currPart; - } + myAssert(currPart < num); + excludeMap[currPart].x = i; + if (lastPart >= 0) + excludeMap[lastPart].y = i; + lastPart = currPart; } } + if (excludeMap[lastPart].x > 0) + excludeMap[lastPart].y = numExcludes; + } void Configuration::readAngles() { diff --git a/src/CudaUtil.cuh b/src/CudaUtil.cuh index 3442969..6775ea4 100644 --- a/src/CudaUtil.cuh +++ b/src/CudaUtil.cuh @@ -76,7 +76,7 @@ __device__ inline void inclIntCumSum(int* in, const int n) { __syncthreads(); } -__device__ inline void atomicAdd(Vector3* address, Vector3 val) { +__device__ inline void atomicAdd(Vector3* address, const Vector3 val) { atomicAdd( &(address->x), val.x); atomicAdd( &(address->y), val.y); atomicAdd( &(address->z), val.z); diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu index 356d854..9e85ef4 100644 --- a/src/GrandBrownTown.cu +++ b/src/GrandBrownTown.cu @@ -179,8 +179,8 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, // Prepare internal force computation internal = new ComputeForce(num, part, numParts, sys, switchStart, switchLen, coulombConst, - fullLongRange, numBonds, numTabBondFiles, numExcludes, numAngles, numTabAngleFiles, - numDihedrals, numTabDihedralFiles, numReplicas); + fullLongRange, numBonds, numTabBondFiles, numExcludes, numAngles, numTabAngleFiles, + numDihedrals, numTabDihedralFiles, c.pairlistDistance, numReplicas); //MLog: I did the other halve of the copyToCUDA function from the Configuration class here, keep an eye on any mistakes that may occur due to the location. internal -> copyToCUDA(c.simNum, c.type, c.bonds, c.bondMap, c.excludes, c.excludeMap, c.angles, c.dihedrals); @@ -480,7 +480,7 @@ void GrandBrownTown::run() { // Make sure the force computation has completed without errors before continuing. //gpuErrchk(cudaPeekAtLastError()); // Does not work on old GPUs (like mine). TODO: write a better wrapper around Peek gpuErrchk(cudaDeviceSynchronize()); - + // printf(" Computed energies\n"); // int numBlocks = (num * numReplicas) / NUM_THREADS + 1; @@ -503,7 +503,7 @@ void GrandBrownTown::run() { printf("Net Force: %f %f %f\n", runningNetForce.x,runningNetForce.y,runningNetForce.z); } // */ - + //MLog: Call the kernel to update the positions of each particle // cudaSetDevice(0); updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas); @@ -520,10 +520,10 @@ void GrandBrownTown::run() { /* computeGridGridForce<<< numBlocks, NUM_THREADS >>>(grid1_d, grid2_d); */ // int numBlocks = (numRB ) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1); - - Vector3 force0(0.0f); + Vector3 force0(0.0f); + if (s % outputPeriod == 0) { // Copy particle positions back to CPU // cudaSetDevice(0); diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh index 401d066..fab9c77 100644 --- a/src/GrandBrownTown.cuh +++ b/src/GrandBrownTown.cuh @@ -153,3 +153,11 @@ __global__ void devicePrint(RigidBodyType* rb[]) { // // i, j, k, rb[i]->rawPotentialGrids[j]).val[k]; // } + + +// __device__ Vector3* totalForce; +// Vector3* totalForce_h; +// void initTotalForce() { +// cudaMalloc( &totalForce_h, sizeof(Vector3) ); +// cudaMemcpyToSymbol(totalForce, &totalForce_h, sizeof(totalForce_h)); +// } diff --git a/src/RigidBody.cu b/src/RigidBody.cu index 54d1b17..bb498d5 100644 --- a/src/RigidBody.cu +++ b/src/RigidBody.cu @@ -47,10 +47,12 @@ void RigidBody::init() { particleForces_d = new Vector3*[numGrids]; particleTorques_d = new Vector3*[numGrids]; for (int i = 0; i < numGrids; ++i) { + numParticles[i] = -1; const int& n = t->numParticles[i]; const int nb = (n/NUMTHREADS)+1; // max number of blocks if (n > 0) { - gpuErrchk(cudaMalloc( &particles_d[i], 0.5*sizeof(int)*n )); + // gpuErrchk(cudaMalloc( &particles_d[i], 0.5*sizeof(int)*n )); // not sure why 0.5 was here; prolly bug + gpuErrchk(cudaMalloc( &particles_d[i], sizeof(int)*n )); // TODO: dynamically allocate memory as needed particleForces[i] = new Vector3[nb]; particleTorques[i] = new Vector3[nb]; gpuErrchk(cudaMalloc( &particleForces_d[i], sizeof(Vector3)*nb )); @@ -104,15 +106,16 @@ void RigidBody::updateParticleList(Vector3* pos_d) { int nb = floor(tnp/NUMTHREADS) + 1; #if __CUDA_ARCH__ >= 300 - createPartlist<<<NUMTHREADS,nb>>>(pos_d, tnp, t->particles_d[i], + createPartlist<<<nb,NUMTHREADS>>>(pos_d, tnp, t->particles_d[i], tmp_d, particles_d[i], gridCenter + position, cutoff*cutoff); #else - createPartlist<<<NUMTHREADS,nb,NUMTHREADS/WARPSIZE>>>(pos_d, tnp, t->particles_d[i], + createPartlist<<<nb,NUMTHREADS,NUMTHREADS/WARPSIZE>>>(pos_d, tnp, t->particles_d[i], tmp_d, particles_d[i], gridCenter + position, cutoff*cutoff); #endif gpuErrchk(cudaMemcpy(&numParticles[i], tmp_d, sizeof(int), cudaMemcpyDeviceToHost )); + gpuErrchk(cudaFree( tmp_d )); } } } @@ -220,7 +223,7 @@ void RigidBody::integrate(int startFinishAll) { Vector3 trans; // = *p_trans; Matrix3 rot = Matrix3(1); // = *p_rot; - /* printf("Rigid Body force\n"); */ + // printf("Rigid Body force\n"); force.print(); #ifdef DEBUGM switch (startFinishAll) { diff --git a/src/RigidBodyController.cu b/src/RigidBodyController.cu index 8b57969..1fc9a6a 100644 --- a/src/RigidBodyController.cu +++ b/src/RigidBodyController.cu @@ -1,9 +1,3 @@ -/* #ifndef MIN_DEBUG_LEVEL */ -/* #define MIN_DEBUG_LEVEL 5 */ -/* #endif */ -/* #define DEBUGM */ -/* #include "Debug.h" */ - /* #include "RigidBody.h" */ #include "RigidBodyController.h" #include "Configuration.h" @@ -436,12 +430,12 @@ void RigidBodyForcePair::callGridForceKernel(int pairId, int s) { // RBTODO: get energy if (!isPmf) { /* pair of RBs */ - computeGridGridForce<<< nb, numThreads, NUMTHREADS*2*sizeof(Vector3), s >>> + computeGridGridForce<<< nb, NUMTHREADS, NUMTHREADS*2*sizeof(Vector3), s >>> (type1->rawDensityGrids_d[k1], type2->rawPotentialGrids_d[k2], B1, B2, c, forces_d[i], torques_d[i]); } else { /* RB with a PMF */ - computeGridGridForce<<< nb, numThreads, NUMTHREADS*2*sizeof(Vector3), s >>> + computeGridGridForce<<< nb, NUMTHREADS, NUMTHREADS*2*sizeof(Vector3), s >>> (type1->rawDensityGrids_d[k1], type2->rawPmfs_d[k2], B1, B2, c, forces_d[i], torques_d[i]); @@ -618,7 +612,7 @@ int RigidBodyForcePair::initialize() { for (int i = 0; i < numGrids; i++) { const int k1 = gridKeyId1[i]; const int sz = type1->rawDensityGrids[k1].getSize(); - const int nb = sz / numThreads + ((sz % numThreads == 0) ? 0:1 ); + const int nb = sz / NUMTHREADS + ((sz % NUMTHREADS == 0) ? 0:1 ); streamID.push_back( nextStreamID % NUMSTREAMS ); nextStreamID++; diff --git a/src/RigidBodyController.h b/src/RigidBodyController.h index ad06f20..35ffe73 100644 --- a/src/RigidBodyController.h +++ b/src/RigidBodyController.h @@ -7,8 +7,6 @@ #include <cuda_runtime.h> #include "useful.h" -// #define NUMTHREADS 128 /* try with 64, every 32+ */ -#define NUMTHREADS 96 #define NUMSTREAMS 8 // #include "RigidBody.h" @@ -55,8 +53,6 @@ private: int updatePeriod; - static const int numThreads = NUMTHREADS; - RigidBodyType* type1; RigidBodyType* type2; RigidBody* rb1; -- GitLab