From ee3383f62c02c34e9de38c3c1ac7fdbe5f2ee48c Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Tue, 27 Sep 2016 22:08:51 +0000 Subject: [PATCH] fixed *char to const *char warnings; created list-based kernel for angles and dihedrals (untested) --- CellDecomposition.cu | 2 +- ComputeForce.cu | 63 ++++++++++++++------- ComputeForce.cuh | 67 ++++++++++++++++++++-- ComputeForce.h | 8 ++- Configuration.cpp | 2 +- CudaUtil.cuh | 2 +- GrandBrownTown.cu | 28 +++++++--- GrandBrownTown.h | 3 + TabulatedAngle.h | 7 ++- TabulatedDihedral.h | 5 +- TabulatedMethods.cuh | 128 +++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 271 insertions(+), 44 deletions(-) create mode 100644 TabulatedMethods.cuh diff --git a/CellDecomposition.cu b/CellDecomposition.cu index 07a7b02..a17133f 100644 --- a/CellDecomposition.cu +++ b/CellDecomposition.cu @@ -7,7 +7,7 @@ // ***************************************************************************** // Error Check -inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line); diff --git a/ComputeForce.cu b/ComputeForce.cu index aee09cd..68224de 100644 --- a/ComputeForce.cu +++ b/ComputeForce.cu @@ -8,7 +8,7 @@ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); @@ -596,19 +596,22 @@ float ComputeForce::computeTabulated(bool get_energy) { /* printPairForceCounter<<<1,32>>>(); */ /* gpuErrchk(cudaDeviceSynchronize()); */ - computeAngles<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, angles_d, - tableAngle_d, numAngles, num, sys_d, energies_d, get_energy); - //Mlog: the commented function doesn't use bondList, uncomment for testing. //if(bondMap_d != NULL && tableBond_d != NULL) if(bondList_d != NULL && tableBond_d != NULL) { - //computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d, bonds, bondMap_d, numBonds, numReplicas, energies_d, get_energy, tableBond_d); - computeTabulatedBonds <<<numBlocks, numThreads>>> ( forceInternal_d, pos_d, num, numParts, sys_d, bondList_d, (numBonds/2), numReplicas, energies_d, get_energy, tableBond_d); + //computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d, bonds, bondMap_d, numBonds, numReplicas, energies_d, get_energy, tableBond_d); + computeTabulatedBonds <<<numBlocks, numThreads>>> ( forceInternal_d, pos_d, num, numParts, sys_d, bondList_d, (numBonds/2), numReplicas, energies_d, get_energy, tableBond_d); } - computeDihedrals<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, dihedrals_d, - tableDihedral_d, numDihedrals, num, sys_d, energies_d, get_energy); + if (angleList_d != NULL && tableAngle_d != NULL) + computeTabulatedAngles<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, sys_d, numReplicas, + numAngles, angleList_d, tableAngle_d); + + if (dihedralList_d != NULL && tableDihedral_d != NULL) + computeTabulatedDihedrals<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, sys_d, numReplicas, + numDihedrals, dihedralList_d, dihedralPotList_d, tableDihedral_d); + // Calculate the energy based on the array created by the kernel if (get_energy) { @@ -709,17 +712,35 @@ void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, gpuErrchk(cudaDeviceSynchronize()); } -void ComputeForce::createBondList(int3 *bondList) -{ - size_t size = (numBonds / 2) * numReplicas * sizeof(int3); - gpuErrchk( cudaMalloc( &bondList_d, size ) ); - gpuErrchk( cudaMemcpyAsync( bondList_d, bondList, size, cudaMemcpyHostToDevice) ); - - for(int i = 0 ; i < (numBonds / 2) * numReplicas ; i++) - { - cout << "Displaying: bondList_d["<< i <<"].x = " << bondList[i].x << ".\n" - << "Displaying: bondList_d["<< i <<"].y = " << bondList[i].y << ".\n" - << "Displaying: bondList_d["<< i <<"].z = " << bondList[i].z << ".\n"; - - } +// void ComputeForce::createBondList(int3 *bondList) +// { +// size_t size = (numBonds / 2) * numReplicas * sizeof(int3); +// gpuErrchk( cudaMalloc( &bondList_d, size ) ); +// gpuErrchk( cudaMemcpyAsync( bondList_d, bondList, size, cudaMemcpyHostToDevice) ); + +// for(int i = 0 ; i < (numBonds / 2) * numReplicas ; i++) +// { +// cout << "Displaying: bondList_d["<< i <<"].x = " << bondList[i].x << ".\n" +// << "Displaying: bondList_d["<< i <<"].y = " << bondList[i].y << ".\n" +// << "Displaying: bondList_d["<< i <<"].z = " << bondList[i].z << ".\n"; + +// } +// } + +void ComputeForce::copyBondedListsToGPU(int3 *bondList, int4 *angleList, int4 *dihedralList, int *dihedralPotList) { + size_t size = (numBonds / 2) * numReplicas * sizeof(int3); + gpuErrchk( cudaMalloc( &bondList_d, size ) ); + gpuErrchk( cudaMemcpyAsync( bondList_d, bondList, size, cudaMemcpyHostToDevice) ); + + size = numAngles * numReplicas * sizeof(int4); + gpuErrchk( cudaMalloc( &angleList_d, size ) ); + gpuErrchk( cudaMemcpyAsync( angleList_d, angleList, size, cudaMemcpyHostToDevice) ); + + size = numDihedrals * numReplicas * sizeof(int4); + gpuErrchk( cudaMalloc( &dihedralList_d, size ) ); + gpuErrchk( cudaMemcpyAsync( dihedralList_d, dihedralList, size, cudaMemcpyHostToDevice) ); + + size = numDihedrals * numReplicas * sizeof(int); + gpuErrchk( cudaMalloc( &dihedralPotList_d, size ) ); + gpuErrchk( cudaMemcpyAsync( dihedralPotList_d, dihedralPotList, size, cudaMemcpyHostToDevice) ); } diff --git a/ComputeForce.cuh b/ComputeForce.cuh index cfb2b96..37b6264 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -3,8 +3,9 @@ // Terrance Howard <heyterrance@gmail.com> #pragma once -#include "CudaUtil.cuh" #include <assert.h> +#include "CudaUtil.cuh" +#include "TabulatedMethods.cuh" #define BD_PI 3.1415927f @@ -228,7 +229,7 @@ void createPairlistsOld(Vector3* __restrict__ pos, int num, int numReplicas, const int cID = bid / blocksPerCell; /* cellID */ const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */ - const int wid = tid/WARPSIZE; + // const int wid = tid/WARPSIZE; const int blockLane = bid % blocksPerCell; if (cID >= nCells) return; @@ -444,7 +445,6 @@ __global__ void computeTabulatedKernel( const int2 pair = g_pair[i]; const int ai = pair.x; const int aj = pair.y; - // BONDS: RBTODO: handle through an independent kernel call // Particle's type and position const int ind = g_pairTabPotType[i]; @@ -619,7 +619,7 @@ void computeAngles(Vector3 force[], Vector3 pos[], Angle& a = angles[i]; const int ind = a.getIndex(idx); if (ind >= 0) { - EnergyForce ef = tableAngle[a.tabFileIndex]->compute(&a, pos, sys, ind); + EnergyForce ef = tableAngle[a.tabFileIndex]->computeOLD(&a, pos, sys, ind); force_local += ef.f; if (ind == 1 and get_energy) energy_local += ef.e; @@ -733,6 +733,30 @@ __global__ void computeTabulatedBonds( Vector3* force, Vector3* pos, int nu } } +// TODO: add kernel for energy calculation +__global__ +void computeTabulatedAngles(Vector3* __restrict__ force, Vector3* __restrict__ pos, + BaseGrid* __restrict__ sys, int numReplicas, + int numAngles, int4* angleList_d, TabulatedAnglePotential** tableAngle) { + + int currAngle = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID + + // Loop over ALL angles in ALL replicas + if( currAngle < (numAngles * numReplicas) ) { + + int4& ids = angleList_d[currAngle]; + computeAngle(tableAngle[ ids.w ], sys, force, pos, ids.x, ids.y, ids.z); + + // if (get_energy) + // { + // //TODO: clarification on energy computation needed, consider changing. + // atomicAdd( &g_energies[i], energy_local); + // //atomicAdd( &g_energies[j], energy_local); + // } + } +} + + __global__ void computeDihedrals(Vector3 force[], Vector3 pos[], Dihedral dihedrals[], @@ -740,7 +764,7 @@ void computeDihedrals(Vector3 force[], Vector3 pos[], int numDihedrals, int num, BaseGrid* sys, float g_energies[], bool get_energy) { int i = blockIdx.x * blockDim.x + threadIdx.x; - float energy_local = 0.0f; + // float energy_local = 0.0f; Vector3 force_local(0.0f); if (i < numDihedrals) { @@ -813,3 +837,36 @@ void computeDihedrals(Vector3 force[], Vector3 pos[], } } } + + + // void computeTabulatedDihedrals(Vector3* __restrict__ force, Vector3* __restrict__ pos, int num, + // int numParts, BaseGrid* __restrict__ sys, int4* __restrict__ dihedralList_d, + // int* __restrict__ dihedralPotList_d, + // int numDihedrals, int numReplicas, float* __restrict g_energies, + // bool get_energy, TabulatedDihedralPotential** __restrict__ tableDihedral) { + +__global__ +void computeTabulatedDihedrals(Vector3* __restrict__ force, const Vector3* __restrict__ pos, + const BaseGrid* __restrict__ sys, int numReplicas, + int numDihedrals, const int4* __restrict__ dihedralList_d, + const int* __restrict__ dihedralPotList_d, TabulatedDihedralPotential** tableDihedral) { + + int currDihedral = blockIdx.x * blockDim.x + threadIdx.x; // first particle ID + + // Loop over ALL dihedrals in ALL replicas + // TODO make grid stride loop + if( currDihedral < (numDihedrals * numReplicas) ) { + const int4& ids = dihedralList_d[currDihedral]; + const int& id = dihedralPotList_d[currDihedral]; + + computeDihedral(tableDihedral[ id ], sys, force, pos, ids.x, ids.y, ids.z, ids.w); + + // if (get_energy) + // { + // //TODO: clarification on energy computation needed, consider changing. + // atomicAdd( &g_energies[i], energy_local); + // //atomicAdd( &g_energies[j], energy_local); + // } + } +} + diff --git a/ComputeForce.h b/ComputeForce.h index 22fd9ea..66d6e75 100644 --- a/ComputeForce.h +++ b/ComputeForce.h @@ -74,8 +74,9 @@ public: void copyToCUDA(Vector3* forceInternal, Vector3* pos); void copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals); - void createBondList(int3 *bondList); - + // void createBondList(int3 *bondList); + void copyBondedListsToGPU(int3 *bondList, int4 *angleList, int4 *dihedralList, int *dihedralPotList); + //MLog: because of the move of a lot of private variables, some functions get starved necessary memory access to these variables, below is a list of functions that return the specified private variable. Vector3* getPos_d() { @@ -190,6 +191,9 @@ private: Angle* angles_d; Dihedral* dihedrals_d; int3* bondList_d; + int4* angleList_d; + int4* dihedralList_d; + int* dihedralPotList_d; }; #endif diff --git a/Configuration.cpp b/Configuration.cpp index 9332a16..ce253af 100644 --- a/Configuration.cpp +++ b/Configuration.cpp @@ -2,7 +2,7 @@ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), __FILE__, line); if (abort) exit(code); diff --git a/CudaUtil.cuh b/CudaUtil.cuh index 1daf958..49e0f83 100644 --- a/CudaUtil.cuh +++ b/CudaUtil.cuh @@ -86,7 +86,7 @@ __device__ inline void inclIntCumSum(int* in, const int n) { __syncthreads(); } -__device__ void atomicAdd( Vector3* address, Vector3 val) { +__device__ inline void atomicAdd(Vector3* address, Vector3 val) { atomicAdd( &(address->x), val.x); atomicAdd( &(address->y), val.y); atomicAdd( &(address->z), val.z); diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu index b459fed..ca30125 100644 --- a/GrandBrownTown.cu +++ b/GrandBrownTown.cu @@ -3,7 +3,7 @@ /* #include "ComputeGridGrid.cuh" */ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); @@ -243,16 +243,28 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, } } } + // internal->createBondList(bondList); - internal -> createBondList(bondList); - - // TODO: copy data to device (not here) - // TODO: use bondList in computeTabulatedBonds kernel - + angleList = new int4[ (numAngles) * numReplicas ]; + for(int k = 0 ; k < numReplicas; k++) { + for(int i = 0; i < numAngles; ++i) { + angleList[i] = make_int4( angles[i].ind1+k*num, angles[i].ind2+k*num, angles[i].ind3+k*num, angles[i].tabFileIndex ); + } + } + + dihedralList = new int4[ (numDihedrals) * numReplicas ]; + dihedralPotList = new int[ (numDihedrals) * numReplicas ]; + for(int k = 0 ; k < numReplicas; k++) { + for(int i = 0; i < numDihedrals; ++i) { + dihedralList[i] = make_int4( dihedrals[i].ind1+k*num, dihedrals[i].ind2+k*num, dihedrals[i].ind3+k*num, dihedrals[i].ind4+k*num); + dihedralPotList[i] = dihedrals[i].tabFileIndex; + } + } + internal->copyBondedListsToGPU(bondList,angleList,dihedralList,dihedralPotList); + forceInternal = new Vector3[num * numReplicas]; - if (fullLongRange != 0) - printf("No cell decomposition created.\n"); + printf("No cell decomposition created.\n"); // Prepare the trajectory output writer. for (int repID = 0; repID < numReplicas; ++repID) { diff --git a/GrandBrownTown.h b/GrandBrownTown.h index 57bd2c1..44a783e 100644 --- a/GrandBrownTown.h +++ b/GrandBrownTown.h @@ -241,10 +241,13 @@ private: Angle* angles; String* angleTableFile; int numTabAngleFiles; + int4 *angleList; Dihedral* dihedrals; String* dihedralTableFile; int numTabDihedralFiles; + int4 *dihedralList; + int *dihedralPotList; void updateNameList(); diff --git a/TabulatedAngle.h b/TabulatedAngle.h index 777832b..0b53e65 100644 --- a/TabulatedAngle.h +++ b/TabulatedAngle.h @@ -7,7 +7,9 @@ #include "Angle.h" #include "TabulatedPotential.h" #include "BaseGrid.h" -#include <cuda.h> + +__device__ void atomicAdd( Vector3* address, Vector3 val); + class TabulatedAnglePotential { @@ -20,7 +22,8 @@ public: float angle_step; // 'step' angle in potential file. potential file might not go 1, 2, 3,...,360, it could be in steps of .5 or something smaller int size; // The number of data points in the file String fileName; - HOST DEVICE inline EnergyForce compute(Angle* a, Vector3* pos, BaseGrid* sys, int index) { + + HOST DEVICE inline EnergyForce computeOLD(Angle* a, Vector3* pos, BaseGrid* sys, int index) { // First, we must find the actual angle we're working with. // Grab the positions of each particle in the angle const Vector3 posa = pos[a->ind1]; diff --git a/TabulatedDihedral.h b/TabulatedDihedral.h index 84518c6..1a8e9ee 100644 --- a/TabulatedDihedral.h +++ b/TabulatedDihedral.h @@ -4,9 +4,8 @@ #ifndef TABULATEDDIHEDRAL_H #define TABULATEDDIHEDRAL_H -#include <cuda.h> - #include "useful.h" + #include "Dihedral.h" #include "TabulatedPotential.h" #include "BaseGrid.h" @@ -29,7 +28,7 @@ public: String fileName; // RBTODO: deprecate - HOST DEVICE inline EnergyForce compute(Dihedral* d, Vector3* pos, BaseGrid* sys, int index) { + HOST DEVICE inline EnergyForce computeOLD(Dihedral* d, Vector3* pos, BaseGrid* sys, int index) { const Vector3 posa = d->ind1; const Vector3 posb = d->ind2; const Vector3 posc = d->ind3; diff --git a/TabulatedMethods.cuh b/TabulatedMethods.cuh new file mode 100644 index 0000000..8986e1c --- /dev/null +++ b/TabulatedMethods.cuh @@ -0,0 +1,128 @@ +#pragma once + +#define BD_PI 3.1415927f + +__device__ inline void computeAngle(const TabulatedAnglePotential* __restrict__ a, const BaseGrid* __restrict__ sys, Vector3* __restrict__ force, const Vector3* __restrict__ pos, + const int& i, const int& j, const int& k) { + + + // Particle's type and position + Vector3 posa = pos[i]; + Vector3 posb = pos[j]; + Vector3 posc = pos[k]; + + // The vectors between each pair of particles + const Vector3 ab = sys->wrapDiff(posa - posb); + const Vector3 bc = sys->wrapDiff(posb - posc); + const Vector3 ac = sys->wrapDiff(posc - posa); + + // Find the distance between each pair of particles + const float distab = ab.length(); + const float distbc = bc.length(); + const float distac = ac.length(); + + // Find the cosine of the angle we want - <ABC + float cos = (distbc * distbc + distab * distab - distac * distac) / (2.0f * distbc * distab); + + // If the cosine is illegitimate, set it to 1 or -1 so that acos won't fail + if (cos < -1.0f) cos = -1.0f; + if (cos > 1.0f) cos = 1.0f; + + // Find the sine while we're at it. + float sin = sqrtf(1.0f - cos*cos); + + // Now we can use the cosine to find the actual angle (in radians) + float angle = acos(cos); + + // transform angle to units of tabulated array index + angle /= a->angle_step; + + // tableAngle[0] stores the potential at angle_step + // tableAngle[1] stores the potential at angle_step * 2, etc. + // 'home' is the index after which 'convertedAngle' would appear if it were stored in the table + int home = int(floor(angle)); + assert(home >= 0); + assert(home < a->size); + + // Make angle the distance from [0,1) from the first index in the potential array index + angle -= home; + + // Linearly interpolate the potential + float U0 = a->pot[home]; + float dUdx = (a->pot[(home+1)] - U0) / a->angle_step; + // float energy = (dUdx * angle) + U0; + dUdx /= sqrtf(1.0f-cos*cos); + + // Calculate the forces + Vector3 force1 = (dUdx/distab) * (ab * (cos/distab) - bc / distbc); // force on particle 1 + Vector3 force3 = (dUdx/distbc) * (bc * (cos/distbc) - ab / distab); // force on particle 3 + atomicAdd( &force[i], force1 ); + atomicAdd( &force[k], force3 ); + atomicAdd( &force[j], -(force1 + force3) ); +} + + +__device__ inline void computeDihedral(const TabulatedDihedralPotential* __restrict__ d, + const BaseGrid* __restrict__ sys, Vector3* __restrict__ forces, const Vector3* __restrict__ pos, + const int& i, const int& j, const int& k, const int& l) { + const Vector3 posa = pos[i]; + const Vector3 posb = pos[j]; + const Vector3 posc = pos[k]; + const Vector3 posd = pos[l]; + + const Vector3 ab = sys->wrapDiff(posa - posb); + const Vector3 bc = sys->wrapDiff(posb - posc); + const Vector3 cd = sys->wrapDiff(posc - posd); + + //const float distab = ab.length(); + const float distbc = bc.length(); + //const float distcd = cd.length(); + + Vector3 crossABC = ab.cross(bc); + Vector3 crossBCD = bc.cross(cd); + Vector3 crossX = bc.cross(crossABC); + + const float cos_phi = crossABC.dot(crossBCD) / (crossABC.length() * crossBCD.length()); + const float sin_phi = crossX.dot(crossBCD) / (crossX.length() * crossBCD.length()); + + const float angle = -atan2(sin_phi, cos_phi); + + // float energy = 0.0f; + float force = 0.0f; + + Vector3 f1, f2, f3; // forces + f1 = -distbc * crossABC.rLength2() * crossABC; + f3 = -distbc * crossBCD.rLength2() * crossBCD; + f2 = -(ab.dot(bc) * bc.rLength2()) * f1 - (bc.dot(cd) * bc.rLength2()) * f3; + + // Shift "angle" by "PI" since -PI < dihedral < PI + // And our tabulated potential data: 0 < angle < 2 PI + float t = (angle + BD_PI) / d->angle_step; + int home = (int) floorf(t); + t = t - home; + + assert(home >= 0); + assert(home < d->size); + // home = home % size; + int home1 = (home + 1) >= d->size ? (home+1-d->size) : home+1; + + //================================================ + // Linear interpolation + float U0 = d->pot[home]; // Potential + float dU = d->pot[home1] - U0; // Change in potential + + // energy = dU * t + U0; + force = -dU / d->angle_step; + + // avoid singularity when one angle is straight + force = (crossABC.rLength() > 1.0f || crossBCD.rLength() > 1.0f) ? 0.0f : force; + + f1 *= force; + f2 *= force; + f3 *= force; + + atomicAdd( &forces[i], f1 ); + atomicAdd( &forces[j], f2-f1 ); + atomicAdd( &forces[k], f3-f2 ); + atomicAdd( &forces[l], -f3 ); +} -- GitLab