From 367fb57fe5f5c3c625915b2f4dacfdff96912cbe Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Wed, 30 Dec 2020 13:47:04 -0600 Subject: [PATCH] Initial port to cuda11: include c++14 std, use texture objects instead of deprecated texture refs, squash warnings --- src/ComputeForce.cu | 132 ++++++++++++++++++++++++++++----------- src/ComputeForce.cuh | 32 +++++----- src/ComputeForce.h | 6 ++ src/Makefile | 4 +- src/TabulatedPotential.h | 13 ++-- 5 files changed, 127 insertions(+), 60 deletions(-) diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu index df06503..fe11c6a 100644 --- a/src/ComputeForce.cu +++ b/src/ComputeForce.cu @@ -118,23 +118,83 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : const int maxPairs = 1<<25; gpuErrchk(cudaMalloc(&numPairs_d, sizeof(int))); gpuErrchk(cudaMalloc(&pairLists_d, sizeof(int2)*maxPairs)); - gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d, sizeof(int2)*maxPairs)); //Han-Yi + // gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d, sizeof(int2)*maxPairs)); //Han-Yi gpuErrchk(cudaMalloc(&pairTabPotType_d, sizeof(int)*maxPairs)); - gpuErrchk(cudaBindTexture(0, pairTabPotTypeTex, pairTabPotType_d, sizeof(int)*maxPairs)); //Han-Yi + + // create texture object + { + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = pairLists_d; + resDesc.res.linear.desc.f = cudaChannelFormatKindSigned; + resDesc.res.linear.desc.x = 32; // bits per channel + resDesc.res.linear.desc.y = 32; // bits per channel + resDesc.res.linear.sizeInBytes = maxPairs*sizeof(int2); + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + + // create texture object: we only have to do this once! + pairLists_tex=0; + cudaCreateTextureObject(&pairLists_tex, &resDesc, &texDesc, NULL); + + } + + // create texture object + { + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = pairTabPotType_d; + resDesc.res.linear.desc.f = cudaChannelFormatKindSigned; + resDesc.res.linear.desc.x = 32; // bits per channel + resDesc.res.linear.sizeInBytes = maxPairs*sizeof(int); + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + + // create texture object: we only have to do this once! + pairTabPotType_tex = 0; + cudaCreateTextureObject(&pairTabPotType_tex, &resDesc, &texDesc, NULL); + + } + //Han-Yi Chou int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z; //int* nCells_dev; if (nCells < MAX_CELLS_FOR_CELLNEIGHBORLIST) { int3 *Cells_dev; - gpuErrchk(cudaMalloc(&CellNeighborsList,sizeof(int)*27*nCells)); + size_t sz = 27*nCells*sizeof(int); + gpuErrchk(cudaMalloc(&CellNeighborsList, sz)); //gpuErrchk(cudaMalloc(&nCells_dev,sizeof(int))); gpuErrchk(cudaMalloc(&Cells_dev,sizeof(int3))); //gpuErrchk(cudaMemcpy(nCells_dev,&nCells,1,cudaMemcpyHostToDevice); gpuErrchk(cudaMemcpy(Cells_dev,&(decomp.nCells),sizeof(int3),cudaMemcpyHostToDevice)); createNeighborsList<<<256,256>>>(Cells_dev,CellNeighborsList); gpuErrchk(cudaFree(Cells_dev)); - cudaBindTexture(0, NeighborsTex, CellNeighborsList, 27*nCells*sizeof(int)); + + // create texture object + { + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = CellNeighborsList; + resDesc.res.linear.desc.f = cudaChannelFormatKindSigned; + resDesc.res.linear.desc.x = 32; // bits per channel + resDesc.res.linear.sizeInBytes = sz; + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + + // create texture object: we only have to do this once! + neighbors_tex=0; + cudaCreateTextureObject(&neighbors_tex, &resDesc, &texDesc, NULL); + } } } @@ -185,7 +245,7 @@ ComputeForce::~ComputeForce() { gpuErrchk(cudaFree(sys_d)); //Han-Yi Unbind the texture - gpuErrchk(cudaUnbindTexture(PosTex)); + gpuErrchk(cudaDestroyTextureObject(pos_tex)); gpuErrchk( cudaFree(pos_d) ); gpuErrchk( cudaFree(forceInternal_d) ); gpuErrchk( cudaFree(type_d) ); @@ -215,11 +275,11 @@ ComputeForce::~ComputeForce() { } gpuErrchk(cudaFree(numPairs_d)); - gpuErrchk(cudaUnbindTexture(pairListsTex)); //Han-Yi + gpuErrchk(cudaDestroyTextureObject(pairLists_tex)); gpuErrchk(cudaFree(pairLists_d)); - gpuErrchk(cudaUnbindTexture(pairTabPotTypeTex)); + gpuErrchk(cudaDestroyTextureObject(pairTabPotType_tex)); gpuErrchk(cudaFree(pairTabPotType_d)); - gpuErrchk(cudaUnbindTexture(NeighborsTex)); + gpuErrchk(cudaDestroyTextureObject(neighbors_tex)); gpuErrchk(cudaFree( CellNeighborsList)); } @@ -521,11 +581,11 @@ void ComputeForce::decompose() { #if __CUDA_ARCH__ >= 520 createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, - excludeMap_d, numExcludes, pairlistdist2); + excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex); #else //__CUDA_ARCH__ == 300 createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, - excludeMap_d, numExcludes, pairlistdist2); + excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex); #endif gpuKernelCheck(); @@ -736,8 +796,8 @@ float ComputeForce::computeTabulated(bool get_energy) { { //gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*num*numReplicas)); //computeTabulatedKernel<<< nb, numThreads >>>(forceInternal_d, pos_d, sys_d, - computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d, pos_d, sys_d, - cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d); + computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d, sys_d, + cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d, pairLists_tex, pos_tex, pairTabPotType_tex); gpuKernelCheck(); //gpuErrchk(cudaUnbindTexture(PosTex)); } @@ -843,8 +903,28 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos) const size_t tot_num = num * numReplicas; gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num)); + { //Han-Yi bind to the texture - gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num)); + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = pos_d; + resDesc.res.linear.desc.f = cudaChannelFormatKindFloat; + resDesc.res.linear.desc.x = 32; // bits per channel + resDesc.res.linear.desc.y = 32; // bits per channel + resDesc.res.linear.desc.z = 32; // bits per channel + resDesc.res.linear.desc.w = 32; // bits per channel + resDesc.res.linear.sizeInBytes = tot_num*sizeof(float4); + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + + // create texture object: we only have to do this once! + pos_tex=0; + cudaCreateTextureObject(&pos_tex, &resDesc, &texDesc, NULL); + } + gpuErrchk(cudaDeviceSynchronize()); gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); @@ -858,43 +938,21 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom { const size_t tot_num = num * numReplicas; - gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num)); gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num)); - //Han-Yi Chou bind to the texture - gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num)); - //gpuErrchk(cudaBindTexture(0, MomTex, mom_d,sizeof(Vector3)*tot_num)); - gpuErrchk(cudaDeviceSynchronize()); - - gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); - gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas)); - gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); - + copyToCUDA(forceInternal,pos); gpuErrchk(cudaDeviceSynchronize()); } void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom, float* random) { const size_t tot_num = num * numReplicas; - gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num)); - gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num)); gpuErrchk(cudaMalloc(&ran_d, sizeof(float) * tot_num)); - - //Han-Yi Chou bind to the texture - gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num)); - //gpuErrchk(cudaBindTexture(0, MomTex, mom_d,sizeof(Vector3)*tot_num)); - gpuErrchk(cudaDeviceSynchronize()); - - gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); gpuErrchk(cudaMemcpyAsync(ran_d, random, sizeof(float) * tot_num, cudaMemcpyHostToDevice)); - gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas)); - gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); - + copyToCUDA(forceInternal, pos, mom); gpuErrchk(cudaDeviceSynchronize()); - } void ComputeForce::setForceInternalOnDevice(Vector3* f) { diff --git a/src/ComputeForce.cuh b/src/ComputeForce.cuh index 0129319..4f7dfef 100644 --- a/src/ComputeForce.cuh +++ b/src/ComputeForce.cuh @@ -13,10 +13,10 @@ #define MAX_CELLS_FOR_CELLNEIGHBORLIST 1<<25 -texture<int, 1, cudaReadModeElementType> NeighborsTex; -texture<int, 1, cudaReadModeElementType> pairTabPotTypeTex; -texture<int2, 1, cudaReadModeElementType> pairListsTex; -texture<float4, 1, cudaReadModeElementType> PosTex; +// texture<int, 1, cudaReadModeElementType> NeighborsTex; +// texture<int, 1, cudaReadModeElementType> pairTabPotTypeTex; +//texture<int2, 1, cudaReadModeElementType> pairListsTex; +// texture<float4, 1, cudaReadModeElementType> PosTex; __host__ __device__ EnergyForce ComputeForce::coulombForce(Vector3 r, float alpha, @@ -303,7 +303,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp, const int nCells,int* g_numPairs, int2* g_pair, int numParts, const int* __restrict__ type, int* __restrict__ g_pairTabPotType, const Exclude* __restrict__ excludes, - const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2) + const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2, cudaTextureObject_t PosTex, cudaTextureObject_t NeighborsTex) { __shared__ float4 __align__(16) particle[N]; __shared__ int Index_i[N]; @@ -336,7 +336,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const if(tid + pid_i < Ni && tid < N) { Index_i [tid] = cellInfo[rangeI.first+pid_i+tid].particle; - particle[tid] = tex1Dfetch(PosTex,Index_i[tid]); + particle[tid] = tex1Dfetch<float4>(PosTex,Index_i[tid]); } __syncthreads(); @@ -360,7 +360,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const int neighbor_cell; if (nCells < MAX_CELLS_FOR_CELLNEIGHBORLIST) { - neighbor_cell = tex1Dfetch(NeighborsTex,idx+27*cellid_i); + neighbor_cell = tex1Dfetch<int>(NeighborsTex,idx+27*cellid_i); } else { int3 cells = decomp->nCells; int3 cell_idx = make_int3(cellid_i % cells.z, @@ -405,7 +405,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const continue; } - float4 b = tex1Dfetch(PosTex,aj); + float4 b = tex1Dfetch<float4>(PosTex,aj); Vector3 B(b.x,b.y,b.z); float dr = (sys->wrapDiff(A-B)).length2(); @@ -453,7 +453,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const { int ai = cellInfo[rangeI.first+pid_i].particle; - float4 a = tex1Dfetch(PosTex,ai); + float4 a = tex1Dfetch<float4>(PosTex,ai); Vector3 A(a.x,a.y,a.z); int2 ex_pair = make_int2(-1,-1); @@ -812,9 +812,11 @@ __global__ void computeTabulatedKernel( #endif //#if 0 template<const int BlockSize> -__global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict__ pos, const BaseGrid* __restrict__ sys, +__global__ void computeTabulatedKernel(Vector3* force, const BaseGrid* __restrict__ sys, float cutoff2, const int* __restrict__ g_numPairs, const int2* __restrict__ g_pair, - const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot) + const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot, + cudaTextureObject_t pairListsTex, cudaTextureObject_t PosTex, cudaTextureObject_t pairTabPotTypeTex + ) { const int numPairs = *g_numPairs; const int tid = threadIdx.x + blockDim.x * threadIdx.y @@ -826,7 +828,7 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict for (int i = tid; i < numPairs; i += TotalThreads) { //int2 pair = g_pair[i]; - int2 pair = tex1Dfetch(pairListsTex,i); + int2 pair = tex1Dfetch<int2>(pairListsTex,i); //int ind = tex1Dfetch(pairTabPotTypeTex,i); int ai = pair.x; @@ -834,12 +836,12 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict //int ind = g_pairTabPotType[i]; - Vector3 a(tex1Dfetch(PosTex, ai)); - Vector3 b(tex1Dfetch(PosTex, aj)); + Vector3 a(tex1Dfetch<float4>(PosTex, ai)); + Vector3 b(tex1Dfetch<float4>(PosTex, aj)); Vector3 dr = sys->wrapDiff(b-a); float d2 = dr.length2(); - int ind = tex1Dfetch(pairTabPotTypeTex,i); + int ind = tex1Dfetch<int>(pairTabPotTypeTex,i); if (tablePot[ind] != NULL && d2 <= cutoff2) { Vector3 f = tablePot[ind]->computef(dr,d2); diff --git a/src/ComputeForce.h b/src/ComputeForce.h index a523404..c58b997 100644 --- a/src/ComputeForce.h +++ b/src/ComputeForce.h @@ -218,17 +218,23 @@ private: // Pairlists float pairlistdist2; int2 *pairLists_d; + cudaTextureObject_t pairLists_tex; + int *pairTabPotType_d; + cudaTextureObject_t pairTabPotType_tex; int *numPairs_d; //Han-Yi Chou int *CellNeighborsList; //MLog: List of variables that need to be moved over to ComputeForce class. Members of this list will be set to static to avoid large alterations in working code, thereby allowing us to access these variables easily. + cudaTextureObject_t neighbors_tex; + //BrownianParticleType* part; //float electricConst; //int fullLongRange; Vector3* pos_d; + cudaTextureObject_t pos_tex; Vector3* mom_d; float* ran_d; Vector3* forceInternal_d; diff --git a/src/Makefile b/src/Makefile index 2ada08a..3c94c53 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,7 +4,7 @@ INCLUDE = $(CUDA_PATH)/include ifeq ($(dbg),1) - NV_FLAGS += -g -G + NV_FLAGS += -g -G -O0 EX_FLAGS = -g -m$(OS_SIZE) else NV_FLAGS = -Xptxas -O3 @@ -31,7 +31,7 @@ CC_FLAGS += -DVERSION="\"$(VERSION)\"" -DSIGNAL CC_FLAGS += -I$(CUDA_PATH)/include -CC_FLAGS += -Wall -Wno-write-strings -std=c++0x -pedantic# TODO: test on Mac OSX and other architectures +CC_FLAGS += -Wall -Wno-write-strings -std=c++14 -pedantic# TODO: test on Mac OSX and other architectures ifeq ($(dbg),1) NV_FLAGS += -lineinfo else diff --git a/src/TabulatedPotential.h b/src/TabulatedPotential.h index 807db9e..e409bdb 100644 --- a/src/TabulatedPotential.h +++ b/src/TabulatedPotential.h @@ -282,20 +282,21 @@ public: template <bool is_periodic> HOST DEVICE inline float2 linearly_interpolate(float x, float start=0.0f) const { float w = (x - start) * step_inv; - int home = int( floorf(w) ); - w = w - home; + int signed_home = int( floorf(w) ); + w = w - signed_home; // if (home < 0) return EnergyForce(v0[0], Vector3(0.0f)); - if (home < 0) { - if (is_periodic) home += size; + if (signed_home < 0) { + if (is_periodic) signed_home += size; else return make_float2(pot[0],0.0f); } - else if (home >= size) { + unsigned int home = signed_home; + if (home >= size) { if (is_periodic) home -= size; else return make_float2(pot[size-1],0.0f); } float u0 = pot[home]; - float du = home+1 < size ? pot[home+1]-u0 : is_periodic ? pot[0]-u0 : 0; + float du = ((unsigned int) home+1) < size ? pot[home+1]-u0 : is_periodic ? pot[0]-u0 : 0; return make_float2(du*w+u0, du*step_inv); } -- GitLab