Skip to content
Snippets Groups Projects
Commit 367fb57f authored by cmaffeo2's avatar cmaffeo2
Browse files

Initial port to cuda11: include c++14 std, use texture objects instead of...

Initial port to cuda11: include c++14 std, use texture objects instead of deprecated texture refs, squash warnings
parent c6c19c64
No related branches found
No related tags found
No related merge requests found
......@@ -118,23 +118,83 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
const int maxPairs = 1<<25;
gpuErrchk(cudaMalloc(&numPairs_d, sizeof(int)));
gpuErrchk(cudaMalloc(&pairLists_d, sizeof(int2)*maxPairs));
gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d, sizeof(int2)*maxPairs)); //Han-Yi
// gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d, sizeof(int2)*maxPairs)); //Han-Yi
gpuErrchk(cudaMalloc(&pairTabPotType_d, sizeof(int)*maxPairs));
gpuErrchk(cudaBindTexture(0, pairTabPotTypeTex, pairTabPotType_d, sizeof(int)*maxPairs)); //Han-Yi
// create texture object
{
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = pairLists_d;
resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
resDesc.res.linear.desc.x = 32; // bits per channel
resDesc.res.linear.desc.y = 32; // bits per channel
resDesc.res.linear.sizeInBytes = maxPairs*sizeof(int2);
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
// create texture object: we only have to do this once!
pairLists_tex=0;
cudaCreateTextureObject(&pairLists_tex, &resDesc, &texDesc, NULL);
}
// create texture object
{
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = pairTabPotType_d;
resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
resDesc.res.linear.desc.x = 32; // bits per channel
resDesc.res.linear.sizeInBytes = maxPairs*sizeof(int);
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
// create texture object: we only have to do this once!
pairTabPotType_tex = 0;
cudaCreateTextureObject(&pairTabPotType_tex, &resDesc, &texDesc, NULL);
}
//Han-Yi Chou
int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
//int* nCells_dev;
if (nCells < MAX_CELLS_FOR_CELLNEIGHBORLIST) {
int3 *Cells_dev;
gpuErrchk(cudaMalloc(&CellNeighborsList,sizeof(int)*27*nCells));
size_t sz = 27*nCells*sizeof(int);
gpuErrchk(cudaMalloc(&CellNeighborsList, sz));
//gpuErrchk(cudaMalloc(&nCells_dev,sizeof(int)));
gpuErrchk(cudaMalloc(&Cells_dev,sizeof(int3)));
//gpuErrchk(cudaMemcpy(nCells_dev,&nCells,1,cudaMemcpyHostToDevice);
gpuErrchk(cudaMemcpy(Cells_dev,&(decomp.nCells),sizeof(int3),cudaMemcpyHostToDevice));
createNeighborsList<<<256,256>>>(Cells_dev,CellNeighborsList);
gpuErrchk(cudaFree(Cells_dev));
cudaBindTexture(0, NeighborsTex, CellNeighborsList, 27*nCells*sizeof(int));
// create texture object
{
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = CellNeighborsList;
resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
resDesc.res.linear.desc.x = 32; // bits per channel
resDesc.res.linear.sizeInBytes = sz;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
// create texture object: we only have to do this once!
neighbors_tex=0;
cudaCreateTextureObject(&neighbors_tex, &resDesc, &texDesc, NULL);
}
}
}
......@@ -185,7 +245,7 @@ ComputeForce::~ComputeForce() {
gpuErrchk(cudaFree(sys_d));
//Han-Yi Unbind the texture
gpuErrchk(cudaUnbindTexture(PosTex));
gpuErrchk(cudaDestroyTextureObject(pos_tex));
gpuErrchk( cudaFree(pos_d) );
gpuErrchk( cudaFree(forceInternal_d) );
gpuErrchk( cudaFree(type_d) );
......@@ -215,11 +275,11 @@ ComputeForce::~ComputeForce() {
}
gpuErrchk(cudaFree(numPairs_d));
gpuErrchk(cudaUnbindTexture(pairListsTex)); //Han-Yi
gpuErrchk(cudaDestroyTextureObject(pairLists_tex));
gpuErrchk(cudaFree(pairLists_d));
gpuErrchk(cudaUnbindTexture(pairTabPotTypeTex));
gpuErrchk(cudaDestroyTextureObject(pairTabPotType_tex));
gpuErrchk(cudaFree(pairTabPotType_d));
gpuErrchk(cudaUnbindTexture(NeighborsTex));
gpuErrchk(cudaDestroyTextureObject(neighbors_tex));
gpuErrchk(cudaFree( CellNeighborsList));
}
......@@ -521,11 +581,11 @@ void ComputeForce::decompose() {
#if __CUDA_ARCH__ >= 520
createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d,
pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d,
excludeMap_d, numExcludes, pairlistdist2);
excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex);
#else //__CUDA_ARCH__ == 300
createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d,
pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d,
excludeMap_d, numExcludes, pairlistdist2);
excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex);
#endif
gpuKernelCheck();
......@@ -736,8 +796,8 @@ float ComputeForce::computeTabulated(bool get_energy) {
{
//gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*num*numReplicas));
//computeTabulatedKernel<<< nb, numThreads >>>(forceInternal_d, pos_d, sys_d,
computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d, pos_d, sys_d,
cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d);
computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d, sys_d,
cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d, pairLists_tex, pos_tex, pairTabPotType_tex);
gpuKernelCheck();
//gpuErrchk(cudaUnbindTexture(PosTex));
}
......@@ -843,8 +903,28 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
const size_t tot_num = num * numReplicas;
gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num));
{
//Han-Yi bind to the texture
gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num));
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeLinear;
resDesc.res.linear.devPtr = pos_d;
resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
resDesc.res.linear.desc.x = 32; // bits per channel
resDesc.res.linear.desc.y = 32; // bits per channel
resDesc.res.linear.desc.z = 32; // bits per channel
resDesc.res.linear.desc.w = 32; // bits per channel
resDesc.res.linear.sizeInBytes = tot_num*sizeof(float4);
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.readMode = cudaReadModeElementType;
// create texture object: we only have to do this once!
pos_tex=0;
cudaCreateTextureObject(&pos_tex, &resDesc, &texDesc, NULL);
}
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
......@@ -858,43 +938,21 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom
{
const size_t tot_num = num * numReplicas;
gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num));
gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num));
//Han-Yi Chou bind to the texture
gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num));
//gpuErrchk(cudaBindTexture(0, MomTex, mom_d,sizeof(Vector3)*tot_num));
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas));
gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
copyToCUDA(forceInternal,pos);
gpuErrchk(cudaDeviceSynchronize());
}
void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom, float* random)
{
const size_t tot_num = num * numReplicas;
gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num));
gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num));
gpuErrchk(cudaMalloc(&ran_d, sizeof(float) * tot_num));
//Han-Yi Chou bind to the texture
gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num));
//gpuErrchk(cudaBindTexture(0, MomTex, mom_d,sizeof(Vector3)*tot_num));
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpyAsync(ran_d, random, sizeof(float) * tot_num, cudaMemcpyHostToDevice));
gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas));
gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
copyToCUDA(forceInternal, pos, mom);
gpuErrchk(cudaDeviceSynchronize());
}
void ComputeForce::setForceInternalOnDevice(Vector3* f) {
......
......@@ -13,10 +13,10 @@
#define MAX_CELLS_FOR_CELLNEIGHBORLIST 1<<25
texture<int, 1, cudaReadModeElementType> NeighborsTex;
texture<int, 1, cudaReadModeElementType> pairTabPotTypeTex;
texture<int2, 1, cudaReadModeElementType> pairListsTex;
texture<float4, 1, cudaReadModeElementType> PosTex;
// texture<int, 1, cudaReadModeElementType> NeighborsTex;
// texture<int, 1, cudaReadModeElementType> pairTabPotTypeTex;
//texture<int2, 1, cudaReadModeElementType> pairListsTex;
// texture<float4, 1, cudaReadModeElementType> PosTex;
__host__ __device__
EnergyForce ComputeForce::coulombForce(Vector3 r, float alpha,
......@@ -303,7 +303,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp,
const int nCells,int* g_numPairs, int2* g_pair, int numParts, const int* __restrict__ type,
int* __restrict__ g_pairTabPotType, const Exclude* __restrict__ excludes,
const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2)
const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2, cudaTextureObject_t PosTex, cudaTextureObject_t NeighborsTex)
{
__shared__ float4 __align__(16) particle[N];
__shared__ int Index_i[N];
......@@ -336,7 +336,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
if(tid + pid_i < Ni && tid < N)
{
Index_i [tid] = cellInfo[rangeI.first+pid_i+tid].particle;
particle[tid] = tex1Dfetch(PosTex,Index_i[tid]);
particle[tid] = tex1Dfetch<float4>(PosTex,Index_i[tid]);
}
__syncthreads();
......@@ -360,7 +360,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
int neighbor_cell;
if (nCells < MAX_CELLS_FOR_CELLNEIGHBORLIST) {
neighbor_cell = tex1Dfetch(NeighborsTex,idx+27*cellid_i);
neighbor_cell = tex1Dfetch<int>(NeighborsTex,idx+27*cellid_i);
} else {
int3 cells = decomp->nCells;
int3 cell_idx = make_int3(cellid_i % cells.z,
......@@ -405,7 +405,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
continue;
}
float4 b = tex1Dfetch(PosTex,aj);
float4 b = tex1Dfetch<float4>(PosTex,aj);
Vector3 B(b.x,b.y,b.z);
float dr = (sys->wrapDiff(A-B)).length2();
......@@ -453,7 +453,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
{
int ai = cellInfo[rangeI.first+pid_i].particle;
float4 a = tex1Dfetch(PosTex,ai);
float4 a = tex1Dfetch<float4>(PosTex,ai);
Vector3 A(a.x,a.y,a.z);
int2 ex_pair = make_int2(-1,-1);
......@@ -812,9 +812,11 @@ __global__ void computeTabulatedKernel(
#endif
//#if 0
template<const int BlockSize>
__global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict__ pos, const BaseGrid* __restrict__ sys,
__global__ void computeTabulatedKernel(Vector3* force, const BaseGrid* __restrict__ sys,
float cutoff2, const int* __restrict__ g_numPairs, const int2* __restrict__ g_pair,
const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot)
const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot,
cudaTextureObject_t pairListsTex, cudaTextureObject_t PosTex, cudaTextureObject_t pairTabPotTypeTex
)
{
const int numPairs = *g_numPairs;
const int tid = threadIdx.x + blockDim.x * threadIdx.y
......@@ -826,7 +828,7 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict
for (int i = tid; i < numPairs; i += TotalThreads)
{
//int2 pair = g_pair[i];
int2 pair = tex1Dfetch(pairListsTex,i);
int2 pair = tex1Dfetch<int2>(pairListsTex,i);
//int ind = tex1Dfetch(pairTabPotTypeTex,i);
int ai = pair.x;
......@@ -834,12 +836,12 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict
//int ind = g_pairTabPotType[i];
Vector3 a(tex1Dfetch(PosTex, ai));
Vector3 b(tex1Dfetch(PosTex, aj));
Vector3 a(tex1Dfetch<float4>(PosTex, ai));
Vector3 b(tex1Dfetch<float4>(PosTex, aj));
Vector3 dr = sys->wrapDiff(b-a);
float d2 = dr.length2();
int ind = tex1Dfetch(pairTabPotTypeTex,i);
int ind = tex1Dfetch<int>(pairTabPotTypeTex,i);
if (tablePot[ind] != NULL && d2 <= cutoff2)
{
Vector3 f = tablePot[ind]->computef(dr,d2);
......
......@@ -218,17 +218,23 @@ private:
// Pairlists
float pairlistdist2;
int2 *pairLists_d;
cudaTextureObject_t pairLists_tex;
int *pairTabPotType_d;
cudaTextureObject_t pairTabPotType_tex;
int *numPairs_d;
//Han-Yi Chou
int *CellNeighborsList;
//MLog: List of variables that need to be moved over to ComputeForce class. Members of this list will be set to static to avoid large alterations in working code, thereby allowing us to access these variables easily.
cudaTextureObject_t neighbors_tex;
//BrownianParticleType* part;
//float electricConst;
//int fullLongRange;
Vector3* pos_d;
cudaTextureObject_t pos_tex;
Vector3* mom_d;
float* ran_d;
Vector3* forceInternal_d;
......
......@@ -4,7 +4,7 @@ INCLUDE = $(CUDA_PATH)/include
ifeq ($(dbg),1)
NV_FLAGS += -g -G
NV_FLAGS += -g -G -O0
EX_FLAGS = -g -m$(OS_SIZE)
else
NV_FLAGS = -Xptxas -O3
......@@ -31,7 +31,7 @@ CC_FLAGS += -DVERSION="\"$(VERSION)\"" -DSIGNAL
CC_FLAGS += -I$(CUDA_PATH)/include
CC_FLAGS += -Wall -Wno-write-strings -std=c++0x -pedantic# TODO: test on Mac OSX and other architectures
CC_FLAGS += -Wall -Wno-write-strings -std=c++14 -pedantic# TODO: test on Mac OSX and other architectures
ifeq ($(dbg),1)
NV_FLAGS += -lineinfo
else
......
......@@ -282,20 +282,21 @@ public:
template <bool is_periodic>
HOST DEVICE inline float2 linearly_interpolate(float x, float start=0.0f) const {
float w = (x - start) * step_inv;
int home = int( floorf(w) );
w = w - home;
int signed_home = int( floorf(w) );
w = w - signed_home;
// if (home < 0) return EnergyForce(v0[0], Vector3(0.0f));
if (home < 0) {
if (is_periodic) home += size;
if (signed_home < 0) {
if (is_periodic) signed_home += size;
else return make_float2(pot[0],0.0f);
}
else if (home >= size) {
unsigned int home = signed_home;
if (home >= size) {
if (is_periodic) home -= size;
else return make_float2(pot[size-1],0.0f);
}
float u0 = pot[home];
float du = home+1 < size ? pot[home+1]-u0 : is_periodic ? pot[0]-u0 : 0;
float du = ((unsigned int) home+1) < size ? pot[home+1]-u0 : is_periodic ? pot[0]-u0 : 0;
return make_float2(du*w+u0, du*step_inv);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment