From 367fb57fe5f5c3c625915b2f4dacfdff96912cbe Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Wed, 30 Dec 2020 13:47:04 -0600
Subject: [PATCH] Initial port to cuda11: include c++14 std, use texture
 objects instead of deprecated texture refs, squash warnings

---
 src/ComputeForce.cu      | 132 ++++++++++++++++++++++++++++-----------
 src/ComputeForce.cuh     |  32 +++++-----
 src/ComputeForce.h       |   6 ++
 src/Makefile             |   4 +-
 src/TabulatedPotential.h |  13 ++--
 5 files changed, 127 insertions(+), 60 deletions(-)

diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index df06503..fe11c6a 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -118,23 +118,83 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
 		const int maxPairs = 1<<25;
 		gpuErrchk(cudaMalloc(&numPairs_d,       sizeof(int)));
 		gpuErrchk(cudaMalloc(&pairLists_d,      sizeof(int2)*maxPairs));
-                gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d, sizeof(int2)*maxPairs)); //Han-Yi
+                // gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d, sizeof(int2)*maxPairs)); //Han-Yi
 		gpuErrchk(cudaMalloc(&pairTabPotType_d, sizeof(int)*maxPairs));
-                gpuErrchk(cudaBindTexture(0, pairTabPotTypeTex, pairTabPotType_d, sizeof(int)*maxPairs)); //Han-Yi
+
+		// create texture object
+		{
+		    cudaResourceDesc resDesc;
+		    memset(&resDesc, 0, sizeof(resDesc));
+		    resDesc.resType = cudaResourceTypeLinear;
+		    resDesc.res.linear.devPtr = pairLists_d;
+		    resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
+		    resDesc.res.linear.desc.x = 32; // bits per channel
+		    resDesc.res.linear.desc.y = 32; // bits per channel
+		    resDesc.res.linear.sizeInBytes = maxPairs*sizeof(int2);
+
+		    cudaTextureDesc texDesc;
+		    memset(&texDesc, 0, sizeof(texDesc));
+		    texDesc.readMode = cudaReadModeElementType;
+
+		    // create texture object: we only have to do this once!
+		    pairLists_tex=0;
+		    cudaCreateTextureObject(&pairLists_tex, &resDesc, &texDesc, NULL);
+
+		}
+
+		// create texture object
+		{
+		    cudaResourceDesc resDesc;
+		    memset(&resDesc, 0, sizeof(resDesc));
+		    resDesc.resType = cudaResourceTypeLinear;
+		    resDesc.res.linear.devPtr = pairTabPotType_d;
+		    resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
+		    resDesc.res.linear.desc.x = 32; // bits per channel
+		    resDesc.res.linear.sizeInBytes = maxPairs*sizeof(int);
+
+		    cudaTextureDesc texDesc;
+		    memset(&texDesc, 0, sizeof(texDesc));
+		    texDesc.readMode = cudaReadModeElementType;
+
+		    // create texture object: we only have to do this once!
+		    pairTabPotType_tex = 0;
+		    cudaCreateTextureObject(&pairTabPotType_tex, &resDesc, &texDesc, NULL);
+
+		}
+
 
                 //Han-Yi Chou
                 int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
                 //int* nCells_dev;
 		if (nCells < MAX_CELLS_FOR_CELLNEIGHBORLIST) {
 		    int3 *Cells_dev;
-		    gpuErrchk(cudaMalloc(&CellNeighborsList,sizeof(int)*27*nCells));
+		    size_t sz = 27*nCells*sizeof(int);
+		    gpuErrchk(cudaMalloc(&CellNeighborsList, sz));
 		    //gpuErrchk(cudaMalloc(&nCells_dev,sizeof(int)));
 		    gpuErrchk(cudaMalloc(&Cells_dev,sizeof(int3)));
 		    //gpuErrchk(cudaMemcpy(nCells_dev,&nCells,1,cudaMemcpyHostToDevice);
 		    gpuErrchk(cudaMemcpy(Cells_dev,&(decomp.nCells),sizeof(int3),cudaMemcpyHostToDevice));
 		    createNeighborsList<<<256,256>>>(Cells_dev,CellNeighborsList);
 		    gpuErrchk(cudaFree(Cells_dev));
-		    cudaBindTexture(0, NeighborsTex, CellNeighborsList, 27*nCells*sizeof(int));
+
+		    // create texture object
+		    {
+			cudaResourceDesc resDesc;
+			memset(&resDesc, 0, sizeof(resDesc));
+			resDesc.resType = cudaResourceTypeLinear;
+			resDesc.res.linear.devPtr = CellNeighborsList;
+			resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
+			resDesc.res.linear.desc.x = 32; // bits per channel
+			resDesc.res.linear.sizeInBytes = sz;
+
+			cudaTextureDesc texDesc;
+			memset(&texDesc, 0, sizeof(texDesc));
+			texDesc.readMode = cudaReadModeElementType;
+
+			// create texture object: we only have to do this once!
+			neighbors_tex=0;
+			cudaCreateTextureObject(&neighbors_tex, &resDesc, &texDesc, NULL);
+		    }
 		}
 	}
 	
@@ -185,7 +245,7 @@ ComputeForce::~ComputeForce() {
 
 		gpuErrchk(cudaFree(sys_d));
                 //Han-Yi Unbind the texture
-                gpuErrchk(cudaUnbindTexture(PosTex));
+                gpuErrchk(cudaDestroyTextureObject(pos_tex));
 		gpuErrchk( cudaFree(pos_d) );
 		gpuErrchk( cudaFree(forceInternal_d) );
 		gpuErrchk( cudaFree(type_d) );
@@ -215,11 +275,11 @@ ComputeForce::~ComputeForce() {
 	}
 
 	gpuErrchk(cudaFree(numPairs_d));
-        gpuErrchk(cudaUnbindTexture(pairListsTex)); //Han-Yi 
+	gpuErrchk(cudaDestroyTextureObject(pairLists_tex));
 	gpuErrchk(cudaFree(pairLists_d));
-        gpuErrchk(cudaUnbindTexture(pairTabPotTypeTex));
+        gpuErrchk(cudaDestroyTextureObject(pairTabPotType_tex));
 	gpuErrchk(cudaFree(pairTabPotType_d));
-        gpuErrchk(cudaUnbindTexture(NeighborsTex));
+        gpuErrchk(cudaDestroyTextureObject(neighbors_tex));
         gpuErrchk(cudaFree( CellNeighborsList));
 
 }
@@ -521,11 +581,11 @@ void ComputeForce::decompose() {
       #if __CUDA_ARCH__ >= 520
       createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d,
                                                                              pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d,
-                                                                             excludeMap_d, numExcludes, pairlistdist2);
+									   excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex);
       #else //__CUDA_ARCH__ == 300
       createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d,
                                                                            pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, 
-                                                                           excludeMap_d, numExcludes, pairlistdist2);
+                                                                           excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex);
       #endif
        
       gpuKernelCheck();
@@ -736,8 +796,8 @@ float ComputeForce::computeTabulated(bool get_energy) {
 	{
                 //gpuErrchk(cudaBindTexture(0,  PosTex, pos_d,sizeof(Vector3)*num*numReplicas));
 		//computeTabulatedKernel<<< nb, numThreads >>>(forceInternal_d, pos_d, sys_d,
-	    computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d, pos_d, sys_d,
-													cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d);
+	    computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d, sys_d,
+													cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d, pairLists_tex, pos_tex, pairTabPotType_tex);
                   gpuKernelCheck();
                 //gpuErrchk(cudaUnbindTexture(PosTex));
 	}
@@ -843,8 +903,28 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 	const size_t tot_num = num * numReplicas;
 
 	gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num));
+	{
         //Han-Yi bind to the texture
-        gpuErrchk(cudaBindTexture(0,  PosTex, pos_d,sizeof(Vector3)*tot_num));
+	    cudaResourceDesc resDesc;
+	    memset(&resDesc, 0, sizeof(resDesc));
+	    resDesc.resType = cudaResourceTypeLinear;
+	    resDesc.res.linear.devPtr = pos_d;
+	    resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
+	    resDesc.res.linear.desc.x = 32; // bits per channel
+	    resDesc.res.linear.desc.y = 32; // bits per channel
+	    resDesc.res.linear.desc.z = 32; // bits per channel
+	    resDesc.res.linear.desc.w = 32; // bits per channel
+	    resDesc.res.linear.sizeInBytes = tot_num*sizeof(float4);
+	    
+	    cudaTextureDesc texDesc;
+	    memset(&texDesc, 0, sizeof(texDesc));
+	    texDesc.readMode = cudaReadModeElementType;
+	    
+	    // create texture object: we only have to do this once!
+	    pos_tex=0;
+	    cudaCreateTextureObject(&pos_tex, &resDesc, &texDesc, NULL);
+	}
+
         gpuErrchk(cudaDeviceSynchronize());
 
 	gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
@@ -858,43 +938,21 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom
 {
         const size_t tot_num = num * numReplicas;
 
-        gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num));
         gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num));
-        //Han-Yi Chou bind to the texture
-                  gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num));
-        //gpuErrchk(cudaBindTexture(0, MomTex, mom_d,sizeof(Vector3)*tot_num));
-                  gpuErrchk(cudaDeviceSynchronize());
-
-        gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
         gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
 
-        gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas));
-        gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
-
+	copyToCUDA(forceInternal,pos);
         gpuErrchk(cudaDeviceSynchronize());
 }
 void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom, float* random)
 {
         const size_t tot_num = num * numReplicas;
 
-        gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num));
-        gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num));
         gpuErrchk(cudaMalloc(&ran_d, sizeof(float) * tot_num));
-
-        //Han-Yi Chou bind to the texture
-                  gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num));
-        //gpuErrchk(cudaBindTexture(0, MomTex, mom_d,sizeof(Vector3)*tot_num));
-                  gpuErrchk(cudaDeviceSynchronize());
-
-        gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
-        gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
         gpuErrchk(cudaMemcpyAsync(ran_d, random, sizeof(float) * tot_num, cudaMemcpyHostToDevice));
 
-        gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas));
-        gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
-
+	copyToCUDA(forceInternal, pos, mom);
         gpuErrchk(cudaDeviceSynchronize());
-
 }
 
 void ComputeForce::setForceInternalOnDevice(Vector3* f) {
diff --git a/src/ComputeForce.cuh b/src/ComputeForce.cuh
index 0129319..4f7dfef 100644
--- a/src/ComputeForce.cuh
+++ b/src/ComputeForce.cuh
@@ -13,10 +13,10 @@
 
 #define MAX_CELLS_FOR_CELLNEIGHBORLIST 1<<25
 
-texture<int,    1, cudaReadModeElementType>      NeighborsTex;
-texture<int,    1, cudaReadModeElementType> pairTabPotTypeTex;
-texture<int2,   1, cudaReadModeElementType>      pairListsTex;
-texture<float4, 1, cudaReadModeElementType>            PosTex;
+// texture<int,    1, cudaReadModeElementType>      NeighborsTex;
+// texture<int,    1, cudaReadModeElementType> pairTabPotTypeTex;
+//texture<int2,   1, cudaReadModeElementType>      pairListsTex;
+// texture<float4, 1, cudaReadModeElementType>            PosTex;
 
 __host__ __device__
 EnergyForce ComputeForce::coulombForce(Vector3 r, float alpha,
@@ -303,7 +303,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
                                 const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp,
                                 const int nCells,int* g_numPairs, int2* g_pair, int numParts, const int* __restrict__ type,
                                 int* __restrict__ g_pairTabPotType, const Exclude* __restrict__ excludes,
-                                const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2)
+                                const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2, cudaTextureObject_t PosTex, cudaTextureObject_t NeighborsTex)
 {
     __shared__ float4 __align__(16) particle[N];
     __shared__ int     Index_i[N];
@@ -336,7 +336,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
                 if(tid + pid_i < Ni && tid < N)
                 {
                     Index_i [tid] = cellInfo[rangeI.first+pid_i+tid].particle;
-                    particle[tid] = tex1Dfetch(PosTex,Index_i[tid]);
+                    particle[tid] = tex1Dfetch<float4>(PosTex,Index_i[tid]);
                 }
                 __syncthreads();
 
@@ -360,7 +360,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
 
 			int neighbor_cell;
 			if (nCells < MAX_CELLS_FOR_CELLNEIGHBORLIST) {
-			    neighbor_cell = tex1Dfetch(NeighborsTex,idx+27*cellid_i);
+			    neighbor_cell = tex1Dfetch<int>(NeighborsTex,idx+27*cellid_i);
 			} else {
 			    int3 cells = decomp->nCells;
 			    int3 cell_idx = make_int3(cellid_i %  cells.z,
@@ -405,7 +405,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
                                 continue;
                             }
 
-                            float4 b = tex1Dfetch(PosTex,aj);
+                            float4 b = tex1Dfetch<float4>(PosTex,aj);
                             Vector3 B(b.x,b.y,b.z);
 
                             float dr = (sys->wrapDiff(A-B)).length2();
@@ -453,7 +453,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
         {
             int ai      = cellInfo[rangeI.first+pid_i].particle;
 
-            float4 a = tex1Dfetch(PosTex,ai);
+            float4 a = tex1Dfetch<float4>(PosTex,ai);
             Vector3 A(a.x,a.y,a.z);
 
             int2 ex_pair = make_int2(-1,-1); 
@@ -812,9 +812,11 @@ __global__ void computeTabulatedKernel(
 #endif
 //#if 0
 template<const int BlockSize>
-__global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict__ pos, const BaseGrid* __restrict__ sys, 
+__global__ void computeTabulatedKernel(Vector3* force, const BaseGrid* __restrict__ sys, 
                                        float cutoff2, const int* __restrict__ g_numPairs, const int2* __restrict__ g_pair, 
-                                       const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot)
+                                       const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot,
+				       cudaTextureObject_t pairListsTex, cudaTextureObject_t PosTex, cudaTextureObject_t pairTabPotTypeTex
+    )
 {
     const int numPairs = *g_numPairs;
     const int tid = threadIdx.x + blockDim.x * threadIdx.y
@@ -826,7 +828,7 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict
     for (int i = tid; i < numPairs; i += TotalThreads) 
     {
         //int2 pair = g_pair[i];
-        int2 pair = tex1Dfetch(pairListsTex,i);
+        int2 pair = tex1Dfetch<int2>(pairListsTex,i);
         //int  ind  = tex1Dfetch(pairTabPotTypeTex,i); 
 
         int ai = pair.x;
@@ -834,12 +836,12 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict
                         
         //int ind = g_pairTabPotType[i];
 
-        Vector3 a(tex1Dfetch(PosTex, ai));
-        Vector3 b(tex1Dfetch(PosTex, aj));
+        Vector3 a(tex1Dfetch<float4>(PosTex, ai));
+        Vector3 b(tex1Dfetch<float4>(PosTex, aj));
         Vector3 dr = sys->wrapDiff(b-a);
         
         float d2 = dr.length2();
-        int  ind  = tex1Dfetch(pairTabPotTypeTex,i);
+        int  ind  = tex1Dfetch<int>(pairTabPotTypeTex,i);
         if (tablePot[ind] != NULL && d2 <= cutoff2) 
         {
             Vector3 f = tablePot[ind]->computef(dr,d2);
diff --git a/src/ComputeForce.h b/src/ComputeForce.h
index a523404..c58b997 100644
--- a/src/ComputeForce.h
+++ b/src/ComputeForce.h
@@ -218,17 +218,23 @@ private:
 	// Pairlists
 	float pairlistdist2;
 	int2 *pairLists_d;
+	cudaTextureObject_t pairLists_tex;
+
 	int *pairTabPotType_d;
+	cudaTextureObject_t pairTabPotType_tex;
 
 	int *numPairs_d;
 
         //Han-Yi Chou
         int *CellNeighborsList;	
 	//MLog: List of variables that need to be moved over to ComputeForce class. Members of this list will be set to static to avoid large alterations in working code, thereby allowing us to access these variables easily.
+	cudaTextureObject_t neighbors_tex;
+
 	//BrownianParticleType* part;
 	//float electricConst;
 	//int fullLongRange;
 	Vector3* pos_d;
+	cudaTextureObject_t pos_tex;
         Vector3* mom_d;
         float*   ran_d;
 	Vector3* forceInternal_d;
diff --git a/src/Makefile b/src/Makefile
index 2ada08a..3c94c53 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,7 +4,7 @@ INCLUDE = $(CUDA_PATH)/include
 
 
 ifeq ($(dbg),1)
-	NV_FLAGS += -g -G 
+	NV_FLAGS += -g -G -O0
 	EX_FLAGS = -g -m$(OS_SIZE)
 else
 	NV_FLAGS = -Xptxas -O3
@@ -31,7 +31,7 @@ CC_FLAGS += -DVERSION="\"$(VERSION)\"" -DSIGNAL
 
 
 CC_FLAGS += -I$(CUDA_PATH)/include
-CC_FLAGS += -Wall -Wno-write-strings -std=c++0x -pedantic# TODO: test on Mac OSX and other architectures
+CC_FLAGS += -Wall -Wno-write-strings -std=c++14 -pedantic# TODO: test on Mac OSX and other architectures
 ifeq ($(dbg),1)
 NV_FLAGS += -lineinfo
 else
diff --git a/src/TabulatedPotential.h b/src/TabulatedPotential.h
index 807db9e..e409bdb 100644
--- a/src/TabulatedPotential.h
+++ b/src/TabulatedPotential.h
@@ -282,20 +282,21 @@ public:
     template <bool is_periodic>
 	HOST DEVICE inline float2 linearly_interpolate(float x, float start=0.0f) const {
 	float w = (x - start) * step_inv;
-	int home = int( floorf(w) );
-	w = w - home;
+	int signed_home = int( floorf(w) );
+	w = w - signed_home;
 	// if (home < 0) return EnergyForce(v0[0], Vector3(0.0f));
-	if (home < 0) {
-	    if (is_periodic) home += size;
+	if (signed_home < 0) {
+	    if (is_periodic) signed_home += size;
 	    else return make_float2(pot[0],0.0f);
 	}
-	else if (home >= size) {
+	unsigned int home = signed_home;
+	if (home >= size) {
 	    if (is_periodic) home -= size;
 	    else return make_float2(pot[size-1],0.0f);
 	}
 
 	float u0 = pot[home];
-	float du = home+1 < size ? pot[home+1]-u0 : is_periodic ? pot[0]-u0 : 0;
+	float du = ((unsigned int) home+1) < size ? pot[home+1]-u0 : is_periodic ? pot[0]-u0 : 0;
 
 	return make_float2(du*w+u0, du*step_inv);
     }
-- 
GitLab