Initial port to cuda11: include c++14 std, use texture objects instead of...

Initial port to cuda11: include c++14 std, use texture objects instead of deprecated texture refs, squash warnings

Initial port to cuda11: include c++14 std, use texture objects instead of...
367fb57f · cmaffeo2 · c6c19c64 · 367fb57f · 367fb57f · 367fb57f
Commit 367fb57f authored 4 years ago by cmaffeo2
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -118,23 +118,83 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) :
 		const int maxPairs = 1<<25;
 		gpuErrchk(cudaMalloc(&numPairs_d,       sizeof(int)));
 		gpuErrchk(cudaMalloc(&pairLists_d,      sizeof(int2)*maxPairs));
-                gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d, sizeof(int2)*maxPairs)); //Han-Yi
+                // gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d, sizeof(int2)*maxPairs)); //Han-Yi
 		gpuErrchk(cudaMalloc(&pairTabPotType_d, sizeof(int)*maxPairs));
-                gpuErrchk(cudaBindTexture(0, pairTabPotTypeTex, pairTabPotType_d, sizeof(int)*maxPairs)); //Han-Yi
+
+		// create texture object
+		{
+		    cudaResourceDesc resDesc;
+		    memset(&resDesc, 0, sizeof(resDesc));
+		    resDesc.resType = cudaResourceTypeLinear;
+		    resDesc.res.linear.devPtr = pairLists_d;
+		    resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
+		    resDesc.res.linear.desc.x = 32; // bits per channel
+		    resDesc.res.linear.desc.y = 32; // bits per channel
+		    resDesc.res.linear.sizeInBytes = maxPairs*sizeof(int2);
+
+		    cudaTextureDesc texDesc;
+		    memset(&texDesc, 0, sizeof(texDesc));
+		    texDesc.readMode = cudaReadModeElementType;
+
+		    // create texture object: we only have to do this once!
+		    pairLists_tex=0;
+		    cudaCreateTextureObject(&pairLists_tex, &resDesc, &texDesc, NULL);
+
+		}
+
+		// create texture object
+		{
+		    cudaResourceDesc resDesc;
+		    memset(&resDesc, 0, sizeof(resDesc));
+		    resDesc.resType = cudaResourceTypeLinear;
+		    resDesc.res.linear.devPtr = pairTabPotType_d;
+		    resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
+		    resDesc.res.linear.desc.x = 32; // bits per channel
+		    resDesc.res.linear.sizeInBytes = maxPairs*sizeof(int);
+
+		    cudaTextureDesc texDesc;
+		    memset(&texDesc, 0, sizeof(texDesc));
+		    texDesc.readMode = cudaReadModeElementType;
+
+		    // create texture object: we only have to do this once!
+		    pairTabPotType_tex = 0;
+		    cudaCreateTextureObject(&pairTabPotType_tex, &resDesc, &texDesc, NULL);
+
+		}
+

                //Han-Yi Chou
                int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z;
                //int* nCells_dev;
 		if (nCells < MAX_CELLS_FOR_CELLNEIGHBORLIST) {
 		    int3 *Cells_dev;
-		    gpuErrchk(cudaMalloc(&CellNeighborsList,sizeof(int)*27*nCells));
+		    size_t sz = 27*nCells*sizeof(int);
+		    gpuErrchk(cudaMalloc(&CellNeighborsList, sz));
 		    //gpuErrchk(cudaMalloc(&nCells_dev,sizeof(int)));
 		    gpuErrchk(cudaMalloc(&Cells_dev,sizeof(int3)));
 		    //gpuErrchk(cudaMemcpy(nCells_dev,&nCells,1,cudaMemcpyHostToDevice);
 		    gpuErrchk(cudaMemcpy(Cells_dev,&(decomp.nCells),sizeof(int3),cudaMemcpyHostToDevice));
 		    createNeighborsList<<<256,256>>>(Cells_dev,CellNeighborsList);
 		    gpuErrchk(cudaFree(Cells_dev));
-		    cudaBindTexture(0, NeighborsTex, CellNeighborsList, 27*nCells*sizeof(int));
+
+		    // create texture object
+		    {
+			cudaResourceDesc resDesc;
+			memset(&resDesc, 0, sizeof(resDesc));
+			resDesc.resType = cudaResourceTypeLinear;
+			resDesc.res.linear.devPtr = CellNeighborsList;
+			resDesc.res.linear.desc.f = cudaChannelFormatKindSigned;
+			resDesc.res.linear.desc.x = 32; // bits per channel
+			resDesc.res.linear.sizeInBytes = sz;
+
+			cudaTextureDesc texDesc;
+			memset(&texDesc, 0, sizeof(texDesc));
+			texDesc.readMode = cudaReadModeElementType;
+
+			// create texture object: we only have to do this once!
+			neighbors_tex=0;
+			cudaCreateTextureObject(&neighbors_tex, &resDesc, &texDesc, NULL);
+		    }
 		}
 	}
 	
@@ -185,7 +245,7 @@ ComputeForce::~ComputeForce() {

 		gpuErrchk(cudaFree(sys_d));
                //Han-Yi Unbind the texture
-                gpuErrchk(cudaUnbindTexture(PosTex));
+                gpuErrchk(cudaDestroyTextureObject(pos_tex));
 		gpuErrchk( cudaFree(pos_d) );
 		gpuErrchk( cudaFree(forceInternal_d) );
 		gpuErrchk( cudaFree(type_d) );
@@ -215,11 +275,11 @@ ComputeForce::~ComputeForce() {
 	}

 	gpuErrchk(cudaFree(numPairs_d));
-        gpuErrchk(cudaUnbindTexture(pairListsTex)); //Han-Yi 
+	gpuErrchk(cudaDestroyTextureObject(pairLists_tex));
 	gpuErrchk(cudaFree(pairLists_d));
-        gpuErrchk(cudaUnbindTexture(pairTabPotTypeTex));
+        gpuErrchk(cudaDestroyTextureObject(pairTabPotType_tex));
 	gpuErrchk(cudaFree(pairTabPotType_d));
-        gpuErrchk(cudaUnbindTexture(NeighborsTex));
+        gpuErrchk(cudaDestroyTextureObject(neighbors_tex));
        gpuErrchk(cudaFree( CellNeighborsList));

 }
@@ -521,11 +581,11 @@ void ComputeForce::decompose() {
      #if __CUDA_ARCH__ >= 520
      createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d,
                                                                             pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d,
-                                                                             excludeMap_d, numExcludes, pairlistdist2);
+									   excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex);
      #else //__CUDA_ARCH__ == 300
      createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d,
                                                                           pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, 
-                                                                           excludeMap_d, numExcludes, pairlistdist2);
+                                                                           excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex);
      #endif
       
      gpuKernelCheck();
@@ -736,8 +796,8 @@ float ComputeForce::computeTabulated(bool get_energy) {
 	{
                //gpuErrchk(cudaBindTexture(0,  PosTex, pos_d,sizeof(Vector3)*num*numReplicas));
 		//computeTabulatedKernel<<< nb, numThreads >>>(forceInternal_d, pos_d, sys_d,
-	    computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d, pos_d, sys_d,
-													cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d);
+	    computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d, sys_d,
+													cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d, pairLists_tex, pos_tex, pairTabPotType_tex);
                  gpuKernelCheck();
                //gpuErrchk(cudaUnbindTexture(PosTex));
 	}
@@ -843,8 +903,28 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos)
 	const size_t tot_num = num * numReplicas;

 	gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num));
+	{
        //Han-Yi bind to the texture
-        gpuErrchk(cudaBindTexture(0,  PosTex, pos_d,sizeof(Vector3)*tot_num));
+	    cudaResourceDesc resDesc;
+	    memset(&resDesc, 0, sizeof(resDesc));
+	    resDesc.resType = cudaResourceTypeLinear;
+	    resDesc.res.linear.devPtr = pos_d;
+	    resDesc.res.linear.desc.f = cudaChannelFormatKindFloat;
+	    resDesc.res.linear.desc.x = 32; // bits per channel
+	    resDesc.res.linear.desc.y = 32; // bits per channel
+	    resDesc.res.linear.desc.z = 32; // bits per channel
+	    resDesc.res.linear.desc.w = 32; // bits per channel
+	    resDesc.res.linear.sizeInBytes = tot_num*sizeof(float4);
+	    
+	    cudaTextureDesc texDesc;
+	    memset(&texDesc, 0, sizeof(texDesc));
+	    texDesc.readMode = cudaReadModeElementType;
+	    
+	    // create texture object: we only have to do this once!
+	    pos_tex=0;
+	    cudaCreateTextureObject(&pos_tex, &resDesc, &texDesc, NULL);
+	}
+
        gpuErrchk(cudaDeviceSynchronize());

 	gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
@@ -858,43 +938,21 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom
 {
        const size_t tot_num = num * numReplicas;

-        gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num));
        gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num));
-        //Han-Yi Chou bind to the texture
-                  gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num));
-        //gpuErrchk(cudaBindTexture(0, MomTex, mom_d,sizeof(Vector3)*tot_num));
-                  gpuErrchk(cudaDeviceSynchronize());
-
-        gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));

-        gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas));
-        gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
-
+	copyToCUDA(forceInternal,pos);
        gpuErrchk(cudaDeviceSynchronize());
 }
 void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom, float* random)
 {
        const size_t tot_num = num * numReplicas;

-        gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num));
-        gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num));
        gpuErrchk(cudaMalloc(&ran_d, sizeof(float) * tot_num));
-
-        //Han-Yi Chou bind to the texture
-                  gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*tot_num));
-        //gpuErrchk(cudaBindTexture(0, MomTex, mom_d,sizeof(Vector3)*tot_num));
-                  gpuErrchk(cudaDeviceSynchronize());
-
-        gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
-        gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpyAsync(ran_d, random, sizeof(float) * tot_num, cudaMemcpyHostToDevice));

-        gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas));
-        gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice));
-
+	copyToCUDA(forceInternal, pos, mom);
        gpuErrchk(cudaDeviceSynchronize());
-
 }

 void ComputeForce::setForceInternalOnDevice(Vector3* f) {

--- a/src/ComputeForce.cuh
+++ b/src/ComputeForce.cuh
@@ -13,10 +13,10 @@

 #define MAX_CELLS_FOR_CELLNEIGHBORLIST 1<<25

-texture<int,    1, cudaReadModeElementType>      NeighborsTex;
-texture<int,    1, cudaReadModeElementType> pairTabPotTypeTex;
-texture<int2,   1, cudaReadModeElementType>      pairListsTex;
-texture<float4, 1, cudaReadModeElementType>            PosTex;
+// texture<int,    1, cudaReadModeElementType>      NeighborsTex;
+// texture<int,    1, cudaReadModeElementType> pairTabPotTypeTex;
+//texture<int2,   1, cudaReadModeElementType>      pairListsTex;
+// texture<float4, 1, cudaReadModeElementType>            PosTex;

 __host__ __device__
 EnergyForce ComputeForce::coulombForce(Vector3 r, float alpha,
@@ -303,7 +303,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
                                const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp,
                                const int nCells,int* g_numPairs, int2* g_pair, int numParts, const int* __restrict__ type,
                                int* __restrict__ g_pairTabPotType, const Exclude* __restrict__ excludes,
-                                const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2)
+                                const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2, cudaTextureObject_t PosTex, cudaTextureObject_t NeighborsTex)
 {
    __shared__ float4 __align__(16) particle[N];
    __shared__ int     Index_i[N];
@@ -336,7 +336,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
                if(tid + pid_i < Ni && tid < N)
                {
                    Index_i [tid] = cellInfo[rangeI.first+pid_i+tid].particle;
-                    particle[tid] = tex1Dfetch(PosTex,Index_i[tid]);
+                    particle[tid] = tex1Dfetch<float4>(PosTex,Index_i[tid]);
                }
                __syncthreads();

@@ -360,7 +360,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const

 			int neighbor_cell;
 			if (nCells < MAX_CELLS_FOR_CELLNEIGHBORLIST) {
-			    neighbor_cell = tex1Dfetch(NeighborsTex,idx+27*cellid_i);
+			    neighbor_cell = tex1Dfetch<int>(NeighborsTex,idx+27*cellid_i);
 			} else {
 			    int3 cells = decomp->nCells;
 			    int3 cell_idx = make_int3(cellid_i %  cells.z,
@@ -405,7 +405,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
                                continue;
                            }

-                            float4 b = tex1Dfetch(PosTex,aj);
+                            float4 b = tex1Dfetch<float4>(PosTex,aj);
                            Vector3 B(b.x,b.y,b.z);

                            float dr = (sys->wrapDiff(A-B)).length2();
@@ -453,7 +453,7 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const
        {
            int ai      = cellInfo[rangeI.first+pid_i].particle;

-            float4 a = tex1Dfetch(PosTex,ai);
+            float4 a = tex1Dfetch<float4>(PosTex,ai);
            Vector3 A(a.x,a.y,a.z);

            int2 ex_pair = make_int2(-1,-1); 
@@ -812,9 +812,11 @@ __global__ void computeTabulatedKernel(
 #endif
 //#if 0
 template<const int BlockSize>
-__global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict__ pos, const BaseGrid* __restrict__ sys, 
+__global__ void computeTabulatedKernel(Vector3* force, const BaseGrid* __restrict__ sys, 
                                       float cutoff2, const int* __restrict__ g_numPairs, const int2* __restrict__ g_pair, 
-                                       const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot)
+                                       const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot,
+				       cudaTextureObject_t pairListsTex, cudaTextureObject_t PosTex, cudaTextureObject_t pairTabPotTypeTex
+    )
 {
    const int numPairs = *g_numPairs;
    const int tid = threadIdx.x + blockDim.x * threadIdx.y
@@ -826,7 +828,7 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict
    for (int i = tid; i < numPairs; i += TotalThreads) 
    {
        //int2 pair = g_pair[i];
-        int2 pair = tex1Dfetch(pairListsTex,i);
+        int2 pair = tex1Dfetch<int2>(pairListsTex,i);
        //int  ind  = tex1Dfetch(pairTabPotTypeTex,i); 

        int ai = pair.x;
@@ -834,12 +836,12 @@ __global__ void computeTabulatedKernel(Vector3* force, const Vector3* __restrict
                        
        //int ind = g_pairTabPotType[i];

-        Vector3 a(tex1Dfetch(PosTex, ai));
-        Vector3 b(tex1Dfetch(PosTex, aj));
+        Vector3 a(tex1Dfetch<float4>(PosTex, ai));
+        Vector3 b(tex1Dfetch<float4>(PosTex, aj));
        Vector3 dr = sys->wrapDiff(b-a);
        
        float d2 = dr.length2();
-        int  ind  = tex1Dfetch(pairTabPotTypeTex,i);
+        int  ind  = tex1Dfetch<int>(pairTabPotTypeTex,i);
        if (tablePot[ind] != NULL && d2 <= cutoff2) 
        {
            Vector3 f = tablePot[ind]->computef(dr,d2);

--- a/src/ComputeForce.h
+++ b/src/ComputeForce.h
@@ -218,17 +218,23 @@ private:
 	// Pairlists
 	float pairlistdist2;
 	int2 *pairLists_d;
+	cudaTextureObject_t pairLists_tex;
+
 	int *pairTabPotType_d;
+	cudaTextureObject_t pairTabPotType_tex;

 	int *numPairs_d;

        //Han-Yi Chou
        int *CellNeighborsList;	
 	//MLog: List of variables that need to be moved over to ComputeForce class. Members of this list will be set to static to avoid large alterations in working code, thereby allowing us to access these variables easily.
+	cudaTextureObject_t neighbors_tex;
+
 	//BrownianParticleType* part;
 	//float electricConst;
 	//int fullLongRange;
 	Vector3* pos_d;
+	cudaTextureObject_t pos_tex;
        Vector3* mom_d;
        float*   ran_d;
 	Vector3* forceInternal_d;

--- a/src/Makefile
+++ b/src/Makefile
@@ -4,7 +4,7 @@ INCLUDE = $(CUDA_PATH)/include


 ifeq ($(dbg),1)
-	NV_FLAGS += -g -G 
+	NV_FLAGS += -g -G -O0
 	EX_FLAGS = -g -m$(OS_SIZE)
 else
 	NV_FLAGS = -Xptxas -O3
@@ -31,7 +31,7 @@ CC_FLAGS += -DVERSION="\"$(VERSION)\"" -DSIGNAL


 CC_FLAGS += -I$(CUDA_PATH)/include
-CC_FLAGS += -Wall -Wno-write-strings -std=c++0x -pedantic# TODO: test on Mac OSX and other architectures
+CC_FLAGS += -Wall -Wno-write-strings -std=c++14 -pedantic# TODO: test on Mac OSX and other architectures
 ifeq ($(dbg),1)
 NV_FLAGS += -lineinfo
 else

--- a/src/TabulatedPotential.h
+++ b/src/TabulatedPotential.h
@@ -282,20 +282,21 @@ public:
    template <bool is_periodic>
 	HOST DEVICE inline float2 linearly_interpolate(float x, float start=0.0f) const {
 	float w = (x - start) * step_inv;
-	int home = int( floorf(w) );
-	w = w - home;
+	int signed_home = int( floorf(w) );
+	w = w - signed_home;
 	// if (home < 0) return EnergyForce(v0[0], Vector3(0.0f));
-	if (home < 0) {
-	    if (is_periodic) home += size;
+	if (signed_home < 0) {
+	    if (is_periodic) signed_home += size;
 	    else return make_float2(pot[0],0.0f);
 	}
-	else if (home >= size) {
+	unsigned int home = signed_home;
+	if (home >= size) {
 	    if (is_periodic) home -= size;
 	    else return make_float2(pot[size-1],0.0f);
 	}

 	float u0 = pot[home];
-	float du = home+1 < size ? pot[home+1]-u0 : is_periodic ? pot[0]-u0 : 0;
+	float du = ((unsigned int) home+1) < size ? pot[home+1]-u0 : is_periodic ? pot[0]-u0 : 0;

 	return make_float2(du*w+u0, du*step_inv);
    }