diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu index fe11c6a73b369782db0e8095d315ff24e7d9392f..7d3ce8118d94fef022be78b043f611a234cd5046 100644 --- a/src/ComputeForce.cu +++ b/src/ComputeForce.cu @@ -8,6 +8,8 @@ #include <cuda_profiler_api.h> #include <fstream> #include <iostream> + +#ifndef gpuErrchk #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -15,6 +17,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t if (abort) exit(code); } } +#endif #define gpuKernelCheck() {kernelCheck( __FILE__, __LINE__); } inline void kernelCheck(const char* file, int line) @@ -30,6 +33,8 @@ inline void kernelCheck(const char* file, int line) cudaEvent_t start, stop; +GPUManager ComputeForce::gpuman = GPUManager(); + void runSort(int2 *d1, int *d2, float *key, int2 *scratch1, int *scratch2, float *scratchKey, unsigned int count); @@ -46,6 +51,20 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : numBondAngles(c.numBondAngles), numProductPotentials(c.numProductPotentials), numReplicas(numReplicas) { + // Grow vectors for per-gpu device pointers + for (int i = 0; i < gpuman.gpus.size(); ++i) { + int s = gpuman.gpus.size(); + sys_d = std::vector<BaseGrid*>(s); + tablePot_d = std::vector<TabulatedPotential**>(s); + pairLists_d = std::vector<int2*>(s); + pairLists_tex = std::vector<cudaTextureObject_t>(s); + pairTabPotType_d = std::vector<int*>(s); + pairTabPotType_tex = std::vector<cudaTextureObject_t>(s); + numPairs_d = std::vector<int*>(s); + pos_d = std::vector<Vector3*>(s); + forceInternal_d = std::vector<Vector3*>(s); + } + // Allocate the parameter tables. decomp_d = NULL; @@ -61,8 +80,8 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : gpuErrchk(cudaMalloc(&tableEps_d, tableSize)); gpuErrchk(cudaMalloc(&tableRad6_d, tableSize)); gpuErrchk(cudaMalloc(&tableAlpha_d, tableSize)); - gpuErrchk(cudaMalloc(&sys_d, sizeof(BaseGrid))); - gpuErrchk(cudaMemcpyAsync(sys_d, sys, sizeof(BaseGrid), cudaMemcpyHostToDevice)); + gpuErrchk(cudaMalloc(&sys_d[0], sizeof(BaseGrid))); + gpuErrchk(cudaMemcpyAsync(sys_d[0], sys, sizeof(BaseGrid), cudaMemcpyHostToDevice)); // Build the parameter tables. makeTables(c.part); @@ -78,7 +97,7 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : tablePot_addr[i] = NULL; tablePot[i] = NULL; } - gpuErrchk(cudaMalloc(&tablePot_d, sizeof(TabulatedPotential*) * np2)); + gpuErrchk(cudaMalloc(&tablePot_d[0], sizeof(TabulatedPotential*) * np2)); // Create the bond table tableBond = new TabulatedPotential*[numTabBondFiles]; @@ -116,17 +135,17 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : { // allocate device for pairlists // RBTODO: select maxpairs in better way; add assertion in kernel to avoid going past this const int maxPairs = 1<<25; - gpuErrchk(cudaMalloc(&numPairs_d, sizeof(int))); - gpuErrchk(cudaMalloc(&pairLists_d, sizeof(int2)*maxPairs)); - // gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d, sizeof(int2)*maxPairs)); //Han-Yi - gpuErrchk(cudaMalloc(&pairTabPotType_d, sizeof(int)*maxPairs)); + gpuErrchk(cudaMalloc(&numPairs_d[0], sizeof(int))); + gpuErrchk(cudaMalloc(&pairLists_d[0], sizeof(int2)*maxPairs)); + // gpuErrchk(cudaBindTexture(0, pairListsTex, pairLists_d[0], sizeof(int2)*maxPairs)); //Han-Yi + gpuErrchk(cudaMalloc(&pairTabPotType_d[0], sizeof(int)*maxPairs)); // create texture object { cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeLinear; - resDesc.res.linear.devPtr = pairLists_d; + resDesc.res.linear.devPtr = pairLists_d[0]; resDesc.res.linear.desc.f = cudaChannelFormatKindSigned; resDesc.res.linear.desc.x = 32; // bits per channel resDesc.res.linear.desc.y = 32; // bits per channel @@ -137,8 +156,8 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : texDesc.readMode = cudaReadModeElementType; // create texture object: we only have to do this once! - pairLists_tex=0; - cudaCreateTextureObject(&pairLists_tex, &resDesc, &texDesc, NULL); + pairLists_tex[0]=0; + cudaCreateTextureObject(&pairLists_tex[0], &resDesc, &texDesc, NULL); } @@ -147,7 +166,7 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeLinear; - resDesc.res.linear.devPtr = pairTabPotType_d; + resDesc.res.linear.devPtr = pairTabPotType_d[0]; resDesc.res.linear.desc.f = cudaChannelFormatKindSigned; resDesc.res.linear.desc.x = 32; // bits per channel resDesc.res.linear.sizeInBytes = maxPairs*sizeof(int); @@ -157,8 +176,8 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : texDesc.readMode = cudaReadModeElementType; // create texture object: we only have to do this once! - pairTabPotType_tex = 0; - cudaCreateTextureObject(&pairTabPotType_tex, &resDesc, &texDesc, NULL); + pairTabPotType_tex[0] = 0; + cudaCreateTextureObject(&pairTabPotType_tex[0], &resDesc, &texDesc, NULL); } @@ -210,7 +229,6 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : cudaEventCreate(&start); cudaEventCreate(&stop); } -GPUManager ComputeForce::gpuman = GPUManager(); ComputeForce::~ComputeForce() { delete[] tableEps; @@ -243,11 +261,13 @@ ComputeForce::~ComputeForce() { gpuErrchk(cudaFree(energies_d)); - gpuErrchk(cudaFree(sys_d)); + gpuErrchk(cudaFree(sys_d[0])); //Han-Yi Unbind the texture gpuErrchk(cudaDestroyTextureObject(pos_tex)); - gpuErrchk( cudaFree(pos_d) ); - gpuErrchk( cudaFree(forceInternal_d) ); + for (int i = 0; i < forceInternal_d.size(); ++i) { + gpuErrchk( cudaFree(forceInternal_d[i]) ); + } + gpuErrchk( cudaFree(pos_d[0]) ); gpuErrchk( cudaFree(type_d) ); if (numBonds > 0) { gpuErrchk( cudaFree(bonds_d) ); @@ -274,11 +294,11 @@ ComputeForce::~ComputeForce() { } } - gpuErrchk(cudaFree(numPairs_d)); - gpuErrchk(cudaDestroyTextureObject(pairLists_tex)); - gpuErrchk(cudaFree(pairLists_d)); - gpuErrchk(cudaDestroyTextureObject(pairTabPotType_tex)); - gpuErrchk(cudaFree(pairTabPotType_d)); + gpuErrchk(cudaFree(numPairs_d[0])); + gpuErrchk(cudaDestroyTextureObject(pairLists_tex[0])); + gpuErrchk(cudaFree(pairLists_d[0])); + gpuErrchk(cudaDestroyTextureObject(pairTabPotType_tex[0])); + gpuErrchk(cudaFree(pairTabPotType_d[0])); gpuErrchk(cudaDestroyTextureObject(neighbors_tex)); gpuErrchk(cudaFree( CellNeighborsList)); @@ -384,7 +404,7 @@ bool ComputeForce::addTabulatedPotential(String fileName, int type0, int type1) t->v0 = NULL; t->v1 = NULL; t->v2 = NULL; t->v3 = NULL; delete t; - gpuErrchk(cudaMemcpy(tablePot_d, tablePot_addr, + gpuErrchk(cudaMemcpy(tablePot_d[0], tablePot_addr, sizeof(TabulatedPotential*) * numParts * numParts, cudaMemcpyHostToDevice)); return true; @@ -524,11 +544,11 @@ void ComputeForce::decompose() { cudaFree(decomp_d); decomp_d = NULL; } - decomp.decompose_d(pos_d, num); + decomp.decompose_d(pos_d[0], num); decomp_d = decomp.copyToCUDA(); // Update pairlists using cell decomposition (not sure this is really needed or good) - //RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d, num, numReplicas, sys_d, decomp_d); + //RBTODO updatePairlists<<< nBlocks, NUM_THREADS >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d); /* size_t free, total; */ /* { */ @@ -549,15 +569,15 @@ void ComputeForce::decompose() { //const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1; // const size_t nBlocks = nCells*blocksPerCell; - /* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d, decomp_d); */ + /* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d[0], decomp_d); */ /* gpuErrchk(cudaDeviceSynchronize()); */ /* pairlistTest<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, */ - /* sys_d, decomp_d, nCells, blocksPerCell, */ - /* numPairs_d, pairListListI_d, pairListListJ_d); */ + /* sys_d[0], decomp_d, nCells, blocksPerCell, */ + /* numPairs_d[0], pairListListI_d, pairListListJ_d); */ /* gpuErrchk(cudaDeviceSynchronize()); */ int tmp = 0; - gpuErrchk(cudaMemcpyAsync(numPairs_d, &tmp, sizeof(int), cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpyAsync(numPairs_d[0], &tmp, sizeof(int), cudaMemcpyHostToDevice)); gpuErrchk(cudaDeviceSynchronize()); // printf("Pairlistdist: %f\n",sqrt(pairlistdist2)); @@ -567,64 +587,64 @@ void ComputeForce::decompose() { #endif //Han-Yi Chou bind texture //printf("%d\n", sizeof(Vector3)); - //gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*num*numReplicas)); + //gpuErrchk(cudaBindTexture(0, PosTex, pos_d[0],sizeof(Vector3)*num*numReplicas)); //gpuErrchk(cudaBindTexture(0,CellsTex, decomp_d->getCells_d(),sizeof(CellDecomposition::cell_t)*num*numReplicas)); //#if __CUDA_ARCH__ >= 300 - //createPairlists_debug<<< 2048, 64 >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2); + //createPairlists_debug<<< 2048, 64 >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); //#ifdef NEW //for sm52 - //createPairlists<32,64,1><<< dim3(256,128,numReplicas),dim3(32,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, + //createPairlists<32,64,1><<< dim3(256,128,numReplicas),dim3(32,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], //GTX 980 //Han-Yi Chou 2017 my code #if __CUDA_ARCH__ >= 520 - createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, - pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, + createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], + pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex); #else //__CUDA_ARCH__ == 300 - createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, - pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, + createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], + pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2, pos_tex, neighbors_tex); #endif gpuKernelCheck(); gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ - //createPairlists<64,64><<< dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, - // pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, + //createPairlists<64,64><<< dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], + // pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, // excludeMap_d, numExcludes, pairlistdist2); //#else - //createPairlists_debug<<< 2048, 64 >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, - // pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2); + //createPairlists_debug<<< 2048, 64 >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, + // pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); //#endif //#else // Use shared memory for warp_bcast function - //createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2); + //createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); //#ifdef NEW //for sm52 - //createPairlists<32,64,1><<<dim3(256,128,numReplicas),dim3(32,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, + //createPairlists<32,64,1><<<dim3(256,128,numReplicas),dim3(32,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], //GTX 980 - //createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, + //createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], //GTX 680 - //createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, - // pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, + //createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], + // pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, // excludeMap_d, numExcludes, pairlistdist2); - //createPairlists<64,64><<<dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, - // pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, + //createPairlists<64,64><<<dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], + // pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, // excludeMap_d, numExcludes, pairlistdist2); //#else - //createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, - // pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2, CellNeighborsList); + //createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, + // pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2, CellNeighborsList); //#endif //#endif #if 0 //////debug section // DEBUGING - gpuErrchk(cudaMemcpy(&tmp, numPairs_d, sizeof(int), cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(&tmp, numPairs_d[0], sizeof(int), cudaMemcpyDeviceToHost)); //printf("CreatePairlist found %d pairs\n",tmp); gpuErrchk(cudaDeviceSynchronize()); @@ -634,12 +654,12 @@ void ComputeForce::decompose() { if (decomp_d) cudaFree(decomp_d); - decomp.decompose_d(pos_d, num); + decomp.decompose_d(pos_d[0], num); decomp_d = decomp.copyToCUDA(); gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ int tmp1 = 0; - gpuErrchk(cudaMemcpyAsync(numPairs_d, &tmp1, sizeof(int), cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpyAsync(numPairs_d[0], &tmp1, sizeof(int), cudaMemcpyHostToDevice)); gpuErrchk(cudaDeviceSynchronize()); // printf("Pairlistdist: %f\n",sqrt(pairlistdist2)); @@ -648,12 +668,12 @@ void ComputeForce::decompose() { gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ #endif #if __CUDA_ARCH__ >= 300 - createPairlists_debug<<< 2048, 64 >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2); + createPairlists_debug<<< 2048, 64 >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); #else // Use shared memory for warp_bcast function - createPairlists_debug<<< 2048, 64, 2048/WARPSIZE >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, excludes_d, excludeMap_d, numExcludes, pairlistdist2); + createPairlists_debug<<< 2048, 64, 2048/WARPSIZE >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); #endif - gpuErrchk(cudaMemcpy(&tmp1, numPairs_d, sizeof(int), cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(&tmp1, numPairs_d[0], sizeof(int), cudaMemcpyDeviceToHost)); printf("Difference CreatePairlist found %d pairs\n",tmp-tmp1); gpuErrchk(cudaDeviceSynchronize()); @@ -682,8 +702,8 @@ float ComputeForce::computeFull(bool get_energy) { dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeFullKernel<<< numBlocks, numThreads >>>(forceInternal_d, pos_d, type_d, tableAlpha_d, - tableEps_d, tableRad6_d, num, numParts, sys_d, energies_d, gridSize, + computeFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d, tableAlpha_d, + tableEps_d, tableRad6_d, num, numParts, sys_d[0], energies_d, gridSize, numReplicas, get_energy); // Calculate energy based on the array created by the kernel @@ -703,8 +723,8 @@ float ComputeForce::computeSoftcoreFull(bool get_energy) { dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeSoftcoreFullKernel<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, type_d, - tableEps_d, tableRad6_d, num, numParts, sys_d, energies_d, gridSize, + computeSoftcoreFullKernel<<<numBlocks, numThreads>>>(forceInternal_d[0], pos_d[0], type_d, + tableEps_d, tableRad6_d, num, numParts, sys_d[0], energies_d, gridSize, numReplicas, get_energy); // Calculate energy based on the array created by the kernel @@ -725,8 +745,8 @@ float ComputeForce::computeElecFull(bool get_energy) { dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeElecFullKernel<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, type_d, - tableAlpha_d, num, numParts, sys_d, energies_d, gridSize, numReplicas, + computeElecFullKernel<<<numBlocks, numThreads>>>(forceInternal_d[0], pos_d[0], type_d, + tableAlpha_d, num, numParts, sys_d[0], energies_d, gridSize, numReplicas, get_energy); // Calculate energy based on the array created by the kernel @@ -748,8 +768,8 @@ float ComputeForce::compute(bool get_energy) { dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeKernel<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, type_d, - tableAlpha_d, tableEps_d, tableRad6_d, num, numParts, sys_d, + computeKernel<<<numBlocks, numThreads>>>(forceInternal_d[0], pos_d[0], type_d, + tableAlpha_d, tableEps_d, tableRad6_d, num, numParts, sys_d[0], decomp_d, energies_d, switchStart, switchLen, gridSize, numReplicas, get_energy); @@ -788,16 +808,16 @@ float ComputeForce::computeTabulated(bool get_energy) { //clearEnergies<<< nb, numThreads >>>(energies_d,num); //gpuErrchk(cudaDeviceSynchronize()); cudaMemset((void*)energies_d, 0, sizeof(float)*num*numReplicas); - computeTabulatedEnergyKernel<<< nb, numThreads >>>(forceInternal_d, pos_d, sys_d, - cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d, energies_d); + computeTabulatedEnergyKernel<<< nb, numThreads >>>(forceInternal_d[0], pos_d[0], sys_d[0], + cutoff2, numPairs_d[0], pairLists_d[0], pairTabPotType_d[0], tablePot_d[0], energies_d); } else { - //gpuErrchk(cudaBindTexture(0, PosTex, pos_d,sizeof(Vector3)*num*numReplicas)); - //computeTabulatedKernel<<< nb, numThreads >>>(forceInternal_d, pos_d, sys_d, - computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d, sys_d, - cutoff2, numPairs_d, pairLists_d, pairTabPotType_d, tablePot_d, pairLists_tex, pos_tex, pairTabPotType_tex); + //gpuErrchk(cudaBindTexture(0, PosTex, pos_d[0],sizeof(Vector3)*num*numReplicas)); + //computeTabulatedKernel<<< nb, numThreads >>>(forceInternal_d[0], pos_d[0], sys_d[0], + computeTabulatedKernel<64><<< dim3(2048,1,1), dim3(64,1,1), 0, gpuman.get_next_stream() >>>(forceInternal_d[0], sys_d[0], + cutoff2, numPairs_d[0], pairLists_d[0], pairTabPotType_d[0], tablePot_d[0], pairLists_tex[0], pos_tex, pairTabPotType_tex[0]); gpuKernelCheck(); //gpuErrchk(cudaUnbindTexture(PosTex)); } @@ -808,41 +828,41 @@ float ComputeForce::computeTabulated(bool get_energy) { if(product_potential_list_d != NULL && product_potentials_d != NULL) { - computeProductPotentials <<<nb, numThreads, 0, gpuman.get_next_stream()>>> ( forceInternal_d, pos_d, sys_d, numReplicas*numProductPotentials, product_potential_particles_d, product_potentials_d, product_potential_list_d, productCount_d, energies_d, get_energy); + computeProductPotentials <<<nb, numThreads, 0, gpuman.get_next_stream()>>> ( forceInternal_d[0], pos_d[0], sys_d[0], numReplicas*numProductPotentials, product_potential_particles_d, product_potentials_d, product_potential_list_d, productCount_d, energies_d, get_energy); } if(bondAngleList_d != NULL && tableBond_d != NULL && tableAngle_d != NULL) { - computeTabulatedBondAngles <<<nb, numThreads, 0, gpuman.get_next_stream()>>> ( forceInternal_d, pos_d, sys_d, numReplicas*numBondAngles, bondAngleList_d, tableAngle_d, tableBond_d, energies_d, get_energy); + computeTabulatedBondAngles <<<nb, numThreads, 0, gpuman.get_next_stream()>>> ( forceInternal_d[0], pos_d[0], sys_d[0], numReplicas*numBondAngles, bondAngleList_d, tableAngle_d, tableBond_d, energies_d, get_energy); } if(bondList_d != NULL && tableBond_d != NULL) { - //computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d, bonds, bondMap_d, numBonds, numReplicas, energies_d, get_energy, tableBond_d); - //computeTabulatedBonds <<<nb, numThreads>>> ( forceInternal_d, pos_d, sys_d, numReplicas*numBonds/2, bondList_d, tableBond_d); + //computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d[0], bonds, bondMap_d, numBonds, numReplicas, energies_d, get_energy, tableBond_d); + //computeTabulatedBonds <<<nb, numThreads>>> ( forceInternal_d[0], pos_d[0], sys_d[0], numReplicas*numBonds/2, bondList_d, tableBond_d); //if(get_energy) //cudaMemset(bond_energy_d, 0, sizeof(float)*num); - computeTabulatedBonds <<<nb, numThreads, 0, gpuman.get_next_stream()>>> ( forceInternal_d, pos_d, sys_d, numReplicas*numBonds/2, bondList_d, tableBond_d, energies_d, get_energy); + computeTabulatedBonds <<<nb, numThreads, 0, gpuman.get_next_stream()>>> ( forceInternal_d[0], pos_d[0], sys_d[0], numReplicas*numBonds/2, bondList_d, tableBond_d, energies_d, get_energy); } if (angleList_d != NULL && tableAngle_d != NULL) { //if(get_energy) - //computeTabulatedAngles<<<nb, numThreads>>>(forceInternal_d, pos_d, sys_d, numAngles*numReplicas, angleList_d, tableAngle_d); - computeTabulatedAngles<<<nb, numThreads, 0, gpuman.get_next_stream()>>>(forceInternal_d, pos_d, sys_d, numAngles*numReplicas, angleList_d, tableAngle_d, energies_d, get_energy); + //computeTabulatedAngles<<<nb, numThreads>>>(forceInternal_d[0], pos_d[0], sys_d[0], numAngles*numReplicas, angleList_d, tableAngle_d); + computeTabulatedAngles<<<nb, numThreads, 0, gpuman.get_next_stream()>>>(forceInternal_d[0], pos_d[0], sys_d[0], numAngles*numReplicas, angleList_d, tableAngle_d, energies_d, get_energy); } if (dihedralList_d != NULL && tableDihedral_d != NULL) { //if(get_energy) - //computeTabulatedDihedrals<<<nb, numThreads>>>(forceInternal_d, pos_d, sys_d, numDihedrals*numReplicas, dihedralList_d, dihedralPotList_d, tableDihedral_d); - computeTabulatedDihedrals<<<nb, numThreads, 0, gpuman.get_next_stream()>>>(forceInternal_d, pos_d, sys_d, numDihedrals*numReplicas, + //computeTabulatedDihedrals<<<nb, numThreads>>>(forceInternal_d[0], pos_d[0], sys_d[0], numDihedrals*numReplicas, dihedralList_d, dihedralPotList_d, tableDihedral_d); + computeTabulatedDihedrals<<<nb, numThreads, 0, gpuman.get_next_stream()>>>(forceInternal_d[0], pos_d[0], sys_d[0], numDihedrals*numReplicas, dihedralList_d, dihedralPotList_d, tableDihedral_d, energies_d, get_energy); } // TODO: Sum energy if (restraintIds_d != NULL ) - computeHarmonicRestraints<<<1, numThreads, 0, gpuman.get_next_stream()>>>(forceInternal_d, pos_d, sys_d, numRestraints*numReplicas, restraintIds_d, restraintLocs_d, restraintSprings_d); + computeHarmonicRestraints<<<1, numThreads, 0, gpuman.get_next_stream()>>>(forceInternal_d[0], pos_d[0], sys_d[0], numRestraints*numReplicas, restraintIds_d, restraintLocs_d, restraintSprings_d); // Calculate the energy based on the array created by the kernel @@ -877,16 +897,16 @@ float ComputeForce::computeTabulatedFull(bool get_energy) { dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeTabulatedFullKernel<<< numBlocks, numThreads >>>(forceInternal_d, pos_d, type_d, tablePot_d, tableBond_d, num, numParts, sys_d, bonds_d, bondMap_d, numBonds, excludes_d, excludeMap_d, numExcludes, energies_d, gridSize, numReplicas, get_energy, angles_d); + computeTabulatedFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d, tablePot_d[0], tableBond_d, num, numParts, sys_d[0], bonds_d, bondMap_d, numBonds, excludes_d, excludeMap_d, numExcludes, energies_d, gridSize, numReplicas, get_energy, angles_d); gpuErrchk(cudaDeviceSynchronize()); - computeAngles<<< numBlocks, numThreads >>>(forceInternal_d, pos_d, angles_d, tableAngle_d, - numAngles, num, sys_d, energies_d, + computeAngles<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], angles_d, tableAngle_d, + numAngles, num, sys_d[0], energies_d, get_energy); gpuErrchk(cudaDeviceSynchronize()); - computeDihedrals<<< numBlocks, numThreads >>>(forceInternal_d, pos_d, dihedrals_d, + computeDihedrals<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], dihedrals_d, tableDihedral_d, numDihedrals, - num, sys_d, energies_d, + num, sys_d[0], energies_d, get_energy); // Calculate the energy based on the array created by the kernel if (get_energy) { @@ -902,13 +922,13 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos) { const size_t tot_num = num * numReplicas; - gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num)); + gpuErrchk(cudaMalloc(&pos_d[0], sizeof(Vector3) * tot_num)); { //Han-Yi bind to the texture cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeLinear; - resDesc.res.linear.devPtr = pos_d; + resDesc.res.linear.devPtr = pos_d[0]; resDesc.res.linear.desc.f = cudaChannelFormatKindFloat; resDesc.res.linear.desc.x = 32; // bits per channel resDesc.res.linear.desc.y = 32; // bits per channel @@ -927,10 +947,10 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos) gpuErrchk(cudaDeviceSynchronize()); - gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpyAsync(pos_d[0], pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); - gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas)); - gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMalloc(&forceInternal_d[0], sizeof(Vector3) * num * numReplicas)); + gpuErrchk(cudaMemcpyAsync(forceInternal_d[0], forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); gpuErrchk(cudaDeviceSynchronize()); } @@ -957,7 +977,7 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom void ComputeForce::setForceInternalOnDevice(Vector3* f) { const size_t tot_num = num * numReplicas; - gpuErrchk(cudaMemcpy(forceInternal_d, f, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpy(forceInternal_d[0], f, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); } diff --git a/src/ComputeForce.h b/src/ComputeForce.h index c58b997ab5d58b2ca413343f15bb57269a782618..c26347a902813941597475c3c1271d28e738e2db 100644 --- a/src/ComputeForce.h +++ b/src/ComputeForce.h @@ -34,6 +34,16 @@ #include <thrust/transform_reduce.h> // thrust::reduce #include <thrust/functional.h> // thrust::plus +#ifndef gpuErrchk +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=true) { + if (code != cudaSuccess) { + fprintf(stderr,"CUDA Error: %s %s:%d\n", cudaGetErrorString(code), __FILE__, line); + if (abort) exit(code); + } +} +#endif + #ifdef USE_BOOST #include <boost/unordered_map.hpp> typedef boost::unordered_map<String,unsigned int> XpotMap; @@ -106,7 +116,7 @@ public: void copyBondedListsToGPU(int3 *bondList, int4 *angleList, int4 *dihedralList, int *dihedralPotList, int4 *bondAngleList); //MLog: because of the move of a lot of private variables, some functions get starved necessary memory access to these variables, below is a list of functions that return the specified private variable. - Vector3* getPos_d() + std::vector<Vector3*> getPos_d() { return pos_d; } @@ -119,7 +129,7 @@ public: return ran_d; } - Vector3* getForceInternal_d() + std::vector<Vector3*> getForceInternal_d() { return forceInternal_d; } @@ -169,6 +179,18 @@ public: { return energies_d; } + + void clear_force() { + for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) { + gpuman.use(i); + gpuErrchk(cudaMemsetAsync((void*)(forceInternal_d[i]),0,num*numReplicas*sizeof(Vector3))); + } + gpuman.use(0); // TODO move to a paradigm where gpu0 is not preferentially treated + } + void clear_energy() { + gpuErrchk(cudaMemsetAsync((void*)(energies_d), 0, sizeof(float)*num*numReplicas)); // TODO make async + } + HOST DEVICE static EnergyForce coulombForce(Vector3 r, float alpha,float start, float len); @@ -205,25 +227,28 @@ private: float energy; // Device Variables - BaseGrid* sys_d; + std::vector<BaseGrid*> sys_d; CellDecomposition* decomp_d; float *energies_d; float *tableEps_d, *tableRad6_d, *tableAlpha_d; int gridSize; - TabulatedPotential **tablePot_d, **tablePot_addr; + // TabulatedPotential **tablePot_d, **tablePot_addr; + std::vector<TabulatedPotential**> tablePot_d; + TabulatedPotential** tablePot_addr; + TabulatedPotential **tableBond_d, **tableBond_addr; TabulatedAnglePotential **tableAngle_d, **tableAngle_addr; TabulatedDihedralPotential **tableDihedral_d, **tableDihedral_addr; // Pairlists float pairlistdist2; - int2 *pairLists_d; - cudaTextureObject_t pairLists_tex; + std::vector<int2*> pairLists_d; + std::vector<cudaTextureObject_t> pairLists_tex; - int *pairTabPotType_d; - cudaTextureObject_t pairTabPotType_tex; + std::vector<int*> pairTabPotType_d; + std::vector<cudaTextureObject_t> pairTabPotType_tex; - int *numPairs_d; + std::vector<int*> numPairs_d; //Han-Yi Chou int *CellNeighborsList; @@ -233,11 +258,12 @@ private: //BrownianParticleType* part; //float electricConst; //int fullLongRange; - Vector3* pos_d; + std::vector<Vector3*> pos_d; cudaTextureObject_t pos_tex; Vector3* mom_d; float* ran_d; - Vector3* forceInternal_d; + + std::vector<Vector3*> forceInternal_d; // vector for multigpu int* type_d; Bond* bonds_d; diff --git a/src/Configuration.cpp b/src/Configuration.cpp index 34f5048175938869c6d2f8fcb4569bcda57c358b..e0477785132816e068b7a9dea72a2a64b7469178 100644 --- a/src/Configuration.cpp +++ b/src/Configuration.cpp @@ -10,6 +10,7 @@ #include <iostream> using namespace std; +#ifndef gpuErrchk #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -17,6 +18,8 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t if (abort) exit(code); } } +#endif + namespace { template<class T> diff --git a/src/GPUManager.cpp b/src/GPUManager.cpp index 69a87ddf8c03fdd48bc5e9155edb4774293a03ff..31965eb2cc1f7d672aa334f3c2c5d73f382dabcb 100644 --- a/src/GPUManager.cpp +++ b/src/GPUManager.cpp @@ -1,5 +1,6 @@ #include "GPUManager.h" +#ifndef gpuErrchk #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -7,6 +8,9 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=t if (abort) exit(code); } } +#endif + +#define WITH_GPU(id,code) { int wg_curr; cudaGetDevice(&wg_curr); cudaSetDevice(id); code ; cudaSetDevice(wg_curr); } int GPUManager::nGPUs = 0; bool GPUManager::is_safe = true; @@ -119,12 +123,29 @@ void GPUManager::select_gpus(std::vector<unsigned int>& gpu_ids) { void GPUManager::use(int gpu_id) { gpu_id = gpu_id % (int) gpus.size(); - printf("Setting device to %d\n",gpus[gpu_id].id); + // printf("Setting device to %d\n",gpus[gpu_id].id); gpuErrchk( cudaSetDevice(gpus[gpu_id].id) ); - gpuErrchk( cudaDeviceSynchronize() ); // printf("Done setting device\n"); } +void GPUManager::sync(int gpu_id) { + WITH_GPU( gpus[gpu_id].id, + gpuErrchk( cudaDeviceSynchronize() )); + // int wg_curr; + // gpuErrchk( cudaGetDevice(&wg_curr) ); + // gpuErrchk( cudaSetDevice(gpus[gpu_id].id) ); + // gpuErrchk( cudaSetDevice(wg_curr) ); +} +void GPUManager::sync() { + int curr; + gpuErrchk( cudaGetDevice(&curr) ); + for (auto it = gpus.begin(); it != gpus.end(); ++it) { + gpuErrchk( cudaSetDevice(it->id) ); + gpuErrchk( cudaDeviceSynchronize() ); + } + gpuErrchk( cudaSetDevice(curr) ); +} + int GPUManager::current() { int c; cudaGetDevice(&c); diff --git a/src/GPUManager.h b/src/GPUManager.h index c07cc7741b57d2c85dbe056c0b4a03bb91d730f5..e5095dd11c122575ab1c88c30e08ab025dc62a7e 100644 --- a/src/GPUManager.h +++ b/src/GPUManager.h @@ -75,6 +75,9 @@ public: // Use the GPU using local index 0..N (not cudaGetDevice index) static void use(int gpu_id); + static void sync(int gpu_id); + static void sync(); + // current // @return the current GPU a thread is using static int current(); diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu index 1c2a5630ec764347091c92f7aac1d8ce0280fa3e..07b6d7b7716f5c21a00fe604e3f3d12bcaaac1b9 100644 --- a/src/GrandBrownTown.cu +++ b/src/GrandBrownTown.cu @@ -30,11 +30,13 @@ bool GrandBrownTown::DEBUG; cudaEvent_t START, STOP; +GPUManager GrandBrownTown::gpuman = GPUManager(); + GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, bool debug, bool imd_on, unsigned int imd_port, int numReplicas) : imd_on(imd_on), imd_port(imd_port), numReplicas(numReplicas), //conf(c), RBC(RigidBodyController(c,outArg)) { - conf(c){ + conf(c) { RBC.resize(numReplicas); for(int i = 0; i < numReplicas; ++i) @@ -582,7 +584,7 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d); + RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); gpuErrchk(cudaDeviceSynchronize()); } @@ -603,8 +605,10 @@ void GrandBrownTown::RunNoseHooverLangevin() if(s == 1) { // 'interparticleForce' - determines whether particles interact with each other - gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3))); - gpuErrchk(cudaMemset((void*)(internal->getEnergy()), 0, sizeof(float)*num*numReplicas)); + internal->clear_force(); + internal->clear_energy(); + gpuman.sync(); + #ifdef _OPENMP omp_set_num_threads(4); #endif @@ -628,7 +632,7 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d); + RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); } internal -> computeTabulated(get_energy); break; @@ -652,7 +656,7 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d); + RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); } internal->compute(get_energy); break; @@ -676,7 +680,7 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateForces((internal->getPos_d())+i*num, (internal->getForceInternal_d())+i*num, s, (internal->getEnergy())+i*num, get_energy, + RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, RigidBodyInterpolationType, sys, sys_d); if(rigidbody_dynamic == String("Langevin")) { @@ -692,17 +696,19 @@ void GrandBrownTown::RunNoseHooverLangevin() } }//if step == 1 - gpuErrchk(cudaMemset((void*)(internal->getEnergy()), 0, sizeof(float)*num*numReplicas)); // TODO: make async + internal->clear_energy(); + gpuman.sync(); + if(particle_dynamic == String("Langevin")) - updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, ParticleInterpolationType); + updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, ParticleInterpolationType); else if(particle_dynamic == String("NoseHooverLangevin")) //kernel for Nose-Hoover Langevin dynamic - updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), - internal -> getRan_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, + updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), + internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, ParticleInterpolationType); ////For Brownian motion else - updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), + updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, ParticleInterpolationType); @@ -735,7 +741,7 @@ void GrandBrownTown::RunNoseHooverLangevin() if (s % outputPeriod == 0) { // Copy particle positions back to CPU gpuErrchk(cudaDeviceSynchronize()); - gpuErrchk(cudaMemcpy(pos, internal -> getPos_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); } if (imd_on && clientsock && s % outputPeriod == 0) { @@ -813,9 +819,9 @@ void GrandBrownTown::RunNoseHooverLangevin() for(int i = 0; i < numReplicas; ++i) RBC[i]->clearForceAndTorque(); if (imd_on && clientsock) - internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD + internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD // TODO add multigpu support with IMD else { - gpuErrchk(cudaMemsetAsync((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3))); + internal->clear_force(); } if (interparticleForce) @@ -834,7 +840,7 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d); + RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); } internal -> computeTabulated(get_energy); break; @@ -857,7 +863,7 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateParticleLists( (internal->getPos_d())+i*num, sys_d); + RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); } internal->compute(get_energy); break; @@ -881,11 +887,11 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateForces((internal->getPos_d())+i*num, (internal->getForceInternal_d())+i*num, s, (internal->getEnergy())+i*num, get_energy, + RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, RigidBodyInterpolationType, sys, sys_d); if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) - LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), + LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, ParticleInterpolationType); //gpuErrchk(cudaDeviceSynchronize()); diff --git a/src/GrandBrownTown.h b/src/GrandBrownTown.h index 877f5a6ed5fc288119d12c3467514deb825b5374..8a50140b8c2b452446233a2b0edd02aba7a903f8 100644 --- a/src/GrandBrownTown.h +++ b/src/GrandBrownTown.h @@ -23,6 +23,7 @@ #include <cuda_runtime.h> #include <curand_kernel.h> +#include "GPUManager.h" #include "useful.h" #include "BaseGrid.h" #include "OverlordGrid.h" @@ -116,6 +117,7 @@ public: Vector3 freePosition(Vector3 r0, Vector3 r1, float minDist); private: + static GPUManager gpuman; const Configuration& conf; int numReplicas; diff --git a/src/RigidBody.cu b/src/RigidBody.cu index 88f020d4db21255fa7f5fd7a6aa093b346d721a7..94e726112c97e8fc6eddde9d2585f8aa01157df5 100644 --- a/src/RigidBody.cu +++ b/src/RigidBody.cu @@ -8,6 +8,7 @@ #include "Debug.h" +#ifndef gpuErrchk #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -15,7 +16,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t if (abort) exit(code); } } - +#endif RigidBody::RigidBody(String name, const Configuration& cref, const RigidBodyType& tref, RigidBodyController* RBCref) : name(name), c(&cref), t(&tref), RBC(RBCref), impulse_to_momentum(4.1867999435271e4) /*impulse_to_momentum(4.184e8f)*/ { init(); } diff --git a/src/RigidBodyController.cu b/src/RigidBodyController.cu index e63ffe80b1791973ab8f4e924ec37877c6b9af32..5c89ba26f440bbc2659aea47b31671ecf7ce9cd0 100644 --- a/src/RigidBodyController.cu +++ b/src/RigidBodyController.cu @@ -16,6 +16,7 @@ #include "RandomCPU.h" /* RBTODO: fix this? */ +#ifndef gpuErrchk #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, String file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -23,6 +24,8 @@ inline void gpuAssert(cudaError_t code, String file, int line, bool abort=true) if (abort) exit(code); } } +#endif + // allocate and initialize an array of stream handles cudaStream_t *RigidBodyForcePair::stream = (cudaStream_t *) malloc(NUMSTREAMS * sizeof(cudaStream_t)); int RigidBodyForcePair::nextStreamID = 0; /* used during stream init */ diff --git a/src/RigidBodyType.cu b/src/RigidBodyType.cu index 6ba407a3d21fa56bda497d4033623740ac8de6b8..969a16ad7d6c3183d9a0f69ee0c4bffd142faf0f 100644 --- a/src/RigidBodyType.cu +++ b/src/RigidBodyType.cu @@ -5,6 +5,7 @@ #include "BaseGrid.h" #include "RigidBodyGrid.h" +#ifndef gpuErrchk #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { @@ -12,6 +13,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t if (abort) exit(code); } } +#endif void RigidBodyType::clear() { num = 0; // RBTODO: not 100% sure about this