diff --git a/src/BaseGrid.cu b/src/BaseGrid.cu index 75ccfbee4144e028bc69dc9c6e5236c5d0e64c90..0a8d9701061761b2a8105f3d95d0d2227bcdd91e 100644 --- a/src/BaseGrid.cu +++ b/src/BaseGrid.cu @@ -13,7 +13,6 @@ // Also, allocate the main value array. void BaseGrid::init() { basisInv = basis.inverse(); - nynz = ny*nz; size = nx*ny*nz; val = new float[size]; } @@ -267,7 +266,6 @@ if(strcmp("origi", start) == 0) { //printf("Size: %d %d %d\n", nx, ny, nz); if (read == 3) { size = nx*ny*nz; - nynz = ny*nz; val = new float[size]; zero(); } @@ -537,7 +535,7 @@ void BaseGrid::averageProfile(const char* fileName, int axis) { int dir2 = (axis+2)%3; int jump[3]; - jump[0] = nynz; + jump[0] = ny*nz; jump[1] = nz; jump[2] = 1; @@ -602,11 +600,10 @@ bool BaseGrid::crop(int x0, int y0, int z0, int x1, int y1, int z1, bool keep_or float *new_val = new float[new_size]; int ind = 0; - int nynz = ny * nz; for (int i = x0; i < x1; i++) for (int j = y0; j < y1; j++) for (int k = z0; k < z1; k++) { - int ind1 = k + j * nz + i * nynz; + int ind1 = k + j * nz + i * ny*nz; new_val[ind++] = val[ind1]; } @@ -658,7 +655,7 @@ void BaseGrid::getNeighbors(int j, int* indexBuffer) const { for (int ix = -1; ix <= 1; ix++) { for (int iy = -1; iy <= 1; iy++) { for (int iz = -1; iz <= 1; iz++) { - int ind = wrap(jz+iz,nz) + nz*wrap(jy+iy,ny) + nynz*wrap(jx+ix,nx); + int ind = wrap(jz+iz,nz) + nz*wrap(jy+iy,ny) + ny*nz*wrap(jx+ix,nx); indexBuffer[k] = ind; k++; } @@ -673,7 +670,7 @@ void BaseGrid::getNeighborValues(NeighborList* neigh, int homeX, int homeY, int for (int ix = -1; ix <= 1; ix++) { for (int iy = -1; iy <= 1; iy++) { for (int iz = -1; iz <= 1; iz++) { - int ind = wrap(homeZ+iz,nz) + nz*wrap(homeY+iy,ny) + nynz*wrap(homeX+ix,nx); + int ind = wrap(homeZ+iz,nz) + nz*wrap(homeY+iy,ny) + ny*nz*wrap(homeX+ix,nx); neigh->v[ix+1][iy+1][iz+1] = val[ind]; } } diff --git a/src/BaseGrid.h b/src/BaseGrid.h index 55669bd770225d88c910ce94f3f6c9e7e15dc348..b3a4cfa170d62790c73306dc3c659328b9208815 100644 --- a/src/BaseGrid.h +++ b/src/BaseGrid.h @@ -21,6 +21,14 @@ #include <ctime> // #include <cuda.h> +#ifndef gpuErrchk +#define delgpuErrchk +#define gpuErrchk(code) { if ((code) != cudaSuccess) { \ + fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), __FILE__, __LINE__); \ + }} +#endif + + enum BoundaryCondition { dirichlet, neumann, periodic }; enum InterpolationOrder { linear = 1, cubic = 3 }; @@ -1044,14 +1052,49 @@ public: void getNeighborValues(NeighborList* neigh, int homeX, int homeY, int homeZ) const; inline void setVal(float* v) { val = v; } + BaseGrid* copy_to_cuda() const { + BaseGrid* g_d = NULL; + BaseGrid g_tmp; + float* val_d = NULL; + size_t sz = sizeof(float) * size; + gpuErrchk(cudaMalloc(&g_d, sizeof(BaseGrid))); + gpuErrchk(cudaMalloc(&val_d, sz)); + gpuErrchk(cudaMemcpy(val_d, val, sz, cudaMemcpyHostToDevice)); + g_tmp.origin = origin; + g_tmp.basis = basis; + g_tmp.nx = nx; + g_tmp.ny = ny; + g_tmp.nz = nz; + g_tmp.size= size; + g_tmp.basisInv = basisInv; + g_tmp.val = val_d; + gpuErrchk(cudaMemcpy(g_d, &g_tmp, sizeof(BaseGrid), cudaMemcpyHostToDevice)); + g_tmp.val = NULL; + return g_d; + } + + static void remove_from_cuda(BaseGrid* g_d) { + BaseGrid g_tmp; + gpuErrchk(cudaMemcpy(&g_tmp, g_d, sizeof(BaseGrid), cudaMemcpyDeviceToHost)); + gpuErrchk(cudaFree(&(g_tmp.val))); + g_tmp.val = NULL; + gpuErrchk(cudaMemcpy(g_d, &g_tmp, sizeof(BaseGrid), cudaMemcpyHostToDevice)); // copy NULL back to device + gpuErrchk(cudaFree(&g_d)); + } + public: Vector3 origin; Matrix3 basis; int nx, ny, nz; - int nynz; int size; Matrix3 basisInv; public: float* val; }; + +#ifndef delgpuErrchk +#undef delgpuErrchk +#undef gpuErrchk +#endif + #endif diff --git a/src/BrownParticlesKernel.h b/src/BrownParticlesKernel.h index 36f7daa3ea95f0e3ddafdabde661b143382f17c7..5475a454d4a7b3ef4c70a904c8ce262a8d71780e 100644 --- a/src/BrownParticlesKernel.h +++ b/src/BrownParticlesKernel.h @@ -4,7 +4,7 @@ template<const int BlockSize> static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], BrownianParticleType* part[], - float *vec_red, int n) + float *vec_red, int num, int num_rb_attached_particles, int num_replicas) { __shared__ __align__(4) float sdata[BlockSize]; @@ -17,9 +17,12 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro sdata[tid] = 0.f; + int n = (num*num_replicas); + while (i < n) - { - int t1 = type[i]; + { + const int i1 = (i % num) + (i/num)*(num+num_rb_attached_particles); + const int t1 = type[i1]; const BrownianParticleType& pt1 = *part[t1]; p1 = P_n[i]; @@ -27,7 +30,8 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro if(i + BlockSize < n) { - int t2 = type[i+BlockSize]; + const int i2 = ((i+BlockSize) % num) + ((i+BlockSize)/num)*(num+num_rb_attached_particles); + const int t2 = type[i2]; const BrownianParticleType& pt2 = *part[t2]; p2 = P_n[i+BlockSize]; @@ -38,7 +42,7 @@ static __global__ void BrownParticlesKineticEnergy(Vector3* P_n, int type[], Bro else sdata[tid] += p1.length2() / mass1; - i += gridSize; + i += gridSize; } sdata[tid] *= 0.50f; diff --git a/src/BrownianParticleType.cpp b/src/BrownianParticleType.cpp index e514500ec3b6b87058cf58389c2a99ab7ddd506d..226814419a20fef51671d3a17ada2539b445f738 100644 --- a/src/BrownianParticleType.cpp +++ b/src/BrownianParticleType.cpp @@ -5,6 +5,7 @@ ////////////////////////////////////////// void BrownianParticleType::clear() { if (pmf != NULL) delete [] pmf; + if (pmf_scale != NULL) delete [] pmf_scale; if (diffusionGrid != NULL) delete diffusionGrid; if (forceXGrid != NULL) delete forceXGrid; if (forceYGrid != NULL) delete forceYGrid; @@ -12,6 +13,7 @@ void BrownianParticleType::clear() { if (reservoir != NULL) delete reservoir; if (meanPmf != NULL) delete [] meanPmf; pmf = NULL, diffusionGrid = NULL; + pmf_scale = NULL; pmf_boundary_conditions = NULL; forceXGrid = NULL, forceYGrid = NULL, forceZGrid = NULL; reservoir = NULL, meanPmf = NULL; @@ -26,6 +28,7 @@ void BrownianParticleType::copy(const BrownianParticleType& src) { radius = src.radius; eps = src.eps; pmf = src.pmf; + pmf_scale = src.pmf_scale; pmf_boundary_conditions = src.pmf_boundary_conditions; meanPmf = src.meanPmf; numPartGridFiles = src.numPartGridFiles; diff --git a/src/BrownianParticleType.h b/src/BrownianParticleType.h index 524a5b66be452771f0aa8aec65d76dfddb60b7dd..8af1385a803768eabf9d3efe225ac01a31be38de 100644 --- a/src/BrownianParticleType.h +++ b/src/BrownianParticleType.h @@ -30,7 +30,7 @@ class BrownianParticleType { BrownianParticleType(const String& name = "") : name(name), num(0), diffusion(0.0f), radius(1.0f), charge(0.0f), eps(0.0f), meanPmf(NULL), - numPartGridFiles(-1), reservoir(NULL), pmf(NULL), pmf_boundary_conditions(NULL), + numPartGridFiles(-1), reservoir(NULL), pmf(NULL), pmf_scale(NULL), pmf_boundary_conditions(NULL), diffusionGrid(NULL), forceXGrid(NULL), forceYGrid(NULL), forceZGrid(NULL){ } @@ -62,8 +62,9 @@ public: float mu; //for Nose-Hoover Langevin dynamics Reservoir* reservoir; - BaseGrid* pmf; - BoundaryCondition* pmf_boundary_conditions; + BaseGrid** pmf; + float* pmf_scale; + BoundaryCondition* pmf_boundary_conditions; BaseGrid* diffusionGrid; BaseGrid* forceXGrid; BaseGrid* forceYGrid; diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu index 4f39b3fe3a905fca9703f03423b357a401bef198..440b2580a07b3bcb508913bdf40b0d56539a2442 100644 --- a/src/ComputeForce.cu +++ b/src/ComputeForce.cu @@ -40,14 +40,15 @@ void runSort(int2 *d1, int *d2, float *key, unsigned int count); ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : - num(c.num), numParts(c.numParts), sys(c.sys), switchStart(c.switchStart), + num(c.num), numParts(c.numParts), num_rb_attached_particles(c.num_rb_attached_particles), + sys(c.sys), switchStart(c.switchStart), switchLen(c.switchLen), electricConst(c.coulombConst), cutoff2((c.switchLen + c.switchStart) * (c.switchLen + c.switchStart)), decomp(c.sys->getBox(), c.sys->getOrigin(), c.switchStart + c.switchLen + c.pairlistDistance, numReplicas), numBonds(c.numBonds), numTabBondFiles(c.numTabBondFiles), numExcludes(c.numExcludes), numAngles(c.numAngles), numTabAngleFiles(c.numTabAngleFiles), numDihedrals(c.numDihedrals), - numTabDihedralFiles(c.numTabDihedralFiles), numRestraints(c.numRestraints), + numTabDihedralFiles(c.numTabDihedralFiles), numRestraints(c.numRestraints), numGroupSites(c.numGroupSites), numReplicas(numReplicas) { @@ -234,10 +235,10 @@ ComputeForce::ComputeForce(const Configuration& c, const int numReplicas = 1) : restraintIds_d = NULL; //Calculate the number of blocks the grid should contain - gridSize = num / NUM_THREADS + 1; + gridSize = (num+num_rb_attached_particles) / NUM_THREADS + 1; // Create and allocate the energy arrays - gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * (num+numGroupSites) * numReplicas)); + gpuErrchk(cudaMalloc(&energies_d, sizeof(float) * (num+num_rb_attached_particles+numGroupSites) * numReplicas)); cudaEventCreate(&start); cudaEventCreate(&stop); } @@ -257,6 +258,7 @@ ComputeForce::~ComputeForce() { for (std::size_t g = 0; g < gpuman.gpus.size(); ++g) { gpuman.use(g); tablePot_addr[g][ind]->free_from_cuda(tablePot_addr[g][ind]); + tablePot_addr[g][ind] = NULL; } delete tablePot[ind]; } @@ -341,7 +343,7 @@ void ComputeForce::updateNumber(int newNum) { // Recalculate the number of blocks in the grid gridSize = 0; - while ((int)sqrt(NUM_THREADS) * gridSize < num) + while ((int)sqrt(NUM_THREADS) * gridSize < num+num_rb_attached_particles) ++gridSize; gpuErrchk(cudaFree(energies_d)); @@ -374,7 +376,7 @@ bool ComputeForce::addTabulatedPotential(String fileName, int type0, int type1) for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) { gpuman.use(i); tablePot_addr[i][ind]->free_from_cuda(tablePot_addr[i][ind]); - delete tablePot_addr[i][ind]; + tablePot_addr[i][ind] = NULL; } gpuman.use(0); delete tablePot[ind]; @@ -500,7 +502,7 @@ void ComputeForce::decompose() { cudaFree(decomp_d); decomp_d = NULL; } - decomp.decompose_d(pos_d[0], num); + decomp.decompose_d(pos_d[0], num+num_rb_attached_particles); decomp_d = decomp.copyToCUDA(); // Update pairlists using cell decomposition (not sure this is really needed or good) @@ -514,52 +516,25 @@ void ComputeForce::decompose() { // initializePairlistArrays int nCells = decomp.nCells.x * decomp.nCells.y * decomp.nCells.z; - - // int blocksPerCell = 10; - /* cuMemGetInfo(&free,&total); */ /* printf("Free memory: %zu / %zu\n", free, total); */ - // const int NUMTHREADS = 128; - //const size_t nBlocks = (num * numReplicas) / NUM_THREADS + 1; - // const size_t nBlocks = nCells*blocksPerCell; - - /* clearPairlists<<< 1, 32 >>>(pos, num, numReplicas, sys_d[0], decomp_d); */ - /* gpuErrchk(cudaDeviceSynchronize()); */ - /* pairlistTest<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, */ - /* sys_d[0], decomp_d, nCells, blocksPerCell, */ - /* numPairs_d[0], pairListListI_d, pairListListJ_d); */ - /* gpuErrchk(cudaDeviceSynchronize()); */ - int tmp = 0; gpuErrchk(cudaMemcpyAsync(numPairs_d[0], &tmp, sizeof(int), cudaMemcpyHostToDevice)); gpuErrchk(cudaDeviceSynchronize()); - // printf("Pairlistdist: %f\n",sqrt(pairlistdist2)); #ifdef DEBUGEXCLUSIONS initExSum(); gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ #endif - //Han-Yi Chou bind texture - //printf("%d\n", sizeof(Vector3)); - //gpuErrchk(cudaBindTexture(0, PosTex, pos_d[0],sizeof(Vector3)*num*numReplicas)); - //gpuErrchk(cudaBindTexture(0,CellsTex, decomp_d->getCells_d(),sizeof(CellDecomposition::cell_t)*num*numReplicas)); - -//#if __CUDA_ARCH__ >= 300 - //createPairlists_debug<<< 2048, 64 >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); - //#ifdef NEW - //for sm52 - //createPairlists<32,64,1><<< dim3(256,128,numReplicas),dim3(32,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], - //GTX 980 - //Han-Yi Chou 2017 my code #if __CUDA_ARCH__ >= 520 - createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], + createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num+num_rb_attached_particles, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2, pos_tex[0], neighbors_tex); #else //__CUDA_ARCH__ == 300 - createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], + createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num+num_rb_attached_particles, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2, pos_tex[0], neighbors_tex); #endif @@ -576,77 +551,6 @@ void ComputeForce::decompose() { } gpuman.sync(); #endif - - //createPairlists<64,64><<< dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], - // pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, - // excludeMap_d, numExcludes, pairlistdist2); - - //#else - //createPairlists_debug<<< 2048, 64 >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, - // pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); - //#endif -//#else - // Use shared memory for warp_bcast function - //createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); - //#ifdef NEW - //for sm52 - //createPairlists<32,64,1><<<dim3(256,128,numReplicas),dim3(32,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], - //GTX 980 - //createPairlists<64,64,8><<<dim3(128,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], - //GTX 680 - //createPairlists<64,64,8><<<dim3(256,256,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], - // pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, - // excludeMap_d, numExcludes, pairlistdist2); - //createPairlists<64,64><<<dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], - // pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, - // excludeMap_d, numExcludes, pairlistdist2); - - //#else - //createPairlists<<< 2048, 64, 2048/WARPSIZE >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, - // pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2, CellNeighborsList); - //#endif - -//#endif -#if 0 -//////debug section - // DEBUGING - gpuErrchk(cudaMemcpy(&tmp, numPairs_d[0], sizeof(int), cudaMemcpyDeviceToHost)); - //printf("CreatePairlist found %d pairs\n",tmp); - gpuErrchk(cudaDeviceSynchronize()); - - gpuErrchk( cudaProfilerStart() ); - - // Reset the cell decomposition. - if (decomp_d) - cudaFree(decomp_d); - - decomp.decompose_d(pos_d[0], num); - decomp_d = decomp.copyToCUDA(); - - gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ - int tmp1 = 0; - gpuErrchk(cudaMemcpyAsync(numPairs_d[0], &tmp1, sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaDeviceSynchronize()); - // printf("Pairlistdist: %f\n",sqrt(pairlistdist2)); - -#ifdef DEBUGEXCLUSIONS - initExSum(); - gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ -#endif - #if __CUDA_ARCH__ >= 300 - createPairlists_debug<<< 2048, 64 >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); -#else - // Use shared memory for warp_bcast function - createPairlists_debug<<< 2048, 64, 2048/WARPSIZE >>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, excludeMap_d, numExcludes, pairlistdist2); -#endif - gpuErrchk(cudaMemcpy(&tmp1, numPairs_d[0], sizeof(int), cudaMemcpyDeviceToHost)); - printf("Difference CreatePairlist found %d pairs\n",tmp-tmp1); - gpuErrchk(cudaDeviceSynchronize()); - -#ifdef DEBUGEXCLUSIONS - printf("Counted %d exclusions\n", getExSum()); -#endif -#endif } IndexList ComputeForce::decompDim() const { @@ -663,20 +567,20 @@ float ComputeForce::decompCutoff() { return decomp.getCutoff(); } float ComputeForce::computeFull(bool get_energy) { float energy = 0.0f; - gridSize = (num * numReplicas) / NUM_THREADS + 1; + gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1; dim3 numBlocks(gridSize, 1, 1); dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces computeFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d, tableAlpha_d, - tableEps_d, tableRad6_d, num, numParts, sys_d[0], energies_d, gridSize, + tableEps_d, tableRad6_d, num+num_rb_attached_particles, numParts, sys_d[0], energies_d, gridSize, numReplicas, get_energy); // Calculate energy based on the array created by the kernel if (get_energy) { gpuErrchk(cudaDeviceSynchronize()); thrust::device_ptr<float> en_d(energies_d); - energy = thrust::reduce(en_d, en_d + num + numGroupSites); + energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles + numGroupSites); } return energy; @@ -684,20 +588,20 @@ float ComputeForce::computeFull(bool get_energy) { float ComputeForce::computeSoftcoreFull(bool get_energy) { float energy = 0.0f; - gridSize = (num * numReplicas) / NUM_THREADS + 1; + gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1; dim3 numBlocks(gridSize, 1, 1); dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces computeSoftcoreFullKernel<<<numBlocks, numThreads>>>(forceInternal_d[0], pos_d[0], type_d, - tableEps_d, tableRad6_d, num, numParts, sys_d[0], energies_d, gridSize, + tableEps_d, tableRad6_d, num+num_rb_attached_particles, numParts, sys_d[0], energies_d, gridSize, numReplicas, get_energy); // Calculate energy based on the array created by the kernel if (get_energy) { cudaDeviceSynchronize(); thrust::device_ptr<float> en_d(energies_d); - energy = thrust::reduce(en_d, en_d + num); + energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles); } return energy; @@ -729,13 +633,13 @@ float ComputeForce::computeElecFull(bool get_energy) { float ComputeForce::compute(bool get_energy) { float energy = 0.0f; - gridSize = (num * numReplicas) / NUM_THREADS + 1; + gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1; dim3 numBlocks(gridSize, 1, 1); dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces computeKernel<<<numBlocks, numThreads>>>(forceInternal_d[0], pos_d[0], type_d, - tableAlpha_d, tableEps_d, tableRad6_d, num, numParts, sys_d[0], + tableAlpha_d, tableEps_d, tableRad6_d, num+num_rb_attached_particles, numParts, sys_d[0], decomp_d, energies_d, switchStart, switchLen, gridSize, numReplicas, get_energy); @@ -744,7 +648,7 @@ float ComputeForce::compute(bool get_energy) { if (get_energy) { gpuErrchk(cudaDeviceSynchronize()); thrust::device_ptr<float> en_d(energies_d); - energy = thrust::reduce(en_d, en_d + num + numGroupSites); + energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles + numGroupSites); } return energy; @@ -757,7 +661,7 @@ float ComputeForce::compute(bool get_energy) { float ComputeForce::computeTabulated(bool get_energy) { float energy = 0.0f; - gridSize = (num * numReplicas) / NUM_THREADS + 1; + gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1; dim3 numBlocks(gridSize, 1, 1); dim3 numThreads(NUM_THREADS, 1, 1); @@ -773,7 +677,7 @@ float ComputeForce::computeTabulated(bool get_energy) { { //clearEnergies<<< nb, numThreads >>>(energies_d,num); //gpuErrchk(cudaDeviceSynchronize()); - cudaMemset((void*)energies_d, 0, sizeof(float)*(num+numGroupSites)*numReplicas); + cudaMemset((void*)energies_d, 0, sizeof(float)*(num+num_rb_attached_particles+numGroupSites)*numReplicas); computeTabulatedEnergyKernel<<< nb, numThreads >>>(forceInternal_d[0], pos_d[0], sys_d[0], cutoff2, numPairs_d[0], pairLists_d[0], pairTabPotType_d[0], tablePot_d[0], energies_d); } @@ -866,27 +770,27 @@ float ComputeForce::computeTabulated(bool get_energy) { float ComputeForce::computeTabulatedFull(bool get_energy) { energy = 0.0f; - gridSize = (num * numReplicas) / NUM_THREADS + 1; + gridSize = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + 1; dim3 numBlocks(gridSize, 1, 1); dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeTabulatedFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d, tablePot_d[0], tableBond_d, num, numParts, sys_d[0], bonds_d, bondMap_d, numBonds, excludes_d, excludeMap_d, numExcludes, energies_d, gridSize, numReplicas, get_energy, angles_d); + computeTabulatedFullKernel<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], type_d, tablePot_d[0], tableBond_d, num+num_rb_attached_particles, numParts, sys_d[0], bonds_d, bondMap_d, numBonds, excludes_d, excludeMap_d, numExcludes, energies_d, gridSize, numReplicas, get_energy, angles_d); gpuErrchk(cudaDeviceSynchronize()); computeAngles<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], angles_d, tableAngle_d, - numAngles, num, sys_d[0], energies_d, + numAngles, num+num_rb_attached_particles, sys_d[0], energies_d, get_energy); gpuErrchk(cudaDeviceSynchronize()); computeDihedrals<<< numBlocks, numThreads >>>(forceInternal_d[0], pos_d[0], dihedrals_d, tableDihedral_d, numDihedrals, - num, sys_d[0], energies_d, + num+num_rb_attached_particles, sys_d[0], energies_d, get_energy); // Calculate the energy based on the array created by the kernel if (get_energy) { gpuErrchk(cudaDeviceSynchronize()); thrust::device_ptr<float> en_d(energies_d); - energy = thrust::reduce(en_d, en_d + num); + energy = thrust::reduce(en_d, en_d + num + num_rb_attached_particles); } return energy; @@ -894,7 +798,7 @@ float ComputeForce::computeTabulatedFull(bool get_energy) { void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos) { - const size_t tot_num = (num+numGroupSites) * numReplicas; + const size_t tot_num = (num+num_rb_attached_particles+numGroupSites) * numReplicas; for (std::size_t i = 0; i < gpuman.gpus.size(); ++i) { gpuman.use(i); @@ -935,11 +839,10 @@ void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos) } void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos, Vector3* mom) { - const size_t tot_num = (num+numGroupSites) * numReplicas; - const size_t small_tot_num = num * numReplicas; + const size_t tot_num = num * numReplicas; gpuErrchk(cudaMalloc(&mom_d, sizeof(Vector3) * tot_num)); - gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * small_tot_num, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpyAsync(mom_d, mom, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); copyToCUDA(forceInternal,pos); gpuErrchk(cudaDeviceSynchronize()); @@ -965,10 +868,11 @@ void ComputeForce::setForceInternalOnDevice(Vector3* f) { void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals, const Restraint* const restraints) { assert(simNum == numReplicas); // Not sure why we have both of these things - int tot_num = num * simNum; + int tot_num_with_rb = (num+num_rb_attached_particles) * simNum; + int tot_num_with_rb_group = (num+num_rb_attached_particles+numGroupSites) * simNum; // type_d - gpuErrchk(cudaMalloc(&type_d, sizeof(int) * tot_num)); - gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * tot_num, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMalloc(&type_d, sizeof(int) * tot_num_with_rb)); + gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * tot_num_with_rb, cudaMemcpyHostToDevice)); if (numBonds > 0) { @@ -977,8 +881,8 @@ void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, gpuErrchk(cudaMemcpyAsync(bonds_d, bonds, sizeof(Bond) * numBonds, cudaMemcpyHostToDevice)); // bondMap_d - gpuErrchk(cudaMalloc(&bondMap_d, sizeof(int2) * num)); - gpuErrchk(cudaMemcpyAsync(bondMap_d, bondMap, sizeof(int2) * num, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMalloc(&bondMap_d, sizeof(int2) * tot_num_with_rb_group)); + gpuErrchk(cudaMemcpyAsync(bondMap_d, bondMap, sizeof(int2) * tot_num_with_rb_group, cudaMemcpyHostToDevice)); } if (numExcludes > 0) { @@ -990,8 +894,8 @@ void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, cudaMemcpyHostToDevice)); // excludeMap_d - gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * num)); - gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * num, + gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * tot_num_with_rb)); + gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * tot_num_with_rb, cudaMemcpyHostToDevice)); } diff --git a/src/ComputeForce.cuh b/src/ComputeForce.cuh index 8735799be098867866ddc865fa39579e3ff77bc5..2d6dac72262eb436802303010731b03040de542e 100644 --- a/src/ComputeForce.cuh +++ b/src/ComputeForce.cuh @@ -296,7 +296,6 @@ void createNeighborsList(const int3 *Cells,int* __restrict__ CellNeighborsList) } } } -//#ifdef NEW template<const int BlockSize,const int Size,const int N> __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas, const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp, @@ -424,101 +423,6 @@ __global__ void createPairlists(Vector3* __restrict__ pos, const int num, const } } -#if 0 -template<const int BlockSize,const int Size> -__global__ void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas, - const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp, - const int nCells,int* g_numPairs, int2* g_pair, int numParts, const int* __restrict__ type, - int* __restrict__ g_pairTabPotType, const Exclude* __restrict__ excludes, - const int2* __restrict__ excludeMap, const int numExcludes, float pairlistdist2) -{ - const int TotalBlocks = gridDim.x * gridDim.y; - const int cells = TotalBlocks / Size; - const int cell_start = (blockIdx.x + gridDim.x * blockIdx.y) / Size; - const int pid_start = (blockIdx.x + gridDim.x * blockIdx.y) % Size; - const int tid = threadIdx.x + blockDim.x * threadIdx.y - + blockDim.x * blockDim.y * threadIdx.z; - const int warpLane = tid % WARPSIZE; - const int repID = blockIdx.z; - - const CellDecomposition::cell_t* __restrict__ cellInfo = decomp->getCells_d(); - - for(int cellid_i = cell_start; cellid_i < nCells; cellid_i += cells) - { - CellDecomposition::range_t rangeI = decomp->getRange(cellid_i,repID); - int Ni = rangeI.last-rangeI.first; - - for(int pid_i = pid_start; pid_i < Ni; pid_i += Size) - { - int ai = cellInfo[rangeI.first+pid_i].particle; - - float4 a = tex1Dfetch<float4>(PosTex,ai); - Vector3 A(a.x,a.y,a.z); - - int2 ex_pair = make_int2(-1,-1); - if(numExcludes > 0 && excludeMap != NULL) - { - ex_pair = excludeMap[ai]; - } - - int currEx = ex_pair.x; - int nextEx = (ex_pair.x >= 0) ? excludes[currEx].ind2 : -1; - - //loop over neighbor directions - for(int idx = 0; idx < 27; ++idx) - { - int neighbor_cell = tex1Dfetch(NeighborsTex,idx+27*cellid_i); - - if(neighbor_cell < 0) - { - continue; - } - - CellDecomposition::range_t rangeJ = decomp->getRange(neighbor_cell,repID); - int Nj = rangeJ.last-rangeJ.first; - - // In each neighbor cell, loop over particles - for(int pid_j = tid; pid_j < Nj; pid_j += BlockSize) - { - int aj = cellInfo[pid_j+rangeJ.first].particle; - if(aj <= ai) - { - continue; - } - - while (nextEx >= 0 && nextEx < ( aj - repID * num)) - { - nextEx = (currEx < ex_pair.y - 1) ? excludes[++currEx].ind2 : -1; - } - - if (nextEx == (aj - repID * num)) - { - #ifdef DEBUGEXCLUSIONS - atomicAggInc( exSum, warpLane ); - #endif - nextEx = (currEx < ex_pair.y - 1) ? excludes[++currEx].ind2 : -1; - continue; - } - - float4 b = tex1Dfetch(PosTex,aj); - Vector3 B(b.x,b.y,b.z); - - float dr = (sys->wrapDiff(A-B)).length2(); - - if(dr < pairlistdist2) - { - int gid = atomicAggInc( g_numPairs, warpLane ); - int pairType = type[ai] + type[aj] * numParts; - - g_pair[gid] = make_int2(ai,aj); - g_pairTabPotType[gid] = pairType; - } - } - } - } - } -} -#endif __global__ void createPairlists_debug(Vector3* __restrict__ pos, const int num, const int numReplicas, const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp, @@ -597,104 +501,6 @@ void createPairlists_debug(Vector3* __restrict__ pos, const int num, const int n } } -//#else -#if 0 -__global__ -void createPairlists(Vector3* __restrict__ pos, const int num, const int numReplicas, - const BaseGrid* __restrict__ sys, const CellDecomposition* __restrict__ decomp, - const int nCells, - int* g_numPairs, int2* g_pair, - int numParts, const int* __restrict__ type, int* __restrict__ g_pairTabPotType, - const Exclude* __restrict__ excludes, const int2* __restrict__ excludeMap, const int numExcludes, - float pairlistdist2, const int* __restrict__ CellNeighborsList) { - // TODO: loop over all cells with edges within pairlistdist2 - - // Loop over threads searching for atom pairs - // Each thread has designated values in shared memory as a buffer - // A sync operation periodically moves data from shared to global - const int tid = threadIdx.x; - const int warpLane = tid % WARPSIZE; /* RBTODO: optimize */ - const int split = 32; /* numblocks should be divisible by split */ - /* const int blocksPerCell = gridDim.x/split; */ - const CellDecomposition::cell_t* __restrict__ cellInfo = decomp->getCells(); - for (int cID = 0 + (blockIdx.x % split); cID < nCells; cID += split) { - // for (int cID = blockIdx.x/blocksPerCell; cID < nCells; cID += split ) { - for (int repID = 0; repID < numReplicas; repID++) { - const CellDecomposition::range_t rangeI = decomp->getRange(cID, repID); - - for (int ci = rangeI.first + blockIdx.x/split; ci < rangeI.last; ci += gridDim.x/split) { - // ai - index of the particle in the original, unsorted array - const int ai = cellInfo[ci].particle; - // const CellDecomposition::cell_t celli = decomp->getCellForParticle(ai); - const CellDecomposition::cell_t celli = cellInfo[ci]; - // Vector3 posi = pos[ai]; - - // Same as for bonds, but for exclusions now - const int ex_start = (numExcludes > 0 && excludeMap != NULL) ? excludeMap[ai -repID*num].x : -1; - const int ex_end = (numExcludes > 0 && excludeMap != NULL) ? excludeMap[ai -repID*num].y : -1; - /* - for (int x = -1; x <= 1; ++x) { - for (int y = -1; y <= 1; ++y) { - #pragma unroll(3) - for (int z = -1; z <= 1; ++z) { - */ - for(int x = 0; x < 27; ++x) { - //const int nID = decomp->getNeighborID(celli, x, y, z); - const int nID = CellNeighborsList[x+27*cID];//elli.id]; - if (nID < 0) continue; - - // Initialize exclusions - // TODO: optimize exclusion code (and entire kernel) - int currEx = ex_start; - int nextEx = (ex_start >= 0) ? excludes[currEx].ind2 : -1; - int ajLast = -1; // TODO: remove this sanity check - - const CellDecomposition::range_t range = decomp->getRange(nID, repID); - - for (int n = range.first + tid; n < range.last; n+=blockDim.x) { - const int aj = cellInfo[n].particle; - if (aj <= ai) continue; - - // Skip excludes - // Implementation requires that aj increases monotonically - assert( ajLast < aj ); ajLast = aj; // TODO: remove this sanity check - while (nextEx >= 0 && nextEx < (aj - repID * num)) // TODO get rid of this - nextEx = (currEx < ex_end - 1) ? excludes[++currEx].ind2 : -1; - if (nextEx == (aj - repID * num)) { -#ifdef DEBUGEXCLUSIONS - atomicAggInc( exSum, warpLane ); -#endif - nextEx = (currEx < ex_end - 1) ? excludes[++currEx].ind2 : -1; - continue; - } - // TODO: Skip non-interacting types for efficiency - - // Skip ones that are too far away - const float dr = (sys->wrapDiff(pos[aj] - pos[ai])).length2(); - // const float dr = (sys->wrapDiff(pos[aj] - posi)).length2(); - if (dr > pairlistdist2) continue; - - // Add to pairlist - - int gid = atomicAggInc( g_numPairs, warpLane ); - int pairType = type[ai] + type[aj] * numParts; - /* assert( ai >= 0 ); */ - /* assert( aj >= 0 ); */ - - g_pair[gid] = make_int2(ai,aj); - g_pairTabPotType[gid] = pairType; - - // g_pairDists[gid] = dr; - } // atoms J - //} // z - //} // y - } // x - } // atoms I - } // replicas - /* if (tid == 0) printf("Cell%d: found %d pairs\n",cID,g_numPairs[cID]); */ - } -} -#endif // TODO: deprecate? __global__ void computeKernel(Vector3 force[], Vector3 pos[], int type[], @@ -774,42 +580,6 @@ __global__ void printPairForceCounter() { printf("Computed the force for %d pairs\n", pairForceCounter); } -// ============================================================================ -// Kernel computes forces between Brownian particles (ions) -// using cell decomposition -// -#if 0 -__global__ void computeTabulatedKernel( - Vector3* force, const Vector3* __restrict__ pos, - const BaseGrid* __restrict__ sys, float cutoff2, - const int* __restrict__ g_numPairs, const int2* __restrict__ g_pair, const int* __restrict__ g_pairTabPotType, TabulatedPotential** __restrict__ tablePot) { -//remove int* type. remove bond references - - const int numPairs = *g_numPairs; - for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < numPairs; i+=blockDim.x*gridDim.x) { - const int2 pair = g_pair[i]; - const int ai = pair.x; - const int aj = pair.y; - - // Particle's type and position - const int ind = g_pairTabPotType[i]; - - /* printf("tid,bid,pos[ai(%d)]: %d %d %f %f %f\n", ai, threadIdx.x, blockIdx.x, pos[ai].x, pos[ai].y, pos[ai].z); //*/ - - Vector3 dr = pos[aj] - pos[ai]; - dr = sys->wrapDiff(dr); - - // Calculate the force based on the tabulatedPotential file - float d2 = dr.length2(); - if (tablePot[ind] != NULL && d2 <= cutoff2) { - Vector3 f = tablePot[ind]->computef(dr,d2); - atomicAdd( &force[ai], f ); - atomicAdd( &force[aj], -f ); - } - } -} -#endif -//#if 0 template<const int BlockSize> __device__ inline void _computeTabulatedKernel(Vector3* force, const BaseGrid* __restrict__ sys, float cutoff2, const int numPairs, const int2* __restrict__ g_pair, @@ -870,11 +640,7 @@ __global__ void computeTabulatedKernel(Vector3* force, const BaseGrid* __restric cutoff2, numPairs, g_pair+start, g_pairTabPotType+start, tablePot, pairListsTex, PosTex, pairTabPotTypeTex); -} - - -//#endif - +} __global__ void clearEnergies(float* __restrict__ g_energies, int num) { for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) { diff --git a/src/ComputeForce.h b/src/ComputeForce.h index 5d602e4a609482a73ba7c639f8a575fc3023b2a7..b3bc258541f9603f387221363f3c39fe7fbae881 100644 --- a/src/ComputeForce.h +++ b/src/ComputeForce.h @@ -174,6 +174,7 @@ private: int numReplicas; int num; int numParts; + int num_rb_attached_particles; int numBonds; int numExcludes; int numTabBondFiles; diff --git a/src/ComputeGridGrid.cu b/src/ComputeGridGrid.cu index 1ecfb9fad191cb1e70d787e384198b57b2a87883..f2eb6fde0bd9802b89537f39af10efd9f32618df 100644 --- a/src/ComputeGridGrid.cu +++ b/src/ComputeGridGrid.cu @@ -2,7 +2,6 @@ #include "ComputeGridGrid.cuh" #include "RigidBodyGrid.h" #include "CudaUtil.cuh" -//RBTODO handle periodic boundaries //RBTODO: add __restrict__, benchmark (Q: how to restrict member data?) class GridPositionTransformer { @@ -17,6 +16,7 @@ private: const Vector3 c; const BaseGrid* s; }; + //class PmfPositionTransformer : public BasePositionTransformer { class PmfPositionTransformer { public: @@ -52,8 +52,7 @@ inline void common_computeGridGridForce(const RigidBodyGrid* rho, const RigidBod Vector3 r_pos= rho->getPosition(r_id); /* i,j,k value of voxel */ r_pos = basis_rho.transform( r_pos ); - r_pos = transformer(r_pos); - const Vector3 u_ijk_float = basis_u_inv.transform( r_pos ); + const Vector3 u_ijk_float = basis_u_inv.transform( transformer( r_pos ) ); // RBTODO: Test for non-unit delta /* Vector3 tmpf = Vector3(0.0f); */ /* float tmpe = 0.0f; */ @@ -71,7 +70,8 @@ inline void common_computeGridGridForce(const RigidBodyGrid* rho, const RigidBod const float r_val = rho->val[r_id]; /* maybe move to beginning of function? */ force[tid].f = basis_u_inv.transpose().transform( r_val*(force[tid].f) ); /* transform to lab frame, with correct scaling factor */ force[tid].e = r_val; - // Calculate torque about origin_u in the lab frame + + // Calculate torque about origin_rho in the lab frame torque[tid].f = r_pos.cross(force[tid].f); } @@ -171,9 +171,11 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc } } + __global__ void createPartlist(const Vector3* __restrict__ pos, const int numTypeParticles, const int* __restrict__ typeParticles_d, + const int attached_particle_start, const int attached_particle_end, int* numParticles_d, int* particles_d, const Vector3 gridCenter, const float radius2, BaseGrid* sys_d) { const int tid = threadIdx.x; @@ -182,11 +184,13 @@ void createPartlist(const Vector3* __restrict__ pos, const int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < numTypeParticles) { int aid = typeParticles_d[i]; - float dist = (sys_d->wrapDiff(pos[aid] - gridCenter)).length2(); - - if (dist <= radius2) { + if (aid < attached_particle_start || aid >= attached_particle_end) { + float dist = (sys_d->wrapDiff(pos[aid] - gridCenter)).length2(); + + if (dist <= radius2) { int tmp = atomicAggInc(numParticles_d, warpLane); particles_d[tmp] = aid; + } } } } diff --git a/src/ComputeGridGrid.cuh b/src/ComputeGridGrid.cuh index 739aa622e2514840f8586c47d0a277d306427ff4..27fc32ccb6b3b3fd43dd005fd9eda7eb1689c8f8 100644 --- a/src/ComputeGridGrid.cuh +++ b/src/ComputeGridGrid.cuh @@ -29,6 +29,7 @@ void computePartGridForce(const Vector3* __restrict__ pos, Vector3* particleForc extern __global__ void createPartlist(const Vector3* __restrict__ pos, const int numTypeParticles, const int* __restrict__ typeParticles_d, + const int attached_particle_start, const int attached_particle_end, int* numParticles_d, int* particles_d, const Vector3 gridCenter, const float radius2, BaseGrid* sys_d); diff --git a/src/Configuration.cpp b/src/Configuration.cpp index 41b4cf3a1dd9c97bd1bf9b28b2591058627b0287..e7bdfd71c57e163646c644d6ef3bb85485f4e7a7 100644 --- a/src/Configuration.cpp +++ b/src/Configuration.cpp @@ -6,6 +6,7 @@ #include <cassert> #include <stdlib.h> /* srand, rand */ #include <time.h> /* time */ +#include <string> #include <iostream> using namespace std; @@ -96,8 +97,24 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) : } } // end result: variable "num" is set + // Count particles associated with rigid bodies + num_rb_attached_particles = 0; + if (numRigidTypes > 0) { + // grow list of rbs + for (int i = 0; i < numRigidTypes; i++) { + RigidBodyType &rbt = rigidBody[i]; + rbt.attach_particles(); + num_rb_attached_particles += rbt.num * rbt.num_attached_particles(); + } + } + assert( num_rb_attached_particles == 0 || simNum == 1 ); // replicas not yet implemented + // num = num+num_rb_attached_particles; + + // Set the number capacity printf("\n%d particles\n", num); + printf("%d particles attached to RBs\n", num_rb_attached_particles); + if (numCap <= 0) numCap = numCapFactor*num; // max number of particles if (numCap <= 0) numCap = 20; @@ -105,18 +122,35 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) : printf("%d groups\n", numGroupSites); // Allocate particle variables. - pos = new Vector3[num * simNum]; + // Each replica works with num+num_rb_attached_particles in array + pos = new Vector3[ (num+num_rb_attached_particles) * simNum]; + //Han-Yi Chou if (ParticleDynamicType == String("Langevin") || ParticleDynamicType == String("NoseHooverLangevin")) - momentum = new Vector3[num * simNum]; - - type = new int[num * simNum]; - serial = new int[num * simNum]; - posLast = new Vector3[num * simNum]; + momentum = new Vector3[(num+num_rb_attached_particles) * simNum]; + + type = new int[(num+num_rb_attached_particles) * simNum]; + serial = new int[(num+num_rb_attached_particles) * simNum]; + posLast = new Vector3[(num+num_rb_attached_particles) * simNum]; + + { + int pidx = 0; + for (int i = 0; i < numRigidTypes; i++) { // Loop over RB types + RigidBodyType &rbt = rigidBody[i]; + for (int j = 0; j < rbt.num; ++j) { // Loop over RBs + for (const int& t: rbt.get_attached_particle_types()) { + type[num+pidx] = t; + serial[num+pidx] = num+pidx; + pidx++; + } + } + } + } + //Han-Yi Chou if(ParticleDynamicType == String("Langevin") || ParticleDynamicType == String("NoseHooverLangevin")) - momLast = new Vector3[num * simNum]; - name = new String[num * simNum]; + momLast = new Vector3[(num+num_rb_attached_particles) * simNum]; + name = new String[(num+num_rb_attached_particles) * simNum]; currSerial = 0; @@ -163,7 +197,12 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) : String* tokenList = new String[numTokens]; partsFromFile[i].tokenize(tokenList); - int currType = 0; + int currType = find_particle_type(tokenList[2]); + if (currType == -1) { + printf("Error: Unable to find particle type %s\n", tokenList[2].val()); + exit(1); + + } for (int j = 0; j < numParts; j++) if (tokenList[2] == part[j].name) currType = j; @@ -215,7 +254,7 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) : printf("done!\n"); } else - loadedMomentum = Boltzmann(COM_Velocity, num * simNum); + loadedMomentum = Boltzmann(COM_Velocity, (num * simNum)); } } } @@ -257,72 +296,66 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) : } printf("\nFound %d particle types.\n", numParts); - // Load the potential grids. + printf("Loading the potential grids...\n"); + // First load a single copy of each grid for (int i = 0; i < numParts; i++) { - String map = partGridFile[i][0]; - int len = map.length(); - - if (len >= 3 && map[len-3]=='.' && map[len-2]=='d' && map[len-1]=='x') - { - part[i].pmf = new BaseGrid[part[i].numPartGridFiles]; - part[i].meanPmf = new float[part[i].numPartGridFiles]; - for(int j = 0; j < part[i].numPartGridFiles; ++j) - { - map = partGridFile[i][j]; - len = map.length(); - if (!(len >= 3 && map[len-3]=='.' && map[len-2]=='d' && map[len-1]=='x')) - { - cout << "currently do not support different format " << endl; - exit(1); - } - // Decide which type of grid is given. - //String map = partGridFile[i]; - - // A dx file. Load the old-fashioned way. - //part[i].pmf[j] = new BaseGrid(map.val()); - BaseGrid tmp(map.val()); - part[i].pmf[j] = tmp; - if (partGridFileScale[i][j] != 1.0f) - part[i].pmf[j].scale(partGridFileScale[i][j]); - - //part[i].meanPmf = part[i].pmf->mean(); - part[i].meanPmf[j] = part[i].pmf[j].mean(); - printf("Loaded dx potential grid `%s'.\n", map.val()); - printf("Grid size %s.\n", part[i].pmf[j].getExtent().toString().val()); - } - } - else if (len >= 4 && map[len-4]=='.' && map[len-3]=='d' && map[len-2]=='e' && map[len-1]=='f') - { - OverlordGrid* over = new OverlordGrid[part[i].numPartGridFiles]; - part[i].meanPmf = new float[part[i].numPartGridFiles]; - for(int j = 0; j < part[i].numPartGridFiles; ++j) - { - map = partGridFile[i][j]; - len = map.length(); - if (!(len >= 4 && map[len-4]=='.' && map[len-3]=='d' && map[len-2]=='e' && map[len-1]=='f')) - { - cout << "currently do not support different format " << endl; - exit(1); - } - - String rootGrid = OverlordGrid::readDefFirst(map); - over[j] = OverlordGrid(rootGrid.val()); - int count = over->readDef(map); - printf("Loaded system def file `%s'.\n", map.val()); - printf("Found %d unique grids.\n", over->getUniqueGridNum()); - printf("Linked %d subgrids.\n", count); - part[i].meanPmf[j] = part[i].pmf[j].mean(); - } - part[i].pmf = static_cast<BaseGrid*>(over); - } - else - { - printf("WARNING: Unrecognized gridFile extension. Must be *.def or *.dx.\n"); - exit(-1); - } + for(int j = 0; j < part[i].numPartGridFiles; ++j) + { + std::string fname(partGridFile[i][j].val(), partGridFile[i][j].length()); + if (part_grid_dictionary.count( fname ) == 0) + { + int len = fname.length(); + if (len >= 3 && fname[len-3]=='.' && fname[len-2]=='d' && fname[len-1]=='x') + { + part_grid_dictionary.insert({fname, BaseGrid(fname.c_str())}); + } + else if (len >= 4 && fname[len-4]=='.' && fname[len-3]=='d' && fname[len-2]=='e' && fname[len-1]=='f') + { + assert(1==2); // Throw exception because this implementation needs to be revisited +/* OverlordGrid* over = new OverlordGrid[part[i].numPartGridFiles]; + part[i].meanPmf = new float[part[i].numPartGridFiles]; + for(int j = 0; j < part[i].numPartGridFiles; ++j) + { + map = partGridFile[i][j]; + len = map.length(); + if (!(len >= 4 && map[len-4]=='.' && map[len-3]=='d' && map[len-2]=='e' && map[len-1]=='f')) + { + cout << "currently do not support different format " << endl; + exit(1); + } + + String rootGrid = OverlordGrid::readDefFirst(map); + over[j] = OverlordGrid(rootGrid.val()); + int count = over->readDef(map); + printf("Loaded system def file `%s'.\n", map.val()); + printf("Found %d unique grids.\n", over->getUniqueGridNum()); + printf("Linked %d subgrids.\n", count); + part[i].meanPmf[j] = part[i].pmf[j].mean(); + } + part[i].pmf = static_cast<BaseGrid*>(over); +*/ + } else { + printf("WARNING: Unrecognized gridFile extension. Must be *.def or *.dx.\n"); + exit(-1); + } + } + } + } + // Then assign grid addresses to particles + for (int i = 0; i < numParts; i++) + { + part[i].pmf = new BaseGrid*[part[i].numPartGridFiles]; + part[i].pmf_scale = new float[part[i].numPartGridFiles]; + part[i].meanPmf = new float[part[i].numPartGridFiles]; + for(int j = 0; j < part[i].numPartGridFiles; ++j) + { + part[i].pmf[j] = &(part_grid_dictionary.find( std::string(partGridFile[i][j]) )->second); + part[i].pmf_scale[j] = partGridFileScale[i][j]; + part[i].meanPmf[j] = part[i].pmf[j]->mean(); // TODO: review how this is used and decide whether to scale + } if (partForceXGridFile[i].length() != 0) { part[i].forceXGrid = new BaseGrid(partForceXGridFile[i].val()); printf("Loaded `%s'.\n", partForceXGridFile[i].val()); @@ -385,7 +418,7 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) : sys = new BaseGrid( Matrix3(basis1,basis2,basis3), origin, 1, 1, 1 ); } else { // TODO: use largest system in x,y,z - sys = part[0].pmf; + sys = *part[0].pmf; } sysDim = sys->getExtent(); @@ -425,13 +458,45 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) : excludes = new Exclude[numExcludes]; for (int i = 0; i < oldNumExcludes; i++) excludes[i] = tempExcludes[i]; - delete tempExcludes; + delete [] tempExcludes; } for (int i = oldNumExcludes; i < numExcludes; i++) excludes[i] = newExcludes[i - oldNumExcludes]; printf("Built %d exclusions.\n",numExcludes); } + { // Add exclusions for RB attached particles + std::vector<Exclude> ex; + int start = num; + for (int i = 0; i < numRigidTypes; i++) { // Loop over RB types + RigidBodyType &rbt = rigidBody[i]; + const int nap = rbt.num_attached_particles(); + for (int j = 0; j < rbt.num; ++j) { // Loop over RBs + for (int ai = 0; ai < nap-1; ++ai) { + for (int aj = ai+1; aj < nap; ++aj) { + ex.push_back( Exclude( ai+start, aj+start ) ); + } + } + start += nap; + } + } + // copy + int oldNumExcludes = numExcludes; + numExcludes = numExcludes + ex.size(); + if (excludes == NULL) { + excludes = new Exclude[numExcludes]; + } else if (numExcludes >= excludeCapacity) { + Exclude* tempExcludes = excludes; + excludes = new Exclude[numExcludes]; + for (int i = 0; i < oldNumExcludes; i++) + excludes[i] = tempExcludes[i]; + delete [] tempExcludes; + } + for (int i = oldNumExcludes; i < numExcludes; i++) + excludes[i] = ex[i - oldNumExcludes]; + } + + printf("Built %d exclusions.\n",numExcludes); buildExcludeMap(); // Count number of particles of each type @@ -439,7 +504,7 @@ Configuration::Configuration(const char* config_file, int simNum, bool debug) : for (int i = 0; i < numParts; ++i) { numPartsOfType[i] = 0; } - for (int i = 0; i < num; ++i) { + for (int i = 0; i < num+num_rb_attached_particles; ++i) { ++numPartsOfType[type[i]]; } @@ -594,7 +659,17 @@ Configuration::~Configuration() { } void Configuration::copyToCUDA() { - printf("Copying particle data to GPU %d\n", GPUManager::current()); + printf("Copying particle grids to GPU %d\n", GPUManager::current()); + for (const auto& pair : part_grid_dictionary) + { + // Copy PMF + const BaseGrid& g = pair.second; + BaseGrid *g_d = g.copy_to_cuda(); + part_grid_dictionary_d.insert({pair.first, g_d}); + // printf("Assigning grid for %s to %p (originally %p)\n", pair.first.c_str(), (void *) part_grid_dictionary_d[pair.first], (void *) g_d); + } + + printf("Copying particle data to GPU %d\n", GPUManager::current()); BrownianParticleType **part_addr = new BrownianParticleType*[numParts]; @@ -604,11 +679,31 @@ void Configuration::copyToCUDA() { for (int i = 0; i < numParts; i++) { - BaseGrid *pmf = NULL, *diffusionGrid = NULL; BrownianParticleType *b = new BrownianParticleType(part[i]); - // Copy PMF + // Copy PMF pointers if (part[i].pmf != NULL) { + { + BaseGrid** tmp_d = new BaseGrid*[part[i].numPartGridFiles]; + BaseGrid** tmp = new BaseGrid*[part[i].numPartGridFiles]; + for(int j = 0; j < part[i].numPartGridFiles; ++j) { + // printf("Retrieving grid for %s (at %p)\n", partGridFile[i][j].val(), (void *) part_grid_dictionary_d[std::string(partGridFile[i][j])]); + tmp[j] = part_grid_dictionary_d[std::string(partGridFile[i][j])]; + } + gpuErrchk(cudaMalloc(&tmp_d, sizeof(BaseGrid*)*part[i].numPartGridFiles)); + gpuErrchk(cudaMemcpy(tmp_d, tmp, sizeof(BaseGrid*)*part[i].numPartGridFiles, + cudaMemcpyHostToDevice)); + b->pmf = tmp_d; + } + + { + float *tmp; + gpuErrchk(cudaMalloc(&tmp, sizeof(float)*part[i].numPartGridFiles)); + gpuErrchk(cudaMemcpy(tmp, part[i].pmf_scale, sizeof(float)*part[i].numPartGridFiles, + cudaMemcpyHostToDevice)); + b->pmf_scale = tmp; + } + { float *tmp; gpuErrchk(cudaMalloc(&tmp, sizeof(float)*part[i].numPartGridFiles)); @@ -624,40 +719,17 @@ void Configuration::copyToCUDA() { gpuErrchk(cudaMemcpy(tmp, part[i].pmf_boundary_conditions, s, cudaMemcpyHostToDevice)); b->pmf_boundary_conditions = tmp; } - - gpuErrchk(cudaMalloc(&pmf, sizeof(BaseGrid)*part[i].numPartGridFiles)); - for(int j = 0; j < part[i].numPartGridFiles; ++j) - { - float *val = NULL; - size_t sz = sizeof(float) * part[i].pmf[j].getSize(); - //gpuErrchk(cudaMalloc(pmf, sizeof(BaseGrid))); - gpuErrchk(cudaMalloc(&val, sz)); - gpuErrchk(cudaMemcpyAsync(val, part[i].pmf[j].val, sz, cudaMemcpyHostToDevice)); - BaseGrid *pmf_h = new BaseGrid(part[i].pmf[j]); - pmf_h->val = val; - gpuErrchk(cudaMemcpy(pmf+j, pmf_h, sizeof(BaseGrid), cudaMemcpyHostToDevice)); - pmf_h->val = NULL; - } } - b->pmf = pmf; // Copy the diffusion grid if (part[i].diffusionGrid != NULL) { - float *val = NULL; - size_t sz = sizeof(float) * part[i].diffusionGrid->getSize(); - BaseGrid *diffusionGrid_h = new BaseGrid(*part[i].diffusionGrid); - gpuErrchk(cudaMalloc(&diffusionGrid, sizeof(BaseGrid))); - gpuErrchk(cudaMalloc(&val, sz)); - diffusionGrid_h->val = val; - gpuErrchk(cudaMemcpyAsync(diffusionGrid, diffusionGrid_h, sizeof(BaseGrid), - cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(val, part[i].diffusionGrid->val, sz, cudaMemcpyHostToDevice)); - diffusionGrid_h->val = NULL; + b->diffusionGrid = part[i].diffusionGrid->copy_to_cuda(); + } else { + b->diffusionGrid = NULL; } //b->pmf = pmf; - b->diffusionGrid = diffusionGrid; gpuErrchk(cudaMalloc(&part_addr[i], sizeof(BrownianParticleType))); gpuErrchk(cudaMemcpyAsync(part_addr[i], b, sizeof(BrownianParticleType), cudaMemcpyHostToDevice)); @@ -677,7 +749,7 @@ void Configuration::copyToCUDA() { gpuErrchk(cudaMalloc(&sys_d, sizeof(BaseGrid))); gpuErrchk(cudaMemcpyAsync(sys_d, sys, sizeof(BaseGrid), cudaMemcpyHostToDevice)); /*gpuErrchk(cudaMalloc(&type_d, sizeof(int) * num * simNum)); - gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * num * simNum, cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int+num_rb_attached_particles) * num * simNum, cudaMemcpyHostToDevice)); if (numBonds > 0) { // bonds_d @@ -696,7 +768,7 @@ void Configuration::copyToCUDA() { cudaMemcpyHostToDevice)); // excludeMap_d - gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * num)); + gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * (num)); gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * num, cudaMemcpyHostToDevice)); } @@ -1115,6 +1187,8 @@ int Configuration::readParameters(const char * config_file) { else if (param == String("rotDamping")) rigidBody[currRB].rotDamping = stringToVector3( value ); + else if (param == String("attachedParticles")) + rigidBody[currRB].append_attached_particle_file(value); else if (param == String("densityGrid")) rigidBody[currRB].addDensityGrid(value); else if (param == String("potentialGrid")) @@ -1135,6 +1209,8 @@ int Configuration::readParameters(const char * config_file) { rigidBody[currRB].initAngularMomentum = stringToVector3(value); else if (param == String("inputRBCoordinates")) inputRBCoordinates = value; + else if (param == String("restartRBCoordinates")) + restartRBCoordinates = value; // COMMON else if (param == String("num")) { @@ -1271,6 +1347,7 @@ void Configuration::readAtoms() { for (int i = 0; i < numParts; i++) if (part[i].num == 0) found = false; + // assert(false); // TODO probably relax constraint that particle must be found; could just be in RB if (!found) { printf("ERROR: Number of particles not specified in config file.\n"); exit(1); @@ -1450,9 +1527,11 @@ void Configuration::readGroups() { std::vector<int> tmp; for (int i=1; i < numTokens; ++i) { const int ai = atoi(tokenList[i].val()); - if (ai >= num) { + if (ai >= num+num_rb_attached_particles) { printf("Error: Attempted to include invalid particle in group: %s\n", line); exit(-1); + } else if (ai >= num) { + printf("WARNING: including RB particles in group with line: %s\n", line); } tmp.push_back( ai ); } @@ -1510,7 +1589,8 @@ void Configuration::readBonds() { continue; } - if (ind1 < 0 || ind1 >= num+numGroupSites || ind2 < 0 || ind2 >= num+numGroupSites) { + if (ind1 < 0 || ind1 >= num+num_rb_attached_particles+numGroupSites || + ind2 < 0 || ind2 >= num+num_rb_attached_particles+numGroupSites) { printf("ERROR: Bond file line '%s' includes invalid index\n", line); exit(1); } @@ -1558,8 +1638,8 @@ void Configuration::readBonds() { * bondMap[i].x is the index in the bonds array where the ith particle's bonds begin * bondMap[i].y is the index in the bonds array where the ith particle's bonds end */ - bondMap = new int2[num+numGroupSites]; - for (int i = 0; i < num+numGroupSites; i++) { + bondMap = new int2[num+num_rb_attached_particles+numGroupSites]; + for (int i = 0; i < num+num_rb_attached_particles+numGroupSites; i++) { bondMap[i].x = -1; bondMap[i].y = -1; } @@ -1617,8 +1697,8 @@ void Configuration::readExcludes() } } void Configuration::addExclusion(int ind1, int ind2) { - if (ind1 >= num || ind2 >= num) { - printf("WARNING: Attempted to add an exclusion for an out-of-range particle index (%d or %d >= %d).\n", ind1, ind2, num); + if (ind1 >= num+num_rb_attached_particles || ind2 >= num+num_rb_attached_particles) { + printf("WARNING: Attempted to add an exclusion for an out-of-range particle index (%d or %d >= %d).\n", ind1, ind2, num+num_rb_attached_particles); return; } @@ -1661,8 +1741,8 @@ void Configuration::buildExcludeMap() { * excludeMap[i].x is the index in the excludes array where the ith particle's excludes begin * excludeMap[i].y is the index in the excludes array where the ith particle's excludes end */ - excludeMap = new int2[num]; - for (int i = 0; i < num; i++) { + excludeMap = new int2[num+num_rb_attached_particles]; + for (int i = 0; i < num+num_rb_attached_particles; i++) { excludeMap[i].x = -1; excludeMap[i].y = -1; } @@ -1671,7 +1751,7 @@ void Configuration::buildExcludeMap() { for (int i = 0; i < numExcludes; i++) { if (excludes[i].ind1 != currPart) { currPart = excludes[i].ind1; - assert(currPart < num); + assert(currPart < num+num_rb_attached_particles); excludeMap[currPart].x = i; if (lastPart >= 0) excludeMap[lastPart].y = i; @@ -1720,7 +1800,7 @@ void Configuration::readAngles() { int ind3 = atoi(tokenList[3].val()); String file_name = tokenList[4]; //printf("file_name %s\n", file_name.val()); - if (ind1 >= num+numGroupSites or ind2 >= num+numGroupSites or ind3 >= num+numGroupSites) + if (ind1 >= num+num_rb_attached_particles+numGroupSites or ind2 >= num+num_rb_attached_particles+numGroupSites or ind3 >= num+num_rb_attached_particles+numGroupSites) continue; if (numAngles >= capacity) { @@ -1781,8 +1861,10 @@ void Configuration::readDihedrals() { int ind4 = atoi(tokenList[4].val()); String file_name = tokenList[5]; //printf("file_name %s\n", file_name.val()); - if (ind1 >= num+numGroupSites or ind2 >= num+numGroupSites - or ind3 >= num+numGroupSites or ind4 >= num+numGroupSites) + if (ind1 >= num+num_rb_attached_particles+numGroupSites or + ind2 >= num+num_rb_attached_particles+numGroupSites or + ind3 >= num+num_rb_attached_particles+numGroupSites or + ind4 >= num+num_rb_attached_particles+numGroupSites) continue; if (numDihedrals >= capacity) { @@ -1841,7 +1923,7 @@ void Configuration::readRestraints() { float y0 = (float) strtod(tokenList[4].val(), NULL); float z0 = (float) strtod(tokenList[5].val(), NULL); - if (id >= num+numGroupSites) continue; + if (id >= num + num_rb_attached_particles + numGroupSites) continue; if (numRestraints >= capacity) { Restraint* temp = restraints; diff --git a/src/Configuration.h b/src/Configuration.h index debf4b9f6ea4d0ce8eaa9c52d70d6c57f1efdcbb..6199dd8b24bc019dd7044b2a3a6031312142f13d 100644 --- a/src/Configuration.h +++ b/src/Configuration.h @@ -11,6 +11,7 @@ #include <algorithm> // sort #include <vector> +#include <map> #include <cuda.h> #include <cuda_runtime.h> @@ -64,6 +65,7 @@ class Configuration { void readDihedrals(); void readRestraints(); + bool readTableFile(const String& value, int currTab); bool readBondFile(const String& value, int currBond); bool readAngleFile(const String& value, int currAngle); @@ -88,6 +90,15 @@ public: Configuration(const char * config_file, int simNum = 0, bool debug=false); ~Configuration(); + int find_particle_type(const char* s) const { + for (int j = 0; j < numParts; j++) { + // printf("Searching particle %d (%s) =? %s\n", j, part[j].name.val(), s); + if (strcmp(s,part[j].name.val()) == 0) + return j; + } + return -1; + } + void copyToCUDA(); // Output variables @@ -121,7 +132,8 @@ public: Bond* bonds; int numCap; // max number of particles int num; // current number of particles - Vector3* pos; // position of each particle + int num_rb_attached_particles; + Vector3* pos; // position of each particle Vector3* momentum; //momentum of each brownian particles Han-Yi Chou Vector3 COM_Velocity; //center of mass velocity Han-Yi Chou int* type; // type of each particle @@ -148,6 +160,7 @@ public: String inputCoordinates; String inputMomentum; //Han-Yi Chou String inputRBCoordinates; + String restartRBCoordinates; int copyReplicaCoordinates; String restartCoordinates; String restartMomentum; //Han-Yi Chou @@ -208,6 +221,8 @@ public: //float* partGridFileScale; float **partGridFileScale; //int *numPartGridFiles; + std::map<std::string,BaseGrid> part_grid_dictionary; + std::map<std::string,BaseGrid*> part_grid_dictionary_d; std::vector< std::vector<String> > partRigidBodyGrid; String* partDiffusionGridFile; String* partForceXGridFile; diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu index a8576e54ab0e07474cdd18db2c6e138f27caf5da..fe55aad463b5b27ea6f095329ea14ed3917d498e 100644 --- a/src/GrandBrownTown.cu +++ b/src/GrandBrownTown.cu @@ -97,10 +97,11 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, bonds = c.bonds; numCap = c.numCap; // max number of particles num = c.num; // current number of particles + num_rb_attached_particles = c.num_rb_attached_particles; numGroupSites = c.numGroupSites; // Allocate arrays of positions, types and serial numbers - pos = new Vector3[(num+numGroupSites) * numReplicas]; // [HOST] array of particles' positions. + pos = new Vector3[(num+num_rb_attached_particles+numGroupSites) * numReplicas]; // [HOST] array of particles' positions. // Allocate arrays of momentum Han-Yi Chou if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) { @@ -112,8 +113,8 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, //for debug //force = new Vector3[num * numReplicas]; - type = new int[num * numReplicas]; // [HOST] array of particles' types. - serial = new int[num * numReplicas]; // [HOST] array of particles' serial numbers. + type = new int[(num+num_rb_attached_particles) * numReplicas]; // [HOST] array of particles' types. + serial = new int[(num+num_rb_attached_particles) * numReplicas]; // [HOST] array of particles' serial numbers. // Allocate things for rigid body // RBC = RigidBodyController(c); @@ -127,8 +128,10 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, // Replicate identical initial conditions across all replicas for (int r = 0; r < numReplicas; ++r) { - std::copy(c.type, c.type + num, type + r*num); - std::copy(c.serial, c.serial + num, serial + r*num); + std::copy(c.type, c.type + num+num_rb_attached_particles, + type + r*(num+num_rb_attached_particles)); + std::copy(c.serial, c.serial + num + num_rb_attached_particles, + serial + r*(num+num_rb_attached_particles)); if (c.copyReplicaCoordinates > 0) std::copy(c.pos, c.pos + num, pos + r*num); } @@ -307,8 +310,12 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, auto _get_index = [this](int idx, int replica) { // Convenient lambda function to deal with increasingly complicated indexing - auto num = this->num; auto numReplicas = this->numReplicas; auto numGroupSites = this->numGroupSites; - idx = (idx < num) ? idx + replica*num : (idx-num) + numReplicas*num + replica * numGroupSites; + auto num = this->num; + auto numReplicas = this->numReplicas; + auto num_rb_attached_particles = this->num_rb_attached_particles; + auto numGroupSites = this->numGroupSites; + idx = (idx < num+num_rb_attached_particles) ? idx + replica*(num+num_rb_attached_particles) + : (idx-num-num_rb_attached_particles) + numReplicas*(num+num_rb_attached_particles) + replica * numGroupSites; return idx; }; @@ -367,7 +374,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, } internal->copyBondedListsToGPU(bondList,angleList,dihedralList,dihedralPotList); - forceInternal = new Vector3[(num+numGroupSites)*numReplicas]; + forceInternal = new Vector3[(num+num_rb_attached_particles+numGroupSites)*numReplicas]; if (fullLongRange != 0) printf("No cell decomposition created.\n"); @@ -488,20 +495,20 @@ GrandBrownTown::~GrandBrownTown() { } -//temporary test for Nose-Hoover Langevin dynamics + //Nose Hoover is now implement for particles. -void GrandBrownTown::RunNoseHooverLangevin() +void GrandBrownTown::run() { - //comment this out because this is the origin points Han-Yi Chou + // Open the files for recording ionic currents for (int repID = 0; repID < numReplicas; ++repID) { - - writers[repID]->newFile(pos + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay + writers[repID]->newFile(pos + repID*(num+num_rb_attached_particles), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) momentum_writers[repID]->newFile((momentum + repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay //random_writers[repID]->newFile(random + (repID * num), name, 0.0f, num); } + // Initialize timers (util.*) wkf_timerhandle timer0, timerS; timer0 = wkf_timer_create(); @@ -569,16 +576,16 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); + RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+conf.num_rb_attached_particles), sys_d); gpuErrchk(cudaDeviceSynchronize()); } float t; // simulation time - int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1); + int numBlocks = ((num+num_rb_attached_particles) * numReplicas) / NUM_THREADS + (((num+num_rb_attached_particles) * numReplicas) % NUM_THREADS == 0 ? 0 : 1); int tl = temperatureGridFile.length(); Vector3 *force_d; - gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*(num+numGroupSites) * numReplicas)); + gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*(num+num_rb_attached_particles+numGroupSites) * numReplicas)); printf("Configuration: %d particles | %d replicas\n", num, numReplicas); for (int i=0; i< gpuman.gpus.size(); ++i) { @@ -599,11 +606,23 @@ void GrandBrownTown::RunNoseHooverLangevin() internal->clear_force(); internal->clear_energy(); const std::vector<Vector3*>& _pos = internal->getPos_d(); - if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(_pos[0], groupSiteData_d, num, numGroupSites, numReplicas); + + if (num_rb_attached_particles > 0) { + #pragma omp parallel for + for(int i = 0; i < numReplicas; ++i) { + RBC[i]->update_attached_particle_positions( + internal->getPos_d()[0]+num+i*(num+num_rb_attached_particles), + internal->getForceInternal_d()[0]+num+i*(num+num_rb_attached_particles), + internal->getEnergy()+num+i*(num+num_rb_attached_particles), + sys_d, num, num_rb_attached_particles, numReplicas); + } + } + + if (numGroupSites > 0) updateGroupSites<<<(numGroupSites/32+1),32>>>(_pos[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas); #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { - gpuman.nccl_broadcast(0, _pos, _pos, (num+numGroupSites)*numReplicas, -1); + gpuman.nccl_broadcast(0, _pos, _pos, (num+num_rb_attached_particles+numGroupSites)*numReplicas, -1); } #endif gpuman.sync(); @@ -633,7 +652,7 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); + RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d); } internal -> computeTabulated(get_energy); break; @@ -657,7 +676,7 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); + RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d); } internal->compute(get_energy); break; @@ -676,13 +695,32 @@ void GrandBrownTown::RunNoseHooverLangevin() } } }//if inter-particle force + + if (get_energy) { + compute_position_dependent_force_for_rb_attached_particles + <<< numBlocks, NUM_THREADS >>> ( + internal -> getPos_d()[0], internal -> getForceInternal_d()[0], + internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType); + } else { + compute_position_dependent_force_for_rb_attached_particles + <<< numBlocks, NUM_THREADS >>> ( + internal -> getPos_d()[0], + internal -> getForceInternal_d()[0], internal -> getEnergy(), + internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType); + } + + #ifdef _OPENMP omp_set_num_threads(4); #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, - RigidBodyInterpolationType, sys, sys_d); + RBC[i]->updateForces(internal->getPos_d()[0]+i*(num+num_rb_attached_particles), + internal->getForceInternal_d()[0]+i*(num+num_rb_attached_particles), + s, + internal->getEnergy()+i*(num+num_rb_attached_particles), + get_energy, + RigidBodyInterpolationType, sys, sys_d, num, num_rb_attached_particles); if(rigidbody_dynamic == String("Langevin")) { #ifdef _OPENMP @@ -698,28 +736,29 @@ void GrandBrownTown::RunNoseHooverLangevin() #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { const std::vector<Vector3*>& _f = internal->getForceInternal_d(); - gpuman.nccl_reduce(0, _f, _f, (num+numGroupSites)*numReplicas, -1); + gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles+numGroupSites)*numReplicas, -1); } #endif - if (numGroupSites > 0) distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas); + if (numGroupSites > 0) distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas); }//if step == 1 internal->clear_energy(); gpuman.sync(); + if(particle_dynamic == String("Langevin")) - updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, ParticleInterpolationType); + updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal->getPos_d()[0], internal->getMom_d(), internal->getForceInternal_d()[0], internal->getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, ParticleInterpolationType); else if(particle_dynamic == String("NoseHooverLangevin")) //kernel for Nose-Hoover Langevin dynamic updateKernelNoseHooverLangevin<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), - internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, + internal -> getRan_d(), internal -> getForceInternal_d()[0], internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, ParticleInterpolationType); ////For Brownian motion else updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getForceInternal_d()[0], internal -> getType_d(), - part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, + part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, ParticleInterpolationType); if(rigidbody_dynamic == String("Langevin")) @@ -748,10 +787,21 @@ void GrandBrownTown::RunNoseHooverLangevin() } } + if (num_rb_attached_particles > 0) { + #pragma omp parallel for + for(int i = 0; i < numReplicas; ++i) { + RBC[i]->update_attached_particle_positions( + internal->getPos_d()[0]+num+i*(num+num_rb_attached_particles), + internal->getForceInternal_d()[0]+num+i*(num+num_rb_attached_particles), + internal->getEnergy()+num+i*(num+num_rb_attached_particles), + sys_d, num, num_rb_attached_particles, numReplicas); + } + } + if (s % outputPeriod == 0) { // Copy particle positions back to CPU gpuErrchk(cudaDeviceSynchronize()); - gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(pos, internal ->getPos_d()[0], sizeof(Vector3) * (num+num_rb_attached_particles) * numReplicas, cudaMemcpyDeviceToHost)); } if (imd_on && clientsock && s % outputPeriod == 0) { @@ -830,9 +880,10 @@ void GrandBrownTown::RunNoseHooverLangevin() for(int i = 0; i < numReplicas; ++i) RBC[i]->clearForceAndTorque(); + if (numGroupSites > 0) { gpuman.sync(); - updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num, numGroupSites, numReplicas); + updateGroupSites<<<(numGroupSites/32+1),32>>>(internal->getPos_d()[0], groupSiteData_d, num + num_rb_attached_particles, numGroupSites, numReplicas); gpuman.sync(); } @@ -844,7 +895,7 @@ void GrandBrownTown::RunNoseHooverLangevin() if (gpuman.gpus.size() > 1) { const std::vector<Vector3*>& _p = internal->getPos_d(); nccl_broadcast_streams[0] = gpuman.gpus[0].get_next_stream(); - gpuman.nccl_broadcast(0, _p, _p, (num+numGroupSites)*numReplicas, nccl_broadcast_streams); + gpuman.nccl_broadcast(0, _p, _p, (num+num_rb_attached_particles+numGroupSites)*numReplicas, nccl_broadcast_streams); } #endif } @@ -865,13 +916,13 @@ void GrandBrownTown::RunNoseHooverLangevin() #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) - RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); + RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*(num+num_rb_attached_particles), sys_d); } internal -> computeTabulated(get_energy); #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { const std::vector<Vector3*>& _f = internal->getForceInternal_d(); - gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1); + gpuman.nccl_reduce(0, _f, _f, (num+num_rb_attached_particles)*numReplicas, -1); } #endif break; @@ -912,28 +963,43 @@ void GrandBrownTown::RunNoseHooverLangevin() } } } + + if (get_energy) { + compute_position_dependent_force_for_rb_attached_particles + <<< numBlocks, NUM_THREADS >>> ( + internal -> getPos_d()[0], internal -> getForceInternal_d()[0], + internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType); + } else { + compute_position_dependent_force_for_rb_attached_particles + <<< numBlocks, NUM_THREADS >>> ( + internal -> getPos_d()[0], + internal -> getForceInternal_d()[0], internal -> getEnergy(), + internal -> getType_d(), part_d, electricField, num, num_rb_attached_particles, numReplicas, ParticleInterpolationType); + } + + //compute the force for rigid bodies #ifdef _OPENMP omp_set_num_threads(4); #endif #pragma omp parallel for for(int i = 0; i < numReplicas; ++i) // TODO: Use different buffer for RB particle forces to avoid race condition - RBC[i]->updateForces((internal->getPos_d()[0])+i*num, (internal->getForceInternal_d()[0])+i*num, s, (internal->getEnergy())+i*num, get_energy, - RigidBodyInterpolationType, sys, sys_d); + RBC[i]->updateForces((internal->getPos_d()[0])+i*(num+num_rb_attached_particles), (internal->getForceInternal_d()[0])+i*(num+num_rb_attached_particles), s, (internal->getEnergy())+i*(num+num_rb_attached_particles), get_energy, + RigidBodyInterpolationType, sys, sys_d, num, num_rb_attached_particles); if (numGroupSites > 0) { gpuman.sync(); // if ((s%100) == 0) { - distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas); + distributeGroupSiteForces<true><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas); // } else { - // distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num, numGroupSites, numReplicas); + // distributeGroupSiteForces<false><<<(numGroupSites/32+1),32>>>(internal->getForceInternal_d()[0], groupSiteData_d, num+num_rb_attached_particles, numGroupSites, numReplicas); // } gpuman.sync(); } if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d()[0], internal -> getMom_d(), internal -> getForceInternal_d()[0], - internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, + internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, num_rb_attached_particles, sys_d, randoGen_d, numReplicas, internal->getEnergy(), get_energy, ParticleInterpolationType); //gpuErrchk(cudaDeviceSynchronize()); @@ -957,7 +1023,7 @@ void GrandBrownTown::RunNoseHooverLangevin() { if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) { - gpuErrchk(cudaMemcpy(momentum, internal -> getMom_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(momentum, internal -> getMom_d(), sizeof(Vector3) * (num) * numReplicas, cudaMemcpyDeviceToHost)); } t = s*timestep; // Loop over all replicas @@ -968,12 +1034,12 @@ void GrandBrownTown::RunNoseHooverLangevin() updateNameList(); // no need for it here if particles stay the same // Write the trajectory. - writers[repID]->append(pos + (repID*num), name, serial, t, num); + writers[repID]->append(pos + repID*(num+num_rb_attached_particles), name, serial, t, num); if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) { - momentum_writers[repID]->append(momentum + (repID * num), name, serial, t, num); - //force_writers[repID]->append(force + (repID * num), name, serial, t, num); + momentum_writers[repID]->append(momentum + repID * (num+num_rb_attached_particles), name, serial, t, num); + //force_writers[repID]->append(force + repID * (num+num_rb_attached_particles), name, serial, t, num); } } // TODO: Currently, not compatible with replicas. Needs a fix. @@ -1005,7 +1071,7 @@ void GrandBrownTown::RunNoseHooverLangevin() float e = 0.f; float V = 0.f; thrust::device_ptr<float> en_d(internal->getEnergy()); - V = (thrust::reduce(en_d, en_d+num*numReplicas)) / numReplicas; + V = (thrust::reduce(en_d, en_d+(num+num_rb_attached_particles)*numReplicas)) / numReplicas; if(particle_dynamic == String("Langevin") || particle_dynamic == String("NoseHooverLangevin")) { gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost)); @@ -1112,511 +1178,12 @@ void GrandBrownTown::RunNoseHooverLangevin() gpuErrchk(cudaFree(force_d)); } // GrandBrownTown::run() -// Run the Brownian Dynamics steps. -void GrandBrownTown::run() { - - RunNoseHooverLangevin(); -#if 0 - printf("\n\n"); - Vector3 runningNetForce(0.0f); - - // Open the files for recording ionic currents - for (int repID = 0; repID < numReplicas; ++repID) { - writers[repID]->newFile(pos + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay - } - //Han-Yi Chou - if(particle_dynamic == String("Langevin")) - { - for (int repID = 0; repID < numReplicas; ++repID) - { - momentum_writers[repID]->newFile(momentum + (repID * num), name, 0.0f, num); // 'pos + (repID*num)' == array-to-pointer decay - //force_writers[repID]->newFile(force + (repID * num), name, 0.0f, num); - } - } - - // Save (remember) particle positions for later (analysis) - remember(0.0f); - - // Initialize timers (util.*) - wkf_timerhandle timer0, timerS; - timer0 = wkf_timer_create(); - timerS = wkf_timer_create(); - - copyToCUDA(); - if(particle_dynamic == String("Langevin")) - internal -> copyToCUDA(forceInternal, pos,momentum); - else - internal -> copyToCUDA(forceInternal, pos); - - // IMD Stuff - void* sock = NULL; - void* clientsock = NULL; - int length; - if (imd_on) { - printf("Setting up incoming socket\n"); - vmdsock_init(); - sock = vmdsock_create(); - clientsock = NULL; - vmdsock_bind(sock, imd_port); - - printf("Waiting for IMD connection on port %d...\n", imd_port); - vmdsock_listen(sock); - while (!clientsock) { - if (vmdsock_selread(sock, 0) > 0) { - clientsock = vmdsock_accept(sock); - if (imd_handshake(clientsock)) - clientsock = NULL; - } - } - sleep(1); - if (vmdsock_selread(clientsock, 0) != 1 || - imd_recv_header(clientsock, &length) != IMD_GO) { - clientsock = NULL; - } - imdForces = new Vector3[num*numReplicas]; - for (size_t i = 0; i < num; ++i) // clear old forces - imdForces[i] = Vector3(0.0f); - - } // endif (imd_on) - - // Start timers - wkf_timer_start(timer0); - wkf_timer_start(timerS); - - // We haven't done any steps yet. - // Do decomposition if we have to - if (fullLongRange == 0) - { - // cudaSetDevice(0); - internal->decompose(); - gpuErrchk(cudaDeviceSynchronize()); - RBC.updateParticleLists( internal->getPos_d() ); - } - - float t; // simulation time - - int numBlocks = (num * numReplicas) / NUM_THREADS + ((num * numReplicas) % NUM_THREADS == 0 ? 0 : 1); - int tl = temperatureGridFile.length(); - Vector3 *force_d; - gpuErrchk(cudaMalloc((void**)&force_d, sizeof(Vector3)*num * numReplicas)); - - printf("Configuration: %d particles | %d replicas\n", num, numReplicas); - - // Main loop over Brownian dynamics steps - for (long int s = 1; s < steps; s++) - { - // Compute the internal forces. Only calculate the energy when we are about to output. - bool get_energy = ((s % outputEnergyPeriod) == 0); - //At the very first time step, the force is computed - if(s == 1) - { - // 'interparticleForce' - determines whether particles interact with each other - gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3))); - RBC.clearForceAndTorque(); //Han-Yi Chou - - if (interparticleForce) - { - //gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3))); - //RBC.clearForceAndTorque(); //Han-Yi Chou - - // 'tabulatedPotential' - determines whether interaction is described with tabulated potentials or formulas - if (tabulatedPotential) - { - // Using tabulated potentials - // 'fullLongRange' - determines whether 'cutoff' is used - // 0 - use cutoff (cell decomposition) [ N*log(N) ] - // 1 - do not use cutoff [ N^2 ] - switch (fullLongRange) - { - case 0: // [ N*log(N) ] interactions, + cutoff | decomposition - //I want to replace this - // by checking how far particles move to decide whether the pair list should be updated - //if(NeedUpdatePairList()) //ToDo, Han-Yi Chou - //{ - //internal-> decompose(); - // RBC.updateParticleLists( internal->getPos_d() ); - //} - if (s % decompPeriod == 0) - { - // cudaSetDevice(0); - internal -> decompose(); - RBC.updateParticleLists( internal->getPos_d() ); - } - - //MLog: added Bond* bondList to the list of passed in variables. - /*energy = internal->computeTabulated(forceInternal_d, pos_d, type_d, bonds_d, bondMap_d, excludes_d, excludeMap_d, angles_d, dihedrals_d, get_energy);*/ - // energy = internal -> computeTabulated(get_energy); - internal -> computeTabulated(get_energy); - break; - default: - // [ N^2 ] interactions, no cutoff | decompositions - internal->computeTabulatedFull(get_energy); - break; - } - } - else - { - // Not using tabulated potentials. - switch (fullLongRange) - { - case 0: // Use cutoff | cell decomposition. - if (s % decompPeriod == 0) - { - // cudaSetDevice(0); - internal->decompose(); - RBC.updateParticleLists( internal->getPos_d() ); - } - internal->compute(get_energy); - break; - - case 1: // Do not use cutoff - internal->computeFull(get_energy); - break; - - case 2: // Compute only softcore forces. - internal->computeSoftcoreFull(get_energy); - break; - - case 3: // Compute only electrostatic forces. - internal->computeElecFull(get_energy); - break; - } - } - }//if inter-particle force - - gpuErrchk(cudaDeviceSynchronize()); - RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, RigidBodyInterpolationType); - if(rigidbody_dynamic == String("Langevin")) - { - RBC.SetRandomTorques(); - RBC.AddLangevin(); - } - gpuErrchk(cudaDeviceSynchronize()); - }//if step == 1 - - - //Han-Yi Chou - //update the rigid body positions and orientation - //So far only brownian dynamics is used - //RBC.integrate(sys, s); - - if(particle_dynamic == String("Langevin")) - updateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas); - //For Brownian motion - else - updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), - part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas); - - - if(rigidbody_dynamic == String("Langevin")) - { - RBC.integrateDLM(sys,0); - RBC.integrateDLM(sys,1); - } - else - RBC.integrate(sys,s); - - - if (s % outputPeriod == 0) { - // Copy particle positions back to CPU - gpuErrchk(cudaDeviceSynchronize()); - gpuErrchk(cudaMemcpy(pos, internal -> getPos_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); - } - - //compute again the new force with new positions. - //reset the internal force, I hope. Han-Yi Chou - //First clear the old force - if (imd_on && clientsock && s % outputPeriod == 0) - { - float* coords = new float[num*3]; // TODO: move allocation out of run loop - int* atomIds = new int[num]; // TODO: move allocation out of run loop - int length; - - bool paused = false; - while (vmdsock_selread(clientsock, 0) > 0 || paused) - { - switch (imd_recv_header(clientsock, &length)) - { - case IMD_DISCONNECT: - printf("[IMD] Disconnecting...\n"); - imd_disconnect(clientsock); - clientsock = NULL; - sleep(5); - break; - case IMD_KILL: - printf("[IMD] Killing...\n"); - imd_disconnect(clientsock); - clientsock = NULL; - steps = s; // Stop the simulation at this step - sleep(5); - break; - case IMD_PAUSE: - paused = !paused; - break; - case IMD_GO: - printf("[IMD] Caught IMD_GO\n"); - break; - case IMD_MDCOMM: - for (size_t i = 0; i < num; ++i) // clear old forces - imdForces[i] = Vector3(0.0f); - - if (imd_recv_mdcomm(clientsock, length, atomIds, coords)) - { - printf("[IMD] Error receiving forces\n"); - } - else - { - for (size_t j = 0; j < length; ++j) - { - int i = atomIds[j]; - imdForces[i] = Vector3(coords[j*3], coords[j*3+1], coords[j*3+2]) * conf.imdForceScale; - } - } - break; - default: - printf("[IMD] Something weird happened. Disconnecting..\n"); - break; - } - } - if (clientsock) - { - // float* coords = new float[num*3]; // TODO: move allocation out of run loop - for (size_t i = 0; i < num; i++) - { - const Vector3& p = pos[i]; - coords[3*i] = p.x; - coords[3*i+1] = p.y; - coords[3*i+2] = p.z; - } - imd_send_fcoords(clientsock, num, coords); - } - delete[] coords; - delete[] atomIds; - } - if (imd_on && clientsock) - internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD - //else - //{ - //int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1); - //MLog: along with calls to internal (ComputeForce class) this function should execute once per GPU. - //clearInternalForces<<< numBlocks, NUM_THREADS >>>(internal->getForceInternal_d(), num*numReplicas); - //use cudaMemset instead - //gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3))); - //RBC.clearForceAndTorque(); - //} - //compute the new force for particles - RBC.clearForceAndTorque(); - gpuErrchk(cudaMemset((void*)(internal->getForceInternal_d()),0,num*numReplicas*sizeof(Vector3))); - //RBC.clearForceAndTorque(); - - if (interparticleForce) - { - // 'tabulatedPotential' - determines whether interaction is described with tabulated potentials or formulas - if (tabulatedPotential) - { - switch (fullLongRange) - { - case 0: // [ N*log(N) ] interactions, + cutoff | decomposition - if (s % decompPeriod == 0) - { - internal -> decompose(); - RBC.updateParticleLists( internal->getPos_d() ); - } - internal -> computeTabulated(get_energy); - break; - default: // [ N^2 ] interactions, no cutoff | decompositions - internal->computeTabulatedFull(get_energy); - break; - } - } - else - { - // Not using tabulated potentials. - switch (fullLongRange) - { - case 0: // Use cutoff | cell decomposition. - if (s % decompPeriod == 0) - { - internal->decompose(); - RBC.updateParticleLists( internal->getPos_d() ); - } - internal->compute(get_energy); - break; - case 1: // Do not use cutoff - internal->computeFull(get_energy); - break; - - case 2: // Compute only softcore forces. - internal->computeSoftcoreFull(get_energy); - break; - - case 3: // Compute only electrostatic forces. - internal->computeElecFull(get_energy); - break; - } - } - } - - //compute the force for rigid bodies - RBC.updateForces(internal->getPos_d(), internal->getForceInternal_d(), s, RigidBodyInterpolationType); - //Han-Yi Chou - //For BAOAB, the last update is only to update the momentum - // gpuErrchk(cudaDeviceSynchronize()); - - if(particle_dynamic == String("Langevin")) - LastUpdateKernelBAOAB<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getMom_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas); - - //gpuErrchk(cudaDeviceSynchronize()); - - if(rigidbody_dynamic == String("Langevin")) - { - RBC.SetRandomTorques(); - RBC.AddLangevin(); - RBC.integrateDLM(sys,2); - RBC.print(s); - } - - if (s % outputPeriod == 0) - { - if(particle_dynamic == String("Langevin")) - { - // TODO: make async - gpuErrchk(cudaMemcpy(momentum, internal -> getMom_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); - /* - gpuErrchk(cudaMemcpy(force, force_d, sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); - Vector3 f(0.f), p(0.f), r0(0.f); - double total_mass = 0.; - for(int i = 0; i < num * numReplicas; ++i) - { - f = f + force[i]; - p = p + momentum[i]; - total_mass += part[type[i]].mass; - r0 = r0 + part[type[i]].mass * pos[i]; - } - printf("The COM is %f %f %f\n",r0.x / total_mass, r0.y / total_mass, r0.z / total_mass); - printf("The total momentum is %f %f %f\n",p.x, p.y, p.z); - printf("The total force %f %f %f\n",f.x, f.y, f.z); - */ - // Output trajectories (to files) - } - //RBC.print(s); - t = s*timestep; - - // Loop over all replicas - for (int repID = 0; repID < numReplicas; ++repID) - { - - if (numberFluct == 1) - updateNameList(); // no need for it here if particles stay the same - - // Write the trajectory. - writers[repID]->append(pos + (repID*num), name, serial, t, num); - - if(particle_dynamic == String("Langevin")) - { - momentum_writers[repID]->append(momentum + (repID * num), name, serial, t, num); - //force_writers[repID]->append(force + (repID * num), name, serial, t, num); - } - - } - - // TODO: Currently, not compatible with replicas. Needs a fix. - if (numberFluct == 1) - updateReservoirs(); - - remember(t); - } - // Output energy. - if (get_energy) - { - wkf_timer_stop(timerS); - t = s * timestep; - // Simulation progress and statistics. - float percent = (100.0f * s) / steps; - float msPerStep = wkf_timer_time(timerS) * 1000.0f / outputEnergyPeriod; - float nsPerDay = numReplicas * timestep / msPerStep * 864E5f; - - // Nice thousand separator - setlocale(LC_NUMERIC, ""); - - // Do the output - printf("\rStep %ld [%.2f%% complete | %.3f ms/step | %.3f ns/day]",s, percent, msPerStep, nsPerDay); - - // Copy positions from GPU to CPU. - gpuErrchk(cudaMemcpy(pos, internal->getPos_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost)); - if(particle_dynamic == String("Langevin")) - { - //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost)); - gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost)); - float e = KineticEnergy(); - printf(" The kinetic energy is %f \n",e*(2.388458509e-1)); - } - if(rigidbody_dynamic == String("Langevin")) - { - //gpuErrchk(cudaMemcpy(momentum, internal->getMom_d(), sizeof(Vector3)*num*numReplicas,cudaMemcpyDeviceToHost)); - float e = RotKineticEnergy(); - printf(" The Rotational kinetic energy is %f \n",e*(2.388458509e-1)); - } - - // Write restart files for each replica. - for (int repID = 0; repID < numReplicas; ++repID) - writeRestart(repID); - - wkf_timer_start(timerS); - } // s % outputEnergyPeriod - } // done with all Brownian dynamics steps - - // If IMD is on & our socket is still open. - if (imd_on and clientsock) - { - if (vmdsock_selread(clientsock, 0) == 1) - { - int length; - switch (imd_recv_header(clientsock, &length)) - { - case IMD_DISCONNECT: - printf("\n[IMD] Disconnecting...\n"); - imd_disconnect(clientsock); - clientsock = NULL; - sleep(5); - break; - case IMD_KILL: - printf("\n[IMD] Killing...\n"); - imd_disconnect(clientsock); - clientsock = NULL; - sleep(5); - break; - default: - printf("\n[IMD] Something weird happened. Disconnecting..\n"); - break; - } - } - } - // Stop the main timer. - wkf_timer_stop(timer0); - - // Compute performance data. - const float elapsed = wkf_timer_time(timer0); // seconds - int tot_hrs = (int) std::fmod(elapsed / 3600.0f, 60.0f); - int tot_min = (int) std::fmod(elapsed / 60.0f, 60.0f); - float tot_sec = std::fmod(elapsed, 60.0f); - - printf("\nFinal Step: %d\n", (int) steps); - - printf("Total Run Time: "); - if (tot_hrs > 0) printf("%dh%dm%.1fs\n", tot_hrs, tot_min, tot_sec); - else if (tot_min > 0) printf("%dm%.1fs\n", tot_min, tot_sec); - else printf("%.2fs\n", tot_sec); - - gpuErrchk(cudaFree(force_d)); -#endif -} // GrandBrownTown::run() // -------------------------------------------- // Populate lists of types and serial numbers. // void GrandBrownTown::populate() { for (int repID = 0; repID < numReplicas; repID++) { - const int offset = repID * num; + const int offset = repID * (num+num_rb_attached_particles); int pn = 0; int p = 0; for (int i = 0; i < num; i++) { @@ -1636,7 +1203,7 @@ void GrandBrownTown::populate() { void GrandBrownTown::writeRestart(int repID) const { FILE* out = fopen(restartFiles[repID].c_str(), "w"); - const int offset = repID * num; + const int offset = repID * (num+num_rb_attached_particles); for (int i = 0; i < num; ++i) { @@ -1664,14 +1231,14 @@ void GrandBrownTown::writeRestart(int repID) const /*the center is defined by the first pmf*/ void GrandBrownTown::initialCondCen() { for (int i = 0; i < num; i++) - pos[i] = part[ type[i] ].pmf->getCenter(); + pos[i] = part[ type[i] ].pmf[0]->getCenter(); } // Set random initial positions for all particles and replicas void GrandBrownTown::initialCond() { for (int repID = 0; repID < numReplicas; repID++) { - const int offset = repID * num; + const int offset = repID * (num+num_rb_attached_particles); for (int i = 0; i < num; i++) { pos[i + offset] = findPos(type[i + offset]); } @@ -1688,7 +1255,7 @@ Vector3 GrandBrownTown::findPos(int typ) { const float ry = sysDim.y * randoGen->uniform(); const float rz = sysDim.z * randoGen->uniform(); r = sys->wrap( Vector3(rx, ry, rz) ); - } while (pt.pmf->interpolatePotential(r) > *pt.meanPmf); + } while (pt.pmf[0]->interpolatePotential(r) > *pt.meanPmf); return r; } @@ -1701,7 +1268,7 @@ Vector3 GrandBrownTown::findPos(int typ, float minZ) { const float ry = sysDim.y * randoGen->uniform(); const float rz = sysDim.z * randoGen->uniform(); r = sys->wrap( Vector3(rx, ry, rz) ); - } while (pt.pmf->interpolatePotential(r) > *pt.meanPmf and fabs(r.z) > minZ); + } while (pt.pmf[0]->interpolatePotential(r) > *pt.meanPmf and fabs(r.z) > minZ); return r; } @@ -1715,7 +1282,7 @@ float GrandBrownTown::KineticEnergy() gpuErrchk(cudaMalloc((void**)&energy, sizeof(float))); gpuErrchk(cudaMemset((void*)energy,0,sizeof(float))); - BrownParticlesKineticEnergy<64><<<dim3(512),dim3(64)>>>(internal->getMom_d(), internal -> getType_d(), part_d, vec_red, numReplicas*num); + BrownParticlesKineticEnergy<64><<<dim3(512),dim3(64)>>>(internal->getMom_d(), internal -> getType_d(), part_d, vec_red, num, num_rb_attached_particles, numReplicas); gpuErrchk(cudaDeviceSynchronize()); Reduction<64><<<dim3(1),dim3(64)>>>(vec_red, energy, 512); @@ -1763,7 +1330,7 @@ void GrandBrownTown::init_cuda_group_sites() } // Create GPU-friendly data structure - // TODO make this work for replicas + assert(numReplicas == 1); // TODO make this work for replicas int* tmp = new int[numGroupSites+1+num_particles]; num_particles = 0; int i = 0; diff --git a/src/GrandBrownTown.cuh b/src/GrandBrownTown.cuh index 4a2211c6c407f16871f93ab7aa9e33f5defa9a2b..44ffe6f82c1c3dde464d8039ac6fef26dd97f84f 100644 --- a/src/GrandBrownTown.cuh +++ b/src/GrandBrownTown.cuh @@ -14,86 +14,50 @@ __device__ Vector3 step(Vector3& r0, float kTlocal, Vector3 force, float diffusion, Vector3 diffGrad, float timestep, BaseGrid *sys, Random *randoGen, int num); -//update both position and momentum for Langevin dynamics -//Han-Yi Chou -#if 0 -__global__ void updateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal, - int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField, - int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas) +inline __device__ +ForceEnergy compute_position_dependent_force( + const Vector3* __restrict__ pos, Vector3* __restrict__ forceInternal, + const int* __restrict__ type, BrownianParticleType** part, + const float electricField, const int scheme, const int idx) { - const int idx = blockIdx.x * blockDim.x + threadIdx.x; + int t = type[idx]; + Vector3 r0 = pos[idx]; + const BrownianParticleType& pt = *part[t]; + Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField); - if (idx < num * numReplicas) + ForceEnergy fe(0.f, 0.f); + for(int i = 0; i < pt.numPartGridFiles; ++i) { - int t = type[idx]; - //Vector3 r0(tex1Dfetch(PosTex, idx)); - //Vector3 p0(tex1Dfetch(MomTex, idx)); - Vector3 r0 = pos[idx]; - Vector3 p0 = momentum[idx]; - - const BrownianParticleType& pt = *part[t]; - float mass = pt.mass; - r0 = r0 + (timestep * 5e3 * p0) / mass; - r0 = sys->wrap(r0); - pos[idx] = r0; - -#if 0 - Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField); - - // Compute PMF - // TODO: maybe make periodic and nonPeriodic versions of this kernel - ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0); + ForceEnergy tmp(0.f, 0.f); + if(!scheme) { + BoundaryCondition bc = pt.pmf_boundary_conditions[i]; + INTERPOLATE_FORCE(tmp, pt.pmf[i]->interpolateForceDLinearly, bc, r0) + } else + tmp = pt.pmf[i]->interpolateForceD(r0); + fe.f += tmp.f * pt.pmf_scale[i]; + fe.e += tmp.e * pt.pmf_scale[i]; + } + // if(get_energy) + // energy[idx] += fe.e; #ifndef FORCEGRIDOFF - // Add a force defined via 3D FORCE maps (not 3D potential maps) - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0); -#endif - Vector3 force = forceInternal[idx] + forceExternal + fe.f; - - // Get local kT value - float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(r0); /* periodic */ - - // Update the particle's position using the calculated values for time, force, etc. - float mass = pt.mass; - Vector3 gamma = pt.transDamping; - - if (pt.diffusionGrid != NULL) - { - - Vector3 gridCenter = pt.diffusionGrid->origin + - pt.diffusionGrid->basis.transform( Vector3(0.5*pt.diffusionGrid->nx, 0.5*pt.diffusionGrid->ny, - 0.5*pt.diffusionGrid->nz)); - Vector3 p2 = r0 - gridCenter; - p2 = sys->wrapDiff( p2 ) + gridCenter; - ForceEnergy diff = pt.diffusionGrid->interpolateForceDLinearlyPeriodic(p2); - gamma = Vector3(kTlocal / (mass * diff.e)); - //diffGrad = diff.f; - } - - float tmp = sqrtf(0.50f * kTlocal * mass * timestep); - Vector3 rando = randoGen->gaussian_vector(idx, num * numReplicas); - - #ifdef MDSTEP - force = Vector3(-r0.x, -r0.y, -r0.z); - #endif - - p0.x = (1.0f - 0.50f * timestep * gamma.x) * p0.x + (0.50f * timestep * force.x * Unit1 + tmp * sqrtf(gamma.x) * rando.x * Unit2); - p0.y = (1.0f - 0.50f * timestep * gamma.y) * p0.y + (0.50f * timestep * force.y * Unit1 + tmp * sqrtf(gamma.y) * rando.y * Unit2); - p0.z = (1.0f - 0.50f * timestep * gamma.z) * p0.z + (0.50f * timestep * force.z * Unit1 + tmp * sqrtf(gamma.z) * rando.z * Unit2); - - r0 = r0 + (timestep * 1e4 * p0) / mass; - r0 = sys->wrap(r0); - - //step(r0, p0, kTlocal, force, tmp, -diffGrad, mass, gamma, timestep, sys, randoGen, num * numReplicas); - pos[idx] = r0; - momentum[idx] = p0; -#endif - + // Add a force defined via 3D FORCE maps (not 3D potential maps) + if(!scheme) + { + if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0); + if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0); + if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0); + } + else + { + if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0); + if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0); + if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0); } -} #endif + fe.f = fe.f + forceExternal; + return fe; +} ////The kernel is for Nose-Hoover Langevin dynamics @@ -101,48 +65,24 @@ __global__ void updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__ momentum, float* random, Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField, int tGridLength, float timestep, - int num, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme) + int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme) { - const int idx = blockIdx.x * blockDim.x + threadIdx.x; + int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < num * numReplicas) { - int t = type[idx]; + idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles); + + ForceEnergy fe = compute_position_dependent_force( + pos, forceInternal, type, part, electricField, scheme, idx ); + int t = type[idx]; Vector3 r0 = pos[idx]; Vector3 p0 = momentum[idx]; float ran = random[idx]; const BrownianParticleType& pt = *part[t]; - Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField); - ForceEnergy fe(0.f, 0.f); - for(int i = 0; i < pt.numPartGridFiles; ++i) - { - ForceEnergy tmp(0.f,0.f); - if(!scheme) { - const BoundaryCondition& bc = pt.pmf_boundary_conditions[i]; - INTERPOLATE_FORCE(tmp, pt.pmf[i].interpolateForceDLinearly, bc, r0) - } else - tmp = pt.pmf[i].interpolateForceD(r0); - fe.f += tmp.f; - } - //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0); -#ifndef FORCEGRIDOFF - // Add a force defined via 3D FORCE maps (not 3D potential maps) - if(!scheme) - { - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0); - } - else - { - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0); - } -#endif - Vector3 force = forceInternal[idx] + forceExternal + fe.f; + Vector3 force = forceInternal[idx] + fe.f; #ifdef Debug forceInternal[idx] = -force; #endif @@ -214,50 +154,27 @@ updateKernelNoseHooverLangevin(Vector3* __restrict__ pos, Vector3* __restrict__ //Han-Yi Chou __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, - float electricField,int tGridLength, float timestep, int num, BaseGrid* sys, + float electricField,int tGridLength, float timestep, + int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas, int scheme) { - const int idx = blockIdx.x * blockDim.x + threadIdx.x; + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num * numReplicas) { - int t = type[idx]; + idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles); + ForceEnergy fe = compute_position_dependent_force( + pos, forceInternal, type, part, electricField, scheme, idx ); + // if (get_energy) energy[idx] += fe.e; + + int t = type[idx]; Vector3 r0 = pos[idx]; Vector3 p0 = momentum[idx]; - const BrownianParticleType& pt = *part[t]; - Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField); - //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0); - ForceEnergy fe(0.f, 0.f); - for(int i = 0; i < pt.numPartGridFiles; ++i) - { - ForceEnergy tmp(0.f, 0.f); - if(!scheme) { - BoundaryCondition bc = pt.pmf_boundary_conditions[i]; - INTERPOLATE_FORCE(tmp, pt.pmf[i].interpolateForceDLinearly, bc, r0) - } else - tmp = pt.pmf[i].interpolateForceD(r0); - fe.f += tmp.f; - } - -#ifndef FORCEGRIDOFF - // Add a force defined via 3D FORCE maps (not 3D potential maps) - if(!scheme) - { - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0); - } - else - { - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0); - } -#endif - Vector3 force = forceInternal[idx] + forceExternal + fe.f; + Vector3 force = forceInternal[idx] + fe.f; #ifdef Debug forceInternal[idx] = -force; #endif @@ -315,52 +232,23 @@ __global__ void updateKernelBAOAB(Vector3* pos, Vector3* momentum, Vector3* __re //update momentum in the last step of BAOAB integrator for the Langevin dynamics. Han-Yi Chou __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, - float electricField, int tGridLength, float timestep, int num, + float electricField, int tGridLength, float timestep, int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas, float* __restrict__ energy, bool get_energy,int scheme) { - const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x); + int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x); if (idx < num * numReplicas) { - int t = type[idx]; + idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles); + + ForceEnergy fe = compute_position_dependent_force( + pos, forceInternal, type, part, electricField, scheme, idx ); + if (get_energy) energy[idx] += fe.e; + Vector3 r0 = pos[idx]; Vector3 p0 = momentum[idx]; - const BrownianParticleType& pt = *part[t]; - Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField); - - //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0); - ForceEnergy fe(0.f, 0.f); - for(int i = 0; i < pt.numPartGridFiles; ++i) - { - ForceEnergy tmp(0.f, 0.f); - if(!scheme) { - BoundaryCondition bc = pt.pmf_boundary_conditions[i]; - INTERPOLATE_FORCE(tmp, pt.pmf[i].interpolateForceDLinearly, bc, r0) - } else - tmp = pt.pmf[i].interpolateForceD(r0); - - fe += tmp; - //fe.e += tmp.e; - } - if(get_energy) - energy[idx] += fe.e; -#ifndef FORCEGRIDOFF - // Add a force defined via 3D FORCE maps (not 3D potential maps) - if(!scheme) - { - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0); - } - else - { - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(r0); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(r0); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(r0); - } -#endif - Vector3 force = forceInternal[idx] + forceExternal + fe.f; + Vector3 force = forceInternal[idx] + fe.f; #ifdef Debug forceInternal[idx] = -force; #endif @@ -373,82 +261,22 @@ __global__ void LastUpdateKernelBAOAB(Vector3* pos,Vector3* momentum, Vector3* _ momentum[idx] = p0; } } -/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -//The last step of BBK integrator to update the momentum for Langevin dynamics. -#if 0 -__global__ void LastUpdateKernelABOBA(Vector3* pos, Vector3* momentum, Vector3* __restrict__ forceInternal, - int type[], BrownianParticleType* part[], float kT, BaseGrid* kTGrid, float electricField, - int tGridLength, float timestep, int num, BaseGrid* sys, Random* randoGen, int numReplicas) -{ - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (idx < num * numReplicas) - { - int t = type[idx]; - Vector3 r0 = pos[idx]; - Vector3 p0 = momentum[idx]; - - const BrownianParticleType& pt = *part[t]; - Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField); - - ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(r0); - -#ifndef FORCEGRIDOFF - // Add a force defined via 3D FORCE maps (not 3D potential maps) - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(r0); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(r0); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(r0); -#endif - Vector3 force = forceInternal[idx] + forceExternal + fe.f; - - float mass = pt.mass; - Vector3 gamma = pt.transDamping; - - float kTlocal = (tGridLength == 0) ? kT : kTGrid->interpolatePotentialLinearly(r0); /* periodic */ - - float tmp = sqrtf(0.50f * kTlocal * mass * timestep); - - if (pt.diffusionGrid != NULL) - { - - Vector3 gridCenter = pt.diffusionGrid->origin + - pt.diffusionGrid->basis.transform( Vector3(0.5*pt.diffusionGrid->nx, 0.5*pt.diffusionGrid->ny, - 0.5*pt.diffusionGrid->nz)); - Vector3 p2 = r0 - gridCenter; - p2 = sys->wrapDiff( p2 ) + gridCenter; - ForceEnergy diff = pt.diffusionGrid->interpolateForceDLinearlyPeriodic(p2); - gamma = Vector3(kTlocal / (mass * diff.e)); - } - - Vector3 rando = randoGen->gaussian_vector(idx, num * numReplicas); - #ifdef MDSTEP - force = Vector3(-r0.x, -r0.y, -r0.z); - #endif - - //step(p0, kTlocal, force, tmp, -diffGrad, mass, gamma, timestep, sys, randoGen, num * numReplicas); - p0.x = (p0.x + 0.50 * timestep * force.x * Unit1 + tmp * sqrtf(gamma.x) * rando.x * Unit2) / (1.0f+0.5f*timestep*gamma.x); - p0.y = (p0.y + 0.50 * timestep * force.y * Unit1 + tmp * sqrtf(gamma.y) * rando.y * Unit2) / (1.0f+0.5f*timestep*gamma.y); - p0.z = (p0.z + 0.50 * timestep * force.z * Unit1 + tmp * sqrtf(gamma.z) * rando.z * Unit2) / (1.0f+0.5f*timestep*gamma.z); - - momentum[idx] = p0; - } -} -#endif //Update kernel for Brownian dynamics __global__ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[], BrownianParticleType* part[],float kT, BaseGrid* kTGrid, float electricField, - int tGridLength, float timestep, int num, BaseGrid* sys, + int tGridLength, float timestep, int num, int num_rb_attached_particles, BaseGrid* sys, Random* randoGen, int numReplicas, float* energy, bool get_energy, int scheme) { // Calculate this thread's ID - const int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x); + int idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x); // TODO: Make this a grid-stride loop to make efficient reuse of RNG states // Loop over ALL particles in ALL replicas if (idx < num * numReplicas) { + idx = (idx % num) + (idx/num) * (num+num_rb_attached_particles); const int t = type[idx]; Vector3 p = pos[idx]; @@ -456,44 +284,14 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[], /* printf("atom %d: forceInternal: %f %f %f\n", idx, forceInternal[idx].x, forceInternal[idx].y, forceInternal[idx].z); */ - // Compute external force - Vector3 forceExternal = Vector3(0.0f, 0.0f, pt.charge * electricField); - - // Compute PMF - // TODO: maybe make periodic and nonPeriodic versions of this kernel - //ForceEnergy fe = pt.pmf->interpolateForceDLinearlyPeriodic(p); - ForceEnergy fe(0.f, 0.f); - for(int i = 0; i < pt.numPartGridFiles; ++i) - { - ForceEnergy tmp(0.f,0.f); - if(!scheme) { - BoundaryCondition bc = pt.pmf_boundary_conditions[i]; - INTERPOLATE_FORCE(tmp, pt.pmf[i].interpolateForceDLinearly, bc, p) - } else - tmp = pt.pmf[i].interpolateForceD(p); - fe += tmp; - } -#ifndef FORCEGRIDOFF - // Add a force defined via 3D FORCE maps (not 3D potential maps) - if(!scheme) - { - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotentialLinearly(p); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotentialLinearly(p); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotentialLinearly(p); - } - else - { - if (pt.forceXGrid != NULL) fe.f.x += pt.forceXGrid->interpolatePotential(p); - if (pt.forceYGrid != NULL) fe.f.y += pt.forceYGrid->interpolatePotential(p); - if (pt.forceZGrid != NULL) fe.f.z += pt.forceZGrid->interpolatePotential(p); - } -#endif + ForceEnergy fe = compute_position_dependent_force( + pos, forceInternal, type, part, electricField, scheme, idx ); // Compute total force: // Internal: interaction between particles // External: electric field (now this is basically a constant vector) // forceGrid: ADD force due to PMF or other potentials defined in 3D space - Vector3 force = forceInternal[idx] + forceExternal + fe.f; + Vector3 force = forceInternal[idx] + fe.f; #ifdef Debug forceInternal[idx] = -force; #endif @@ -550,10 +348,12 @@ void updateKernel(Vector3* pos, Vector3* __restrict__ forceInternal, int type[], float en_local = 0.f; for(int i = 0; i < pt.numPartGridFiles; ++i) { + float en_tmp = 0.0f; if(!scheme) - en_local += pt.pmf[i].interpolatePotentialLinearly(tmp); + en_tmp = pt.pmf[i]->interpolatePotentialLinearly(tmp); else - en_local += pt.pmf[i].interpolatePotential(tmp); + en_tmp = pt.pmf[i]->interpolatePotential(tmp); + en_tmp *= pt.pmf_scale[i]; } energy[idx] += en_local; } @@ -703,3 +503,37 @@ __global__ void devicePrint(RigidBodyType* rb[]) { // cudaMalloc( &totalForce_h, sizeof(Vector3) ); // cudaMemcpyToSymbol(totalForce, &totalForce_h, sizeof(totalForce_h)); // } + +__global__ void compute_position_dependent_force_for_rb_attached_particles( + const Vector3* __restrict__ pos, + Vector3* __restrict__ forceInternal, float* __restrict__ energy, + const int* __restrict__ type, BrownianParticleType** __restrict__ part, + const float electricField, const int num, const int num_rb_attached_particles, + const int numReplicas, const int scheme) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_rb_attached_particles * numReplicas) + { + idx = num + (idx % num_rb_attached_particles) + (idx/num_rb_attached_particles) * (num+num_rb_attached_particles); + ForceEnergy fe = compute_position_dependent_force( + pos, forceInternal, type, part, electricField, scheme, idx ); + atomicAdd( &forceInternal[idx], fe.f ); + atomicAdd( &energy[idx], fe.e ); + } +} +__global__ void compute_position_dependent_force_for_rb_attached_particles( + const Vector3* __restrict__ pos, Vector3* __restrict__ forceInternal, + const int* __restrict__ type, BrownianParticleType** __restrict__ part, + const float electricField, const int num, const int num_rb_attached_particles, + const int numReplicas, const int scheme) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < num_rb_attached_particles * numReplicas) + { + idx = num + (idx % num_rb_attached_particles) + (idx/num_rb_attached_particles) * (num+num_rb_attached_particles); + ForceEnergy fe = compute_position_dependent_force( + pos, forceInternal, type, part, electricField, scheme, idx ); + atomicAdd( &forceInternal[idx], fe.f ); + } +} diff --git a/src/GrandBrownTown.h b/src/GrandBrownTown.h index c6479a081fb2b4461493f49269c1e67b29eb7323..56d4c5a3b28c7b765542ba87eb5efa2445768c82 100644 --- a/src/GrandBrownTown.h +++ b/src/GrandBrownTown.h @@ -62,7 +62,6 @@ public: ~GrandBrownTown(); void run(); - void RunNoseHooverLangevin(); static bool DEBUG; private: @@ -230,6 +229,8 @@ private: int numAngles; int numDihedrals; + int num_rb_attached_particles; + int numGroupSites; int* groupSiteData_d; diff --git a/src/Makefile b/src/Makefile index b0356b86521cc4ca5448f5fdb8b0b35ea106b2dc..68598f462bf925e748a759b91ab9bd5ab09d6afc 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,7 @@ ### Paths, libraries, includes, options include ./findcudalib.mk +SUFFIX ?= "" ifeq ($(dbg),1) NV_FLAGS += -g -G -O0 @@ -99,13 +100,18 @@ ifeq ($(omp),1) TARGET := $(TARGET)_omp endif -all: $(TARGET) - @echo "Done ->" $(TARGET) +all: $(TARGET)$(SUFFIX) + @echo "Done ->" $(TARGET)$(SUFFIX) -$(TARGET): $(CU_OBJ) $(CC_OBJ) arbd.cpp vmdsock.c imd.c imd.h - $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o +$(TARGET)_link.o: $(CU_OBJ) +# $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o - $(CC) $(CC_FLAGS) $(EX_FLAGS) arbd.cpp vmdsock.c imd.c $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) $(LD_FLAGS) -o $(TARGET) + +$(TARGET)$(SUFFIX): $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) arbd.cpp vmdsock.c imd.c imd.h +# $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) $(CC_OBJ) -o $(TARGET)_link.o +# $(NVCC) $(NVLD_FLAGS) $(CU_OBJ) -o $(TARGET)_link.o + $(CC) $(CC_FLAGS) $(EX_FLAGS) arbd.cpp vmdsock.c imd.c $(TARGET)_link.o $(CU_OBJ) $(CC_OBJ) $(LD_FLAGS) -o $(TARGET)$(SUFFIX) + .SECONDEXPANSION: $(CU_OBJ): %.o: %.cu $$(wildcard %.h) $$(wildcard %.cuh) diff --git a/src/RigidBody.cu b/src/RigidBody.cu index 94e726112c97e8fc6eddde9d2585f8aa01157df5..8aeb7aa65711c72fa02149e73cf969a611f4078f 100644 --- a/src/RigidBody.cu +++ b/src/RigidBody.cu @@ -18,10 +18,10 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t } #endif -RigidBody::RigidBody(String name, const Configuration& cref, const RigidBodyType& tref, RigidBodyController* RBCref) - : name(name), c(&cref), t(&tref), RBC(RBCref), impulse_to_momentum(4.1867999435271e4) /*impulse_to_momentum(4.184e8f)*/ { init(); } +RigidBody::RigidBody(String name, const Configuration& cref, const RigidBodyType& tref, RigidBodyController* RBCref, int attached_particle_start, int attached_particle_end) + : name(name), c(&cref), t(&tref), RBC(RBCref), attached_particle_start(attached_particle_start), attached_particle_end(attached_particle_end), impulse_to_momentum(4.1867999435271e4) /*impulse_to_momentum(4.184e8f)*/ { init(); } RigidBody::RigidBody(const RigidBody& rb) - : name(rb.name), c(rb.c), t(rb.t), RBC(rb.RBC), impulse_to_momentum(4.1867999435271e4)/*impulse_to_momentum(4.184e8f)*/ { init(); } + : name(rb.name), c(rb.c), t(rb.t), RBC(rb.RBC), attached_particle_start(rb.attached_particle_start), attached_particle_end(rb.attached_particle_end), impulse_to_momentum(4.1867999435271e4)/*impulse_to_momentum(4.184e8f)*/ { init(); } void RigidBody::init() { // units "(kcal_mol/AA) * ns" "dalton AA/ns" * 4.184e+08 timestep = c->timestep; @@ -69,8 +69,8 @@ void RigidBody::Boltzmann(unsigned long int seed) angularMomentum.x *= sigma[1]; angularMomentum.y *= sigma[2]; angularMomentum.z *= sigma[3]; - printf("%f\n", Temp); - printf("%f\n", Temperature()); + // printf("%f\n", Temp); + // printf("%f\n", Temperature()); } RigidBody::~RigidBody() { @@ -103,6 +103,23 @@ int RigidBody::appendNumParticleBlocks( std::vector<int>* blocks ) { return ret; } +__global__ +void update_particle_positions_kernel(Vector3* __restrict__ pos, const int start, const int num, + const Vector3* __restrict__ pos_rb, + const Vector3 center, const Matrix3 orientation) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < num) { + const int aid = i+start; + pos[aid] = orientation*pos_rb[i] + center; + } +} +void RigidBody::update_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d) { + int num_attached = attached_particle_end - attached_particle_start; + int nb = floor(num_attached/NUMTHREADS) + 1; + update_particle_positions_kernel<<<nb,NUMTHREADS>>>(pos_d, attached_particle_start, num_attached, + t->attached_particle_positions_d, position, orientation); +} + void RigidBody::addForce(Force f) { force += f; } @@ -113,6 +130,141 @@ void RigidBody::addEnergy(float e) { energy += e; } + +// TODO move code snippet to more general location +template<class C, class T> +auto contains(const C& v, const T& x) +-> decltype(end(v), true) +{ + return end(v) != std::find(begin(v), end(v), x); +} +/* +void RigidBody::initialize_possible_particles() +{ + std::vector<int> atomic_ids; + + // Loop over potential grids + for (int i = 0; i < t->numPotGrids; ++i) { + atomic_ids.clear(); + + String& gridName = t->potentialGridKeys[i]; + + // Loop over particle types to count the number of particles + for (int j = 0; j < conf->numParts; ++j) { + if (contains(conf->partRigidBodyGrid[j], gridName)) { + // gridNames contained gridName, so add the particles to atomic_ids + for (int aid = 0; aid < conf->num + conf->num_rb_attached_particles; ++aid) { + if (conf->type[aid] == j && (aid < exclude_start || aid > exclude_end)) { + atomic_ids.push_back(aid); + } + } + } + } + + // Initialize device data + size_t sz = sizeof(int) * atomic_ids.size(); + gpuErrchk(cudaMalloc( &(possible_particles_d[i]), sz )); + gpuErrchk(cudaMemcpyAsync( possible_particles_d[i], &atomic_ids[0], sz, cudaMemcpyHostToDevice)) + + // // Add particles attached to OTHER RBs + // int rb_particle_offset = conf->num; + // for (const auto& rbs: RBC->rigidBodyByType) + // { + // int rb_type + // const RigidBodyType& rb_type = rbs[0].t; + // if (rbs.t != + // for (int& ptype: rb_type->attached_particle_types) { + // // This could be made much more efficient + // if (contains(conf->partRigidBodyGrid[ptype], gridName)) { + // // Add particles + // rb + // rbs[0].sum + // } + // rb_particle_offset += rb_type.attached_particle_types + + // for (int k = 0; k < t->gridNames.size(); ++k) { + // if (t->gridNames[k] == gridName) { + // t->numParticles[i] += conf->numPartsOfType[j]; + // } + + + // // Loop over rigid body grid names associated with particle type + // const std::vector<String>& gridNames = conf->partRigidBodyGrid[j]; + // for (int k = 0; k < gridNames.size(); ++k) { + // if (gridNames[k] == gridName) { + // // Copy type j particles to particles[i] + // memcpy( &(particles[i][pid]), tmp, sizeof(int)*currId ); + // assert(currId == conf->numPartsOfType[j]); + // pid += conf->numPartsOfType[j]; + // } + // } + // } + + // numParticles[i] = 0; + + // Count the particles interacting with potential grid i + // Loop over particle types + for (int j = 0; j < conf->numParts; ++j) { + + // Loop over rigid body grid names associated with particle type + const std::vector<String>& gridNames = conf->partRigidBodyGrid[j]; + for (int k = 0; k < t->gridNames.size(); ++k) { + if (t->gridNames[k] == gridName) { + numParticles[i] += conf->numPartsOfType[j]; + } + } + } + + // Add RB particles + for (const auto& rbv: RBC->rigidBodyByType) + { + int ptype = rbv[0].t->attached_particle_types; + const std::vector<String>& gridNames = conf->partRigidBodyGrid[ptype]; + for (int k = 0; k < t->gridNames.size(); ++k) { + if (t->gridNames[k] == gridName) { + t->numParticles[i] += conf->numPartsOfType[j]; + } + + attached_particle_ + + if (numParticles[i] > 0) { + + // allocate array of particle ids for the potential grid + particles[i] = new int[numParticles[i]]; + int pid = 0; + + // Loop over particle types to count the number of particles + for (int j = 0; j < conf->numParts; ++j) { + + // Build temporary id array of type j particles + int tmp[conf->numPartsOfType[j]]; + int currId = 0; + for (int aid = 0; aid < conf->num; ++aid) { + if (conf->type[aid] == j) + tmp[currId++] = aid; + } + if (currId == 0) continue; + + // Loop over rigid body grid names associated with particle type + const std::vector<String>& gridNames = conf->partRigidBodyGrid[j]; + for (int k = 0; k < gridNames.size(); ++k) { + if (gridNames[k] == gridName) { + // Copy type j particles to particles[i] + memcpy( &(particles[i][pid]), tmp, sizeof(int)*currId ); + assert(currId == conf->numPartsOfType[j]); + pid += conf->numPartsOfType[j]; + } + } + } + + // Initialize device data + size_t sz = sizeof(int) * numParticles[i]; + gpuErrchk(cudaMalloc( &(particles_d[i]), sz )); + gpuErrchk(cudaMemcpyAsync( particles_d[i], particles[i], sz, cudaMemcpyHostToDevice)); + } + } +} +*/ void RigidBody::updateParticleList(Vector3* pos_d, BaseGrid* sys_d) { for (int i = 0; i < t->numPotGrids; ++i) { numParticles[i] = 0; @@ -131,10 +283,12 @@ void RigidBody::updateParticleList(Vector3* pos_d, BaseGrid* sys_d) { int nb = floor(tnp/NUMTHREADS) + 1; #if __CUDA_ARCH__ >= 300 createPartlist<<<nb,NUMTHREADS>>>(pos_d, tnp, t->particles_d[i], + attached_particle_start, attached_particle_end, tmp_d, particles_d[i], gridCenter + position, cutoff*cutoff, sys_d); #else createPartlist<<<nb,NUMTHREADS,NUMTHREADS/WARPSIZE>>>(pos_d, tnp, t->particles_d[i], + attached_particle_start, attached_particle_end, tmp_d, particles_d[i], gridCenter + position, cutoff*cutoff, sys_d); #endif @@ -179,6 +333,20 @@ void RigidBody::callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, in } } +void RigidBody::apply_attached_particle_forces(const Vector3* force) { + const auto &rb_pos = t->attached_particle_positions; + int num = rb_pos.size(); + Vector3 total_force = Vector3(0.0f); + Vector3 torque = Vector3(0.0f); + for (int i = 0; i < num; ++i) { + const int j = i + attached_particle_start; + torque = torque + (orientation*rb_pos[i]).cross(force[j]); + total_force = total_force + force[j]; + } + addForce(total_force); + addTorque(torque); +} + void RigidBody::applyGridParticleForces(BaseGrid* sys, ForceEnergy* forcestorques, const std::vector<int>& forcestorques_offset, int& fto_idx) { // loop over potential grids for (int i = 0; i < t->numPotGrids; ++i) { diff --git a/src/RigidBody.h b/src/RigidBody.h index 5854a082a29f975d11281b9ad45ffd5bc8cc0fbf..e3e81737ae1d7aee45f16e915d44f6aa413e1749 100644 --- a/src/RigidBody.h +++ b/src/RigidBody.h @@ -31,7 +31,9 @@ class RigidBody { // host side representation of rigid bodies | splitting methods for rigid body molecular dynamics". J Chem Phys. (1997) | `––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––*/ public: - RigidBody(String name, const Configuration& c, const RigidBodyType& t, RigidBodyController* RBC); + RigidBody(String name, const Configuration& c, const RigidBodyType& t, RigidBodyController* RBC, + int attached_particle_start, int attached_particle_end); + RigidBody(const RigidBody& rb); // RigidBody(const RigidBody& rb) : RigidBody(rb.name, *rb.c, *rb.t) {}; void init(); @@ -40,6 +42,8 @@ class RigidBody { // host side representation of rigid bodies int appendNumParticleBlocks( std::vector<int>* blocks ); + void update_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d); + HOST DEVICE void addForce(Force f); HOST DEVICE void addTorque(Force t); HOST DEVICE void addEnergy(float e); @@ -76,8 +80,10 @@ class RigidBody { // host side representation of rigid bodies return Vector3( angularMomentum.x, angularMomentum.y, angularMomentum.z); } + void initializeParticleLists(); void updateParticleList(Vector3* pos_d, BaseGrid* sys_d); void callGridParticleForceKernel(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d, ForceEnergy* forcestorques_d, const std::vector<int>& forcestorques_offset, int& fto_idx); + void apply_attached_particle_forces(const Vector3* force); void applyGridParticleForces(BaseGrid* sys, ForceEnergy* forcestorques, const std::vector<int>& forcestorques_offset, int& fto_idx); bool langevin; @@ -122,9 +128,12 @@ private: bool isFirstStep; int* numParticles; /* particles affected by potential grids */ + int** possible_particles_d; int** particles_d; const cudaStream_t** particleForceStreams; - + + int attached_particle_start, attached_particle_end; + /*–––––––––––––––––––––––––––––––––––––––––. | units "kcal_mol/AA * ns" "(AA/ns) * amu" | `–––––––––––––––––––––––––––––––––––––––––*/ diff --git a/src/RigidBodyController.cu b/src/RigidBodyController.cu index 5c89ba26f440bbc2659aea47b31671ecf7ce9cd0..13438b8c758384996f3c72309d2bc62071a1f7c7 100644 --- a/src/RigidBodyController.cu +++ b/src/RigidBodyController.cu @@ -45,15 +45,14 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* pre gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */ construct_grids(); - for (int i = 0; i < conf.numRigidTypes; i++) - conf.rigidBody[i].initializeParticleLists(); int numRB = 0; // grow list of rbs - + int attached_particle_offset = 0; for (int i = 0; i < conf.numRigidTypes; i++) { numRB += conf.rigidBody[i].num; + int attached_particle_in_type = conf.rigidBody[i].num_attached_particles(); std::vector<RigidBody> tmp; // RBTODO: change conf.rigidBody to conf.rigidBodyType const int jmax = conf.rigidBody[i].num; @@ -64,13 +63,17 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* pre snprintf(stmp, 128, "#%d", j); name.add( stmp ); } - RigidBody r(name, conf, conf.rigidBody[i], this); + RigidBody r(name, conf, conf.rigidBody[i], this, + attached_particle_offset, attached_particle_offset+attached_particle_in_type); + attached_particle_offset += attached_particle_in_type; + int nb = r.appendNumParticleBlocks( &particleForceNumBlocks ); tmp.push_back( r ); } rigidBodyByType.push_back(tmp); } - + attached_particle_forces = new Vector3[attached_particle_offset]; + totalParticleForceNumBlocks = 0; for (int i=0; i < particleForceNumBlocks.size(); ++i) { particleForce_offset.push_back(2*totalParticleForceNumBlocks); @@ -80,30 +83,24 @@ RigidBodyController::RigidBodyController(const Configuration& c, const char* pre gpuErrchk(cudaMallocHost(&(particleForces), sizeof(ForceEnergy) * 2*totalParticleForceNumBlocks)) gpuErrchk(cudaMalloc(&(particleForces_d), sizeof(ForceEnergy) * 2*totalParticleForceNumBlocks)) - if (conf.inputRBCoordinates.length() > 0) - loadRBCoordinates(conf.inputRBCoordinates.val()); + if (conf.restartRBCoordinates.length() > 0) + load_restart_coordinates(conf.restartRBCoordinates.val()); + else if (conf.inputRBCoordinates.length() > 0) + loadRBCoordinates(conf.inputRBCoordinates.val()); random = new RandomCPU(conf.seed + repID + 1); /* +1 to avoid using same seed as RandomCUDA */ + initializeForcePairs(); // Must run after construct_grids() gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: this should be extraneous */ - - //Boltzmann distribution - for (int i = 0; i < rigidBodyByType.size(); i++) - { - for (int j = 0; j < rigidBodyByType[i].size(); j++) - { - RigidBody& rb = rigidBodyByType[i][j]; - // thermostat - rb.Boltzmann(seed); - } - } } RigidBodyController::~RigidBodyController() { for (int i = 0; i < rigidBodyByType.size(); i++) rigidBodyByType[i].clear(); rigidBodyByType.clear(); + + delete [] attached_particle_forces; delete random; } @@ -337,6 +334,9 @@ bool RigidBodyController::loadRBCoordinates(const char* fileName) { rb.momentum = Vector3((float)strtod(tokenList[12],NULL), (float) strtod(tokenList[13],NULL), (float) strtod(tokenList[14],NULL)); rb.angularMomentum = Vector3((float)strtod(tokenList[15],NULL), (float) strtod(tokenList[16],NULL), (float) strtod(tokenList[17],NULL)); } + + if(conf.RigidBodyDynamicType == String("Langevin") && numTokens < 18) + rb.Boltzmann(conf.seed); delete[] tokenList; @@ -352,6 +352,82 @@ bool RigidBodyController::loadRBCoordinates(const char* fileName) { fclose(inp); return true; } +bool RigidBodyController::load_restart_coordinates(const char* filename) { + char line[STRLEN]; + FILE* inp = fopen(filename, "r"); + + if (inp == NULL) { + printf("ARBD: load RB coordinates: File '%s' does not exist\n", filename); + exit(-1); + } + + int imax = rigidBodyByType.size(); + int i = 0; + int jmax = rigidBodyByType[i].size(); + int j = 0; + + while (fgets(line, STRLEN, inp) != NULL) { + // Ignore comments. + int len = strlen(line); + if (line[0] == '#') continue; + if (len < 2) continue; + + String s(line); + int numTokens = s.tokenCount(); + if (numTokens < 2+3+9) { + printf("ARBD: invalid RB restart coordinate line: %s\n", line); + fclose(inp); + exit(-1); + } + if(conf.RigidBodyDynamicType == String("Langevin") && numTokens < 20) + { + std::cout << "Warning the initial momentum set by random number" << std::endl; + } + + String* tokenList = new String[numTokens]; + s.tokenize(tokenList); + if (tokenList == NULL) { + printf("ARBD: invalid RB restart coordinate line: %s\n", line); + fclose(inp); + exit(-1); + } + + RigidBody& rb = rigidBodyByType[i][j]; + printf("Assinging positions %d %d %f\n",i,j, (float) strtod(tokenList[2],NULL)); + rb.position = Vector3( + (float) strtod(tokenList[2],NULL), (float) strtod(tokenList[3],NULL), (float) strtod(tokenList[4],NULL)); + rb.orientation = Matrix3( + (float) strtod(tokenList[5],NULL), (float) strtod(tokenList[6],NULL), (float) strtod(tokenList[7],NULL), + (float) strtod(tokenList[8],NULL), (float) strtod(tokenList[9],NULL), (float) strtod(tokenList[10],NULL), + (float) strtod(tokenList[11],NULL), (float) strtod(tokenList[12],NULL), (float) strtod(tokenList[13],NULL)); + + if(conf.RigidBodyDynamicType == String("Langevin") && numTokens >= 20) + { + rb.momentum = Vector3((float)strtod(tokenList[14],NULL), (float) strtod(tokenList[15],NULL), (float) strtod(tokenList[16],NULL)); + rb.angularMomentum = Vector3((float)strtod(tokenList[17],NULL), (float) strtod(tokenList[18],NULL), (float) strtod(tokenList[19],NULL)); + } + + if(conf.RigidBodyDynamicType == String("Langevin") && numTokens < 20) + rb.Boltzmann(conf.seed); + + delete[] tokenList; + + j++; + if (j == jmax) { + i++; + if (i == imax) + break; + j=0; + jmax = rigidBodyByType[i].size(); + } + } + fclose(inp); + if (i < imax || j < jmax) { + printf("ARBD: RB restart file did not contain the correct number of lines: %s\n", filename); + exit(-1); + } + return true; +} @@ -454,6 +530,17 @@ void RigidBodyController::initializeForcePairs() { } +void RigidBodyController::update_attached_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d, BaseGrid* sys_d, int num, int num_rb_attached_particles, int numReplicas) { + for (int i = 0; i < rigidBodyByType.size(); i++) { + for (int j = 0; j < rigidBodyByType[i].size(); j++) { + rigidBodyByType[i][j].update_particle_positions(pos_d, force_d, energy_d); + } + } + gpuErrchk(cudaMemset((void*) force_d, 0, num_rb_attached_particles*sizeof(Vector3))); + gpuErrchk(cudaMemset((void*) energy_d, 0, num_rb_attached_particles*sizeof(float))); +} + + void RigidBodyController::updateParticleLists(Vector3* pos_d, BaseGrid* sys_d) { for (int i = 0; i < rigidBodyByType.size(); i++) { for (int j = 0; j < rigidBodyByType[i].size(); j++) { @@ -476,7 +563,7 @@ void RigidBodyController::clearForceAndTorque() } } -void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d) +void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d, int num, int num_rb_attached_particles) { //if (s <= 1) //gpuErrchk( cudaProfilerStart() ); @@ -493,6 +580,7 @@ void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s, // RBTODO: launch kernels ahead of time and sync using event and memcpyAsync gpuErrchk( cudaDeviceSynchronize() ); cudaMemcpy(particleForces, particleForces_d, sizeof(ForceEnergy)*2*totalParticleForceNumBlocks, cudaMemcpyDeviceToHost); + cudaMemcpyAsync(attached_particle_forces, &force_d[num], sizeof(Vector3)*(num_rb_attached_particles), cudaMemcpyDeviceToHost); pfo_idx=0; for (int i = 0; i < rigidBodyByType.size(); i++) { @@ -503,6 +591,16 @@ void RigidBodyController::updateForces(Vector3* pos_d, Vector3* force_d, int s, } } + { + gpuErrchk( cudaDeviceSynchronize() ); + for (auto &rbv: rigidBodyByType) { + for (auto &rb: rbv) { + rb.apply_attached_particle_forces( attached_particle_forces ); + } + } + } + + // Grid–Grid forces if ( ((s % conf.rigidBodyGridGridPeriod) == 0 || s == 1 ) && forcePairs.size() > 0) { for (int i=0; i < forcePairs.size(); i++) { @@ -804,10 +902,10 @@ void RigidBodyForcePair::processGPUForces(BaseGrid* sys) { tmpT = tmpT + torques[i][j]; } - // tmpT is the torque calculated about the origin of grid k2 (e.g. c2) + // tmpT is the torque calculated about the origin of density grid // so here we transform torque to be about rb1 - Vector3 o2 = getOrigin2(i); - tmpT = tmpT - sys->wrapDiff(rb1->getPosition() - o2).cross( tmpF.f ); + Vector3 o1 = getOrigin1(i); + tmpT = tmpT - (rb1->getPosition() - o1).cross( tmpF.f ); // clear forces on GPU gpuErrchk(cudaMemset((void*)(forces_d[i]),0,nb*sizeof(ForceEnergy))); diff --git a/src/RigidBodyController.h b/src/RigidBodyController.h index ff99091b3b49f0ca5d68edf07f66538821cd9946..fa2fa4e5d1efd6b8bf43a83e9e34b456d5f67245 100644 --- a/src/RigidBodyController.h +++ b/src/RigidBodyController.h @@ -108,8 +108,9 @@ public: void SetRandomTorques(); void integrate(BaseGrid* sys, int step); void integrateDLM(BaseGrid* sys, int step); - void updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d); + void updateForces(Vector3* pos_d, Vector3* force_d, int s, float* energy, bool get_energy, int scheme, BaseGrid* sys, BaseGrid* sys_d, int num, int num_rb_attached_particles); void updateParticleLists(Vector3* pos_d, BaseGrid* sys_d); + void update_attached_particle_positions(Vector3* pos_d, Vector3* force_d, float* energy_d, BaseGrid* sys_d, int num, int num_rb_attached_particles, int numReplicas); void clearForceAndTorque(); void KineticEnergy(); void print(int step); @@ -117,6 +118,7 @@ public: float getEnergy(float (RigidBody::*get)()); private: bool loadRBCoordinates(const char* fileName); + bool load_restart_coordinates(const char* filename); void initializeForcePairs(); //void print(int step); @@ -150,7 +152,8 @@ private: void construct_grids(); void destruct_grids(); - //float* rb_energy; + //float* rb_energy; + Vector3* attached_particle_forces; ForceEnergy* particleForces; ForceEnergy* particleForces_d; std::vector<int> particleForceNumBlocks; diff --git a/src/RigidBodyType.cu b/src/RigidBodyType.cu index 969a16ad7d6c3183d9a0f69ee0c4bffd142faf0f..2137160943aaf63fa2345f48e0b115373a242704 100644 --- a/src/RigidBodyType.cu +++ b/src/RigidBodyType.cu @@ -81,6 +81,58 @@ void RigidBodyType::setDampingCoeffs(float timestep) { /* MUST ONLY BE CALLED ON } +void RigidBodyType::attach_particles() { + for (const auto& filename: attached_particle_files) { + const size_t line_char = 256; + FILE* inp = fopen(filename.val(), "r"); + char line[line_char]; + + // If the particle file cannot be found, exit the program + if (inp == NULL) { + printf("ERROR: Could not open `%s'.\n", filename.val()); + fclose(inp); + exit(1); + } + + // Get and process all lines of input + while (fgets(line, line_char, inp) != NULL) { + // Lines in the particle file that begin with # are comments + if (line[0] == '#') continue; + + String s(line); + int numTokens = s.tokenCount(); + + // Break the line down into pieces (tokens) so we can process them individually + String* tokenList = new String[numTokens]; + s.tokenize(tokenList); + + // Legitimate GROUP input lines have at least 3 tokens: + // Particle_type | x | y | z + // A line without exactly six tokens should be discarded. + if (numTokens != 4) { + printf("Error: Invalid attached particle file line: %s\n", tokenList[0].val()); + fclose(inp); + exit(-1); + } + + // Make sure the index of this particle is unique. + // NOTE: The particle list is sorted by index. + int type_idx = conf->find_particle_type( tokenList[0].val() ); + if (type_idx < 0) { + printf("Error: Unrecognized particle type: %s\n", line); + fclose(inp); + exit(-1); + } + attached_particle_types.push_back( type_idx ); + attached_particle_positions.push_back( Vector3(atof(tokenList[1].val()), atof(tokenList[2].val()), atof(tokenList[3].val())) ); + } + fclose(inp); + } + size_t sz = sizeof(Vector3)*attached_particle_positions.size(); + gpuErrchk(cudaMalloc( &(attached_particle_positions_d), sz )); + gpuErrchk(cudaMemcpyAsync( attached_particle_positions_d, &attached_particle_positions[0], sz, cudaMemcpyHostToDevice)); +} + void RigidBodyType::addGrid(String s, std::vector<String> &keys, std::vector<String> &files) { // tokenize and return int numTokens = s.tokenCount(); diff --git a/src/RigidBodyType.h b/src/RigidBodyType.h index 575a31348eb24dd305e913bbf46bf0b095bb8649..5cc1442052a102da7198e1e6eeda806c4cdaa8b7 100644 --- a/src/RigidBodyType.h +++ b/src/RigidBodyType.h @@ -17,9 +17,10 @@ class BaseGrid; class RigidBodyGrid; class Configuration; class RigidBodyController; +class RigidBody; class RigidBodyType { - + friend class RigidBody; public: RigidBodyType(const String& name = "", const Configuration* conf = NULL ) : name(name), conf(conf), num(0), @@ -39,6 +40,11 @@ public: /* RigidBodyType& operator=(const RigidBodyType& src); */ void copyGridsToDevice(); + void append_attached_particle_file(String s) { attached_particle_files.push_back(s); } + void attach_particles(); + size_t num_attached_particles() const { return attached_particle_types.size() ;} + const std::vector<int>& get_attached_particle_types() const { return attached_particle_types; } + void addPotentialGrid(String s); void addDensityGrid(String s); void addPMF(String s); @@ -54,6 +60,12 @@ public: String name; private: const Configuration* conf; + std::vector<String> attached_particle_files; + std::vector<int>attached_particle_types; +private: + std::vector<Vector3>attached_particle_positions; + Vector3* attached_particle_positions_d; + public: int num; // number of particles of this type diff --git a/src/TabulatedPotential.h b/src/TabulatedPotential.h index b5a0f1e19e9c3dc43f73bd85c978d2091b27db57..e32068a6fa7dc0b8cbada59cb7f6ea8d40933b22 100644 --- a/src/TabulatedPotential.h +++ b/src/TabulatedPotential.h @@ -179,6 +179,6 @@ private: }; #ifndef delgpuErrchk #undef delgpuErrchk -#undef gpuErrchk +#undef gpuErrchk(code) #endif #endif