diff --git a/ComputeForce.cu b/ComputeForce.cu index 7f9207ab9452bea77de4c8495d16e2a34bce5e22..aee09cdc1f22fc5fff364f690ea5d5b64cd7115e 100644 --- a/ComputeForce.cu +++ b/ComputeForce.cu @@ -129,14 +129,29 @@ ComputeForce::~ComputeForce() { delete tableAngle[j]; delete[] tableAngle; delete[] tableAngle_addr; - gpuErrchk(cudaFree(tableAngle_d)); - gpuErrchk(cudaFree(energies_d)); - - gpuErrchk(cudaFree(sys_d)); + if(type_d != NULL) + { + gpuErrchk(cudaFree(tableAngle_d)); + + gpuErrchk(cudaFree(energies_d)); + + gpuErrchk(cudaFree(sys_d)); + + gpuErrchk( cudaFree(pos_d) ); + gpuErrchk( cudaFree(forceInternal_d) ); + gpuErrchk( cudaFree(type_d) ); + gpuErrchk( cudaFree(bonds_d) ); + gpuErrchk( cudaFree(bondMap_d) ); + gpuErrchk( cudaFree(excludes_d) ); + gpuErrchk( cudaFree(excludeMap_d) ); + gpuErrchk( cudaFree(angles_d) ); + gpuErrchk( cudaFree(dihedrals_d) ); + gpuErrchk( cudaFree(bondList_d) ); + } } -void ComputeForce::updateNumber(Vector3* pos, int type[], int newNum) { +void ComputeForce::updateNumber(int newNum) { if (newNum == num or newNum < 0) return; // Set the new number. @@ -145,7 +160,7 @@ void ComputeForce::updateNumber(Vector3* pos, int type[], int newNum) { // Reallocate the neighbor list. //delete[] neigh; //neigh = new IndexList[num]; - decompose(pos, type); + decompose(); printf("updateNumber() called\n"); // Reallocate CUDA arrays @@ -242,8 +257,8 @@ bool ComputeForce::addTabulatedPotential(String fileName, int type0, int type1) return true; } -bool ComputeForce::addBondPotential(String fileName, int ind, - Bond bonds[], Bond bonds_d[]) { +bool ComputeForce::addBondPotential(String fileName, int ind, Bond bonds[]) +{ if (tableBond[ind] != NULL) { delete tableBond[ind]; gpuErrchk(cudaFree(tableBond_addr[ind])); @@ -285,7 +300,7 @@ bool ComputeForce::addBondPotential(String fileName, int ind, return true; } -bool ComputeForce::addAnglePotential(String fileName, int ind, Angle* angles, Angle* angles_d) { +bool ComputeForce::addAnglePotential(String fileName, int ind, Angle* angles) { if (tableAngle[ind] != NULL) { delete tableAngle[ind]; gpuErrchk(cudaFree(tableAngle_addr[ind])); @@ -319,9 +334,8 @@ bool ComputeForce::addAnglePotential(String fileName, int ind, Angle* angles, An return true; } -bool ComputeForce::addDihedralPotential(String fileName, int ind, - Dihedral dihedrals[], - Dihedral dihedrals_d[]) { +bool ComputeForce::addDihedralPotential(String fileName, int ind, Dihedral dihedrals[]) +{ for (int i = 0; i < numDihedrals; i++) if (dihedrals[i].fileName == fileName) dihedrals[i].tabFileIndex = ind; @@ -356,7 +370,7 @@ bool ComputeForce::addDihedralPotential(String fileName, int ind, return true; } -void ComputeForce::decompose(Vector3* pos, int type[]) { +void ComputeForce::decompose() { gpuErrchk( cudaProfilerStart() ); // Reset the cell decomposition. bool newDecomp = false; @@ -365,7 +379,7 @@ void ComputeForce::decompose(Vector3* pos, int type[]) { else newDecomp = true; - decomp.decompose_d(pos, num); + decomp.decompose_d(pos_d, num); decomp_d = decomp.copyToCUDA(); // Update pairlists using cell decomposition (not sure this is really needed or good) @@ -409,21 +423,14 @@ void ComputeForce::decompose(Vector3* pos, int type[]) { /* numPairs_d, pairListListI_d, pairListListJ_d); */ /* gpuErrchk(cudaDeviceSynchronize()); */ - { - int tmp = 0; - gpuErrchk(cudaMemcpyAsync(numPairs_d, &tmp, - sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaDeviceSynchronize()); - } - + int tmp = 0; + gpuErrchk(cudaMemcpyAsync(numPairs_d, &tmp, sizeof(int), cudaMemcpyHostToDevice)); + gpuErrchk(cudaDeviceSynchronize()); float pairlistdist2 = (sqrt(cutoff2) + 2.0f); pairlistdist2 = pairlistdist2*pairlistdist2; - createPairlists<<< 2048, 64 >>>(pos, num, numReplicas, - sys_d, decomp_d, nCells, - numPairs_d, pairLists_d, - numParts, type, pairTabPotType_d, pairlistdist2); + createPairlists<<< 2048, 64 >>>(pos_d, num, numReplicas, sys_d, decomp_d, nCells, numPairs_d, pairLists_d, numParts, type_d, pairTabPotType_d, pairlistdist2); /* createPairlistsOld<<< nBlocks, NUMTHREADS >>>(pos, num, numReplicas, */ /* sys_d, decomp_d, nCells, blocksPerCell, */ /* numPairs_d, pairLists_d, */ @@ -466,14 +473,14 @@ int* ComputeForce::neighborhood(Vector3 r) { return NULL; } -float ComputeForce::computeFull(Vector3* force, Vector3* pos, int* type, bool get_energy) { +float ComputeForce::computeFull(bool get_energy) { float energy = 0.0f; gridSize = (num * numReplicas) / NUM_THREADS + 1; dim3 numBlocks(gridSize, 1, 1); dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeFullKernel<<< numBlocks, numThreads >>>(force, pos, type, tableAlpha_d, + computeFullKernel<<< numBlocks, numThreads >>>(forceInternal_d, pos_d, type_d, tableAlpha_d, tableEps_d, tableRad6_d, num, numParts, sys_d, energies_d, gridSize, numReplicas, get_energy); @@ -487,14 +494,14 @@ float ComputeForce::computeFull(Vector3* force, Vector3* pos, int* type, bool ge return energy; } -float ComputeForce::computeSoftcoreFull(Vector3* force, Vector3* pos, int* type, bool get_energy) { +float ComputeForce::computeSoftcoreFull(bool get_energy) { float energy = 0.0f; gridSize = (num * numReplicas) / NUM_THREADS + 1; dim3 numBlocks(gridSize, 1, 1); dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeSoftcoreFullKernel<<<numBlocks, numThreads>>>(force, pos, type, + computeSoftcoreFullKernel<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, type_d, tableEps_d, tableRad6_d, num, numParts, sys_d, energies_d, gridSize, numReplicas, get_energy); @@ -508,8 +515,7 @@ float ComputeForce::computeSoftcoreFull(Vector3* force, Vector3* pos, int* type, return energy; } -float ComputeForce::computeElecFull(Vector3* force, Vector3* pos, - int* type, bool get_energy) { +float ComputeForce::computeElecFull(bool get_energy) { float energy = 0.0f; gridSize = num/NUM_THREADS + 1; @@ -517,7 +523,7 @@ float ComputeForce::computeElecFull(Vector3* force, Vector3* pos, dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeElecFullKernel<<<numBlocks, numThreads>>>(force, pos, type, + computeElecFullKernel<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, type_d, tableAlpha_d, num, numParts, sys_d, energies_d, gridSize, numReplicas, get_energy); @@ -532,7 +538,7 @@ float ComputeForce::computeElecFull(Vector3* force, Vector3* pos, } -float ComputeForce::compute(Vector3 force[], Vector3 pos[], int type[], bool get_energy) { +float ComputeForce::compute(bool get_energy) { float energy = 0.0f; gridSize = (num * numReplicas) / NUM_THREADS + 1; @@ -540,7 +546,7 @@ float ComputeForce::compute(Vector3 force[], Vector3 pos[], int type[], bool get dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeKernel<<<numBlocks, numThreads>>>(force, pos, type, + computeKernel<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, type_d, tableAlpha_d, tableEps_d, tableRad6_d, num, numParts, sys_d, decomp_d, energies_d, switchStart, switchLen, gridSize, numReplicas, get_energy); @@ -560,9 +566,7 @@ float ComputeForce::compute(Vector3 force[], Vector3 pos[], int type[], bool get /*float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals, bool get_energy, Bond* bondList) {*/ -float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type, - Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, - Angle* angles, Dihedral* dihedrals, bool get_energy, int3* bondList_d) { +float ComputeForce::computeTabulated(bool get_energy) { float energy = 0.0f; gridSize = (num * numReplicas) / NUM_THREADS + 1; @@ -575,32 +579,35 @@ float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type, const int nb = 800; // printf("ComputeTabulated\n"); - if (get_energy) { + if (get_energy) + { clearEnergies<<< nb, numThreads >>>(energies_d,num); gpuErrchk(cudaDeviceSynchronize()); - computeTabulatedEnergyKernel<<< nb, numThreads >>>(force, pos, type, - tablePot_d, tableBond_d, sys_d, - bonds, bondMap, numBonds, energies_d, cutoff2, - numPairs_d, pairLists_d, pairTabPotType_d); - } else { - computeTabulatedKernel<<< nb, numThreads >>>(force, pos, type, + computeTabulatedEnergyKernel<<< nb, numThreads >>>(forceInternal_d, pos_d, type_d, tablePot_d, sys_d, energies_d, cutoff2, numPairs_d, pairLists_d, pairTabPotType_d); + } + + else + { + computeTabulatedKernel<<< nb, numThreads >>>(forceInternal_d, pos_d, type_d, tablePot_d, tableBond_d, sys_d, - bonds, bondMap, numBonds, cutoff2, + bonds_d, bondMap_d, numBonds, cutoff2, numPairs_d, pairLists_d, pairTabPotType_d); } /* printPairForceCounter<<<1,32>>>(); */ /* gpuErrchk(cudaDeviceSynchronize()); */ - computeAngles<<<numBlocks, numThreads>>>(force, pos, angles, + computeAngles<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, angles_d, tableAngle_d, numAngles, num, sys_d, energies_d, get_energy); //Mlog: the commented function doesn't use bondList, uncomment for testing. - if(bondMap != NULL && tableBond_d != NULL) + //if(bondMap_d != NULL && tableBond_d != NULL) + if(bondList_d != NULL && tableBond_d != NULL) + { - //computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d, bonds, bondMap, numBonds, numReplicas, energies_d, get_energy, tableBond_d); - computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d, bondList_d, (numBonds/2), numReplicas, energies_d, get_energy, tableBond_d); + //computeTabulatedBonds <<<numBlocks, numThreads>>> ( force, pos, num, numParts, sys_d, bonds, bondMap_d, numBonds, numReplicas, energies_d, get_energy, tableBond_d); + computeTabulatedBonds <<<numBlocks, numThreads>>> ( forceInternal_d, pos_d, num, numParts, sys_d, bondList_d, (numBonds/2), numReplicas, energies_d, get_energy, tableBond_d); } - computeDihedrals<<<numBlocks, numThreads>>>(force, pos, dihedrals, + computeDihedrals<<<numBlocks, numThreads>>>(forceInternal_d, pos_d, dihedrals_d, tableDihedral_d, numDihedrals, num, sys_d, energies_d, get_energy); // Calculate the energy based on the array created by the kernel @@ -613,9 +620,7 @@ float ComputeForce::computeTabulated(Vector3* force, Vector3* pos, int* type, return energy; } -float ComputeForce::computeTabulatedFull(Vector3* force, Vector3* pos, int* type, - Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, - Angle* angles, Dihedral* dihedrals, bool get_energy) { +float ComputeForce::computeTabulatedFull(bool get_energy) { energy = 0.0f; gridSize = (num * numReplicas) / NUM_THREADS + 1; @@ -623,17 +628,14 @@ float ComputeForce::computeTabulatedFull(Vector3* force, Vector3* pos, int* type dim3 numThreads(NUM_THREADS, 1, 1); // Call the kernel to calculate forces - computeTabulatedFullKernel<<< numBlocks, numThreads >>>(force, pos, type, - tablePot_d, tableBond_d, num, numParts, sys_d, bonds, bondMap, numBonds, - excludes, excludeMap, numExcludes, energies_d, gridSize, numReplicas, - get_energy, angles); + computeTabulatedFullKernel<<< numBlocks, numThreads >>>(forceInternal_d, pos_d, type_d, tablePot_d, tableBond_d, num, numParts, sys_d, bonds_d, bondMap_d, numBonds, excludes_d, excludeMap_d, numExcludes, energies_d, gridSize, numReplicas, get_energy, angles_d); gpuErrchk(cudaDeviceSynchronize()); - computeAngles<<< numBlocks, numThreads >>>(force, pos, angles, tableAngle_d, + computeAngles<<< numBlocks, numThreads >>>(forceInternal_d, pos_d, angles_d, tableAngle_d, numAngles, num, sys_d, energies_d, get_energy); gpuErrchk(cudaDeviceSynchronize()); - computeDihedrals<<< numBlocks, numThreads >>>(force, pos, dihedrals, + computeDihedrals<<< numBlocks, numThreads >>>(forceInternal_d, pos_d, dihedrals_d, tableDihedral_d, numDihedrals, num, sys_d, energies_d, get_energy); @@ -646,3 +648,78 @@ float ComputeForce::computeTabulatedFull(Vector3* force, Vector3* pos, int* type return energy; } + +void ComputeForce::copyToCUDA(Vector3* forceInternal, Vector3* pos) +{ + const size_t tot_num = num * numReplicas; + + gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num)); + gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); + + gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas)); + gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); + + gpuErrchk(cudaDeviceSynchronize()); +} + +void ComputeForce::copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals) +{ + // type_d + gpuErrchk(cudaMalloc(&type_d, sizeof(int) * num * simNum)); + gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * num * simNum, cudaMemcpyHostToDevice)); + + if (numBonds > 0) + { + // bonds_d + gpuErrchk(cudaMalloc(&bonds_d, sizeof(Bond) * numBonds)); + gpuErrchk(cudaMemcpyAsync(bonds_d, bonds, sizeof(Bond) * numBonds, cudaMemcpyHostToDevice)); + + // bondMap_d + gpuErrchk(cudaMalloc(&bondMap_d, sizeof(int2) * num)); + gpuErrchk(cudaMemcpyAsync(bondMap_d, bondMap, sizeof(int2) * num, cudaMemcpyHostToDevice)); + } + + if (numExcludes > 0) { + // excludes_d + gpuErrchk(cudaMalloc(&excludes_d, sizeof(Exclude) * numExcludes)); + gpuErrchk(cudaMemcpyAsync(excludes_d, excludes, sizeof(Exclude) * numExcludes, + cudaMemcpyHostToDevice)); + + // excludeMap_d + gpuErrchk(cudaMalloc(&excludeMap_d, sizeof(int2) * num)); + gpuErrchk(cudaMemcpyAsync(excludeMap_d, excludeMap, sizeof(int2) * num, + cudaMemcpyHostToDevice)); + } + + if (numAngles > 0) { + // angles_d + gpuErrchk(cudaMalloc(&angles_d, sizeof(Angle) * numAngles)); + gpuErrchk(cudaMemcpyAsync(angles_d, angles, sizeof(Angle) * numAngles, + cudaMemcpyHostToDevice)); + } + + if (numDihedrals > 0) { + // dihedrals_d + gpuErrchk(cudaMalloc(&dihedrals_d, sizeof(Dihedral) * numDihedrals)); + gpuErrchk(cudaMemcpyAsync(dihedrals_d, dihedrals, + sizeof(Dihedral) * numDihedrals, + cudaMemcpyHostToDevice)); + } + + gpuErrchk(cudaDeviceSynchronize()); +} + +void ComputeForce::createBondList(int3 *bondList) +{ + size_t size = (numBonds / 2) * numReplicas * sizeof(int3); + gpuErrchk( cudaMalloc( &bondList_d, size ) ); + gpuErrchk( cudaMemcpyAsync( bondList_d, bondList, size, cudaMemcpyHostToDevice) ); + + for(int i = 0 ; i < (numBonds / 2) * numReplicas ; i++) + { + cout << "Displaying: bondList_d["<< i <<"].x = " << bondList[i].x << ".\n" + << "Displaying: bondList_d["<< i <<"].y = " << bondList[i].y << ".\n" + << "Displaying: bondList_d["<< i <<"].z = " << bondList[i].z << ".\n"; + + } +} diff --git a/ComputeForce.cuh b/ComputeForce.cuh index 359b879551923b6e1214b902d4a6d4629b48f391..cfb2b9681ab6c7727c16f04c6b6e604b812831e8 100644 --- a/ComputeForce.cuh +++ b/ComputeForce.cuh @@ -470,13 +470,7 @@ __global__ void clearEnergies(float* g_energies, int num) { g_energies[i] = 0.0f; } } -__global__ void computeTabulatedEnergyKernel(Vector3* force, Vector3* __restrict__ pos, - int* type, TabulatedPotential** __restrict__ tablePot, - TabulatedPotential** __restrict__ tableBond, - BaseGrid* __restrict__ sys, Bond* __restrict__ bonds, - int2* __restrict__ bondMap, int numBonds, float* g_energies, float cutoff2, - int* __restrict__ g_numPairs, int2* __restrict__ g_pair, - int* __restrict__ g_pairTabPotType) { +__global__ void computeTabulatedEnergyKernel(Vector3* force, Vector3* __restrict__ pos, int* type, TabulatedPotential** __restrict__ tablePot, BaseGrid* __restrict__ sys, float* g_energies, float cutoff2, int* __restrict__ g_numPairs, int2* __restrict__ g_pair, int* __restrict__ g_pairTabPotType) { const int numPairs = *g_numPairs; // RBTODO: BONDS (handle through an independent kernel call?) @@ -512,15 +506,8 @@ __global__ void computeTabulatedEnergyKernel(Vector3* force, Vector3* __restrict // NOT using cell decomposition // __global__ -void computeTabulatedFullKernel(Vector3 force[], Vector3 pos[], int type[], - TabulatedPotential* tablePot[], - TabulatedPotential* tableBond[], - int num, int numParts, BaseGrid *sys, - Bond bonds[], int2 bondMap[], int numBonds, - Exclude excludes[], int2 excludeMap[], - int numExcludes, float g_energies[], - int gridSize, int numReplicas, bool get_energy, - Angle angles[]) { +void computeTabulatedFullKernel(Vector3 force[], Vector3 pos[], int type[], TabulatedPotential* tablePot[], TabulatedPotential* tableBond[], int num, int numParts, BaseGrid *sys, Bond bonds[], int2 bondMap[], int numBonds, Exclude excludes[], int2 excludeMap[], int numExcludes, float g_energies[], int gridSize, int numReplicas, bool get_energy, Angle angles[]) +{ // Thread's unique ID. const int i = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/ComputeForce.h b/ComputeForce.h index d1d0b350bc6078d37ea0528a96e1e5027aac39ea..22fd9ea30a7281f7a7a7305e96fd41d20e552307 100644 --- a/ComputeForce.h +++ b/ComputeForce.h @@ -39,15 +39,15 @@ public: int numDihedrals, int numTabDihedralFiles, int numReplicas = 1); ~ComputeForce(); - void updateNumber(Vector3* pos, int type[], int newNum); + void updateNumber(int newNum); void makeTables(const BrownianParticleType* part); bool addTabulatedPotential(String fileName, int type0, int type1); - bool addBondPotential(String fileName, int ind, Bond* bonds, Bond* bonds_d); - bool addAnglePotential(String fileName, int ind, Angle* angles, Angle* angles_d); - bool addDihedralPotential(String fileName, int ind, Dihedral* dihedrals, Dihedral* dihedrals_d); + bool addBondPotential(String fileName, int ind, Bond* bonds); + bool addAnglePotential(String fileName, int ind, Angle* angles); + bool addDihedralPotential(String fileName, int ind, Dihedral* dihedrals); - void decompose(Vector3* pos, int type[]); + void decompose(); CellDecomposition getDecomp(); IndexList decompDim() const; @@ -57,22 +57,76 @@ public: // Does nothing int* neighborhood(Vector3 r); - float computeSoftcoreFull(Vector3* force, Vector3* pos, int* type, bool get_energy); - float computeElecFull(Vector3* force, Vector3* pos, int* type, bool get_energy); + float computeSoftcoreFull(bool get_energy); + float computeElecFull(bool get_energy); - float compute(Vector3* force, Vector3* pos, int* type, bool get_energy); - float computeFull(Vector3* force, Vector3* pos, int* type, bool get_energy); + float compute(bool get_energy); + float computeFull(bool get_energy); //MLog: the commented function doesn't use bondList, uncomment for testing. /*float computeTabulated(Vector3* force, Vector3* pos, int* type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals, bool get_energy);*/ - float computeTabulated(Vector3* force, Vector3* pos, int* type, - Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, - Angle* angles, Dihedral* dihedrals, bool get_energy, int3* bondList_d); - float computeTabulatedFull(Vector3* force, Vector3* pos, int* type, - Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, - Angle* angles, Dihedral* dihedrals, bool get_energy); + float computeTabulated(bool get_energy); + float computeTabulatedFull(bool get_energy); + + //MLog: new copy function to allocate memory required by ComputeForce class. + void copyToCUDA(Vector3* forceInternal, Vector3* pos); + void copyToCUDA(int simNum, int *type, Bond* bonds, int2* bondMap, Exclude* excludes, int2* excludeMap, Angle* angles, Dihedral* dihedrals); + + void createBondList(int3 *bondList); + + //MLog: because of the move of a lot of private variables, some functions get starved necessary memory access to these variables, below is a list of functions that return the specified private variable. + Vector3* getPos_d() + { + return pos_d; + } + + Vector3* getForceInternal_d() + { + return forceInternal_d; + } + + int* getType_d() + { + return type_d; + } + + Bond* getBonds_d() + { + return bonds_d; + } + + int2* getBondMap_d() + { + return bondMap_d; + } + + Exclude* getExcludes_d() + { + return excludes_d; + } + + int2* getExcludeMap_d() + { + return excludeMap_d; + } + + Angle* getAngles_d() + { + return angles_d; + } + + Dihedral* getDihedrals_d() + { + return dihedrals_d; + } + + int3* getBondList_d() + { + return bondList_d; + } + HOST DEVICE static EnergyForce coulombForce(Vector3 r, float alpha,float start, float len); @@ -120,7 +174,22 @@ private: int2 *pairLists_d; int *pairTabPotType_d; - int *numPairs_d; + int *numPairs_d; + + //MLog: List of variables that need to be moved over to ComputeForce class. Members of this list will be set to static to avoid large alterations in working code, thereby allowing us to access these variables easily. + //BrownianParticleType* part; + //float electricConst; + //int fullLongRange; + Vector3* pos_d; + Vector3* forceInternal_d; + int* type_d; + Bond* bonds_d; + int2* bondMap_d; + Exclude* excludes_d; + int2* excludeMap_d; + Angle* angles_d; + Dihedral* dihedrals_d; + int3* bondList_d; }; #endif diff --git a/Configuration.cpp b/Configuration.cpp index 8373e272fb2f21995b399974cd4dde28753d8653..68b5db65502feb99bb280d3a47b09c15fed1d75f 100644 --- a/Configuration.cpp +++ b/Configuration.cpp @@ -12,14 +12,14 @@ inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) { Configuration::Configuration(const char* config_file, int simNum, bool debug) : simNum(simNum) { // Read the parameters. - type_d = NULL; + //type_d = NULL; kTGrid_d = NULL; - bonds_d = NULL; - bondMap_d = NULL; - excludes_d = NULL; - excludeMap_d = NULL; - angles_d = NULL; - dihedrals_d = NULL; + //bonds_d = NULL; + //bondMap_d = NULL; + //excludes_d = NULL; + //excludeMap_d = NULL; + //angles_d = NULL; + //dihedrals_d = NULL; setDefaults(); readParameters(config_file); if (readPartsFromFile) readAtoms(); @@ -397,18 +397,18 @@ Configuration::~Configuration() { delete[] dihedralTableFile; - if (type_d != NULL) { - gpuErrchk(cudaFree(type_d)); + //if (type_d != NULL) { + //gpuErrchk(cudaFree(type_d)); gpuErrchk(cudaFree(sys_d)); gpuErrchk(cudaFree(kTGrid_d)); gpuErrchk(cudaFree(part_d)); - gpuErrchk(cudaFree(bonds_d)); - gpuErrchk(cudaFree(bondMap_d)); - gpuErrchk(cudaFree(excludes_d)); - gpuErrchk(cudaFree(excludeMap_d)); - gpuErrchk(cudaFree(angles_d)); - gpuErrchk(cudaFree(dihedrals_d)); - } + //gpuErrchk(cudaFree(bonds_d)); + //gpuErrchk(cudaFree(bondMap_d)); + //gpuErrchk(cudaFree(excludes_d)); + //gpuErrchk(cudaFree(excludeMap_d)); + //gpuErrchk(cudaFree(angles_d)); + //gpuErrchk(cudaFree(dihedrals_d)); + //} } void Configuration::copyToCUDA() { @@ -468,11 +468,10 @@ void Configuration::copyToCUDA() { } // type_d and sys_d - gpuErrchk(cudaMalloc(&type_d, sizeof(int) * num * simNum)); - gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * num * simNum, cudaMemcpyHostToDevice)); gpuErrchk(cudaMalloc(&sys_d, sizeof(BaseGrid))); gpuErrchk(cudaMemcpyAsync(sys_d, sys, sizeof(BaseGrid), cudaMemcpyHostToDevice)); - + /*gpuErrchk(cudaMalloc(&type_d, sizeof(int) * num * simNum)); + gpuErrchk(cudaMemcpyAsync(type_d, type, sizeof(int) * num * simNum, cudaMemcpyHostToDevice)); if (numBonds > 0) { // bonds_d @@ -509,7 +508,7 @@ void Configuration::copyToCUDA() { gpuErrchk(cudaMemcpyAsync(dihedrals_d, dihedrals, sizeof(Dihedral) * numDihedrals, cudaMemcpyHostToDevice)); - } + }*/ gpuErrchk(cudaDeviceSynchronize()); } diff --git a/Configuration.h b/Configuration.h index 4829e4036165cedd6af6b3ba833fc1e4e09f57fb..058b0bb31ecb2f4e39a472b42517a313d5b33c1a 100644 --- a/Configuration.h +++ b/Configuration.h @@ -80,15 +80,15 @@ public: bool loadedCoordinates; // Device Variables - int *type_d; + //int *type_d; BrownianParticleType **part_d; BaseGrid *sys_d, *kTGrid_d; - Bond* bonds_d; - int2* bondMap_d; - Exclude* excludes_d; - int2* excludeMap_d; - Angle* angles_d; - Dihedral* dihedrals_d; + //Bond* bonds_d; + //int2* bondMap_d; + //Exclude* excludes_d; + //int2* excludeMap_d; + //Angle* angles_d; + //Dihedral* dihedrals_d; // number of simulations int simNum; diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu index c2db8bf12592c179b8e446f15eb4aeea8a95234a..b459fed9cf271b4a13786b7e402044194dbe7823 100644 --- a/GrandBrownTown.cu +++ b/GrandBrownTown.cu @@ -138,16 +138,16 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, numTabDihedralFiles = c.numTabDihedralFiles; // Device parameters - type_d = c.type_d; + //type_d = c.type_d; part_d = c.part_d; sys_d = c.sys_d; kTGrid_d = c.kTGrid_d; - bonds_d = c.bonds_d; - bondMap_d = c.bondMap_d; - excludes_d = c.excludes_d; - excludeMap_d = c.excludeMap_d; - angles_d = c.angles_d; - dihedrals_d = c.dihedrals_d; + //bonds_d = c.bonds_d; + //bondMap_d = c.bondMap_d; + //excludes_d = c.excludes_d; + //excludeMap_d = c.excludeMap_d; + //angles_d = c.angles_d; + //dihedrals_d = c.dihedrals_d; // Seed random number generator long unsigned int r_seed; @@ -179,6 +179,9 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, internal = new ComputeForce(num, part, numParts, sys, switchStart, switchLen, coulombConst, fullLongRange, numBonds, numTabBondFiles, numExcludes, numAngles, numTabAngleFiles, numDihedrals, numTabDihedralFiles, numReplicas); + + //MLog: I did the other halve of the copyToCUDA function from the Configuration class here, keep an eye on any mistakes that may occur due to the location. + internal -> copyToCUDA(c.simNum, c.type, c.bonds, c.bondMap, c.excludes, c.excludeMap, c.angles, c.dihedrals); // TODO: check for duplicate potentials if (c.tabulatedPotential) { @@ -199,7 +202,8 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, printf("Loading the tabulated bond potentials...\n"); for (int p = 0; p < numTabBondFiles; p++) if (bondTableFile[p].length() > 0) { - internal->addBondPotential(bondTableFile[p].val(), p, bonds, bonds_d); + //MLog: make sure to add to all GPUs + internal->addBondPotential(bondTableFile[p].val(), p, bonds); printf("%s\n",bondTableFile[p].val()); } } @@ -208,14 +212,17 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, printf("Loading the tabulated angle potentials...\n"); for (int p = 0; p < numTabAngleFiles; p++) if (angleTableFile[p].length() > 0) - internal->addAnglePotential(angleTableFile[p].val(), p, angles, angles_d); + { + //MLog: make sure to do this for every GPU + internal->addAnglePotential(angleTableFile[p].val(), p, angles); + } } if (c.readDihedralsFromFile) { printf("Loading the tabulated dihedral potentials...\n"); for (int p = 0; p < numTabDihedralFiles; p++) if (dihedralTableFile[p].length() > 0) - internal->addDihedralPotential(dihedralTableFile[p].val(), p, dihedrals, dihedrals_d); + internal->addDihedralPotential(dihedralTableFile[p].val(), p, dihedrals); } //Mlog: this is where we create the bondList. @@ -237,7 +244,7 @@ GrandBrownTown::GrandBrownTown(const Configuration& c, const char* outArg, } } - createBondList(); + internal -> createBondList(bondList); // TODO: copy data to device (not here) // TODO: use bondList in computeTabulatedBonds kernel @@ -271,10 +278,10 @@ GrandBrownTown::~GrandBrownTown() { for (int i = 0; i < numReplicas; ++i) delete writers[i]; - gpuErrchk(cudaFree(pos_d)); - gpuErrchk(cudaFree(forceInternal_d)); + //gpuErrchk(cudaFree(pos_d)); + //gpuErrchk(cudaFree(forceInternal_d)); gpuErrchk(cudaFree(randoGen_d)); - gpuErrchk( cudaFree(bondList_d) ); + //gpuErrchk( cudaFree(bondList_d) ); } // Run the Brownian Dynamics steps. @@ -298,6 +305,7 @@ void GrandBrownTown::run() { timerS = rt_timer_create(); copyToCUDA(); + internal -> copyToCUDA(forceInternal, pos); // IMD Stuff void* sock = NULL; @@ -333,7 +341,10 @@ void GrandBrownTown::run() { // We haven't done any steps yet. // Do decomposition if we have to if (fullLongRange == 0) - internal->decompose(pos_d, type_d); + { + cudaSetDevice(0); + internal->decompose(); + } float t; // simulation time @@ -360,22 +371,18 @@ void GrandBrownTown::run() { // 1 - do not use cutoff [ N^2 ] switch (fullLongRange) { case 0: // [ N*log(N) ] interactions, + cutoff | decomposition - if (s % decompPeriod == 0) { - internal->decompose(pos_d, type_d); - //internal->updatePairlists(pos_d); // perhaps this way? + if (s % decompPeriod == 0) + { + cudaSetDevice(0); + internal -> decompose(); } //MLog: added Bond* bondList to the list of passed in variables. /*energy = internal->computeTabulated(forceInternal_d, pos_d, type_d, bonds_d, bondMap_d, excludes_d, excludeMap_d, angles_d, dihedrals_d, get_energy);*/ - energy = internal -> computeTabulated(forceInternal_d, pos_d, type_d, bonds_d, bondMap_d, excludes_d, excludeMap_d, angles_d, dihedrals_d, get_energy, bondList_d); + energy = internal -> computeTabulated(get_energy); break; default: // [ N^2 ] interactions, no cutoff | decompositions - energy = - internal->computeTabulatedFull(forceInternal_d, pos_d, type_d, - bonds_d, bondMap_d, - excludes_d, excludeMap_d, - angles_d, dihedrals_d, - get_energy); + energy = internal->computeTabulatedFull(get_energy); break; } @@ -386,24 +393,23 @@ void GrandBrownTown::run() { case 0: // Use cutoff | cell decomposition. if (s % decompPeriod == 0) - internal->decompose(pos_d, type_d); - energy = - internal->compute(forceInternal_d, pos_d, type_d, get_energy); + { + cudaSetDevice(0); + internal->decompose(); + } + energy = internal->compute(get_energy); break; case 1: // Do not use cutoff - energy = internal->computeFull(forceInternal_d, - pos_d, type_d, get_energy); + energy = internal->computeFull(get_energy); break; case 2: // Compute only softcore forces. - energy = internal->computeSoftcoreFull(forceInternal_d, - pos_d, type_d, get_energy); + energy = internal->computeSoftcoreFull(get_energy); break; case 3: // Compute only electrostatic forces. - energy = internal->computeElecFull(forceInternal_d, - pos_d, type_d, get_energy); + energy = internal->computeElecFull(get_energy); break; } } @@ -428,11 +434,9 @@ void GrandBrownTown::run() { RBC.updateForces(s); /* update RB forces before update particle positions... */ - // Call the kernel to update the positions of each particle - updateKernel<<< numBlocks, NUM_THREADS >>>(pos_d, forceInternal_d, type_d, - part_d, kT, kTGrid_d, - electricField, tl, timestep, num, - sys_d, randoGen_d, numReplicas); + //MLog: Call the kernel to update the positions of each particle + cudaSetDevice(0); + updateKernel<<< numBlocks, NUM_THREADS >>>(internal -> getPos_d(), internal -> getForceInternal_d(), internal -> getType_d(), part_d, kT, kTGrid_d, electricField, tl, timestep, num, sys_d, randoGen_d, numReplicas); //gpuErrchk(cudaPeekAtLastError()); // Does not work on old GPUs (like mine). TODO: write a better wrapper around Peek /* Time position computations. @@ -472,7 +476,8 @@ void GrandBrownTown::run() { } } if (clientsock) { - gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num, cudaMemcpyDeviceToHost)); + cudaSetDevice(0); + gpuErrchk(cudaMemcpy(pos, internal -> getPos_d(), sizeof(Vector3) * num, cudaMemcpyDeviceToHost)); float* coords = new float[num * 3]; for (size_t i = 0; i < num; i++) { Vector3 p = pos[i]; @@ -488,7 +493,8 @@ void GrandBrownTown::run() { // Output trajectories (to files) if (s % outputPeriod == 0) { // Copy particle positions back to CPU - gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas, + cudaSetDevice(0); + gpuErrchk(cudaMemcpy(pos, internal -> getPos_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); // Nanoseconds computed @@ -522,10 +528,11 @@ void GrandBrownTown::run() { // Output energy. if (get_energy) { // Stop the timer. + cudaSetDevice(0); rt_timer_stop(timerS); // Copy back forces to display (internal only) - gpuErrchk(cudaMemcpy(&force0, forceInternal_d, sizeof(Vector3), cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(&force0, internal -> getForceInternal_d(), sizeof(Vector3), cudaMemcpyDeviceToHost)); // Nanoseconds computed t = s * timestep; @@ -546,7 +553,7 @@ void GrandBrownTown::run() { */ // Copy positions from GPU to CPU. - gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas, + gpuErrchk(cudaMemcpy(pos, internal -> getPos_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); // Write restart files for each replica. @@ -559,7 +566,8 @@ void GrandBrownTown::run() { { int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1); - clearInternalForces<<< numBlocks, NUM_THREADS >>>(forceInternal_d, num, numReplicas); + //MLog: along with calls to internal (ComputeForce class) this function should execute once per GPU. + clearInternalForces<<< numBlocks, NUM_THREADS >>>(internal -> getForceInternal_d(), num, numReplicas); } } // done with all Brownian dynamics steps @@ -604,7 +612,8 @@ void GrandBrownTown::run() { else if (tot_min > 0) printf("%dm%.1fs\n", tot_min, tot_sec); else printf("%.2fs\n", tot_sec); - gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); + cudaSetDevice(0); + gpuErrchk(cudaMemcpy(pos, internal -> getPos_d(), sizeof(Vector3) * num * numReplicas, cudaMemcpyDeviceToHost)); // Write the restart file (once again) for (int repID = 0; repID < numReplicas; ++repID) @@ -912,7 +921,7 @@ void GrandBrownTown::addParticles(int n, int typ, Vector3 r0, Vector3 r1, float type[i] = typ; num++; // Update the cell decomposition - internal->updateNumber(pos, type, num); /* RBTODO: unsure if type arg is ok */ + internal->updateNumber(num); /* RBTODO: unsure if type arg is ok */ } } @@ -964,20 +973,20 @@ void GrandBrownTown::updateReservoirs() { } // end particle loop if (numberChange) - internal->updateNumber(pos, type, num); + internal->updateNumber(num); } // ----------------------------------------------------------------------------- // Allocate memory on GPU(s) and copy to device void GrandBrownTown::copyToCUDA() { const size_t tot_num = num * numReplicas; - gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num)); + /*gpuErrchk(cudaMalloc(&pos_d, sizeof(Vector3) * tot_num)); gpuErrchk(cudaMemcpyAsync(pos_d, pos, sizeof(Vector3) * tot_num, cudaMemcpyHostToDevice)); gpuErrchk(cudaMalloc(&forceInternal_d, sizeof(Vector3) * num * numReplicas)); gpuErrchk(cudaMemcpyAsync(forceInternal_d, forceInternal, sizeof(Vector3) * tot_num, - cudaMemcpyHostToDevice)); + cudaMemcpyHostToDevice));*/ gpuErrchk(cudaMalloc(&randoGen_d, sizeof(Random))); gpuErrchk(cudaMemcpyAsync(randoGen_d, randoGen, sizeof(Random), @@ -985,7 +994,7 @@ void GrandBrownTown::copyToCUDA() { gpuErrchk(cudaDeviceSynchronize()); } -void GrandBrownTown::createBondList() +/*void GrandBrownTown::createBondList() { size_t size = (numBonds / 2) * numReplicas * sizeof(int3); gpuErrchk( cudaMalloc( &bondList_d, size ) ); @@ -998,4 +1007,4 @@ void GrandBrownTown::createBondList() << "Displaying: bondList_d["<< i <<"].z = " << bondList[i].z << ".\n"; } -} +}*/ diff --git a/GrandBrownTown.h b/GrandBrownTown.h index dc8c3be9743e153a5b4aa5a2a7b6745002573aa7..57bd2c14f4371eb94d9c1477cb906b2975629623 100644 --- a/GrandBrownTown.h +++ b/GrandBrownTown.h @@ -98,7 +98,6 @@ private: void getDebugForce(); void copyToCUDA(); - void createBondList(); public: // Compute the current in nanoamperes. @@ -155,18 +154,17 @@ private: Vector3* rbPos; // rigid body positions // CUDA device variables - Vector3 *pos_d, *forceInternal_d, *force_d; - int *type_d; + //Vector3 *pos_d, *forceInternal_d, *force_d; + //int *type_d; BrownianParticleType **part_d; BaseGrid *sys_d, *kTGrid_d; Random *randoGen_d; - Bond* bonds_d; - int2* bondMap_d; - Exclude* excludes_d; - int2* excludeMap_d; - Angle* angles_d; - Dihedral* dihedrals_d; - int3 *bondList_d; + //Bond* bonds_d; + //int2* bondMap_d; + //Exclude* excludes_d; + //int2* excludeMap_d; + //Angle* angles_d; + //Dihedral* dihedrals_d; // System parameters String outputName; diff --git a/runBrownTown.cpp b/runBrownTown.cpp index 49b272df357592d2f2ff18f41e0f3b3145de5581..ebd1eea1ef829a64608635826564be134351aa06 100644 --- a/runBrownTown.cpp +++ b/runBrownTown.cpp @@ -119,7 +119,9 @@ int main(int argc, char* argv[]) { Configuration config(configFile, replicas, debug); // GPUManager::set(0); GPUManager::set(gpuID); + //MLog: this copyToCUDA function (along with the one in GrandBrownTown.cpp) was split into pieces to allocate memory into the ComputeForce, due to the location of this call we may get some memory error as a ComputeForce class isn't allocated until later on. config.copyToCUDA(); + GrandBrownTown brown(config, outArg, randomSeed, debug, imd_on, imd_port, replicas);