diff --git a/GrandBrownTown.cu b/GrandBrownTown.cu index aa062e642546343b647e0461c6b2d7df1e43c76b..899bc780f600917d69c3e765ee071ebd65ff7ddb 100644 --- a/GrandBrownTown.cu +++ b/GrandBrownTown.cu @@ -1,5 +1,7 @@ #include "GrandBrownTown.h" #include "GrandBrownTown.cuh" +#include "nvtxEvents.h" + /* #include "ComputeGridGrid.cuh" */ #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } @@ -313,7 +315,14 @@ void GrandBrownTown::run() { printf("Configuration: %d particles | %d replicas\n", num, numReplicas); - // Main loop over Brownian dynamics steps + +#pragma omp parallel sections + { + +/* # pragma omp single nowait */ + { + +// Main loop over Brownian dynamics steps for (long int s = 1; s < steps; s++) { // Compute the internal forces. Only calculate the energy when we are about to output. bool get_energy = ((s % outputEnergyPeriod) == 0); @@ -322,6 +331,10 @@ void GrandBrownTown::run() { // Set the timer rt_timer_start(cputimer); + + // compute forces + nvtxRangeId_t rid0 = nvtxStart("Compute 'atom' forces", 1); + // 'interparticleForce' - determines whether particles interact with each other if (interparticleForce) { @@ -345,8 +358,7 @@ void GrandBrownTown::run() { get_energy); break; default: // [ N^2 ] interactions, no cutoff | decompositions - energy = - internal->computeTabulatedFull(forceInternal_d, pos_d, type_d, + energy = internal->computeTabulatedFull(forceInternal_d, pos_d, type_d, bonds_d, bondMap_d, excludes_d, excludeMap_d, angles_d, dihedrals_d, @@ -401,6 +413,8 @@ void GrandBrownTown::run() { int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1); int tl = temperatureGrid.length(); + nvtxRangeEnd(rid0); + rid0 = nvtxStart("RigidBody and update", 1); RBC.updateForces(s); /* update RB forces before update particle positions... */ // Call the kernel to update the positions of each particle @@ -421,7 +435,11 @@ void GrandBrownTown::run() { /* computeGridGridForce<<< numBlocks, NUM_THREADS >>>(grid1_d, grid2_d); */ // int numBlocks = (numRB ) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1); - + + + nvtxRangeEnd(rid0); + rid0 = nvtxStart("I/O", 1); + Vector3 force0(0.0f); if (imd_on && clientsock && s % outputPeriod == 0) { @@ -460,37 +478,48 @@ void GrandBrownTown::run() { } } + bool posFetched = false; // Output trajectories (to files) if (s % outputPeriod == 0) { + + // wait for previous writes before copying new data +#pragma omp taskwait // Copy particle positions back to CPU - gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas, - cudaMemcpyDeviceToHost)); + posFetched=true; +#pragma omp task untied + { + nvtxRangeId_t rid = nvtxStart("Write output", 1); - // Nanoseconds computed - t = s*timestep; + gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas, + cudaMemcpyDeviceToHost)); + // Nanoseconds computed + t = s*timestep; - // Loop over all replicas - for (int repID = 0; repID < numReplicas; ++repID) { + // Loop over all replicas + for (int repID = 0; repID < numReplicas; ++repID) { - // *Analysis*: ionic current - if (currentSegmentZ <= 0.0f) writeCurrent(repID, t); - else writeCurrentSegment(repID, t, currentSegmentZ); + // *Analysis*: ionic current + if (currentSegmentZ <= 0.0f) writeCurrent(repID, t); + else writeCurrentSegment(repID, t, currentSegmentZ); - // - if (numberFluct == 1) updateNameList(); // no need for it here if particles stay the same - // TODO: doublecheck + // + if (numberFluct == 1) updateNameList(); // no need for it here if particles stay the same + // TODO: doublecheck - // Write the trajectory. - writers[repID]->append(pos + (repID*num), name, serial, t, num); - } + // Write the trajectory. + writers[repID]->append(pos + (repID*num), name, serial, t, num); + } - // Handle particle fluctuations. - // TODO: Currently, not compatible with replicas. Needs a fix. - if (numberFluct == 1) updateReservoirs(); + // Handle particle fluctuations. + // TODO: Currently, not compatible with replicas. Needs a fix. + if (numberFluct == 1) updateReservoirs(); - // Store the current positions. - // We must do this after particles have been added. - remember(t); + // Store the current positions. + // We must do this after particles have been added. + remember(t); + nvtxRangeEnd(rid); + } + } // s % outputPeriod == 0 @@ -500,7 +529,7 @@ void GrandBrownTown::run() { rt_timer_stop(timerS); // Copy back forces to display (internal only) - gpuErrchk(cudaMemcpy(&force0, forceInternal_d, sizeof(Vector3), cudaMemcpyDeviceToHost)); + // gpuErrchk(cudaMemcpy(&force0, forceInternal_d, sizeof(Vector3), cudaMemcpyDeviceToHost)); // Nanoseconds computed t = s * timestep; @@ -520,25 +549,31 @@ void GrandBrownTown::run() { t, energy, force0.x, force0.y, force0.z); */ - // Copy positions from GPU to CPU. - gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas, - cudaMemcpyDeviceToHost)); - // Write restart files for each replica. - for (int repID = 0; repID < numReplicas; ++repID) - writeRestart(repID); + // Copy positions from GPU to CPU. + #pragma omp taskwait + if (!posFetched) { + gpuErrchk(cudaMemcpy(pos, pos_d, sizeof(Vector3) * num * numReplicas, + cudaMemcpyDeviceToHost)); + } + #pragma omp task untied + { // Write restart files for each replica. + nvtxRangeId_t rid = nvtxStart("Write Restart", 1); + for (int repID = 0; repID < numReplicas; ++repID) + writeRestart(repID); + nvtxRangeEnd(rid); + } + // restart the timer rt_timer_start(timerS); - } // s % outputEnergyPeriod - - { - int numBlocks = (num * numReplicas) / NUM_THREADS + (num * numReplicas % NUM_THREADS == 0 ? 0 : 1); - clearInternalForces<<< numBlocks, NUM_THREADS >>>(forceInternal_d, num, numReplicas); } - + + nvtxRangeEnd(rid0); + } // done with all Brownian dynamics steps + } // If IMD is on & our socket is still open. if (imd_on and clientsock) { if (vmdsock_selread(clientsock, 0) == 1) { @@ -562,6 +597,7 @@ void GrandBrownTown::run() { } } } + } // Stop the main timer. rt_timer_stop(timer0); @@ -612,10 +648,9 @@ void GrandBrownTown::populate() { void GrandBrownTown::writeRestart(int repID) const { FILE* out = fopen(restartFiles[repID].c_str(), "w"); const int offset = repID * num; - for (int i = 0; i < num; ++i) { - const int ind = i + offset; - const Vector3& p = pos[ind]; - fprintf(out, "%d %.10g %.10g %.10g\n", type[ind], p.x, p.y, p.z); + for (int i = offset; i < offset+num; ++i) { + const Vector3& p = pos[i]; + fprintf(out, "%d %.10g %.10g %.10g\n", type[i], p.x, p.y, p.z); } fclose(out); } diff --git a/makefile b/makefile index 6ec127e3d6645931c09d1d86b2464603ea4e8870..5c7ecc0dd31b86145a66f325737fa7f28456ade1 100644 --- a/makefile +++ b/makefile @@ -13,9 +13,11 @@ INCLUDE = $(CUDA_PATH)/include # DEBUG = -g -O0 -CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic +CC_FLAGS = -Wall -Wno-write-strings -I$(INCLUDE) $(DEBUG) -std=c++0x -pedantic -fopenmp # NV_FLAGS = --maxrregcount 63 -Xptxas -v # -v,-abi=no -NV_FLAGS = -Xptxas -v # -v,-abi=no +NV_FLAGS = -Xptxas -v # -v,-abi=no +NV_FLAGS += -Xcompiler -fopenmp + ifneq ($(DEBUG),) NV_FLAGS += -g -G #debug EX_FLAGS = -O0 -m$(OS_SIZE) @@ -56,7 +58,7 @@ NV_FLAGS += -gencode arch=compute_$(SM),code=compute_$(SM) NVLD_FLAGS := $(NV_FLAGS) --device-link # NV_FLAGS += -rdc=true -LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(LIBRARY) +LD_FLAGS = -L$(LIBRARY) -lcurand -lcudart -lcudadevrt -lnvToolsExt -Wl,-rpath,$(LIBRARY) LD_FLAGS += -lcuda diff --git a/nvtxEvents.h b/nvtxEvents.h new file mode 100644 index 0000000000000000000000000000000000000000..5c9ffe5640c07968628745e696aa7e4615aed541 --- /dev/null +++ b/nvtxEvents.h @@ -0,0 +1,10 @@ +#pragma once + +#include <nvToolsExt.h> + +// http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvtx +nvtxRangeId_t nvtxStart(const char* name, int colorId) { + return nvtxRangeStartA(name); +} + +