From 98b2a95f81f120911d7b4d0bc9fc83a0aaf9a55e Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Wed, 6 Jan 2021 11:35:51 -0600 Subject: [PATCH] Macros to make NCCL optional --- src/ComputeForce.cu | 2 ++ src/GPUManager.cpp | 7 +++++-- src/GPUManager.h | 11 +++++++---- src/GrandBrownTown.cu | 10 ++++++++++ src/Makefile | 19 ++++++++++++------- src/arbd.cpp | 7 +++++++ 6 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu index 5379e31..ce8b6ed 100644 --- a/src/ComputeForce.cu +++ b/src/ComputeForce.cu @@ -581,6 +581,7 @@ void ComputeForce::decompose() { gpuKernelCheck(); gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */ + #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { // Currently we don't use numPairs_d[i] for i > 0... might be able to reduce data transfer with some kind nccl scatter, and in that case we'd prefer to use all numPairs_d[i] gpuErrchk(cudaMemcpy(&numPairs, numPairs_d[0], sizeof(int), cudaMemcpyDeviceToHost)); @@ -588,6 +589,7 @@ void ComputeForce::decompose() { gpuman.nccl_broadcast(0, pairLists_d, pairLists_d, numPairs, -1); } gpuman.sync(); + #endif //createPairlists<64,64><<< dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0], // pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d, diff --git a/src/GPUManager.cpp b/src/GPUManager.cpp index df56a55..0a7d89c 100644 --- a/src/GPUManager.cpp +++ b/src/GPUManager.cpp @@ -15,8 +15,6 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=t int GPUManager::nGPUs = 0; bool GPUManager::is_safe = true; std::vector<GPU> GPUManager::allGpus, GPUManager::gpus, GPUManager::notimeouts; -ncclComm_t* GPUManager::comms = NULL; - GPU::GPU(unsigned int id) : id(id) { cudaSetDevice(id); @@ -120,7 +118,9 @@ void GPUManager::select_gpus(std::vector<unsigned int>& gpu_ids) { gpus.push_back( allGpus[*it] ); } init_devices(); + #ifdef USE_NCCL init_comms(); + #endif } void GPUManager::use(int gpu_id) { @@ -169,6 +169,8 @@ int GPUManager::getInitialGPU() { return 0; } +#ifdef USE_NCCL +ncclComm_t* GPUManager::comms = NULL; void GPUManager::init_comms() { if (gpus.size() == 1) return; int* gpu_ids = new int[gpus.size()]; @@ -181,3 +183,4 @@ void GPUManager::init_comms() { } NCCLCHECK(ncclCommInitAll(comms, gpus.size(), gpu_ids)); } +#endif diff --git a/src/GPUManager.h b/src/GPUManager.h index 55b6d63..5847c03 100644 --- a/src/GPUManager.h +++ b/src/GPUManager.h @@ -8,7 +8,7 @@ #include "useful.h" -// #ifdef USE_NCCL +#ifdef USE_NCCL #include <nccl.h> #define NCCLCHECK(cmd) do { \ ncclResult_t r = cmd; \ @@ -18,7 +18,7 @@ exit(EXIT_FAILURE); \ } \ } while(0) -// #endif +#endif #ifndef gpuErrchk #define delgpuErrchk @@ -79,12 +79,13 @@ private: static int nGPUs; static bool is_safe; - // NCCL + #ifdef USE_NCCL static void init_comms(); + static ncclComm_t* comms; + #endif public: static size_t allGpuSize() { return allGpus.size(); } - static ncclComm_t* comms; static std::vector<GPU> gpus; static bool safe() { return is_safe; } @@ -130,6 +131,7 @@ public: return gpus[0].get_next_stream(); }; + #ifdef USE_NCCL template<typename T> void nccl_broadcast(int root, std::vector<T*> send_d, std::vector<T*> recv_d, unsigned int size, int stream_id) { if (gpus.size() == 1) return; @@ -168,6 +170,7 @@ public: } NCCLCHECK(ncclGroupEnd()); } + #endif // USE_NCCL }; #ifndef delgpuErrchk diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu index 9a21faf..5b87c3a 100644 --- a/src/GrandBrownTown.cu +++ b/src/GrandBrownTown.cu @@ -528,8 +528,10 @@ void GrandBrownTown::RunNoseHooverLangevin() timer0 = wkf_timer_create(); timerS = wkf_timer_create(); + #ifdef USE_NCCL cudaStream_t* nccl_broadcast_streams = new cudaStream_t[gpuman.gpus.size()]; for (int i=0; i< gpuman.gpus.size(); ++i) nccl_broadcast_streams[i] = 0; + #endif copyToCUDA(); @@ -618,9 +620,11 @@ void GrandBrownTown::RunNoseHooverLangevin() internal->clear_force(); internal->clear_energy(); const std::vector<Vector3*>& _pos = internal->getPos_d(); + #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { gpuman.nccl_broadcast(0, _pos, _pos, num*numReplicas, -1); } + #endif gpuman.sync(); #ifdef _OPENMP @@ -708,10 +712,12 @@ void GrandBrownTown::RunNoseHooverLangevin() RBC[i]->AddLangevin(); } } + #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { const std::vector<Vector3*>& _f = internal->getForceInternal_d(); gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1); } + #endif }//if step == 1 @@ -842,11 +848,13 @@ void GrandBrownTown::RunNoseHooverLangevin() internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD // TODO add multigpu support with IMD else { internal->clear_force(); + #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { const std::vector<Vector3*>& _p = internal->getPos_d(); nccl_broadcast_streams[0] = gpuman.gpus[0].get_next_stream(); gpuman.nccl_broadcast(0, _p, _p, num*numReplicas, nccl_broadcast_streams); } + #endif } if (interparticleForce) @@ -868,10 +876,12 @@ void GrandBrownTown::RunNoseHooverLangevin() RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d); } internal -> computeTabulated(get_energy); + #ifdef USE_NCCL if (gpuman.gpus.size() > 1) { const std::vector<Vector3*>& _f = internal->getForceInternal_d(); gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1); } + #endif break; default: // [ N^2 ] interactions, no cutoff | decompositions internal->computeTabulatedFull(get_energy); diff --git a/src/Makefile b/src/Makefile index ce3114e..8d4b024 100644 --- a/src/Makefile +++ b/src/Makefile @@ -28,15 +28,14 @@ endif endif CC_FLAGS += -DVERSION="\"$(VERSION)\"" -DSIGNAL -CC_FLAGS += -I$(CUDA_PATH)/include -I$(NCCL_PATH)/include -CC_FLAGS += -Wall -Wno-write-strings -pedantic# TODO: test on Mac OSX and other architectures +CC_FLAGS += -I$(CUDA_PATH)/include +CC_FLAGS += -Wall -Wno-write-strings -std=c++14 -pedantic# TODO: test on Mac OSX and other architectures ifeq ($(dbg),1) NV_FLAGS += -lineinfo else NV_FLAGS += -lineinfo endif -NV_FLAGS += -I$(NCCL_PATH)/include CUDA_VERSION_GT10 = $(shell expr `nvcc -V | tr 'V' ' ' | tr ' ' '\n' | tail -n1 | cut -f1 -d.` \> 10) ifeq "$(CUDA_VERSION_GT10)" "1" @@ -47,15 +46,14 @@ endif NV_FLAGS := $(NV_FLAGS) -std=$(CSTD) ifneq ($(DARWIN),) - LIBRARY = $(CUDA_PATH)/lib + CUDALIB = $(CUDA_PATH)/lib else - LIBRARY = $(CUDA_PATH)/lib64 + CUDALIB = $(CUDA_PATH)/lib64 ifeq "$(CSTD)" "c++11" CSTD := c++1y endif endif CC_FLAGS += -std=$(CSTD) -# LIBRARY := $(LIBRARY) $(NCCL_PATH)/lib # NV_FLAGS += -ftz=true # TODO: test if this preserves accurate simulation ## Find valid compute capabilities for this machine @@ -75,7 +73,14 @@ $(foreach SM,$(SMS), $(eval NV_FLAGS += -gencode arch=compute_$(SM),code=sm_$(SM $(foreach SM,$(SMPTXS), $(eval NV_FLAGS += -gencode arch=compute_$(SM),code=compute_$(SM)) ) NVLD_FLAGS := $(NV_FLAGS) --device-link -LD_FLAGS = -L$(LIBRARY) -L$(NCCL_PATH)/lib -lnccl -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(LIBRARY):$(NCCL_PATH)/lib +LD_FLAGS = -L$(CUDALIB) -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(CUDALIB) + +ifdef NCCL_PATH + CC_FLAGS += -DUSE_NCCL + CC_FLAGS += -I$(NCCL_PATH)/include + NV_FLAGS += -I$(NCCL_PATH)/include + LD_FLAGS += -L$(NCCL_PATH)/lib -lnccl -Wl,-rpath,$(NCCL_PATH)/lib +endif ifdef USE_BOOST CC_FLAGS += -DUSE_BOOST diff --git a/src/arbd.cpp b/src/arbd.cpp index e326d49..9c8a83c 100644 --- a/src/arbd.cpp +++ b/src/arbd.cpp @@ -144,6 +144,13 @@ int main(int argc, char* argv[]) { if (gpuIDs.size() == 0) gpuIDs.push_back( GPUManager::getInitialGPU() ); + #ifndef USE_NCCL + if (gpuIDs.size() > 1) { + printf("ERROR: more than one GPU requires compilation with USE_NCCL flag\n"); + return 1; + } + #endif + GPUManager::select_gpus(gpuIDs); Configuration config(configFile, replicas, debug); -- GitLab