From 98b2a95f81f120911d7b4d0bc9fc83a0aaf9a55e Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Wed, 6 Jan 2021 11:35:51 -0600
Subject: [PATCH] Macros to make NCCL optional

---
 src/ComputeForce.cu   |  2 ++
 src/GPUManager.cpp    |  7 +++++--
 src/GPUManager.h      | 11 +++++++----
 src/GrandBrownTown.cu | 10 ++++++++++
 src/Makefile          | 19 ++++++++++++-------
 src/arbd.cpp          |  7 +++++++
 6 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/src/ComputeForce.cu b/src/ComputeForce.cu
index 5379e31..ce8b6ed 100644
--- a/src/ComputeForce.cu
+++ b/src/ComputeForce.cu
@@ -581,6 +581,7 @@ void ComputeForce::decompose() {
       gpuKernelCheck();
       gpuErrchk(cudaDeviceSynchronize()); /* RBTODO: sync needed here? */
 
+      #ifdef USE_NCCL
       if (gpuman.gpus.size() > 1) {
 	  // Currently we don't use numPairs_d[i] for i > 0... might be able to reduce data transfer with some kind nccl scatter, and in that case we'd prefer to use all numPairs_d[i]
 	  gpuErrchk(cudaMemcpy(&numPairs, numPairs_d[0], sizeof(int), cudaMemcpyDeviceToHost));
@@ -588,6 +589,7 @@ void ComputeForce::decompose() {
 	  gpuman.nccl_broadcast(0, pairLists_d, pairLists_d, numPairs, -1);
       }
       gpuman.sync();
+      #endif
 
     //createPairlists<64,64><<< dim3(256,128,numReplicas),dim3(64,1,1)>>>(pos_d[0], num, numReplicas, sys_d[0], decomp_d, nCells, numPairs_d[0],
     //                                                                  pairLists_d[0], numParts, type_d, pairTabPotType_d[0], excludes_d,
diff --git a/src/GPUManager.cpp b/src/GPUManager.cpp
index df56a55..0a7d89c 100644
--- a/src/GPUManager.cpp
+++ b/src/GPUManager.cpp
@@ -15,8 +15,6 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=t
 int GPUManager::nGPUs = 0;
 bool GPUManager::is_safe = true;
 std::vector<GPU> GPUManager::allGpus, GPUManager::gpus, GPUManager::notimeouts;
-ncclComm_t* GPUManager::comms = NULL;
-
 
 GPU::GPU(unsigned int id) : id(id) {
     cudaSetDevice(id);
@@ -120,7 +118,9 @@ void GPUManager::select_gpus(std::vector<unsigned int>& gpu_ids) {
 	gpus.push_back( allGpus[*it] );
     }
     init_devices();
+    #ifdef USE_NCCL
     init_comms();
+    #endif
 }
 
 void GPUManager::use(int gpu_id) {
@@ -169,6 +169,8 @@ int GPUManager::getInitialGPU() {
     return 0;
 }
 
+#ifdef USE_NCCL
+ncclComm_t* GPUManager::comms = NULL;
 void GPUManager::init_comms() {
     if (gpus.size() == 1) return;
     int* gpu_ids = new int[gpus.size()];
@@ -181,3 +183,4 @@ void GPUManager::init_comms() {
     }
     NCCLCHECK(ncclCommInitAll(comms, gpus.size(), gpu_ids));
 }
+#endif
diff --git a/src/GPUManager.h b/src/GPUManager.h
index 55b6d63..5847c03 100644
--- a/src/GPUManager.h
+++ b/src/GPUManager.h
@@ -8,7 +8,7 @@
 
 #include "useful.h"
 
-// #ifdef USE_NCCL
+#ifdef USE_NCCL
 #include <nccl.h>
 #define NCCLCHECK(cmd) do {					\
       ncclResult_t r = cmd;					\
@@ -18,7 +18,7 @@
 	  exit(EXIT_FAILURE);					\
       }								\
   } while(0)
-// #endif
+#endif
 
 #ifndef gpuErrchk
 #define delgpuErrchk
@@ -79,12 +79,13 @@ private:
 	static int nGPUs;
 	static bool is_safe;
 
-	// NCCL
+	#ifdef USE_NCCL
 	static void init_comms();
+	static ncclComm_t* comms;
+	#endif
 
 public:	
 	static size_t allGpuSize() { return allGpus.size(); }
-	static ncclComm_t* comms;
 	static std::vector<GPU> gpus;
 	
 	static bool safe() { return is_safe; }
@@ -130,6 +131,7 @@ public:
 	return gpus[0].get_next_stream();
     };
 
+    #ifdef USE_NCCL
     template<typename T>
     void nccl_broadcast(int root, std::vector<T*> send_d, std::vector<T*> recv_d, unsigned int size, int stream_id) {
 	if (gpus.size() == 1) return;
@@ -168,6 +170,7 @@ public:
 	}
 	NCCLCHECK(ncclGroupEnd());
     }
+    #endif // USE_NCCL
     
 };
 #ifndef delgpuErrchk
diff --git a/src/GrandBrownTown.cu b/src/GrandBrownTown.cu
index 9a21faf..5b87c3a 100644
--- a/src/GrandBrownTown.cu
+++ b/src/GrandBrownTown.cu
@@ -528,8 +528,10 @@ void GrandBrownTown::RunNoseHooverLangevin()
     timer0 = wkf_timer_create();
     timerS = wkf_timer_create();
 
+    #ifdef USE_NCCL
     cudaStream_t* nccl_broadcast_streams = new cudaStream_t[gpuman.gpus.size()];
     for (int i=0; i< gpuman.gpus.size(); ++i) nccl_broadcast_streams[i] = 0;
+    #endif
 
     copyToCUDA();
 
@@ -618,9 +620,11 @@ void GrandBrownTown::RunNoseHooverLangevin()
 	    internal->clear_force();
 	    internal->clear_energy();
 	    const std::vector<Vector3*>& _pos = internal->getPos_d();
+	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
 		gpuman.nccl_broadcast(0, _pos, _pos, num*numReplicas, -1);
 	    }
+	    #endif
 	    gpuman.sync();
 
             #ifdef _OPENMP
@@ -708,10 +712,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                     RBC[i]->AddLangevin();
                 }
             }
+	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _f = internal->getForceInternal_d();
 		gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
 	    }
+	    #endif
 
         }//if step == 1
 
@@ -842,11 +848,13 @@ void GrandBrownTown::RunNoseHooverLangevin()
             internal->setForceInternalOnDevice(imdForces); // TODO ensure replicas are mutually exclusive with IMD // TODO add multigpu support with IMD
 	else {
             internal->clear_force();
+	    #ifdef USE_NCCL
 	    if (gpuman.gpus.size() > 1) {
 		const std::vector<Vector3*>& _p = internal->getPos_d();
 		nccl_broadcast_streams[0] = gpuman.gpus[0].get_next_stream();
 		gpuman.nccl_broadcast(0, _p, _p, num*numReplicas, nccl_broadcast_streams);
 	    }
+	    #endif
     	}
 
         if (interparticleForce)
@@ -868,10 +876,12 @@ void GrandBrownTown::RunNoseHooverLangevin()
                                 RBC[i]->updateParticleLists( (internal->getPos_d()[0])+i*num, sys_d);
                         }
                         internal -> computeTabulated(get_energy);
+			#ifdef USE_NCCL
 			if (gpuman.gpus.size() > 1) {
 			    const std::vector<Vector3*>& _f = internal->getForceInternal_d();
 			    gpuman.nccl_reduce(0, _f, _f, num*numReplicas, -1);
 			}
+			#endif
                         break;
                     default: // [ N^2 ] interactions, no cutoff | decompositions
                         internal->computeTabulatedFull(get_energy);
diff --git a/src/Makefile b/src/Makefile
index ce3114e..8d4b024 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -28,15 +28,14 @@ endif
 endif
 
 CC_FLAGS += -DVERSION="\"$(VERSION)\"" -DSIGNAL
-CC_FLAGS += -I$(CUDA_PATH)/include -I$(NCCL_PATH)/include
-CC_FLAGS += -Wall -Wno-write-strings -pedantic# TODO: test on Mac OSX and other architectures
+CC_FLAGS += -I$(CUDA_PATH)/include
+CC_FLAGS += -Wall -Wno-write-strings -std=c++14 -pedantic# TODO: test on Mac OSX and other architectures
 
 ifeq ($(dbg),1)
 NV_FLAGS += -lineinfo
 else
 NV_FLAGS += -lineinfo
 endif
-NV_FLAGS += -I$(NCCL_PATH)/include
 
 CUDA_VERSION_GT10 = $(shell expr `nvcc -V | tr 'V' ' ' | tr ' ' '\n' | tail -n1 | cut -f1 -d.` \> 10)
 ifeq "$(CUDA_VERSION_GT10)" "1"
@@ -47,15 +46,14 @@ endif
 NV_FLAGS := $(NV_FLAGS) -std=$(CSTD)
 
 ifneq ($(DARWIN),)
-  LIBRARY = $(CUDA_PATH)/lib
+  CUDALIB = $(CUDA_PATH)/lib
 else
-  LIBRARY = $(CUDA_PATH)/lib64
+  CUDALIB = $(CUDA_PATH)/lib64
   ifeq "$(CSTD)" "c++11"
     CSTD := c++1y
   endif
 endif
 CC_FLAGS += -std=$(CSTD)
-# LIBRARY := $(LIBRARY) $(NCCL_PATH)/lib
 # NV_FLAGS += -ftz=true			# TODO: test if this preserves accurate simulation
 
 ## Find valid compute capabilities for this machine
@@ -75,7 +73,14 @@ $(foreach SM,$(SMS), $(eval NV_FLAGS += -gencode arch=compute_$(SM),code=sm_$(SM
 $(foreach SM,$(SMPTXS), $(eval NV_FLAGS += -gencode arch=compute_$(SM),code=compute_$(SM)) )
 
 NVLD_FLAGS := $(NV_FLAGS) --device-link 
-LD_FLAGS = -L$(LIBRARY) -L$(NCCL_PATH)/lib -lnccl -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(LIBRARY):$(NCCL_PATH)/lib
+LD_FLAGS = -L$(CUDALIB) -lcurand -lcudart -lcudadevrt -Wl,-rpath,$(CUDALIB)
+
+ifdef NCCL_PATH
+	CC_FLAGS += -DUSE_NCCL
+	CC_FLAGS += -I$(NCCL_PATH)/include
+	NV_FLAGS += -I$(NCCL_PATH)/include
+	LD_FLAGS += -L$(NCCL_PATH)/lib -lnccl -Wl,-rpath,$(NCCL_PATH)/lib
+endif
 
 ifdef USE_BOOST
 	CC_FLAGS += -DUSE_BOOST
diff --git a/src/arbd.cpp b/src/arbd.cpp
index e326d49..9c8a83c 100644
--- a/src/arbd.cpp
+++ b/src/arbd.cpp
@@ -144,6 +144,13 @@ int main(int argc, char* argv[]) {
 	if (gpuIDs.size() == 0)
 	    gpuIDs.push_back( GPUManager::getInitialGPU() );
 
+	#ifndef USE_NCCL
+	if (gpuIDs.size() > 1) {
+	    printf("ERROR: more than one GPU requires compilation with USE_NCCL flag\n");
+	    return 1;
+	}
+	#endif
+
 	GPUManager::select_gpus(gpuIDs);
 
 	Configuration config(configFile, replicas, debug);
-- 
GitLab