From 7e2dc96fa99f35bc04e2af99d9dd84c025c3c6ab Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Tue, 7 Mar 2017 16:21:49 -0600
Subject: [PATCH] Made GPU assigment more intuitive

---
 src/GPUManager.cpp | 32 ++++++++++++++++++--------------
 src/GPUManager.h   |  9 ++++++---
 src/arbd.cpp       | 28 ++++++++++++++++++----------
 3 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/src/GPUManager.cpp b/src/GPUManager.cpp
index 4061674..665b2dc 100644
--- a/src/GPUManager.cpp
+++ b/src/GPUManager.cpp
@@ -10,7 +10,7 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=t
 
 int GPUManager::nGPUs = 0;
 bool GPUManager::is_safe = true;
-std::vector<int> GPUManager::gpus, GPUManager::timeouts, GPUManager::notimeouts;
+std::vector<int> GPUManager::allGpus, GPUManager::gpus, GPUManager::timeouts, GPUManager::notimeouts;
 std::vector<GPUPeer> GPUManager::peers;
 
 // Currently unused
@@ -20,15 +20,10 @@ std::vector<cudaEvent_t> GPUManager::events;
 
 void GPUManager::init() {
 	load_info();
-	is_safe = true;
-	gpus = notimeouts;
-	// If every GPU times out, use them
-	if (gpus.size() == 0) {
-		printf("WARNING: Using GPUs that may time out\n");
-		is_safe = false;
-		gpus = timeouts;
-	}
-	if (gpus.size() == 0) {
+	is_safe = false;
+	gpus = allGpus;
+
+	if (allGpus.size() == 0) {
 	    fprintf(stderr, "Error: Did not find a GPU\n");
 	    exit(1);
 	}
@@ -46,11 +41,11 @@ void GPUManager::load_info() {
 
 		// Print out properties
 		printf("[%d] %s ", dev, prop.name);
+		allGpus.push_back(dev);
 		if (prop.kernelExecTimeoutEnabled) {
 			printf("(may timeout) ");
-			timeouts.push_back(dev);
 		} else {
-			notimeouts.push_back(dev);
+		    notimeouts.push_back(dev);
 		}
 
 		printf("| SM %d.%d, ", prop.major, prop.minor);
@@ -117,7 +112,16 @@ void GPUManager::safe(bool make_safe) {
 		gpus = notimeouts;
 		is_safe = true;
 	} else {
-		gpus.insert(gpus.end(), timeouts.begin(), timeouts.end());
-		is_safe = false;
+	    gpus = allGpus;
+	    is_safe = false;
 	}
 }
+
+int GPUManager::getInitialGPU() {
+    // TODO: check the load on the gpus and select an unused one
+    for (uint i = 0; i < gpus.size(); ++i) {
+	if (!properties[gpus[i]].kernelExecTimeoutEnabled)
+	    return i; 
+    }
+    return 0;
+}
diff --git a/src/GPUManager.h b/src/GPUManager.h
index bf61053..1efeaf3 100644
--- a/src/GPUManager.h
+++ b/src/GPUManager.h
@@ -23,6 +23,7 @@ private:
 	static bool is_safe;
 
 public:	
+	static std::vector<int> allGpus;
 	static std::vector<int> gpus;
 	static std::vector<cudaDeviceProp> properties;
 	
@@ -38,14 +39,16 @@ public:
 	// set
 	// Set the GPU
 	static void set(int gpu_id);
+
+	// current
+	// @return the current GPU a thread is using
+	static int current();
 	
 	// safe
 	// @param whether gpus should contain GPUs that may timeout
 	static void safe(bool make_safe);
 	
-	// current
-	// @return the current GPU a thread is using
-	static int current();
+	static int getInitialGPU();
 	
 	// Currently unused
 	static std::vector<GPUPeer> peers;
diff --git a/src/arbd.cpp b/src/arbd.cpp
index 78ae60c..98c93e2 100644
--- a/src/arbd.cpp
+++ b/src/arbd.cpp
@@ -23,8 +23,8 @@ int main(int argc, char* argv[]) {
 		printf("  -g, --gpu=         Index of gpu to use (defaults to 0)\n");
 		printf("  -i, --imd=         IMD port (defaults to %d)\n", kIMDPort);
 		printf("  -d, --debug        Debug mode: allows user to choose which forces are computed\n");
-		printf("  --safe             Do not use GPUs that may timeout (default)\n");
-		printf("  --unsafe           Use GPUs that may timeout\n");
+		printf("  --safe             Do not use GPUs that may timeout\n");
+		printf("  --unsafe           Use GPUs that may timeout (default)\n");
 		printf("  -h, --help         Display this help and exit\n");
 		printf("  --info             Output CPU and GPU information and exit\n");
 		printf("  --version          Output version information and exit\n");
@@ -43,11 +43,17 @@ int main(int argc, char* argv[]) {
     printf("Try '%s --help' for more information.\n", argv[0]);
     return 1;
   }
-	size_t n_gpus = max(GPUManager::gpus.size(), 1lu);
+	// printf("Everything's great when you're...BrownTown\n");
+	printf("  â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“\n");
+	printf("  |    Atomic Resolution Brownian Dynamics    |\n");
+	printf("  â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“â€“\n\n");
+
+	GPUManager::init();
 
-	int gpuID = 0;
+	size_t n_gpus = max(GPUManager::gpus.size(), 1lu);
+	int gpuID = -1;
 	
-	bool debug = false, safe = true;
+	bool debug = false, safe = false;
 	int replicas = 1;
 	unsigned int imd_port = 0;
 	bool imd_on = false;
@@ -66,17 +72,18 @@ int main(int argc, char* argv[]) {
 
 		} else if (strcmp(arg, "-g") == 0 || strcmp(arg, "--gpu") == 0) {
 			unsigned int arg_val = atoi(argv[pos + 1]);
+			safe = false;
 			gpuID = arg_val;
 			num_flags += 2;
 			if (arg_val < 0 || arg_val > n_gpus) {
-				printf("Invalid argument given to %s\n", arg);
+				printf("ERROR: Invalid argument given to %s\n", arg);
 				return 1;
 			}
 			
 		} else if (strcmp(arg, "-r") == 0 || strcmp(arg, "--replicas") == 0) {
 			int arg_val = atoi(argv[pos + 1]);
 			if (arg_val <= 0) {
-				printf("Invalid argument given to %s\n", arg);
+				printf("ERROR: Invalid argument given to %s\n", arg);
 				return 1;
 			}
 			replicas = arg_val;
@@ -111,13 +118,14 @@ int main(int argc, char* argv[]) {
 		configFile = argv[argc - 2];
 		outArg = argv[argc - 1];
 	}
-	
-  printf("Everything's great when you're...BrownTown\n");
 
-	GPUManager::init();
 	GPUManager::safe(safe);
+	if (gpuID == -1)
+	    gpuID = GPUManager::getInitialGPU();
+
 	Configuration config(configFile, replicas, debug);
 	// GPUManager::set(0);
+	printf("Setting gpuID to %d\n",gpuID);
 	GPUManager::set(gpuID);
 	//MLog: this copyToCUDA function (along with the one in GrandBrownTown.cpp) was split into pieces to allocate memory into the ComputeForce, due to the location of this call we may get some memory error as a ComputeForce class isn't allocated until later on.
 	config.copyToCUDA();
-- 
GitLab