Made GPU assigment more intuitive

7e2dc96f · cmaffeo2 · 2284ac9c · 7e2dc96f · 7e2dc96f · 7e2dc96f
Commit 7e2dc96f authored 8 years ago by cmaffeo2
--- a/src/GPUManager.cpp
+++ b/src/GPUManager.cpp
@@ -10,7 +10,7 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=t
 int GPUManager::nGPUs = 0;
 bool GPUManager::is_safe = true;
-std::vector<int> GPUManager::gpus, GPUManager::timeouts, GPUManager::notimeouts;
+std::vector<int> GPUManager::allGpus, GPUManager::gpus, GPUManager::timeouts, GPUManager::notimeouts;
 std::vector<GPUPeer> GPUManager::peers;
 // Currently unused
@@ -20,15 +20,10 @@ std::vector<cudaEvent_t> GPUManager::events;
 void GPUManager::init() {
 	load_info();
-	is_safe = true;
+	is_safe = false;
-	gpus = notimeouts;
+	gpus = allGpus;
-	// If every GPU times out, use them
-	if (gpus.size() == 0) {
+	if (allGpus.size() == 0) {
-		printf("WARNING: Using GPUs that may time out\n");
-		is_safe = false;
-		gpus = timeouts;
-	}
-	if (gpus.size() == 0) {
 	    fprintf(stderr, "Error: Did not find a GPU\n");
 	    exit(1);
 	}
@@ -46,11 +41,11 @@ void GPUManager::load_info() {
 		// Print out properties
 		printf("[%d] %s ", dev, prop.name);
+		allGpus.push_back(dev);
 		if (prop.kernelExecTimeoutEnabled) {
 			printf("(may timeout) ");
-			timeouts.push_back(dev);
 		} else {
-			notimeouts.push_back(dev);
+		    notimeouts.push_back(dev);
 		}
 		printf("| SM %d.%d, ", prop.major, prop.minor);
@@ -117,7 +112,16 @@ void GPUManager::safe(bool make_safe) {
 		gpus = notimeouts;
 		is_safe = true;
 	} else {
-		gpus.insert(gpus.end(), timeouts.begin(), timeouts.end());
+	    gpus = allGpus;
-		is_safe = false;
+	    is_safe = false;
 	}
 }
+int GPUManager::getInitialGPU() {
+    // TODO: check the load on the gpus and select an unused one
+    for (uint i = 0; i < gpus.size(); ++i) {
+	if (!properties[gpus[i]].kernelExecTimeoutEnabled)
+	    return i; 
+    }
+    return 0;
+}
--- a/src/GPUManager.h
+++ b/src/GPUManager.h
@@ -23,6 +23,7 @@ private:
 	static bool is_safe;
 public:	
+	static std::vector<int> allGpus;
 	static std::vector<int> gpus;
 	static std::vector<cudaDeviceProp> properties;
@@ -38,14 +39,16 @@ public:
 	// set
 	// Set the GPU
 	static void set(int gpu_id);
+	// current
+	// @return the current GPU a thread is using
+	static int current();
 	// safe
 	// @param whether gpus should contain GPUs that may timeout
 	static void safe(bool make_safe);
-	// current
+	static int getInitialGPU();
-	// @return the current GPU a thread is using
-	static int current();
 	// Currently unused
 	static std::vector<GPUPeer> peers;

--- a/src/arbd.cpp
+++ b/src/arbd.cpp
@@ -23,8 +23,8 @@ int main(int argc, char* argv[]) {
 		printf("  -g, --gpu=         Index of gpu to use (defaults to 0)\n");
 		printf("  -i, --imd=         IMD port (defaults to %d)\n", kIMDPort);
 		printf("  -d, --debug        Debug mode: allows user to choose which forces are computed\n");
-		printf("  --safe             Do not use GPUs that may timeout (default)\n");
+		printf("  --safe             Do not use GPUs that may timeout\n");
-		printf("  --unsafe           Use GPUs that may timeout\n");
+		printf("  --unsafe           Use GPUs that may timeout (default)\n");
 		printf("  -h, --help         Display this help and exit\n");
 		printf("  --info             Output CPU and GPU information and exit\n");
 		printf("  --version          Output version information and exit\n");
@@ -43,11 +43,17 @@ int main(int argc, char* argv[]) {
    printf("Try '%s --help' for more information.\n", argv[0]);
    return 1;
  }
-	size_t n_gpus = max(GPUManager::gpus.size(), 1lu);
+	// printf("Everything's great when you're...BrownTown\n");
+	printf("  –––––––––––––––––––––––––––––––––––––––––––––\n");
+	printf("  |    Atomic Resolution Brownian Dynamics    |\n");
+	printf("  –––––––––––––––––––––––––––––––––––––––––––––\n\n");
+	GPUManager::init();
-	int gpuID = 0;
+	size_t n_gpus = max(GPUManager::gpus.size(), 1lu);
+	int gpuID = -1;
-	bool debug = false, safe = true;
+	bool debug = false, safe = false;
 	int replicas = 1;
 	unsigned int imd_port = 0;
 	bool imd_on = false;
@@ -66,17 +72,18 @@ int main(int argc, char* argv[]) {
 		} else if (strcmp(arg, "-g") == 0 || strcmp(arg, "--gpu") == 0) {
 			unsigned int arg_val = atoi(argv[pos + 1]);
+			safe = false;
 			gpuID = arg_val;
 			num_flags += 2;
 			if (arg_val < 0 || arg_val > n_gpus) {
-				printf("Invalid argument given to %s\n", arg);
+				printf("ERROR: Invalid argument given to %s\n", arg);
 				return 1;
 			}
 		} else if (strcmp(arg, "-r") == 0 || strcmp(arg, "--replicas") == 0) {
 			int arg_val = atoi(argv[pos + 1]);
 			if (arg_val <= 0) {
-				printf("Invalid argument given to %s\n", arg);
+				printf("ERROR: Invalid argument given to %s\n", arg);
 				return 1;
 			}
 			replicas = arg_val;
@@ -111,13 +118,14 @@ int main(int argc, char* argv[]) {
 		configFile = argv[argc - 2];
 		outArg = argv[argc - 1];
 	}
-  printf("Everything's great when you're...BrownTown\n");
-	GPUManager::init();
 	GPUManager::safe(safe);
+	if (gpuID == -1)
+	    gpuID = GPUManager::getInitialGPU();
 	Configuration config(configFile, replicas, debug);
 	// GPUManager::set(0);
+	printf("Setting gpuID to %d\n",gpuID);
 	GPUManager::set(gpuID);
 	//MLog: this copyToCUDA function (along with the one in GrandBrownTown.cpp) was split into pieces to allocate memory into the ComputeForce, due to the location of this call we may get some memory error as a ComputeForce class isn't allocated until later on.
 	config.copyToCUDA();