Skip to content
Snippets Groups Projects
Commit 7e2dc96f authored by cmaffeo2's avatar cmaffeo2
Browse files

Made GPU assigment more intuitive

parent 2284ac9c
No related branches found
No related tags found
No related merge requests found
...@@ -10,7 +10,7 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=t ...@@ -10,7 +10,7 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=t
int GPUManager::nGPUs = 0; int GPUManager::nGPUs = 0;
bool GPUManager::is_safe = true; bool GPUManager::is_safe = true;
std::vector<int> GPUManager::gpus, GPUManager::timeouts, GPUManager::notimeouts; std::vector<int> GPUManager::allGpus, GPUManager::gpus, GPUManager::timeouts, GPUManager::notimeouts;
std::vector<GPUPeer> GPUManager::peers; std::vector<GPUPeer> GPUManager::peers;
// Currently unused // Currently unused
...@@ -20,15 +20,10 @@ std::vector<cudaEvent_t> GPUManager::events; ...@@ -20,15 +20,10 @@ std::vector<cudaEvent_t> GPUManager::events;
void GPUManager::init() { void GPUManager::init() {
load_info(); load_info();
is_safe = true; is_safe = false;
gpus = notimeouts; gpus = allGpus;
// If every GPU times out, use them
if (gpus.size() == 0) { if (allGpus.size() == 0) {
printf("WARNING: Using GPUs that may time out\n");
is_safe = false;
gpus = timeouts;
}
if (gpus.size() == 0) {
fprintf(stderr, "Error: Did not find a GPU\n"); fprintf(stderr, "Error: Did not find a GPU\n");
exit(1); exit(1);
} }
...@@ -46,11 +41,11 @@ void GPUManager::load_info() { ...@@ -46,11 +41,11 @@ void GPUManager::load_info() {
// Print out properties // Print out properties
printf("[%d] %s ", dev, prop.name); printf("[%d] %s ", dev, prop.name);
allGpus.push_back(dev);
if (prop.kernelExecTimeoutEnabled) { if (prop.kernelExecTimeoutEnabled) {
printf("(may timeout) "); printf("(may timeout) ");
timeouts.push_back(dev);
} else { } else {
notimeouts.push_back(dev); notimeouts.push_back(dev);
} }
printf("| SM %d.%d, ", prop.major, prop.minor); printf("| SM %d.%d, ", prop.major, prop.minor);
...@@ -117,7 +112,16 @@ void GPUManager::safe(bool make_safe) { ...@@ -117,7 +112,16 @@ void GPUManager::safe(bool make_safe) {
gpus = notimeouts; gpus = notimeouts;
is_safe = true; is_safe = true;
} else { } else {
gpus.insert(gpus.end(), timeouts.begin(), timeouts.end()); gpus = allGpus;
is_safe = false; is_safe = false;
} }
} }
int GPUManager::getInitialGPU() {
// TODO: check the load on the gpus and select an unused one
for (uint i = 0; i < gpus.size(); ++i) {
if (!properties[gpus[i]].kernelExecTimeoutEnabled)
return i;
}
return 0;
}
...@@ -23,6 +23,7 @@ private: ...@@ -23,6 +23,7 @@ private:
static bool is_safe; static bool is_safe;
public: public:
static std::vector<int> allGpus;
static std::vector<int> gpus; static std::vector<int> gpus;
static std::vector<cudaDeviceProp> properties; static std::vector<cudaDeviceProp> properties;
...@@ -38,14 +39,16 @@ public: ...@@ -38,14 +39,16 @@ public:
// set // set
// Set the GPU // Set the GPU
static void set(int gpu_id); static void set(int gpu_id);
// current
// @return the current GPU a thread is using
static int current();
// safe // safe
// @param whether gpus should contain GPUs that may timeout // @param whether gpus should contain GPUs that may timeout
static void safe(bool make_safe); static void safe(bool make_safe);
// current static int getInitialGPU();
// @return the current GPU a thread is using
static int current();
// Currently unused // Currently unused
static std::vector<GPUPeer> peers; static std::vector<GPUPeer> peers;
......
...@@ -23,8 +23,8 @@ int main(int argc, char* argv[]) { ...@@ -23,8 +23,8 @@ int main(int argc, char* argv[]) {
printf(" -g, --gpu= Index of gpu to use (defaults to 0)\n"); printf(" -g, --gpu= Index of gpu to use (defaults to 0)\n");
printf(" -i, --imd= IMD port (defaults to %d)\n", kIMDPort); printf(" -i, --imd= IMD port (defaults to %d)\n", kIMDPort);
printf(" -d, --debug Debug mode: allows user to choose which forces are computed\n"); printf(" -d, --debug Debug mode: allows user to choose which forces are computed\n");
printf(" --safe Do not use GPUs that may timeout (default)\n"); printf(" --safe Do not use GPUs that may timeout\n");
printf(" --unsafe Use GPUs that may timeout\n"); printf(" --unsafe Use GPUs that may timeout (default)\n");
printf(" -h, --help Display this help and exit\n"); printf(" -h, --help Display this help and exit\n");
printf(" --info Output CPU and GPU information and exit\n"); printf(" --info Output CPU and GPU information and exit\n");
printf(" --version Output version information and exit\n"); printf(" --version Output version information and exit\n");
...@@ -43,11 +43,17 @@ int main(int argc, char* argv[]) { ...@@ -43,11 +43,17 @@ int main(int argc, char* argv[]) {
printf("Try '%s --help' for more information.\n", argv[0]); printf("Try '%s --help' for more information.\n", argv[0]);
return 1; return 1;
} }
size_t n_gpus = max(GPUManager::gpus.size(), 1lu); // printf("Everything's great when you're...BrownTown\n");
printf(" –––––––––––––––––––––––––––––––––––––––––––––\n");
printf(" | Atomic Resolution Brownian Dynamics |\n");
printf(" –––––––––––––––––––––––––––––––––––––––––––––\n\n");
GPUManager::init();
int gpuID = 0; size_t n_gpus = max(GPUManager::gpus.size(), 1lu);
int gpuID = -1;
bool debug = false, safe = true; bool debug = false, safe = false;
int replicas = 1; int replicas = 1;
unsigned int imd_port = 0; unsigned int imd_port = 0;
bool imd_on = false; bool imd_on = false;
...@@ -66,17 +72,18 @@ int main(int argc, char* argv[]) { ...@@ -66,17 +72,18 @@ int main(int argc, char* argv[]) {
} else if (strcmp(arg, "-g") == 0 || strcmp(arg, "--gpu") == 0) { } else if (strcmp(arg, "-g") == 0 || strcmp(arg, "--gpu") == 0) {
unsigned int arg_val = atoi(argv[pos + 1]); unsigned int arg_val = atoi(argv[pos + 1]);
safe = false;
gpuID = arg_val; gpuID = arg_val;
num_flags += 2; num_flags += 2;
if (arg_val < 0 || arg_val > n_gpus) { if (arg_val < 0 || arg_val > n_gpus) {
printf("Invalid argument given to %s\n", arg); printf("ERROR: Invalid argument given to %s\n", arg);
return 1; return 1;
} }
} else if (strcmp(arg, "-r") == 0 || strcmp(arg, "--replicas") == 0) { } else if (strcmp(arg, "-r") == 0 || strcmp(arg, "--replicas") == 0) {
int arg_val = atoi(argv[pos + 1]); int arg_val = atoi(argv[pos + 1]);
if (arg_val <= 0) { if (arg_val <= 0) {
printf("Invalid argument given to %s\n", arg); printf("ERROR: Invalid argument given to %s\n", arg);
return 1; return 1;
} }
replicas = arg_val; replicas = arg_val;
...@@ -111,13 +118,14 @@ int main(int argc, char* argv[]) { ...@@ -111,13 +118,14 @@ int main(int argc, char* argv[]) {
configFile = argv[argc - 2]; configFile = argv[argc - 2];
outArg = argv[argc - 1]; outArg = argv[argc - 1];
} }
printf("Everything's great when you're...BrownTown\n");
GPUManager::init();
GPUManager::safe(safe); GPUManager::safe(safe);
if (gpuID == -1)
gpuID = GPUManager::getInitialGPU();
Configuration config(configFile, replicas, debug); Configuration config(configFile, replicas, debug);
// GPUManager::set(0); // GPUManager::set(0);
printf("Setting gpuID to %d\n",gpuID);
GPUManager::set(gpuID); GPUManager::set(gpuID);
//MLog: this copyToCUDA function (along with the one in GrandBrownTown.cpp) was split into pieces to allocate memory into the ComputeForce, due to the location of this call we may get some memory error as a ComputeForce class isn't allocated until later on. //MLog: this copyToCUDA function (along with the one in GrandBrownTown.cpp) was split into pieces to allocate memory into the ComputeForce, due to the location of this call we may get some memory error as a ComputeForce class isn't allocated until later on.
config.copyToCUDA(); config.copyToCUDA();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment