Something went wrong on our end
GPUManager.cpp 4.93 KiB
#include "GPUManager.h"
#ifndef gpuErrchk
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort=true) {
if (code != cudaSuccess) {
fprintf(stderr,"CUDA Error: %s %s:%d\n", cudaGetErrorString(code), __FILE__, line);
if (abort) exit(code);
}
}
#endif
#define WITH_GPU(id,code) { int wg_curr; cudaGetDevice(&wg_curr); cudaSetDevice(id); code ; cudaSetDevice(wg_curr); }
int GPUManager::nGPUs = 0;
bool GPUManager::is_safe = true;
std::vector<GPU> GPUManager::allGpus, GPUManager::gpus, GPUManager::notimeouts;
GPU::GPU(unsigned int id) : id(id) {
cudaSetDevice(id);
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION < 12000)
cudaGetDeviceProperties(&properties, id);
#else
cudaGetDeviceProperties_v2(&properties, id);
#endif
printf("[%d] %s ", id, properties.name);
if (properties.kernelExecTimeoutEnabled) {
printf("(may timeout) ");
may_timeout = true;
} else {
may_timeout = false;
}
printf("| SM %d.%d, ", properties.major, properties.minor);
printf("%.2fGHz, ", (float) properties.clockRate * 10E-7);
printf("%.1fGB RAM\n", (float) properties.totalGlobalMem * 7.45058e-10);
streams_created = false;
// fflush(stdout);
// gpuErrchk( cudaDeviceSynchronize() );
}
GPU::~GPU() {
destroy_streams();
}
void GPU::create_streams() {
int curr;
gpuErrchk( cudaGetDevice(&curr) );
gpuErrchk( cudaSetDevice(id) );
if (streams_created) destroy_streams();
last_stream = -1;
for (int i = 0; i < NUMSTREAMS; i++) {
// printf(" creating stream %d at %p\n", i, (void *) &streams[i]);
gpuErrchk( cudaStreamCreate( &streams[i] ) );
// gpuErrchk( cudaStreamCreateWithFlags( &(streams[i]) , cudaStreamNonBlocking ) );
}
streams_created = true;
gpuErrchk( cudaSetDevice(id) );
cudaSetDevice(curr);
}
void GPU::destroy_streams() {
int curr;
// printf("Destroying streams\n");
if (cudaGetDevice(&curr) == cudaSuccess) { // Avoid errors when program is shutting down
gpuErrchk( cudaSetDevice(id) );
if (streams_created) {
for (int i = 0; i < NUMSTREAMS; i++) {
// printf(" destroying stream %d at %p\n", i, (void *) &streams[i]);
gpuErrchk( cudaStreamDestroy( streams[i] ) );
}
}
gpuErrchk( cudaSetDevice(curr) );
}
streams_created = false;
}
void GPUManager::init() {
gpuErrchk(cudaGetDeviceCount(&nGPUs));
printf("Found %d GPU(s)\n", nGPUs);
for (int dev = 0; dev < nGPUs; dev++) {
GPU g(dev);
allGpus.push_back(g);
if (!g.may_timeout) notimeouts.push_back(g);
}
is_safe = false;
if (allGpus.size() == 0) {
fprintf(stderr, "Error: Did not find a GPU\n");
exit(1);
}
}
void GPUManager::load_info() {
init();
gpus = allGpus;
init_devices();
}
void GPUManager::init_devices() {
printf("Initializing devices... ");
for (unsigned int i = 0; i < gpus.size(); i++) {
if (i != gpus.size() - 1 && gpus.size() > 1)
printf("%d, ", gpus[i].id);
else if (gpus.size() > 1)
printf("and %d\n", gpus[i].id);
else
printf("%d\n", gpus[i].id);
use(i);
cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
gpus[i].create_streams();
}
use(0);
gpuErrchk( cudaDeviceSynchronize() );
}
void GPUManager::select_gpus(std::vector<unsigned int>& gpu_ids) {
gpus.clear();
for (auto it = gpu_ids.begin(); it != gpu_ids.end(); ++it) {
gpus.push_back( allGpus[*it] );
}
init_devices();
#ifdef USE_NCCL
init_comms();
#endif
}
void GPUManager::use(int gpu_id) {
gpu_id = gpu_id % (int) gpus.size();
// printf("Setting device to %d\n",gpus[gpu_id].id);
gpuErrchk( cudaSetDevice(gpus[gpu_id].id) );
// printf("Done setting device\n");
}
void GPUManager::sync(int gpu_id) {
WITH_GPU( gpus[gpu_id].id,
gpuErrchk( cudaDeviceSynchronize() ));
// int wg_curr;
// gpuErrchk( cudaGetDevice(&wg_curr) );
// gpuErrchk( cudaSetDevice(gpus[gpu_id].id) );
// gpuErrchk( cudaSetDevice(wg_curr) );
}
int GPUManager::current() {
int c;
cudaGetDevice(&c);
return c;
}
void GPUManager::safe(bool make_safe) {
if (make_safe == is_safe) return;
if (make_safe) {
if (notimeouts.size() == 0) {
printf("WARNING: No safe GPUs\n");
return;
}
allGpus = notimeouts;
is_safe = true;
} else {
is_safe = false;
}
}
int GPUManager::getInitialGPU() {
// TODO: check the load on the gpus and select an unused one
for (auto it = gpus.begin(); it != gpus.end(); ++it) {
GPU& gpu = *it;
if (!gpu.properties.kernelExecTimeoutEnabled)
return gpu.id;
}
return 0;
}
#ifdef USE_NCCL
ncclComm_t* GPUManager::comms = NULL;
void GPUManager::init_comms() {
if (gpus.size() == 1) return;
int* gpu_ids = new int[gpus.size()];
comms = new ncclComm_t[gpus.size()];
int i = 0;
for (auto &g: gpus) {
gpu_ids[i] = g.id;
++i;
}
NCCLCHECK(ncclCommInitAll(comms, gpus.size(), gpu_ids));
}
#endif