From ea359cfe14bfd5ab1e1d8f492d979bdb071f8ace Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Fri, 19 Jan 2024 10:08:07 -0600 Subject: [PATCH] Checkpointing implementation of Random class; about to redesign away from virtual functions using templates --- src/GPUManager.cpp | 12 +- src/ParticlePatch.cpp | 2 +- src/ParticlePatch.h | 4 +- src/Proxy.h | 206 ++++++++++++++++--- src/Random.cu | 469 +++++++++++++++++++++++++++++++++++++++--- src/Random.h | 69 +++---- src/SignalManager.h | 12 +- src/SimManager.cpp | 4 +- src/Types/Array.h | 10 +- 9 files changed, 666 insertions(+), 122 deletions(-) diff --git a/src/GPUManager.cpp b/src/GPUManager.cpp index 3f8dab9..71e7942 100644 --- a/src/GPUManager.cpp +++ b/src/GPUManager.cpp @@ -27,7 +27,7 @@ GPU::GPU(unsigned int id) : id(id) { } else { may_timeout = false; } - INFO("[{}] {} {}| SM {}.{} {:.2f}GHz, {:.1f}GB RAM", + LOGINFO("[{}] {} {}| SM {}.{} {:.2f}GHz, {:.1f}GB RAM", id, properties.name, timeout_str, properties.major, properties.minor, (float) properties.clockRate * 10E-7, (float) properties.totalGlobalMem * 7.45058e-10); @@ -59,12 +59,12 @@ void GPU::create_streams() { void GPU::destroy_streams() { int curr; - TRACE("Destroying streams"); + LOGTRACE("Destroying streams"); if (cudaGetDevice(&curr) == cudaSuccess) { // Avoid errors when program is shutting down gpuErrchk( cudaSetDevice(id) ); if (streams_created) { for (int i = 0; i < NUMSTREAMS; i++) { - TRACE(" destroying stream {} at {}\n", i, fmt::ptr((void *) &streams[i])); + LOGTRACE(" destroying stream {} at {}\n", i, fmt::ptr((void *) &streams[i])); gpuErrchk( cudaStreamDestroy( streams[i] ) ); } } @@ -76,7 +76,7 @@ void GPU::destroy_streams() { void GPUManager::init() { gpuErrchk(cudaGetDeviceCount(&nGPUs)); - INFO("Found {} GPU(s)", nGPUs); + LOGINFO("Found {} GPU(s)", nGPUs); for (int dev = 0; dev < nGPUs; dev++) { GPU g(dev); allGpus.push_back(g); @@ -96,7 +96,7 @@ void GPUManager::load_info() { } void GPUManager::init_devices() { - INFO("Initializing GPU devices... "); + LOGINFO("Initializing GPU devices... "); char msg[256] = ""; for (unsigned int i = 0; i < gpus.size(); i++) { if (i != gpus.size() - 1 && gpus.size() > 1) @@ -109,7 +109,7 @@ void GPUManager::init_devices() { cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 ); gpus[i].create_streams(); } - INFO("Initializing GPUs: {}", msg); + LOGINFO("Initializing GPUs: {}", msg); use(0); gpuErrchk( cudaDeviceSynchronize() ); } diff --git a/src/ParticlePatch.cpp b/src/ParticlePatch.cpp index 3611eac..e64231a 100644 --- a/src/ParticlePatch.cpp +++ b/src/ParticlePatch.cpp @@ -118,7 +118,7 @@ size_t Patch::send_particles_filtered( Proxy<Patch>& destination, std::function< // size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>& destination, T filter ) { } size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>& destination, std::function<bool(size_t,Patch::Data)> filter ) { // Data buffer; // not sure which object should allocate - WARN("Patch::Data::send_particles_filtered() was called but is not implemented"); + LOGWARN("Patch::Data::send_particles_filtered() was called but is not implemented"); // metadata->data.callSync( &Patch::Data::send_particles_filtered, destination, filter ); return 0; }; diff --git a/src/ParticlePatch.h b/src/ParticlePatch.h index fa03ef8..ee217a3 100644 --- a/src/ParticlePatch.h +++ b/src/ParticlePatch.h @@ -195,11 +195,11 @@ public: // [](size_t idx, Patch::Data d)->bool { return true; } ); void clear() { - WARN("Patch::clear() was called but is not implemented"); + LOGWARN("Patch::clear() was called but is not implemented"); } size_t test() { - WARN("Patch::test() was called but is not implemented"); + LOGWARN("Patch::test() was called but is not implemented"); return 1; } diff --git a/src/Proxy.h b/src/Proxy.h index 61dea75..3f3dd04 100644 --- a/src/Proxy.h +++ b/src/Proxy.h @@ -15,13 +15,32 @@ struct Resource { ResourceType type; ///< Type of the resource. size_t id; ///< ID or any other identifier associated with the resource. + // Q: should this return True for GPU that is attached/assigned to current thread? Currently assuming yes. HOST DEVICE bool is_local() const { - WARN("Resource::is_local() not implemented; returning true"); - return true; + bool ret = true; +// #ifdef __CUDA_ACC__ +// ret = (type == GPU); +// #else +// ret = (type == CPU); +// #endif + LOGWARN("Resource::is_local() not fully implemented; returning {}",ret); + return ret; }; // HOST DEVICE static bool is_local() { // check if thread/gpu idx matches some global idx }; + + static Resource Local() { + LOGWARN("Resource::Local() not properly implemented"); +#ifdef __CUDA_ACC__ + return Resource{ GPU, 0 }; +#else + return Resource{ CPU, 0 }; +#endif + }; + bool operator==(const Resource& other) const { return type == other.type && id == other.id; }; + }; + // START traits // These ugly bits of code help implement SFINAE in C++14 and should likely be removed if a newer standard is adopted // https://stackoverflow.com/questions/55191505/c-compile-time-check-if-method-exists-in-template-type @@ -43,6 +62,8 @@ template <typename...> using void_t = void; // struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; // END traits + +// Used by Proxy class template <typename T, typename = void> struct Metadata_t { }; template <typename T> @@ -50,50 +71,101 @@ struct Metadata_t<T, void_t<typename T::Metadata>> : T::Metadata { }; // struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; +// template<typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr> +// struct Proxy { +// /** +// * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr. +// */ +// Proxy() : location{Resource{Resource::CPU,0}}, addr{nullptr} {}; +// Proxy(const Resource& r, T* obj) : location{r}, addr{obj} {}; + +// /** +// * @brief Overloaded operator-> returns the address of the underlying object. +// * @return The address of the underlying object. +// */ +// auto operator->() { return addr; } +// auto operator->() const { return addr; } + +// /** +// * @brief The resource associated with the data represented by the proxy. +// */ +// Resource location; ///< The device (thread/gpu) holding the data represented by the proxy. +// T* addr; ///< The address of the underlying object. +// }; + /** * @brief Template class representing a proxy for the underlying data. * @tparam T The type of the underlying data. */ -// C++17 way: template<typename T, typename Metadata = std::void_t<typename T::Metadata>> -// C++14 way: -//template<typename T, typename Metadata = typename std::conditional<has_metadata<T>::value, typename T::Metadata, void>::type> -template<typename T> -class Proxy { - - // Define Metadata types using SFINAE - // template<typename=void> struct Metadata_t { }; - // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { }; - // template<typename=void> struct Metadata_t { }; - // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { }; - // using Metadata_t = Metadata_t<T>; + + + // Q: why a pointer? + // Q: rename to Metadata? Has + // consequences for Proxy<Proxy<T>> + // that limits specialization but uses + // T::Metadata automatically + // + // A: depends on how Proxy<Proxy<T>> + // objects are used -public: + // void move(const Resource& newloc) { + // LOGTRACE("Moving object from {} to {}", location, newloc); + // Proxy<T> new_proxy; + + // switch (location.type) { + // case Resource::CPU: + // if (location.is_local()) { + // new_proxy = send(location, &addr); + // } else { + // Exception( NotImplementedError, "Proxy::move() non-local CPU calls" ); + // } + // break; + // case Resource::GPU: + // if (location.is_local()) { + // Exception( NotImplementedError, "Proxy::move() local GPU calls" ); + // } else { + // Exception( NotImplementedError, "Proxy::move() non-local GPU calls" ); + // } + // break; + // case Resource::MPI: + // Exception( NotImplementedError, "MPI move (deprecate?)" ); + // break; + // default: + // Exception( ValueError, "Proxy::move(): Unknown resource type" ); + // } + + // // auto Proxy<T>{ location , newaddr } + // // auto tmpmeta = send(newloc, metadata); + + // location = newloc; + // addr = new_proxy.addr; + // metadata = new_proxy.metadata; + // } +// C++17 way: template<typename T, typename Metadata = std::void_t<typename T::Metadata>> +// C++14 way: template<typename T, typename Metadata = typename std::conditional<has_metadata<T>::value, typename T::Metadata, void>::type> +// Neither needed! +template<typename T, typename Enable = void> +struct Proxy { /** * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr. */ - Proxy<T>() : location(Resource{Resource::CPU,0}), addr(nullptr) {}; - Proxy<T>(const Resource& r, T* obj) : location(r), addr(obj) {}; + Proxy() : location(Resource{Resource::CPU,0}), addr(nullptr) {}; + Proxy(const Resource& r, T* obj) : location(r), addr(obj) {}; /** * @brief Overloaded operator-> returns the address of the underlying object. * @return The address of the underlying object. */ - auto operator->() { return addr; } - auto operator->() const { return addr; } + auto operator->() { return addr; }; + auto operator->() const { return addr; }; /** * @brief The resource associated with the data represented by the proxy. */ Resource location; ///< The device (thread/gpu) holding the data represented by the proxy. T* addr; ///< The address of the underlying object. - Metadata_t<T>* metadata; // Q: rename to Metadata? Has - // consequences for Proxy<Proxy<T>> - // that limits specialization but uses - // T::Metadata automatically - // - // A: depends on how Proxy<Proxy<T>> - // objects are used + Metadata_t<T>* metadata; ///< T-specific metadata that resides in same memory space as Proxy<T> // Use two template parameter packs as suggested here: https://stackoverflow.com/questions/26994969/inconsistent-parameter-pack-deduction-with-variadic-templates template <typename RetType, typename... Args1, typename... Args2> @@ -105,14 +177,17 @@ public: } else { Exception( NotImplementedError, "Proxy::callSync() non-local CPU calls" ); } + break; case Resource::GPU: if (location.is_local()) { Exception( NotImplementedError, "Proxy::callSync() local GPU calls" ); } else { Exception( NotImplementedError, "Proxy::callSync() non-local GPU calls" ); } + break; case Resource::MPI: Exception( NotImplementedError, "MPI sync calls (deprecate?)" ); + break; default: Exception( ValueError, "Proxy::callSync(): Unknown resource type" ); } @@ -129,14 +204,17 @@ public: } else { Exception( NotImplementedError, "Proxy::callAsync() non-local CPU calls" ); } + break; case Resource::GPU: if (location.is_local()) { Exception( NotImplementedError, "Proxy::callAsync() local GPU calls" ); } else { Exception( NotImplementedError, "Proxy::callAsync() non-local GPU calls" ); } + break; case Resource::MPI: Exception( NotImplementedError, "MPI async calls (deprecate?)" ); + break; default: Exception( ValueError, "Proxy::callAsync(): Unknown resource type" ); } @@ -144,6 +222,66 @@ public: } }; +// Specialization for bool/int/float types that do not have member functions +template<typename T> +struct Proxy<T, typename std::enable_if_t<std::is_arithmetic<T>::value>> { + /** + * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr. + */ + Proxy() : location{Resource{Resource::CPU,0}}, addr{nullptr} {}; + Proxy(const Resource& r, T* obj) : location{r}, addr{obj} {}; + + /** + * @brief Overloaded operator-> returns the address of the underlying object. + * @return The address of the underlying object. + */ + auto operator->() { return addr; } + auto operator->() const { return addr; } + + /** + * @brief The resource associated with the data represented by the proxy. + */ + Resource location; ///< The device (thread/gpu) holding the data represented by the proxy. + T* addr; ///< The address of the underlying object. +}; + + +// using Proxy<int> = SimpleProxy<int>; + + + +// class Proxy<int> { + +// // Define Metadata types using SFINAE +// // template<typename=void> struct Metadata_t { }; +// // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { }; +// // template<typename=void> struct Metadata_t { }; +// // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { }; +// // using Metadata_t = Metadata_t<T>; + +// public: + +// /** +// * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr. +// */ +// Proxy<int>() : location(Resource{Resource::CPU,0}), addr(nullptr) {}; +// Proxy<int>(const Resource& r, int* obj) : location(r), addr(obj) {}; + +// /** +// * @brief Overloaded operator-> returns the address of the underlying object. +// * @return The address of the underlying object. +// */ +// auto operator->() { return addr; } +// auto operator->() const { return addr; } + +// /** +// * @brief The resource associated with the data represented by the proxy. +// */ +// Resource location; ///< The device (thread/gpu) holding the data represented by the proxy. +// int* addr; ///< The address of the underlying object. +// }; + + // // Partial specialization // template<typename T> // using Proxy<T> = Proxy<T, std::void_t<typename T::Metadata>> @@ -165,7 +303,7 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T case Resource::GPU: if (location.is_local()) { if (dest == nullptr) { // allocate if needed - TRACE(" cudaMalloc for array"); + LOGTRACE(" cudaMalloc for array"); gpuErrchk(cudaMalloc(&dest, sizeof(T))); } gpuErrchk(cudaMemcpy(dest, &obj, sizeof(T), cudaMemcpyHostToDevice)); @@ -176,7 +314,7 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T case Resource::CPU: if (location.is_local()) { if (dest == nullptr) { // allocate if needed - TRACE(" Alloc CPU memory for ", decltype(T)); + LOGTRACE(" Allocate CPU memory for {}", decltype(T)); dest = new T; } memcpy(dest, &obj, sizeof(T)); @@ -203,10 +341,10 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T */ template <typename T, typename Dummy = void, typename std::enable_if_t<!has_send_children<T>::value, Dummy>* = nullptr> HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) { - TRACE("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); + LOGTRACE("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); // Simple objects can simply be copied without worrying about contained objects and arrays auto ret = _send_ignoring_children(location, obj, dest); - TRACE("...done sending"); + LOGTRACE("...done sending"); // printf("...done\n"); return ret; } @@ -222,11 +360,13 @@ HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) { */ template <typename T, typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr> HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) { - TRACE("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); + LOGTRACE("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); auto dummy = obj.send_children(location); // function is expected to return an object of type obj with all pointers appropriately assigned to valid pointers on location Proxy<T> ret = _send_ignoring_children<T>(location, dummy, dest); - TRACE("... clearing dummy complex object"); + LOGTRACE("... clearing dummy complex object"); dummy.clear(); - TRACE("... done sending"); + LOGTRACE("... done sending"); return ret; } + + diff --git a/src/Random.cu b/src/Random.cu index 620c3f0..42a8d2b 100644 --- a/src/Random.cu +++ b/src/Random.cu @@ -187,10 +187,10 @@ public: }; // Methods for generating random values - HOST inline float gaussian() { + HOST DEVICE inline float gaussian() { return 0; }; - HOST inline float gaussian(size_t idx, size_t thread) { + HOST DEVICE inline float gaussian(size_t idx, size_t thread) { return 0; }; @@ -226,31 +226,448 @@ void RandomImplCUDA_init_kernel(unsigned long seed, curandState_t *state, int nu } } +// template<size_t NUM_THREADS = 128, size_t buffer_size = 1024, typename T=int> +// class RandomImplGPUres : public Random { +// public: + +// RandomImplGPURes(Random::Conf c, Resource& location) : location{location}, seed{c.seed}, num_states{c.num_threads}, num{0} { +// // Note: NUM_THREADS refers to CUDA threads, whereas c.num_threads refers to (?) number of random numbers to be generated in parallel + +// LOGINFO("Creating RandomImplGPUres with seed {}", seed); +// assert( location.type == Resource::GPU ); +// LOGINFO("...Sending data"); + + +// states_d = states_d.send_children(location); +// buffer_d = buffer_d.send_children(location); + +// // LOGINFO("...Cleaning temporary data"); +// // delete [] tmp.states; +// // delete [] tmp.buffer; + +// // gpuErrchk(cudaMalloc(&(data.states), sizeof(curandState_t) * c.num_threads)); +// // int nBlocks = c.num_threads / NUM_THREADS + 1; +// // initKernel<<< nBlocks, NUM_THREADS >>>(seed, data.states, NUM_THREADS); +// // gpuErrchk(cudaDeviceSynchronize()); + +// // TODO consider location whe ncreating generator! +// // Create RNG and set seed +// LOGINFO("...Creating generator"); +// curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW); +// curandSetPseudoRandomGeneratorSeed( generator, seed ); + +// // Set seed +// LOGINFO("...Setting seed"); +// gpuErrchk(cudaDeviceSynchronize()); +// LOGINFO("...synced"); +// size_t num_blocks = 1 + (c.num_threads-1)/NUM_THREADS; +// LOGINFO("...launching"); +// // LOGINFO("...hmmm {}", (void*) metadata.states); +// RandomImplGPUres_init_kernel<<< num_blocks, NUM_THREADS >>>(seed, states_d.values, num_states); +// LOGINFO("...Kernel launched"); +// gpuErrchk(cudaDeviceSynchronize()); +// LOGINFO("...Done"); + +// // Allocate temporary storage + -// GPU-resident? +// // gpuErrchk(cudaMalloc((void**) &(data.buffer), sizeof(size_t) * buffer_size)); + +// }; + +// // struct Data { +// // static_assert( sizeof(float) == sizeof(int) ); +// // static_assert( sizeof(float) == sizeof(T) ); + +// // Data send_children(Resource& location) const { +// // Data tmp{}; +// // tmp.states = states.send_children(location); +// // tmp.buffer = buffer.send_children(location); +// // return tmp; +// // }; +// // void clear() {}; +// // }; + +// private: +// // Proxy<Data> data; +// size_t seed; +// curandGenerator_t generator; +// size_t num_states; +// Array<curandState_t> states_d{NUM_THREADS}; +// Array<T> buffer_d{buffer_size}; // Cast when you need (float) vs int? +// Array<T> local_buffer{buffer_size}; // What about Gaussian (float) vs int? +// size_t num; // number of entries left in local_buffer + +// // Methods for maninpulating the state +// void init(size_t num, unsigned long seed, size_t offset) { +// // gpuErrchk(cudaMalloc((void**)&uniform_d, sizeof(float) * RAND_N)); +// // gpuErrchk(cudaMalloc((void**)&integer_d, sizeof(unsigned int) * RAND_N)); +// // gpuErrchk(cudaMalloc((void**)&gaussian_d, sizeof(float) * RAND_N)); +// // integer_h = new unsigned int[RAND_N]; +// // uniform_h = new float[RAND_N]; +// // gaussian_h = new float[RAND_N]; +// // uniform_n = 0; +// // integer_n = 0; +// // gaussian_n = 0; +// // // +// }; + +// public: +// // RandomImplCUDA<NUM_THREADS,buffer_size> send_children(Resource& location) const { +// // RandomImplCUDA<NUM_THREADS,buffer_size> tmp{}; +// // tmp.states = states.send_children(location); +// // tmp.buffer = buffer.send_children(location); +// // return tmp; +// // }; +// // void clear() {}; + +// // TODO: could introduce dual buffer so random numbers can be filled asynchronously +// // Methods for generating random values +// HOST DEVICE inline float gaussian() { +// if (num == 0) { +// cuRandchk(curandGenerateUniform( generator, (float*) buffer_d.values, buffer_size )); +// gpuErrchk(cudaMemcpy(local_buffer.values, buffer_d.values, sizeof(float) * buffer_size, cudaMemcpyDeviceToHost)); +// num = buffer_size; +// } +// return *reinterpret_cast<float*>(&local_buffer[--num]); +// }; +// HOST DEVICE inline float gaussian(size_t idx, size_t thread) { +// return 0; +// }; + +// // HOST DEVICE inline float gaussian(RandomState* state) {}; +// HOST DEVICE inline Vector3 gaussian_vector(size_t idx, size_t thread) { +// return Vector3(); +// }; +// }; + + +using curandState = curandStateXORWOW_t; +template<size_t NUM_THREADS = 128> +class RandomGPU : public Random { +public: + + DEVICE RandomGPU(Random::Conf c, Resource& location) : num_states{c.num_threads}, states{nullptr} { + // Note: NUM_THREADS refers to CUDA threads, whereas c.num_threads refers to (?) number of random numbers to be generated in parallel + // Not sure how this + assert( threadIdx.x + blockIdx.x == 0 ); + + LOGINFO("Creating RandomGPU", seed); + assert( location.type == Resource::GPU ); + states = new curandState[num_states]; + } + DEVICE ~RandomGPU() { + assert( threadIdx.x + blockIdx.x == 0 ); + if (states != nullptr) delete [] states; + states = nullptr; + }; + +private: + size_t num_states; + curandState* states; // RNG states stored in global memory + +public: + // Methods for maninpulating the state + DEVICE void init(size_t num, unsigned long seed, size_t offset = 0) { + // curand_init(unsigned long long seed, + // unsigned long long sequence, + // unsigned long long offset, + // curandStateXORWOW_t *state) + assert( states != nullptr ); + for (size_t idx = threadIdx.x + blockIdx.x*blockDim.x; idx < NUM_THREADS; idx += blockDim.x * gridDim.x) { + curand_init(seed, tidx, offset, curandState &state[idx]); + } + }; + + // // This _might_ be made more generic by using whatever chunk of shm is needed, then returning end of pointer + // DEVICE inline void copy_state_to_shm(curandState *shm) { + // size_t idx = threadIdx.x + blockIdx.x*blockDim.x; + // states_shm = &shm; + // for (; idx < NUM_THREADS; idx += blockDim.x * gridDim.x) { + // (*states_shm)[idx] = states[idx]; + // } + // // __syncthreads(); + + // } + + DEVICE inline float gaussian() { + size_t idx = threadIdx.x + blockIdx.x*blockDim.x; + assert( idx < num_states ); + return curand_normal(&states[idx]); + }; + + DEVICE inline float gaussian(curandState& state) { return curand_normal( &state ); }; + + // Useful for getting the RNG state for the thread from global memory, putting it in local memory for use + DEVICE inline curandState* get_gaussian_state() { + size_t& i = threadIdx.x + blockSize.x*blockIdx.x; + return (i < num_states) ? &(states[i]) : nullptr; + }; + + DEVICE inline void set_gaussian_state(curandState& state) { + size_t& i = threadIdx.x + blockSize.x*blockIdx.x; + if (i < num_states) states[i] = state; + }; + + DEVICE inline Vector3 gaussian_vector() { + return Vector3( gaussian(), gaussian(), gaussian() ); + }; + + DEVICE inline Vector3 gaussian_vector( curandState& state) { + return Vector3( gaussian(state), gaussian(state), gaussian(state) ); + }; +}; + + +/* + * Note: this implementation (and probably the interface) should be + * reworked. Generating RNs on the GPU doesn't require or benefit + * from the Data buffers, so probably those should be removed from a + * device-resident object. Ideally this object can implement a + * uniform and lightweight interface that enables efficient use + * through the agnostic API provided by the Random class. + */ template<size_t NUM_THREADS = 128, size_t buffer_size = 1024> class RandomImplCUDA : public Random { +public: + + template<typename T> + struct Data { + Array<float> buffer{buffer_size}; + size_t num; // number of unread entries in buffer + + Data send_children(Resource& location) const { + Data tmp{}; + tmp.buffer = buffer.send_children(location); + LOGTRACE( "Clearing RANDOM::Data::buffer..." ); + buffer.clear(); // Not sure if this should be done here! + return tmp; + }; + + }; + + RandomImplCUDA(Random::Conf c, Resource& location) : location{location}, seed{c.seed}, num_states{c.num_threads}, num{0} { + // Note: NUM_THREADS refers to CUDA threads, whereas c.num_threads refers to (?) number of random numbers to be generated in parallel + + LOGINFO("Creating RandomImplCUDA with seed {}", seed); + assert( location.type == Resource::GPU ); + LOGINFO("...Sending data"); + + // Data tmp; + // data = send(location, tmp); + // metadata.buffer = new size_t[buffer_size]; // TODO handle case if RandomImplCUDA is on location + // metadata.num = 0; + + states_d = states_d.send_children(location); + + // LOGINFO("...Cleaning temporary data"); + // delete [] tmp.states; + // delete [] tmp.buffer; + + // gpuErrchk(cudaMalloc(&(data.states), sizeof(curandState_t) * c.num_threads)); + // int nBlocks = c.num_threads / NUM_THREADS + 1; + // initKernel<<< nBlocks, NUM_THREADS >>>(seed, data.states, NUM_THREADS); + // gpuErrchk(cudaDeviceSynchronize()); + + // TODO consider location whe ncreating generator! + // Create RNG and set seed + LOGINFO("...Creating generator"); + curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW); + curandSetPseudoRandomGeneratorSeed( generator, seed ); + + // Set seed + LOGINFO("...Setting seed"); + gpuErrchk(cudaDeviceSynchronize()); + LOGINFO("...synced"); + size_t num_blocks = 1 + (c.num_threads-1)/NUM_THREADS; + LOGINFO("...launching"); + // LOGINFO("...hmmm {}", (void*) metadata.states); + RandomImplCUDA_init_kernel<<< num_blocks, NUM_THREADS >>>(c.seed, states_d.values, num_states); + LOGINFO("...Kernel launched"); + gpuErrchk(cudaDeviceSynchronize()); + LOGINFO("...Done"); + + // Allocate temporary storage + + + // gpuErrchk(cudaMalloc((void**) &(data.buffer), sizeof(size_t) * buffer_size)); + + }; + + // struct Data { + // static_assert( sizeof(float) == sizeof(int) ); + // static_assert( sizeof(float) == sizeof(T) ); + + // Data send_children(Resource& location) const { + // Data tmp{}; + // tmp.states = states.send_children(location); + // tmp.buffer = buffer.send_children(location); + // return tmp; + // }; + // void clear() {}; + // }; + +private: + Resource location; + size_t seed; + curandGenerator_t generator; + size_t num_states; + Array<curandState_t> states_d; + + // Buffers for generating numbers from CPU-resident + Data<float>* gauss_d; + Data<float>* poisson_d; + Data<float>* uniform_d; + + Data<float> gauss; + Data<float> poisson; + Data<float> uniform; + + + // Methods for maninpulating the state + void init(size_t num, unsigned long seed, size_t offset) { + // gpuErrchk(cudaMalloc((void**)&uniform_d, sizeof(float) * RAND_N)); + // gpuErrchk(cudaMalloc((void**)&integer_d, sizeof(unsigned int) * RAND_N)); + // gpuErrchk(cudaMalloc((void**)&gaussian_d, sizeof(float) * RAND_N)); + // integer_h = new unsigned int[RAND_N]; + // uniform_h = new float[RAND_N]; + // gaussian_h = new float[RAND_N]; + // uniform_n = 0; + // integer_n = 0; + // gaussian_n = 0; + // // + }; + +public: + // RandomImplCUDA<NUM_THREADS,buffer_size> send_children(Resource& location) const { + // RandomImplCUDA<NUM_THREADS,buffer_size> tmp{}; + // tmp.states = states.send_children(location); + // tmp.buffer = buffer.send_children(location); + // return tmp; + // }; + // void clear() {}; + + // TODO: could introduce dual buffer so random numbers can be filled asynchronously + // Methods for generating random values + HOST DEVICE inline float gaussian() { +#ifdef __CUDA__ARCH__ + size_t& i = threadIdx.x + blockSize.x*blockIdx.x + if (i < num_states) { + return curand_normal( &(state_d[i]) ); + } + return 0.0f; +#else + if (num == 0) { + cuRandchk(curandGenerateUniform( generator, (float*) gauss.buffer_d.values, buffer_size )); + gpuErrchk(cudaMemcpy(local_buffer.values, buffer_d.values, sizeof(float) * buffer_size, cudaMemcpyDeviceToHost)); + num = buffer_size; + } + return (&local_buffer[--num]); +#endif + }; + + __device__ inline float gaussian() { + }; + + DEVICE inline float gaussian(curandState* state) { return curand_normal( state ); }; + + // Useful for getting the RNG state for the thread from global memory, putting it in local memory for use + DEVICE inline curandState* get_gaussian_state() { + size_t& i = threadIdx.x + blockSize.x*blockIdx.x; + if (i < num_states) { + return &(state_d[i]); + } + return nullptr; + }; + + DEVICE inline void set_gaussian_state(curandState* state) { + size_t& i = threadIdx.x + blockSize.x*blockIdx.x; + if (i < num_states) state_d[i] = *state; + }; + + // HOST inline float gaussian(size_t idx, size_t thread) { + // return 0; + // }; + + // HOST DEVICE inline float gaussian(RandomState* state) {}; + HOST inline Vector3 gaussian_vector(size_t idx, size_t thread) { + return Vector3(); + }; + + // unsigned int integer() { + // return 0; + // }; + // unsigned int poisson(float lambda) { + // return 0; + // }; + // float uniform() { + // return 0; + // }; +}; + +/* GPU-resident? +template<size_t NUM_THREADS = 128, size_t buffer_size = 1024> +class RandomImplCUDA_DEPRECATE : public Random { public: struct Data { - Data() : states(nullptr), buffer(nullptr), num(0) {}; - curandState_t *states; - size_t* buffer; // What about Gauss vs int? - size_t num; // Remove? + Data() : states(nullptr), buffer(nullptr), metadata(nullptr) {}; + // Data(Data&& orig) : num_states(orig.states), states(orig.states), buffer(orig.buffer), num(orig.num) {}; + + struct Metadata { + size_t num_states; + size_t num; // Remove? + + }; + Array<curandState_t> states; + size_t* buffer; // What about Gaussian (float) vs int? + // static_assert( sizeof(float) == sizeof(size_t) ); + // static_assert( sizeof(int) == sizeof(size_t) ); + Metadata* metadata; + + Data send_children(Resource& location) const { + const Resource& local = Resource::Local(); + Data tmp{*this}; + tmp.states = nullptr; + tmp.buffer = nullptr; + + if (local.type == Resource::CPU) { + if (location.type == Resource::GPU) { + if (states != nullptr) { + gpuErrchk(cudaMalloc(&(tmp.states), sizeof(curandState_t) * num_states)); + gpuErrchk(cudaMemcpy(tmp.states, states, sizeof(curandState_t) * num_states, cudaMemcpyHostToDevice)); + } + if (buffer != nullptr) { + gpuErrchk(cudaMalloc(&(tmp.buffer), sizeof(size_t) * buffer_size)); + gpuErrchk(cudaMemcpy(tmp.buffer, buffer, sizeof(float) * buffer_size, cudaMemcpyHostToDevice)); + } + } else { + Exception(NotImplementedError, ""); + } + } else { + Exception(NotImplementedError, ""); + } + return tmp; + }; + void clear() { + Exception(NotImplementedError, ""); + }; }; // Metadata stored on host even if Data is on device struct Metadata { curandGenerator_t generator; - Proxy<Data> data; // state data may be found elsewhere - size_t* buffer; - size_t num; + curandState_t *states_d; + size_t* buffer_d; // Device buffer (possibly used) + Proxy<Data> data; // State data that may be found elsewhere + size_t* buffer; // Local buffer + size_t num; // Number of elements in local buffer }; - RandomImplCUDA(Random::Conf c, Resource&& location) { + RandomImplCUDA_DEPRECATE(Random::Conf c, Resource& location) { // Note: NUM_THREADS refers to CUDA threads, whereas c.num_threads refers to (?) number of random numbers to be generated in parallel - INFO("Creating RandomImplCUDA with seed {}", c.seed); + LOGINFO("Creating RandomImplCUDA with seed {}", c.seed); assert( location.type == Resource::GPU ); @@ -261,14 +678,14 @@ public: tmp.buffer = new size_t[buffer_size]; tmp.num = 0; - INFO("...Sending data"); + LOGINFO("...Sending data"); metadata.data = send(location, tmp); metadata.buffer = new size_t[buffer_size]; // TODO handle case if RandomImplCUDA is on location metadata.num = 0; - INFO("...Cleaning temporary data"); + LOGINFO("...Cleaning temporary data"); delete [] tmp.states; delete [] tmp.buffer; @@ -279,19 +696,21 @@ public: // TODO consider location whe ncreating generator! // Create RNG and set seed - INFO("...Creating generator"); + LOGINFO("...Creating generator"); curandCreateGenerator(&(metadata.generator), CURAND_RNG_PSEUDO_XORWOW); curandSetPseudoRandomGeneratorSeed( metadata.generator, c.seed ); // Set seed - INFO("...Setting seed"); + LOGINFO("...Setting seed"); gpuErrchk(cudaDeviceSynchronize()); - INFO("...synced"); + LOGINFO("...synced"); size_t num_blocks = 1 + (c.num_threads-1)/NUM_THREADS; - RandomImplCUDA_init_kernel<<< num_blocks, NUM_THREADS >>>(c.seed, metadata.data.addr->states, c.num_threads); - INFO("...Kernel launched"); + LOGINFO("...launching"); + // LOGINFO("...hmmm {}", (void*) metadata.states); + RandomImplCUDA_init_kernel<<< num_blocks, NUM_THREADS >>>(c.seed, metadata.states, c.num_threads); + LOGINFO("...Kernel launched"); gpuErrchk(cudaDeviceSynchronize()); - INFO("...Done"); + LOGINFO("...Done"); // Allocate temporary storage @@ -346,6 +765,7 @@ private: // void reorder(int *a, int n); }; +// */ #endif Random* Random::GetRandom(Conf&& conf) { @@ -365,14 +785,15 @@ Random* Random::GetRandom(Conf&& conf) { if (inserted) { // Conf not found, so create a new one Random* tmp; - + Resource r{conf.backend == Conf::CUDA ? Resource::GPU : Resource::CPU ,0}; + switch (conf.backend) { case Conf::CUDA: #ifdef USE_CUDA - // TODO: replace Resource below / overload GetRandom() so it can target a resource - tmp = new RandomImplCUDA<128,1024>(conf, Resource{Resource::GPU,0} ); + // TODO: replace Resource below / overload GetRandom() so it can target a resource + tmp = new RandomImplCUDA<128,1024>(conf, r ); #else - WARN("Random::GetRandom(): CUDA disabled, creating CPU random instead"); + LOGWARN("Random::GetRandom(): CUDA disabled, creating CPU random instead"); tmp = new RandomImpl(); #endif break; diff --git a/src/Random.h b/src/Random.h index c5d5786..8081760 100644 --- a/src/Random.h +++ b/src/Random.h @@ -31,23 +31,40 @@ public: }; // size_t thread_idx; - struct State; + struct State; // Not sure whether to use something like this to capture the RNG state... static Random* GetRandom(Conf&& conf); // Methods for maninpulating the state // virtual void init(size_t num, unsigned long seed, size_t offset) = 0; - // Methods for generating random values - virtual inline float gaussian() = 0; - virtual inline float gaussian(size_t idx, size_t thread) = 0; - // HOST DEVICE inline float gaussian(RandomState* state); - virtual inline Vector3 gaussian_vector(size_t idx, size_t thread) = 0; + // // Methods for generating random values + // // Use of this convenience method is discouraged... maybe remove? + // HOST DEVICE virtual inline float gaussian() = 0; + // // virtual inline float gaussian(size_t idx, size_t thread) = 0; + + // How can we redesign to avoid using virtual functions for this low-level stuff? Perhaps the GetRandom() approach should be dropped - virtual unsigned int integer() = 0; - virtual unsigned int poisson(float lambda) = 0; + // Here we use void pointers to acheive polymorphism; we lose type + // safety but gain the ability to have a single approach to invoking Random + HOST DEVICE virtual inline float gaussian(void* state) = 0; + + template<typename S> + HOST DEVICE virtual inline S* get_gaussian_state() = 0; + + template<typename S> + HOST DEVICE virtual inline void set_gaussian_state(S* state) = 0; + + + + // HOST DEVICE inline float gaussian(RandomState* state); + // virtual inline Vector3 gaussian_vector(size_t idx, size_t thread) = 0; + virtual float uniform() = 0; + // virtual unsigned int integer() = 0; + // virtual unsigned int poisson(float lambda) = 0; + // void reorder(int *a, int n); protected: @@ -58,39 +75,3 @@ protected: // } }; - -// class RandomImpl : public Random { -// public: -// // Methods for maninpulating the state -// void init(size_t num, unsigned long seed, size_t offset); - -// // // Methods for generating random values -// HOST inline float gaussian(); -// HOST inline float gaussian(size_t idx, size_t thread); -// // HOST DEVICE inline float gaussian(RandomState* state) {}; -// HOST inline Vector3 gaussian_vector(size_t idx, size_t thread); -// unsigned int integer() { return 0; }; -// unsigned int poisson(float lambda) { return 0; }; -// float uniform() { return 0; }; - -// // // void reorder(int *a, int n); - -// }; - -#ifdef USE_CUDA -// class RandomImplCUDA : public Random { -// public: -// // Methods for maninpulating the state -// void init(size_t num, unsigned long seed, size_t offset); - -// // // Methods for generating random values -// HOST inline float gaussian(); -// HOST inline float gaussian(size_t idx, size_t thread); -// // HOST DEVICE inline float gaussian(RandomState* state) {}; -// HOST inline Vector3 gaussian_vector(size_t idx, size_t thread); -// unsigned int integer() { return 0; }; -// unsigned int poisson(float lambda) { return 0; }; -// float uniform() { return 0; }; - -// }; -#endif diff --git a/src/SignalManager.h b/src/SignalManager.h index 1d517d0..704f39b 100644 --- a/src/SignalManager.h +++ b/src/SignalManager.h @@ -21,13 +21,13 @@ #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE #endif -#define TRACE(...) SPDLOG_TRACE(__VA_ARGS__) -#define DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__) +#define LOGTRACE(...) SPDLOG_TRACE(__VA_ARGS__) +#define LOGDEBUG(...) SPDLOG_DEBUG(__VA_ARGS__) // #define DEBUG(...) spdlog::debug(__VA_ARGS__) -#define INFO(...) SPDLOG_INFO(__VA_ARGS__) -#define WARN(...) SPDLOG_WARN(__VA_ARGS__) -#define ERROR(...) SPDLOG_ERROR(__VA_ARGS__) -#define CRITICAL(...) SPDLOG_CRITICAL(__VA_ARGS__) +#define LOGINFO(...) SPDLOG_INFO(__VA_ARGS__) +#define LOGWARN(...) SPDLOG_WARN(__VA_ARGS__) +#define LOGERROR(...) SPDLOG_ERROR(__VA_ARGS__) +#define LOGCRITICAL(...) SPDLOG_CRITICAL(__VA_ARGS__) // spdlog::set_level(spdlog::level::trace); //*/ diff --git a/src/SimManager.cpp b/src/SimManager.cpp index 16e5fb2..11cd324 100644 --- a/src/SimManager.cpp +++ b/src/SimManager.cpp @@ -3,7 +3,7 @@ #include <memory> void SimManager::run() { - INFO("Running!"); + LOGINFO("Running!"); // TODO: process a command queue? // SimSystem sys = SimSystem(); @@ -27,7 +27,7 @@ void SimManager::run() { // #endif for (size_t step = 0; step < 10; ++step) { - INFO("Step {}: random {}", step, r->gaussian()); + LOGINFO("Step {}: random {:0.2f}", step, r->gaussian()); p.compute(); #ifdef USE_CUDA cudaDeviceSynchronize(); diff --git a/src/Types/Array.h b/src/Types/Array.h index 91dcbcf..8da11cc 100644 --- a/src/Types/Array.h +++ b/src/Types/Array.h @@ -11,8 +11,7 @@ // Simple templated array object without resizing capabilities template<typename T> -class Array { -public: +struct Array { HOST DEVICE inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this); HOST inline Array<T>(size_t num) : num(num), values(nullptr) { // printf("Constructing Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values); @@ -118,11 +117,11 @@ public: template <typename Dummy = void, typename std::enable_if_t<!has_send_children<T>::value, Dummy>* = nullptr> HOST inline Array<T> send_children(const Resource& location) { T* values_d = nullptr; - + // Allocate and copy at values_d if (num > 0) { size_t sz = sizeof(T) * num; - // printf(" cudaMalloc for %d items\n", num); + LOGINFO(" Array<{}>.send_children(...): cudaMalloc for {} items", type_name<T>(), num); switch (location.type) { case Resource::GPU: gpuErrchk(cudaMalloc(&values_d, sz)); @@ -138,12 +137,14 @@ public: Exception( ValueError, "Unknown Resource type" ); } } + LOGINFO(" Array<{}>.send_children(...): done copying", type_name<T>()); // Copy Array with pointers correctly assigned Array<T> tmp(0); tmp.num = num; tmp.values = values_d; // printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d); + LOGINFO(" Array<{}>.send_children(...): done", type_name<T>()); return tmp; } template <typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr> @@ -350,6 +351,7 @@ private: values = nullptr; } +public: size_t num; T* __restrict__ values; }; -- GitLab