From ea359cfe14bfd5ab1e1d8f492d979bdb071f8ace Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Fri, 19 Jan 2024 10:08:07 -0600
Subject: [PATCH] Checkpointing implementation of Random class; about to
 redesign away from virtual functions using templates

---
 src/GPUManager.cpp    |  12 +-
 src/ParticlePatch.cpp |   2 +-
 src/ParticlePatch.h   |   4 +-
 src/Proxy.h           | 206 ++++++++++++++++---
 src/Random.cu         | 469 +++++++++++++++++++++++++++++++++++++++---
 src/Random.h          |  69 +++----
 src/SignalManager.h   |  12 +-
 src/SimManager.cpp    |   4 +-
 src/Types/Array.h     |  10 +-
 9 files changed, 666 insertions(+), 122 deletions(-)

diff --git a/src/GPUManager.cpp b/src/GPUManager.cpp
index 3f8dab9..71e7942 100644
--- a/src/GPUManager.cpp
+++ b/src/GPUManager.cpp
@@ -27,7 +27,7 @@ GPU::GPU(unsigned int id) : id(id) {
     } else {
 	may_timeout = false;
     }
-    INFO("[{}] {} {}| SM {}.{} {:.2f}GHz, {:.1f}GB RAM",
+    LOGINFO("[{}] {} {}| SM {}.{} {:.2f}GHz, {:.1f}GB RAM",
 	 id, properties.name, timeout_str, properties.major, properties.minor,
 	 (float) properties.clockRate * 10E-7, (float) properties.totalGlobalMem * 7.45058e-10);
     
@@ -59,12 +59,12 @@ void GPU::create_streams() {
 
 void GPU::destroy_streams() {
     int curr;
-    TRACE("Destroying streams");
+    LOGTRACE("Destroying streams");
     if (cudaGetDevice(&curr) == cudaSuccess) { // Avoid errors when program is shutting down
 	gpuErrchk( cudaSetDevice(id) );
 	if (streams_created) {
 	    for (int i = 0; i < NUMSTREAMS; i++) {
-		TRACE("  destroying stream {} at {}\n", i, fmt::ptr((void *) &streams[i]));
+		LOGTRACE("  destroying stream {} at {}\n", i, fmt::ptr((void *) &streams[i]));
 		gpuErrchk( cudaStreamDestroy( streams[i] ) );
 	    }
 	}
@@ -76,7 +76,7 @@ void GPU::destroy_streams() {
 
 void GPUManager::init() {
     gpuErrchk(cudaGetDeviceCount(&nGPUs));
-    INFO("Found {} GPU(s)", nGPUs);
+    LOGINFO("Found {} GPU(s)", nGPUs);
     for (int dev = 0; dev < nGPUs; dev++) {
 	GPU g(dev);
 	allGpus.push_back(g);
@@ -96,7 +96,7 @@ void GPUManager::load_info() {
 }
 
 void GPUManager::init_devices() {
-    INFO("Initializing GPU devices... ");
+    LOGINFO("Initializing GPU devices... ");
     char msg[256] = "";    
     for (unsigned int i = 0; i < gpus.size(); i++) {
     	if (i != gpus.size() - 1 && gpus.size() > 1)
@@ -109,7 +109,7 @@ void GPUManager::init_devices() {
     	cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
     	gpus[i].create_streams();
     }
-    INFO("Initializing GPUs: {}", msg);
+    LOGINFO("Initializing GPUs: {}", msg);
     use(0);
     gpuErrchk( cudaDeviceSynchronize() );
 }
diff --git a/src/ParticlePatch.cpp b/src/ParticlePatch.cpp
index 3611eac..e64231a 100644
--- a/src/ParticlePatch.cpp
+++ b/src/ParticlePatch.cpp
@@ -118,7 +118,7 @@ size_t Patch::send_particles_filtered( Proxy<Patch>& destination, std::function<
 // size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>& destination, T filter ) { }
 size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>& destination, std::function<bool(size_t,Patch::Data)> filter ) { 
     // Data buffer;		// not sure which object should allocate
-    WARN("Patch::Data::send_particles_filtered() was called but is not implemented");
+    LOGWARN("Patch::Data::send_particles_filtered() was called but is not implemented");
     // metadata->data.callSync( &Patch::Data::send_particles_filtered, destination, filter );
     return 0;
 };
diff --git a/src/ParticlePatch.h b/src/ParticlePatch.h
index fa03ef8..ee217a3 100644
--- a/src/ParticlePatch.h
+++ b/src/ParticlePatch.h
@@ -195,11 +195,11 @@ public:
     // [](size_t idx, Patch::Data d)->bool { return true; } );
 
     void clear() {
-	WARN("Patch::clear() was called but is not implemented");
+	LOGWARN("Patch::clear() was called but is not implemented");
     }
 
     size_t test() {
-	WARN("Patch::test() was called but is not implemented");
+	LOGWARN("Patch::test() was called but is not implemented");
 	return 1;
     }
 
diff --git a/src/Proxy.h b/src/Proxy.h
index 61dea75..3f3dd04 100644
--- a/src/Proxy.h
+++ b/src/Proxy.h
@@ -15,13 +15,32 @@ struct Resource {
     ResourceType type; ///< Type of the resource.
     size_t id; ///< ID or any other identifier associated with the resource.
 
+    // Q: should this return True for GPU that is attached/assigned to current thread? Currently assuming yes.
     HOST DEVICE bool is_local() const {
-	WARN("Resource::is_local() not implemented; returning true");
-	return true;
+	bool ret = true;
+// #ifdef __CUDA_ACC__
+// 	ret = (type == GPU);
+// #else
+// 	ret = (type == CPU);
+// #endif
+	LOGWARN("Resource::is_local() not fully implemented; returning {}",ret);
+	return ret;
     };
     // HOST DEVICE static bool is_local() { // check if thread/gpu idx matches some global idx };
+
+    static Resource Local() {
+	LOGWARN("Resource::Local() not properly implemented");
+#ifdef __CUDA_ACC__
+	return Resource{ GPU, 0 };
+#else
+	return Resource{ CPU, 0 };
+#endif
+    };
+    bool operator==(const Resource& other) const { return type == other.type && id == other.id; };
+	
 };
 
+
 // START traits
 // These ugly bits of code help implement SFINAE in C++14 and should likely be removed if a newer standard is adopted 
 // https://stackoverflow.com/questions/55191505/c-compile-time-check-if-method-exists-in-template-type
@@ -43,6 +62,8 @@ template <typename...>
 using void_t = void;
 // struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; 
 // END traits
+
+// Used by Proxy class 
 template <typename T, typename = void>
 struct Metadata_t { }; 
 template <typename T>
@@ -50,50 +71,101 @@ struct Metadata_t<T, void_t<typename T::Metadata>> : T::Metadata { };
 // struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; 
 
 
+// template<typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+// struct Proxy {
+//     /**
+//      * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr.
+//      */
+//     Proxy() : location{Resource{Resource::CPU,0}}, addr{nullptr} {};
+//     Proxy(const Resource& r, T* obj) : location{r}, addr{obj} {};
+
+//     /**
+//      * @brief Overloaded operator-> returns the address of the underlying object.
+//      * @return The address of the underlying object.
+//      */
+//     auto operator->() { return addr; }
+//     auto operator->() const { return addr; }
+
+//     /**
+//      * @brief The resource associated with the data represented by the proxy.
+//      */
+//     Resource location;	    ///< The device (thread/gpu) holding the data represented by the proxy.
+//     T* addr;		    ///< The address of the underlying object.
+// };
+
 /**
  * @brief Template class representing a proxy for the underlying data.
  * @tparam T The type of the underlying data.
  */
-// C++17 way: template<typename T, typename Metadata = std::void_t<typename T::Metadata>>
-// C++14 way:
-//template<typename T, typename Metadata = typename std::conditional<has_metadata<T>::value, typename T::Metadata, void>::type>
-template<typename T>
-class Proxy {
-
-    // Define Metadata types using SFINAE
-    // template<typename=void> struct Metadata_t { };
-    // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { };
-    // template<typename=void> struct Metadata_t { };
-    // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { };
-    // using Metadata_t = Metadata_t<T>;
+
+
+    // Q: why a pointer?
+    // Q: rename to Metadata? Has
+    // consequences for Proxy<Proxy<T>>
+    // that limits specialization but uses
+    // T::Metadata automatically
+    //
+    // A: depends on how Proxy<Proxy<T>>
+    // objects are used
     
-public:
+    // void move(const Resource& newloc) {
+    // 	LOGTRACE("Moving object from {} to {}", location, newloc);
+    // 	Proxy<T> new_proxy;
+	
+    //     switch (location.type) {
+    // 	case Resource::CPU:
+    // 	    if (location.is_local()) {
+    // 		new_proxy = send(location, &addr);
+    // 	    } else {
+    // 		Exception( NotImplementedError, "Proxy::move() non-local CPU calls" );
+    // 	    }
+    // 	    break;
+    // 	case Resource::GPU:
+    // 	    if (location.is_local()) {
+    // 		Exception( NotImplementedError, "Proxy::move() local GPU calls" );
+    // 	    } else {
+    // 		Exception( NotImplementedError, "Proxy::move() non-local GPU calls" );
+    // 	    }
+    // 	    break;
+    // 	case Resource::MPI:
+    // 	    Exception( NotImplementedError, "MPI move (deprecate?)" );
+    // 	    break;
+    // 	default:
+    // 	    Exception( ValueError, "Proxy::move(): Unknown resource type" );
+    //     }
+	
+    // 	// auto Proxy<T>{ location , newaddr }
+    // 	// auto tmpmeta = send(newloc, metadata);
+
+    // 	location = newloc;
+    // 	addr = new_proxy.addr;
+    // 	metadata = new_proxy.metadata;
+    // }
 
+// C++17 way: template<typename T, typename Metadata = std::void_t<typename T::Metadata>>
+// C++14 way: template<typename T, typename Metadata = typename std::conditional<has_metadata<T>::value, typename T::Metadata, void>::type>
+// Neither needed!
+template<typename T, typename Enable = void>
+struct Proxy {
     /**
      * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr.
      */
-    Proxy<T>() : location(Resource{Resource::CPU,0}), addr(nullptr) {};
-    Proxy<T>(const Resource& r, T* obj) : location(r), addr(obj) {};
+    Proxy() : location(Resource{Resource::CPU,0}), addr(nullptr) {};
+    Proxy(const Resource& r, T* obj) : location(r), addr(obj) {};
 
     /**
      * @brief Overloaded operator-> returns the address of the underlying object.
      * @return The address of the underlying object.
      */
-    auto operator->() { return addr; }
-    auto operator->() const { return addr; }
+    auto operator->() { return addr; };
+    auto operator->() const { return addr; };
 
     /**
      * @brief The resource associated with the data represented by the proxy.
      */
     Resource location;	    ///< The device (thread/gpu) holding the data represented by the proxy.
     T* addr;		    ///< The address of the underlying object.
-    Metadata_t<T>* metadata;	// Q: rename to Metadata? Has
-				// consequences for Proxy<Proxy<T>>
-				// that limits specialization but uses
-				// T::Metadata automatically
-				//
-				// A: depends on how Proxy<Proxy<T>>
-				// objects are used
+    Metadata_t<T>* metadata; ///< T-specific metadata that resides in same memory space as Proxy<T> 
 
     // Use two template parameter packs as suggested here: https://stackoverflow.com/questions/26994969/inconsistent-parameter-pack-deduction-with-variadic-templates
     template <typename RetType, typename... Args1, typename... Args2>
@@ -105,14 +177,17 @@ public:
 	    } else {
 		Exception( NotImplementedError, "Proxy::callSync() non-local CPU calls" );
 	    }
+	    break;
 	case Resource::GPU:
 	    if (location.is_local()) {
 		Exception( NotImplementedError, "Proxy::callSync() local GPU calls" );
 	    } else {
 		Exception( NotImplementedError, "Proxy::callSync() non-local GPU calls" );
 	    }
+	    break;
 	case Resource::MPI:
 	    Exception( NotImplementedError, "MPI sync calls (deprecate?)" );
+	    break;
 	default:
 	    Exception( ValueError, "Proxy::callSync(): Unknown resource type" );
         }
@@ -129,14 +204,17 @@ public:
 	    } else {
 		Exception( NotImplementedError, "Proxy::callAsync() non-local CPU calls" );
 	    }
+	    break;
 	case Resource::GPU:
 	    if (location.is_local()) {
 		Exception( NotImplementedError, "Proxy::callAsync() local GPU calls" );
 	    } else {
 		Exception( NotImplementedError, "Proxy::callAsync() non-local GPU calls" );
 	    }
+	    break;
 	case Resource::MPI:
 	    Exception( NotImplementedError, "MPI async calls (deprecate?)" );
+	    break;
 	default:
 	    Exception( ValueError, "Proxy::callAsync(): Unknown resource type" );
         }
@@ -144,6 +222,66 @@ public:
     }
 };
 
+// Specialization for bool/int/float types that do not have member functions
+template<typename T>
+struct Proxy<T, typename std::enable_if_t<std::is_arithmetic<T>::value>> {
+    /**
+     * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr.
+     */
+    Proxy() : location{Resource{Resource::CPU,0}}, addr{nullptr} {};
+    Proxy(const Resource& r, T* obj) : location{r}, addr{obj} {};
+
+    /**
+     * @brief Overloaded operator-> returns the address of the underlying object.
+     * @return The address of the underlying object.
+     */
+    auto operator->() { return addr; }
+    auto operator->() const { return addr; }
+
+    /**
+     * @brief The resource associated with the data represented by the proxy.
+     */
+    Resource location;	    ///< The device (thread/gpu) holding the data represented by the proxy.
+    T* addr;		    ///< The address of the underlying object.
+};
+
+
+// using Proxy<int> = SimpleProxy<int>;
+
+
+
+// class Proxy<int> {
+
+//     // Define Metadata types using SFINAE
+//     // template<typename=void> struct Metadata_t { };
+//     // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { };
+//     // template<typename=void> struct Metadata_t { };
+//     // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { };
+//     // using Metadata_t = Metadata_t<T>;
+    
+// public:
+
+//     /**
+//      * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr.
+//      */
+//     Proxy<int>() : location(Resource{Resource::CPU,0}), addr(nullptr) {};
+//     Proxy<int>(const Resource& r, int* obj) : location(r), addr(obj) {};
+
+//     /**
+//      * @brief Overloaded operator-> returns the address of the underlying object.
+//      * @return The address of the underlying object.
+//      */
+//     auto operator->() { return addr; }
+//     auto operator->() const { return addr; }
+
+//     /**
+//      * @brief The resource associated with the data represented by the proxy.
+//      */
+//     Resource location;	    ///< The device (thread/gpu) holding the data represented by the proxy.
+//     int* addr;		    ///< The address of the underlying object.
+// };
+
+
 // // Partial specialization
 // template<typename T>
 // using Proxy<T> = Proxy<T, std::void_t<typename T::Metadata>>
@@ -165,7 +303,7 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T
     case Resource::GPU:
 	if (location.is_local()) {
 	    if (dest == nullptr) { // allocate if needed
-		TRACE("   cudaMalloc for array");
+		LOGTRACE("   cudaMalloc for array");
 		gpuErrchk(cudaMalloc(&dest, sizeof(T)));
 	    }
 	    gpuErrchk(cudaMemcpy(dest, &obj, sizeof(T), cudaMemcpyHostToDevice));
@@ -176,7 +314,7 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T
     case Resource::CPU:
 	if (location.is_local()) {
 	    if (dest == nullptr) { // allocate if needed
-		TRACE("   Alloc CPU memory for ", decltype(T));
+		LOGTRACE("   Allocate CPU memory for {}", decltype(T));
 		dest = new T;
 	    }
 	    memcpy(dest, &obj, sizeof(T));
@@ -203,10 +341,10 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T
  */
 template <typename T, typename Dummy = void, typename std::enable_if_t<!has_send_children<T>::value, Dummy>* = nullptr>
 HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
-    TRACE("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
+    LOGTRACE("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
     // Simple objects can simply be copied without worrying about contained objects and arrays
     auto ret = _send_ignoring_children(location, obj, dest);
-    TRACE("...done sending");
+    LOGTRACE("...done sending");
     // printf("...done\n");        
     return ret;
 }
@@ -222,11 +360,13 @@ HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
  */
 template <typename T, typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr>
 HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
-    TRACE("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
+    LOGTRACE("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
     auto dummy = obj.send_children(location); // function is expected to return an object of type obj with all pointers appropriately assigned to valid pointers on location
     Proxy<T> ret = _send_ignoring_children<T>(location, dummy, dest);
-    TRACE("... clearing dummy complex object");
+    LOGTRACE("... clearing dummy complex object");
     dummy.clear();
-    TRACE("... done sending");
+    LOGTRACE("... done sending");
     return ret;
 }
+
+
diff --git a/src/Random.cu b/src/Random.cu
index 620c3f0..42a8d2b 100644
--- a/src/Random.cu
+++ b/src/Random.cu
@@ -187,10 +187,10 @@ public:
     };
 
     // Methods for generating random values
-    HOST inline float gaussian() {
+    HOST DEVICE inline float gaussian() {
 	return 0;
     };
-    HOST inline float gaussian(size_t idx, size_t thread) {
+    HOST DEVICE inline float gaussian(size_t idx, size_t thread) {
 	return 0;
     };
 
@@ -226,31 +226,448 @@ void RandomImplCUDA_init_kernel(unsigned long seed, curandState_t *state, int nu
        }
 }
 
+// template<size_t NUM_THREADS = 128, size_t buffer_size = 1024, typename T=int>
+// class RandomImplGPUres : public Random {
+// public:
+
+//     RandomImplGPURes(Random::Conf c, Resource& location) : location{location}, seed{c.seed}, num_states{c.num_threads}, num{0} {
+// 	// Note: NUM_THREADS refers to CUDA threads, whereas c.num_threads refers to (?) number of random numbers to be generated in parallel
+	
+// 	LOGINFO("Creating RandomImplGPUres with seed {}", seed);
+// 	assert( location.type == Resource::GPU );
+// 	LOGINFO("...Sending data");
+
+	
+// 	states_d = states_d.send_children(location);
+// 	buffer_d = buffer_d.send_children(location);
+	    
+// 	// LOGINFO("...Cleaning temporary data");
+// 	// delete [] tmp.states;
+// 	// delete [] tmp.buffer;
+
+// 	// gpuErrchk(cudaMalloc(&(data.states), sizeof(curandState_t) * c.num_threads));
+// 	// int nBlocks = c.num_threads / NUM_THREADS + 1;
+// 	// initKernel<<< nBlocks, NUM_THREADS >>>(seed, data.states, NUM_THREADS);
+// 	// gpuErrchk(cudaDeviceSynchronize());
+
+// 	// TODO consider location whe ncreating generator!
+// 	// Create RNG and set seed
+// 	LOGINFO("...Creating generator");
+// 	curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW);
+// 	curandSetPseudoRandomGeneratorSeed( generator, seed );
+
+// 	// Set seed 
+// 	LOGINFO("...Setting seed");
+// 	gpuErrchk(cudaDeviceSynchronize());
+// 	LOGINFO("...synced");
+// 	size_t num_blocks = 1 + (c.num_threads-1)/NUM_THREADS;
+// 	LOGINFO("...launching");
+// 	// LOGINFO("...hmmm {}", (void*) metadata.states);
+// 	RandomImplGPUres_init_kernel<<< num_blocks, NUM_THREADS >>>(seed, states_d.values, num_states);
+// 	LOGINFO("...Kernel launched");
+// 	gpuErrchk(cudaDeviceSynchronize());
+// 	LOGINFO("...Done");
+
+//     // Allocate temporary storage
+
 
-// GPU-resident?
+// 	// gpuErrchk(cudaMalloc((void**) &(data.buffer), sizeof(size_t) * buffer_size));
+	    
+//     };
+
+//     // struct Data {
+//     // 	static_assert( sizeof(float) == sizeof(int) );
+//     // 	static_assert( sizeof(float) == sizeof(T) );
+
+//     // 	Data send_children(Resource& location) const {
+//     // 	    Data tmp{};
+//     // 	    tmp.states = states.send_children(location);
+//     // 	    tmp.buffer = buffer.send_children(location);
+//     // 	    return tmp;
+//     // 	};
+//     // 	void clear() {};	    
+//     // };
+
+// private:    
+//     // Proxy<Data> data;
+//     size_t seed;
+//     curandGenerator_t generator;
+//     size_t num_states;
+//     Array<curandState_t> states_d{NUM_THREADS};
+//     Array<T> buffer_d{buffer_size};		// Cast when you need (float) vs int?
+//     Array<T> local_buffer{buffer_size};	// What about Gaussian (float) vs int?
+//     size_t num;		                // number of entries left in local_buffer
+
+//     // Methods for maninpulating the state
+//     void init(size_t num, unsigned long seed, size_t offset) {
+// 	// gpuErrchk(cudaMalloc((void**)&uniform_d, sizeof(float) * RAND_N));
+// 	// gpuErrchk(cudaMalloc((void**)&integer_d, sizeof(unsigned int) * RAND_N));
+// 	// gpuErrchk(cudaMalloc((void**)&gaussian_d, sizeof(float) * RAND_N));
+// 	// integer_h = new unsigned int[RAND_N];
+// 	// uniform_h = new float[RAND_N];
+// 	// gaussian_h = new float[RAND_N];
+// 	// uniform_n = 0;
+// 	// integer_n = 0;
+// 	// gaussian_n = 0;
+// 	// // 
+//     };
+
+// public:    
+//     // RandomImplCUDA<NUM_THREADS,buffer_size> send_children(Resource& location) const {
+//     // 	RandomImplCUDA<NUM_THREADS,buffer_size> tmp{};
+//     // 	tmp.states = states.send_children(location);
+//     // 	tmp.buffer = buffer.send_children(location);
+//     // 	return tmp;
+//     // 	};
+//     // 	void clear() {};	    
+
+//     // TODO: could introduce dual buffer so random numbers can be filled asynchronously
+//     // Methods for generating random values
+//     HOST DEVICE inline float gaussian() {
+// 	if (num == 0) {
+// 	    cuRandchk(curandGenerateUniform( generator, (float*) buffer_d.values, buffer_size ));
+// 	    gpuErrchk(cudaMemcpy(local_buffer.values, buffer_d.values, sizeof(float) * buffer_size, cudaMemcpyDeviceToHost));
+// 	    num = buffer_size;
+// 	}
+// 	return *reinterpret_cast<float*>(&local_buffer[--num]);
+//     };
+//     HOST DEVICE inline float gaussian(size_t idx, size_t thread) {
+// 	return 0;
+//     };
+
+//     // HOST DEVICE inline float gaussian(RandomState* state) {};
+//     HOST DEVICE inline Vector3 gaussian_vector(size_t idx, size_t thread) {
+// 	return Vector3();
+//     };
+// };
+
+
+using curandState = curandStateXORWOW_t;
+template<size_t NUM_THREADS = 128>
+class RandomGPU : public Random {
+public:
+
+    DEVICE RandomGPU(Random::Conf c, Resource& location) : num_states{c.num_threads}, states{nullptr} {
+	// Note: NUM_THREADS refers to CUDA threads, whereas c.num_threads refers to (?) number of random numbers to be generated in parallel
+	// Not sure how this
+	assert( threadIdx.x + blockIdx.x == 0 );
+	
+	LOGINFO("Creating RandomGPU", seed);
+	assert( location.type == Resource::GPU );
+	states = new curandState[num_states];
+    }
+    DEVICE ~RandomGPU() {
+	assert( threadIdx.x + blockIdx.x == 0 );
+	if (states != nullptr) delete [] states;
+	states = nullptr;
+    };
+
+private:    
+    size_t num_states;
+    curandState* states;	// RNG states stored in global memory 
+    
+public:    
+    // Methods for maninpulating the state
+    DEVICE void init(size_t num, unsigned long seed, size_t offset = 0) {
+	// curand_init(unsigned long long seed,
+	// 	    unsigned long long sequence,
+	// 	    unsigned long long offset,
+	// 	    curandStateXORWOW_t *state)
+	assert( states != nullptr );
+	for (size_t idx = threadIdx.x + blockIdx.x*blockDim.x; idx < NUM_THREADS; idx += blockDim.x * gridDim.x) {
+	    curand_init(seed, tidx, offset, curandState &state[idx]);
+	}
+    };
+
+    // // This _might_ be made more generic by using whatever chunk of shm is needed, then returning end of pointer
+    // DEVICE inline void copy_state_to_shm(curandState *shm) {
+    // 	size_t idx = threadIdx.x + blockIdx.x*blockDim.x;
+    // 	states_shm = &shm;
+    // 	for (; idx < NUM_THREADS; idx += blockDim.x * gridDim.x) {
+    // 	    (*states_shm)[idx] = states[idx];
+    // 	}
+    // 	// __syncthreads();
+
+    // }
+    
+    DEVICE inline float gaussian() {
+	size_t idx = threadIdx.x + blockIdx.x*blockDim.x;
+	assert( idx < num_states );
+	return curand_normal(&states[idx]);
+    };
+
+    DEVICE inline float gaussian(curandState& state) { return curand_normal( &state ); };
+
+    // Useful for getting the RNG state for the thread from global memory, putting it in local memory for use
+    DEVICE inline curandState* get_gaussian_state() {
+	size_t& i = threadIdx.x + blockSize.x*blockIdx.x;
+	return (i < num_states) ? &(states[i]) : nullptr;
+    };
+
+    DEVICE inline void set_gaussian_state(curandState& state) {
+	size_t& i = threadIdx.x + blockSize.x*blockIdx.x;
+	if (i < num_states) states[i] = state;
+    };
+
+    DEVICE inline Vector3 gaussian_vector() {
+	return Vector3( gaussian(), gaussian(), gaussian() );
+    };
+
+    DEVICE inline Vector3 gaussian_vector( curandState& state) {
+	return Vector3( gaussian(state), gaussian(state), gaussian(state) );
+    };
+};
+
+
+/*
+ * Note: this implementation (and probably the interface) should be
+ *   reworked. Generating RNs on the GPU doesn't require or benefit
+ *   from the Data buffers, so probably those should be removed from a
+ *   device-resident object. Ideally this object can implement a
+ *   uniform and lightweight interface that enables efficient use
+ *   through the agnostic API provided by the Random class.
+ */
 template<size_t NUM_THREADS = 128, size_t buffer_size = 1024>
 class RandomImplCUDA : public Random {
+public:
+
+    template<typename T>
+    struct Data {
+	Array<float> buffer{buffer_size};
+	size_t num; // number of unread entries in buffer
+
+	Data send_children(Resource& location) const {
+	    Data tmp{};
+	    tmp.buffer = buffer.send_children(location);
+	    LOGTRACE( "Clearing RANDOM::Data::buffer..." );
+	    buffer.clear(); // Not sure if this should be done here!
+	    return tmp;
+	};
+
+    };
+    
+    RandomImplCUDA(Random::Conf c, Resource& location) : location{location}, seed{c.seed}, num_states{c.num_threads}, num{0} {
+	// Note: NUM_THREADS refers to CUDA threads, whereas c.num_threads refers to (?) number of random numbers to be generated in parallel
+	
+	LOGINFO("Creating RandomImplCUDA with seed {}", seed);
+	assert( location.type == Resource::GPU );
+	LOGINFO("...Sending data");
+
+	// Data tmp;
+	// data = send(location, tmp);	
+	// metadata.buffer = new size_t[buffer_size]; // TODO handle case if RandomImplCUDA is on location
+	// metadata.num = 0;
+
+	states_d = states_d.send_children(location);
+	    
+	// LOGINFO("...Cleaning temporary data");
+	// delete [] tmp.states;
+	// delete [] tmp.buffer;
+
+	// gpuErrchk(cudaMalloc(&(data.states), sizeof(curandState_t) * c.num_threads));
+	// int nBlocks = c.num_threads / NUM_THREADS + 1;
+	// initKernel<<< nBlocks, NUM_THREADS >>>(seed, data.states, NUM_THREADS);
+	// gpuErrchk(cudaDeviceSynchronize());
+
+	// TODO consider location whe ncreating generator!
+	// Create RNG and set seed
+	LOGINFO("...Creating generator");
+	curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_XORWOW);
+	curandSetPseudoRandomGeneratorSeed( generator, seed );
+
+	// Set seed 
+	LOGINFO("...Setting seed");
+	gpuErrchk(cudaDeviceSynchronize());
+	LOGINFO("...synced");
+	size_t num_blocks = 1 + (c.num_threads-1)/NUM_THREADS;
+	LOGINFO("...launching");
+	// LOGINFO("...hmmm {}", (void*) metadata.states);
+	RandomImplCUDA_init_kernel<<< num_blocks, NUM_THREADS >>>(c.seed, states_d.values, num_states);
+	LOGINFO("...Kernel launched");
+	gpuErrchk(cudaDeviceSynchronize());
+	LOGINFO("...Done");
+
+    // Allocate temporary storage
+
+
+	// gpuErrchk(cudaMalloc((void**) &(data.buffer), sizeof(size_t) * buffer_size));
+	    
+    };
+
+    // struct Data {
+    // 	static_assert( sizeof(float) == sizeof(int) );
+    // 	static_assert( sizeof(float) == sizeof(T) );
+
+    // 	Data send_children(Resource& location) const {
+    // 	    Data tmp{};
+    // 	    tmp.states = states.send_children(location);
+    // 	    tmp.buffer = buffer.send_children(location);
+    // 	    return tmp;
+    // 	};
+    // 	void clear() {};	    
+    // };
+
+private:    
+    Resource location;
+    size_t seed;
+    curandGenerator_t generator;
+    size_t num_states;
+    Array<curandState_t> states_d;
+
+    // Buffers for generating numbers from CPU-resident
+    Data<float>* gauss_d;
+    Data<float>* poisson_d;
+    Data<float>* uniform_d;
+    
+    Data<float> gauss;
+    Data<float> poisson;
+    Data<float> uniform;
+    
+	
+    // Methods for maninpulating the state
+    void init(size_t num, unsigned long seed, size_t offset) {
+	// gpuErrchk(cudaMalloc((void**)&uniform_d, sizeof(float) * RAND_N));
+	// gpuErrchk(cudaMalloc((void**)&integer_d, sizeof(unsigned int) * RAND_N));
+	// gpuErrchk(cudaMalloc((void**)&gaussian_d, sizeof(float) * RAND_N));
+	// integer_h = new unsigned int[RAND_N];
+	// uniform_h = new float[RAND_N];
+	// gaussian_h = new float[RAND_N];
+	// uniform_n = 0;
+	// integer_n = 0;
+	// gaussian_n = 0;
+	// // 
+    };
+
+public:    
+    // RandomImplCUDA<NUM_THREADS,buffer_size> send_children(Resource& location) const {
+    // 	RandomImplCUDA<NUM_THREADS,buffer_size> tmp{};
+    // 	tmp.states = states.send_children(location);
+    // 	tmp.buffer = buffer.send_children(location);
+    // 	return tmp;
+    // 	};
+    // 	void clear() {};	    
+
+    // TODO: could introduce dual buffer so random numbers can be filled asynchronously
+    // Methods for generating random values
+    HOST DEVICE inline float gaussian() {
+#ifdef __CUDA__ARCH__
+	size_t& i = threadIdx.x + blockSize.x*blockIdx.x
+	if (i < num_states) {
+	    return curand_normal( &(state_d[i]) );
+	}
+	return 0.0f;
+#else
+	if (num == 0) {
+	    cuRandchk(curandGenerateUniform( generator, (float*) gauss.buffer_d.values, buffer_size ));
+	    gpuErrchk(cudaMemcpy(local_buffer.values, buffer_d.values, sizeof(float) * buffer_size, cudaMemcpyDeviceToHost));
+	    num = buffer_size;
+	}
+	return (&local_buffer[--num]);
+#endif
+    };
+    
+    __device__ inline float gaussian() {
+    };
+    
+    DEVICE inline float gaussian(curandState* state) { return curand_normal( state ); };
+
+    // Useful for getting the RNG state for the thread from global memory, putting it in local memory for use
+    DEVICE inline curandState* get_gaussian_state() {
+	size_t& i = threadIdx.x + blockSize.x*blockIdx.x;
+	if (i < num_states) {
+	    return &(state_d[i]);
+	}
+	return nullptr;
+    };
+
+    DEVICE inline void set_gaussian_state(curandState* state) {
+	size_t& i = threadIdx.x + blockSize.x*blockIdx.x;
+	if (i < num_states) state_d[i] = *state;
+    };
+    
+    // HOST inline float gaussian(size_t idx, size_t thread) {
+    // 	return 0;
+    // };
+
+    // HOST DEVICE inline float gaussian(RandomState* state) {};
+    HOST inline Vector3 gaussian_vector(size_t idx, size_t thread) {
+	return Vector3();
+    };
+    
+    // unsigned int integer() {
+    // 	return 0;
+    // };
+    // unsigned int poisson(float lambda) {
+    // 	return 0;
+    // };
+    // float uniform() {
+    // 	return 0;
+    // };
+};
+
+/* GPU-resident?
+template<size_t NUM_THREADS = 128, size_t buffer_size = 1024>
+class RandomImplCUDA_DEPRECATE : public Random {
 public:
     struct Data {
-	Data() : states(nullptr), buffer(nullptr), num(0) {};
-	curandState_t *states;
-	size_t* buffer;		// What about Gauss vs int?
-	size_t num;		// Remove?
+	Data() : states(nullptr), buffer(nullptr), metadata(nullptr) {};
+	// Data(Data&& orig) : num_states(orig.states), states(orig.states), buffer(orig.buffer), num(orig.num) {};
+
+	struct Metadata {
+	    size_t num_states;
+	    size_t num;		// Remove?
+	    
+	};
+	Array<curandState_t> states;
+	size_t* buffer;		// What about Gaussian (float) vs int?
+	// static_assert( sizeof(float) == sizeof(size_t) );
+	// static_assert( sizeof(int) == sizeof(size_t) );
+	Metadata* metadata;
+	
+	Data send_children(Resource& location) const {
+	    const Resource& local = Resource::Local();
+	    Data tmp{*this};
+	    tmp.states = nullptr;
+	    tmp.buffer = nullptr;
+	    
+	    if (local.type == Resource::CPU) {
+		if (location.type == Resource::GPU) {
+		    if (states != nullptr) {
+			gpuErrchk(cudaMalloc(&(tmp.states), sizeof(curandState_t) * num_states));
+			gpuErrchk(cudaMemcpy(tmp.states, states, sizeof(curandState_t) * num_states, cudaMemcpyHostToDevice));
+		    }
+		    if (buffer != nullptr) {
+			gpuErrchk(cudaMalloc(&(tmp.buffer), sizeof(size_t) * buffer_size));
+			gpuErrchk(cudaMemcpy(tmp.buffer, buffer, sizeof(float) * buffer_size, cudaMemcpyHostToDevice));
+		    }
+		} else {
+		    Exception(NotImplementedError, "");
+		}
+	    } else {
+		Exception(NotImplementedError, "");
+	    }
+	    return tmp;
+	};
+	void clear() {
+	    Exception(NotImplementedError, "");	    
+	};
     };
 
     // Metadata stored on host even if Data is on device
     struct Metadata {
  	curandGenerator_t generator;
-	Proxy<Data> data;		// state data may be found elsewhere
-	size_t* buffer;
-	size_t num;
+	curandState_t *states_d;	
+	size_t* buffer_d;    // Device buffer (possibly used)
+	Proxy<Data> data;    // State data that may be found elsewhere
+	size_t* buffer;	     // Local buffer
+	size_t num;	     // Number of elements in local buffer
     };
 
 
-    RandomImplCUDA(Random::Conf c, Resource&& location) {
+    RandomImplCUDA_DEPRECATE(Random::Conf c, Resource& location) {
 	// Note: NUM_THREADS refers to CUDA threads, whereas c.num_threads refers to (?) number of random numbers to be generated in parallel
 
-	INFO("Creating RandomImplCUDA with seed {}", c.seed);
+	LOGINFO("Creating RandomImplCUDA with seed {}", c.seed);
 	
 	assert( location.type == Resource::GPU );
 	
@@ -261,14 +678,14 @@ public:
 	tmp.buffer = new size_t[buffer_size];
 	tmp.num = 0;
 
-	INFO("...Sending data");
+	LOGINFO("...Sending data");
 
 	metadata.data = send(location, tmp);
 	metadata.buffer = new size_t[buffer_size]; // TODO handle case if RandomImplCUDA is on location
 	metadata.num = 0;
 	
 
-	INFO("...Cleaning temporary data");
+	LOGINFO("...Cleaning temporary data");
 	delete [] tmp.states;
 	delete [] tmp.buffer;
 
@@ -279,19 +696,21 @@ public:
 
 	// TODO consider location whe ncreating generator!
 	// Create RNG and set seed
-	INFO("...Creating generator");
+	LOGINFO("...Creating generator");
 	curandCreateGenerator(&(metadata.generator), CURAND_RNG_PSEUDO_XORWOW);
 	curandSetPseudoRandomGeneratorSeed( metadata.generator, c.seed );
 
 	// Set seed 
-	INFO("...Setting seed");
+	LOGINFO("...Setting seed");
 	gpuErrchk(cudaDeviceSynchronize());
-	INFO("...synced");
+	LOGINFO("...synced");
 	size_t num_blocks = 1 + (c.num_threads-1)/NUM_THREADS;
-	RandomImplCUDA_init_kernel<<< num_blocks, NUM_THREADS >>>(c.seed, metadata.data.addr->states, c.num_threads);
-	INFO("...Kernel launched");
+	LOGINFO("...launching");
+	// LOGINFO("...hmmm {}", (void*) metadata.states);
+	RandomImplCUDA_init_kernel<<< num_blocks, NUM_THREADS >>>(c.seed, metadata.states, c.num_threads);
+	LOGINFO("...Kernel launched");
 	gpuErrchk(cudaDeviceSynchronize());
-	INFO("...Done");
+	LOGINFO("...Done");
 
 	// Allocate temporary storage
 
@@ -346,6 +765,7 @@ private:
     
     // void reorder(int *a, int n);
     };
+// */
 #endif
 
 Random* Random::GetRandom(Conf&& conf) {
@@ -365,14 +785,15 @@ Random* Random::GetRandom(Conf&& conf) {
 	if (inserted) {
 	    // Conf not found, so create a new one 
 	    Random* tmp;
-
+	    Resource r{conf.backend == Conf::CUDA ? Resource::GPU : Resource::CPU ,0};
+	    
 	    switch (conf.backend) {
 	    case Conf::CUDA:
 #ifdef USE_CUDA
-		// TODO: replace Resource below / overload GetRandom() so it can target a resource 
-		tmp = new RandomImplCUDA<128,1024>(conf, Resource{Resource::GPU,0} );
+		// TODO: replace Resource below / overload GetRandom() so it can target a resource
+		tmp = new RandomImplCUDA<128,1024>(conf, r );
 #else
-		WARN("Random::GetRandom(): CUDA disabled, creating CPU random instead");
+		LOGWARN("Random::GetRandom(): CUDA disabled, creating CPU random instead");
 		tmp = new RandomImpl();
 #endif
 		break;
diff --git a/src/Random.h b/src/Random.h
index c5d5786..8081760 100644
--- a/src/Random.h
+++ b/src/Random.h
@@ -31,23 +31,40 @@ public:
     };
     // size_t thread_idx;
 
-    struct State;
+    struct State;		// Not sure whether to use something like this to capture the RNG state...
     
     static Random* GetRandom(Conf&& conf);
 
     // Methods for maninpulating the state
     // virtual void init(size_t num, unsigned long seed, size_t offset) = 0;
     
-    // Methods for generating random values
-    virtual inline float gaussian() = 0;
-    virtual inline float gaussian(size_t idx, size_t thread) = 0;
-    // HOST DEVICE inline float gaussian(RandomState* state);
-    virtual inline Vector3 gaussian_vector(size_t idx, size_t thread) = 0;
+    // // Methods for generating random values
+    // // Use of this convenience method is discouraged... maybe remove?
+    // HOST DEVICE virtual inline float gaussian() = 0;
+    // // virtual inline float gaussian(size_t idx, size_t thread) = 0;
+
+    // How can we redesign to avoid using virtual functions for this low-level stuff? Perhaps the GetRandom() approach should be dropped
     
-    virtual unsigned int integer() = 0;
-    virtual unsigned int poisson(float lambda) = 0;
+    // Here we use void pointers to acheive polymorphism; we lose type
+    // safety but gain the ability to have a single approach to invoking Random 
+    HOST DEVICE virtual inline float gaussian(void* state) = 0;
+
+    template<typename S>
+    HOST DEVICE virtual inline S* get_gaussian_state() = 0;
+
+    template<typename S>
+    HOST DEVICE virtual inline void set_gaussian_state(S* state) = 0;
+
+    
+    
+    // HOST DEVICE inline float gaussian(RandomState* state);
+    // virtual inline Vector3 gaussian_vector(size_t idx, size_t thread) = 0;
+
     virtual float uniform() = 0;
     
+    // virtual unsigned int integer() = 0;
+    // virtual unsigned int poisson(float lambda) = 0;
+    
     // void reorder(int *a, int n);
     
 protected:
@@ -58,39 +75,3 @@ protected:
 	// }
 
 };
-
-// class RandomImpl : public Random {
-// public:
-//     // Methods for maninpulating the state
-//     void init(size_t num, unsigned long seed, size_t offset);
-
-//     // // Methods for generating random values
-//     HOST inline float gaussian();
-//     HOST inline float gaussian(size_t idx, size_t thread);
-//     // HOST DEVICE inline float gaussian(RandomState* state) {};
-//     HOST inline Vector3 gaussian_vector(size_t idx, size_t thread);
-//     unsigned int integer() { return 0; };
-//     unsigned int poisson(float lambda) { return 0; };
-//     float uniform() { return 0; };
-    
-//     // // void reorder(int *a, int n);
-
-// };
-
-#ifdef USE_CUDA
-// class RandomImplCUDA : public Random {
-// public:
-//     // Methods for maninpulating the state
-//     void init(size_t num, unsigned long seed, size_t offset);
-
-//     // // Methods for generating random values
-//     HOST inline float gaussian();
-//     HOST inline float gaussian(size_t idx, size_t thread);
-//     // HOST DEVICE inline float gaussian(RandomState* state) {};
-//     HOST inline Vector3 gaussian_vector(size_t idx, size_t thread);
-//     unsigned int integer() { return 0; };
-//     unsigned int poisson(float lambda) { return 0; };
-//     float uniform() { return 0; };
-
-// };
-#endif
diff --git a/src/SignalManager.h b/src/SignalManager.h
index 1d517d0..704f39b 100644
--- a/src/SignalManager.h
+++ b/src/SignalManager.h
@@ -21,13 +21,13 @@
 #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE
 #endif
 
-#define TRACE(...) SPDLOG_TRACE(__VA_ARGS__)
-#define DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
+#define LOGTRACE(...) SPDLOG_TRACE(__VA_ARGS__)
+#define LOGDEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
 // #define DEBUG(...) spdlog::debug(__VA_ARGS__)
-#define INFO(...) SPDLOG_INFO(__VA_ARGS__)
-#define WARN(...) SPDLOG_WARN(__VA_ARGS__)
-#define ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
-#define CRITICAL(...) SPDLOG_CRITICAL(__VA_ARGS__)
+#define LOGINFO(...) SPDLOG_INFO(__VA_ARGS__)
+#define LOGWARN(...) SPDLOG_WARN(__VA_ARGS__)
+#define LOGERROR(...) SPDLOG_ERROR(__VA_ARGS__)
+#define LOGCRITICAL(...) SPDLOG_CRITICAL(__VA_ARGS__)
 // spdlog::set_level(spdlog::level::trace);
 //*/
 
diff --git a/src/SimManager.cpp b/src/SimManager.cpp
index 16e5fb2..11cd324 100644
--- a/src/SimManager.cpp
+++ b/src/SimManager.cpp
@@ -3,7 +3,7 @@
 #include <memory>
 
 void SimManager::run() {
-    INFO("Running!");
+    LOGINFO("Running!");
     // TODO: process a command queue?
 
     // SimSystem sys = SimSystem();
@@ -27,7 +27,7 @@ void SimManager::run() {
 // #endif
     
     for (size_t step = 0; step < 10; ++step) {
-	INFO("Step {}: random {}", step, r->gaussian());
+	LOGINFO("Step {}: random {:0.2f}", step, r->gaussian());
 	p.compute();
 #ifdef USE_CUDA
 	cudaDeviceSynchronize();
diff --git a/src/Types/Array.h b/src/Types/Array.h
index 91dcbcf..8da11cc 100644
--- a/src/Types/Array.h
+++ b/src/Types/Array.h
@@ -11,8 +11,7 @@
 
 // Simple templated array object without resizing capabilities 
 template<typename T>
-class Array {
-public:
+struct Array {
     HOST DEVICE inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this);
     HOST inline Array<T>(size_t num) : num(num), values(nullptr) {
 	// printf("Constructing Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values);
@@ -118,11 +117,11 @@ public:
     template <typename Dummy = void, typename std::enable_if_t<!has_send_children<T>::value, Dummy>* = nullptr>
     HOST inline Array<T> send_children(const Resource& location) {
 	T* values_d = nullptr;
-
+	
 	// Allocate and copy at values_d
 	if (num > 0) { 
 	    size_t sz = sizeof(T) * num;
-	    // printf("   cudaMalloc for %d items\n", num);
+	    LOGINFO("  Array<{}>.send_children(...): cudaMalloc for {} items", type_name<T>(), num);
 	    switch (location.type) {
 	    case Resource::GPU:
 		gpuErrchk(cudaMalloc(&values_d, sz));
@@ -138,12 +137,14 @@ public:
 		Exception( ValueError, "Unknown Resource type" );
 	    }
 	}
+	LOGINFO("  Array<{}>.send_children(...): done copying", type_name<T>());
 
 	// Copy Array with pointers correctly assigned
 	Array<T> tmp(0);
 	tmp.num = num;
 	tmp.values = values_d;
 	// printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d);
+	LOGINFO("  Array<{}>.send_children(...): done", type_name<T>());
 	return tmp;
     }
     template <typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr>
@@ -350,6 +351,7 @@ private:
 	values = nullptr;
     }
     
+public:
     size_t num;
     T* __restrict__ values;
 };
-- 
GitLab