From d8e076183644c9879c39a1b4e84a0dc877eaa3d7 Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Wed, 23 Oct 2024 20:27:36 -0500
Subject: [PATCH] Development on Proxy template class, with lots to clean up

---
 src/ParticlePatch.cpp |  27 +++++-
 src/ParticlePatch.h   |  47 ++++++++--
 src/Patch.h           |  48 +++++------
 src/Proxy.h           | 195 ++++++++++++++++++++++++++++++++++++++----
 src/Random.h          |  40 +++++++++
 src/Resource.cu       |  12 ++-
 src/Resource.h        |  32 +++----
 src/SimManager.cu     |  20 +++--
 src/SimSystem.cpp     |   2 +-
 src/SimSystem.h       |   4 +
 10 files changed, 346 insertions(+), 81 deletions(-)

diff --git a/src/ParticlePatch.cpp b/src/ParticlePatch.cpp
index e64231a..a4b76ca 100644
--- a/src/ParticlePatch.cpp
+++ b/src/ParticlePatch.cpp
@@ -7,12 +7,29 @@
 //     // rb_pos = rb_orient = rb_mom = rb_ang_mom = type = rb_type = nullptr;
 // };
 
+size_t BasePatch::global_patch_idx = 0;
+
 void Patch::initialize() {
+    LOGINFO("Creating Patch::Data");
+    /*
     Data* data = new Data{nullptr,nullptr,nullptr,nullptr};	// TODO free
-    metadata = Metadata{0,0,0};
+    // metadata = Metadata{0,0,0};
     // data->pos_force = data->momentum = nullptr;
     // data->particle_types = data->particle_order = nullptr;
-    metadata.data = send( Resource{Resource::CPU,0}, *data );
+    LOGINFO("Sending Patch::Data @{}", fmt::ptr(data));
+    Resource r = Resource{Resource::CPU,0};
+    // if (&(metadata.data) != nullptr) {
+    // 	LOGINFO( "Freeing metadata.data");
+    // 	delete &(metadata.data);
+    // 	LOGINFO( "Freeing metadata");
+    // }
+    // send(r,*data);
+    metadata.data = send(r, *data);
+    LOGINFO("Data sent");
+    */
+    LOGWARN("Patch::initialize(): Creating Data on CPU resource");
+    Resource r = Resource{Resource::CPU,0};
+    metadata.data = construct_remote<Data>(r, capacity);
 };
 
 Patch* Patch::copy_to_cuda(Patch* dev_ptr) const {
@@ -49,10 +66,12 @@ Patch* Patch::copy_to_cuda(Patch* dev_ptr) const {
 };
 
 Patch Patch::send_children(Resource location) const {
+    LOGWARN("Patch::send_children()");
     Patch tmp(0);
     Proxy<Data> newdata;
     switch (location.type) {
     case Resource::GPU:
+	Exception( NotImplementedError, "`send_children(...)` not implemented on GPU" );
 	// metadata.data is already a proxy because the data may reside on the GPU;
 	// Q: In what cases should we move metadata.data.addr?
 	//     For example, we could always move, or we could never move, or we could move if location metadata.data.location is a CPU resource
@@ -80,11 +99,13 @@ Patch Patch::send_children(Resource location) const {
 
 
 Patch Patch::copy_from_cuda(Patch* dev_ptr, Patch* dest) {
+    Exception(NotImplementedError, "Deprecated");
     // TODO
     Patch tmp = Patch();
     return tmp;
 };
 void Patch::remove_from_cuda(Patch* dev_ptr) {
+    Exception(NotImplementedError, "Deprecated");
     // TODO
 };
     
@@ -99,7 +120,7 @@ void Patch::compute() {
 // template<typename T>
 // size_t Patch::send_particles_filtered( Proxy<Patch>& destination, T filter ) {
 size_t Patch::send_particles_filtered( Proxy<Patch>& destination, std::function<bool(size_t,Patch::Data)> filter ) {
-// TODO determine who allocates and cleans up temporary data
+    // TODO determine who allocates and cleans up temporary data
     // TODO determine if there is a good way to avoid having temporary data (this can be done later if delegated to a reasonable object)
 
     // TODO think about the design and whether it's possible to have a single array with start/end
diff --git a/src/ParticlePatch.h b/src/ParticlePatch.h
index ee217a3..7cf7a51 100644
--- a/src/ParticlePatch.h
+++ b/src/ParticlePatch.h
@@ -33,8 +33,28 @@ class BasePatch {
 public:
     // BasePatch(size_t num, short thread_id, short gpu_id, SimSystem& sys);
     // BasePatch(size_t num, short thread_id, short gpu_id);
-    BasePatch() : num(0), capacity(0) {}
-    BasePatch(size_t capacity) : num(0), capacity(capacity) {}
+    BasePatch() : num(0), capacity(0), patch_idx(++global_patch_idx) {}
+    BasePatch(size_t capacity) : num(0), capacity(capacity), patch_idx(++global_patch_idx) {}
+    
+    // Copy constructor
+    BasePatch(const BasePatch& other) : num(other.num), capacity(other.capacity), patch_idx(++global_patch_idx) {
+    	LOGTRACE("Copy constructing {} @{}", type_name<*this>().c_str(), fmt::ptr(this));
+}
+    // Move constructor
+    BasePatch(BasePatch&& other) : num(std::move(other.num)), capacity(std::move(other.capacity)), patch_idx(std::move(other.patch_idx)) {
+	LOGTRACE("Move constructing {} @{}", type_name<*this>().c_str(), fmt::ptr(this));
+    }
+    // Move assignment operator
+    BasePatch& operator=(BasePatch&& other) {
+	LOGTRACE("Move assigning {} @{}", type_name<*this>().c_str(), fmt::ptr(this));
+	num = std::move(other.num);
+	capacity = std::move(other.capacity);
+	patch_idx = std::move(other.patch_idx);
+	// lower_bound = std::move(other.lower_bound);
+	// upper_bound = std::move(other.upper_bound);
+	return *this;
+    }
+
     // ~BasePatch();
     
 protected:
@@ -44,12 +64,15 @@ protected:
     // short thread_id;		// MPI
     // short gpu_id;		// -1 if GPU unavailable
 
-    static size_t patch_idx;		// Unique ID across ranks
+    static size_t global_patch_idx;		// Unique ID across ranks // TODO: preallocate regions that will be used, or offload this to a parallel singleton class
+    /* const */ size_t patch_idx;		// Unique ID across ranks
 
+    // Q: should we have different kinds of patches? E.g. Spheres? This specialization _could_ go into subclass, or we could have a ptr to a Region class that can have different implementations
     Vector3 lower_bound;
     Vector3 upper_bound;
 };
 
+
 // class PatchProxy {
 //     // 
 // public:
@@ -127,11 +150,22 @@ protected:
 // Storage class that should be
 class Patch : public BasePatch {
 public:
-    Patch() : BasePatch() { initialize(); }
+    Patch() : BasePatch(), metadata() { LOGINFO("Creating Patch"); initialize(); LOGINFO("Done Creating Patch"); }
     Patch(size_t capacity) : BasePatch(capacity) { initialize(); }
 
     // Particle data arrays pointing to either CPU or GPU memory
     struct Data {
+	Data(const size_t capacity = 0) {
+	    if (capacity == 0) {
+		pos_force = momentum = nullptr;
+		particle_types = particle_order = nullptr;
+	    } else {
+		pos_force = new VectorArr(capacity);
+		momentum = new VectorArr(capacity);
+		particle_types = new Array<size_t>(capacity);
+		particle_order = new Array<size_t>(capacity);
+	    }
+	}
 	VectorArr* pos_force;
 	VectorArr* momentum;
 	Array<size_t>* particle_types;
@@ -152,7 +186,11 @@ public:
 
     // Metadata stored on host even if Data is on device
     struct Metadata {
+	Metadata() : num(0), capacity(0), min(0), max(0), data() {};
+	Metadata(const Patch& p) : num(p.metadata.num), capacity(p.metadata.capacity), min(p.metadata.min), max(p.metadata.max), data(p.metadata.data) {};
+	Metadata(const Metadata& other) : num(other.num), capacity(other.capacity), min(other.min), max(other.max), data(other.data) {};
 	size_t num;
+	size_t capacity;
 	Vector3 min;
 	Vector3 max;
 	Proxy<Data> data;		// actual data may be found elsewhere
@@ -203,7 +241,6 @@ public:
 	return 1;
     }
 
-    
 private:
     void initialize();
     
diff --git a/src/Patch.h b/src/Patch.h
index cd211f9..2773a3d 100644
--- a/src/Patch.h
+++ b/src/Patch.h
@@ -16,38 +16,38 @@
 #include "PatchOp.h"
 
 
-template<typename Pos, typename Force>
-class Patch {
-    // size_t num_replicas;
+// template<typename Pos, typename Force>
+// class Patch {
+//     // size_t num_replicas;
 
-public:    
-    Patch(SimParameters* sim);
+// public:    
+//     Patch(SimParameters* sim);
 	
     
-    void compute();
+//     void compute();
     
-private:
+// private:
 
-    SimParameters* sim;
+//     SimParameters* sim;
     
-    Pos minimum;
-    Pos maximum;
+//     Pos minimum;
+//     Pos maximum;
 
-    ParticleTypes* types;
-    ParticleTypes* types_d;
+//     ParticleTypes* types;
+//     ParticleTypes* types_d;
 
-    // Particle data
-    size_t num_particles;
+//     // Particle data
+//     size_t num_particles;
 
-    idx_t* global_idx;		// global index of particle
-    size_t* type_ids;
-    Pos* pos;
-    Force* force;
+//     idx_t* global_idx;		// global index of particle
+//     size_t* type_ids;
+//     Pos* pos;
+//     Force* force;
     
-    size_t* type_ids_d;
-    Pos* pos_d;
-    Force* force_d;
+//     size_t* type_ids_d;
+//     Pos* pos_d;
+//     Force* force_d;
 
-    std::vector<Patch> neighbor_patches;
-    std::vector<BasePatchOp> ops;
-};
+//     std::vector<Patch> neighbor_patches;
+//     std::vector<BasePatchOp> ops;
+// };
diff --git a/src/Proxy.h b/src/Proxy.h
index e1c49e5..3756277 100644
--- a/src/Proxy.h
+++ b/src/Proxy.h
@@ -4,6 +4,30 @@
 #include <iostream>
 #include "Resource.h"
 
+#include "cuda.h"
+#include "cuda_runtime.h"
+
+// template<typename T, typename...Args1, typename... Args2>
+// __global__ void proxy_sync_call_kernel_noreturn(T* addr, (T::*memberFunc(Args1...)), Args2...args);
+
+#ifdef __CUDACC__
+#include <cuda/std/utility>
+// Kernels
+template<typename T, typename RetType, typename...Args>
+__global__ void proxy_sync_call_kernel(RetType* result, T* addr, RetType (T::*memberFunc(Args...)), Args...args) {
+    if (blockIdx.x == 0) {
+	*result = (addr->*memberFunc)(args...);
+    }
+}
+
+template<typename T, typename... Args>
+__global__ void constructor_kernel(T* __restrict__ devptr, Args...args) {
+    if (blockIdx.x == 0) {
+	devptr = new T{::cuda::std::forward<Args>(args)...};
+    }
+}
+#endif
+
 // START traits
 // These ugly bits of code help implement SFINAE in C++14 and should likely be removed if a newer standard is adopted 
 // https://stackoverflow.com/questions/55191505/c-compile-time-check-if-method-exists-in-template-type
@@ -18,7 +42,7 @@ struct has_send_children<T, decltype(std::declval<T>().send_children(Resource{Re
 
 // template <typename T, typename = void>
 // struct has_metadata : std::false_type {};
-// template <typename T>
+// template <typename _tT>
 // struct has_metadata<T, decltype(std::declval<T>()::Metadata, void())> : std::true_type {};
 
 template <typename...>
@@ -28,9 +52,15 @@ using void_t = void;
 
 // Used by Proxy class 
 template <typename T, typename = void>
-struct Metadata_t { }; 
+struct Metadata_t {
+    Metadata_t(const T& obj) {};
+    Metadata_t(const Metadata_t<T>& other) {};
+}; 
 template <typename T>
-struct Metadata_t<T, void_t<typename T::Metadata>> : T::Metadata { };
+struct Metadata_t<T, void_t<typename T::Metadata>> : T::Metadata {
+    Metadata_t(const T& obj) : T::Metadata(obj) {};
+    Metadata_t(const Metadata_t<T>& other) : T::Metadata(other) {};
+};
 // struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; 
 
 
@@ -113,9 +143,52 @@ struct Proxy {
     /**
      * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr.
      */
-    Proxy() : location(Resource{Resource::CPU,0}), addr(nullptr) {};
-    Proxy(const Resource& r, T* obj) : location(r), addr(obj) {};
-
+    Proxy() : location(Resource{Resource::CPU,0}), addr(nullptr), metadata(nullptr) {
+	LOGINFO("Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this));
+    };
+    Proxy(const Resource& r) : location(r),  addr(nullptr), metadata(nullptr) {
+	LOGINFO("Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this));
+    };
+    Proxy(const Resource& r, T& obj, T* dest = nullptr) : location(r), addr(dest == nullptr ? &obj : dest) {
+	if (dest == nullptr) metadata = nullptr;
+	else metadata = new Metadata_t<T>(obj);
+	LOGINFO("Constructing Proxy<{}> @{} wrapping @{} with metadata @{}",
+		type_name<T>().c_str(), fmt::ptr(this), fmt::ptr(&obj), fmt::ptr(metadata));
+    };
+    // Copy constructor
+    Proxy(const Proxy<T>& other) : location(other.location), addr(other.addr), metadata(nullptr) {
+	LOGINFO("Copy Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this));
+	if (other.metadata != nullptr) {
+	    const Metadata_t<T>& tmp = *(other.metadata);
+	    metadata = new Metadata_t<T>(tmp);
+	}
+    };
+    Proxy<T>& operator=(const Proxy<T>& other) {
+	if (this != &other) {
+	    // Free existing resources.
+	    if (metadata != nullptr) delete metadata;
+	    location = other.location;
+	    addr = other.addr;
+	    const Metadata_t<T>& tmp = *(other.metadata);
+	    metadata = new Metadata_t<T>(tmp); // copy construct!
+	    // std::copy(other.metadata, other.metadata + sizeof(Metadata_t<T>), metadata);
+      }
+      return *this;
+    };
+    Proxy(Proxy<T>&& other) : addr(nullptr), metadata(nullptr) {
+	LOGINFO("Move Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this));
+	location = other.location;
+	addr = other.addr;
+	// For now we avoid std::move, but we may choose to change this behavior
+	// const Metadata_t<T>& tmp = *(other.metadata);
+	metadata = other.metadata;
+	other.metadata = nullptr;
+    };
+    ~Proxy() {
+	LOGINFO("Deconstructing Proxy<{}> @{} with metadata @{}", type_name<T>().c_str(), fmt::ptr(this), fmt::ptr(metadata));
+	if (metadata != nullptr) delete metadata;
+    };
+    
     /**
      * @brief Overloaded operator-> returns the address of the underlying object.
      * @return The address of the underlying object.
@@ -142,11 +215,27 @@ struct Proxy {
 	    }
 	    break;
 	case Resource::GPU:
+#ifdef __CUDACC__
 	    if (location.is_local()) {
-		Exception( NotImplementedError, "Proxy::callSync() local GPU calls" );
+		if (sizeof(RetType) > 0) {
+		    // Note: this only support basic RetType objects
+		    RetType* dest;
+		    RetType obj;
+		    gpuErrchk(cudaMalloc(&dest, sizeof(RetType)));
+		    proxy_sync_call_kernel<T, RetType, Args2...><<<1,32>>>(dest, addr, addr->*memberFunc, args...);
+		    // proxy_sync_call_kernel<><<<1,32>>>(dest, addr, addr->*memberFunc, args...);
+		    gpuErrchk(cudaMemcpy(dest, &obj, sizeof(RetType), cudaMemcpyHostToDevice));
+		    gpuErrchk(cudaFree(dest));
+		    return obj;
+		} else {
+		    Exception( NotImplementedError, "Proxy::callSync() local GPU calls" );
+		}
 	    } else {
 		Exception( NotImplementedError, "Proxy::callSync() non-local GPU calls" );
 	    }
+#else
+Exception( NotImplementedError, "Proxy::callSync() for GPU only defined for files compiled with nvvc" );
+#endif		    		
 	    break;
 	case Resource::MPI:
 	    Exception( NotImplementedError, "MPI sync calls (deprecate?)" );
@@ -262,8 +351,11 @@ struct Proxy<T, typename std::enable_if_t<std::is_arithmetic<T>::value>> {
  */
 template <typename T>
 HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T* dest = nullptr) {
+    LOGTRACE("   _send_ignoring_children...");
     switch (location.type) {
     case Resource::GPU:
+	LOGINFO("   GPU...");
+#ifdef USE_CUDA
 	if (location.is_local()) {
 	    if (dest == nullptr) { // allocate if needed
 		LOGTRACE("   cudaMalloc for array");
@@ -273,16 +365,25 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T
 	} else {
  	    Exception( NotImplementedError, "`_send_ignoring_children(...)` on non-local GPU" );
 	}
+#else
+	Exception( NotImplementedError, "USE_CUDA is not enabled" );
+#endif
 	break;
     case Resource::CPU:
+	LOGINFO("   CPU...");
 	if (location.is_local()) {
-	    if (dest == nullptr) { // allocate if needed
-		LOGTRACE("   Allocate CPU memory for {}", decltype(T));
-		dest = new T;
-	    }
-	    memcpy(dest, &obj, sizeof(T));
+	    LOGINFO("   local CPU...");
+	    // if (dest == nullptr) { // allocate if needed
+	    // 	LOGINFO("   allocate memory...");
+	    // 	LOGTRACE("   Allocate CPU memory for {}", type_name<T>().c_str());
+	    // 	dest = new T;
+	    // }
+	    // LOGINFO("   memcpying...");
+	    // memcpy(dest, &obj, sizeof(T));
+	    // dest = *obj;
 	} else {
-	    Exception( NotImplementedError, "`_send_ignoring_children(...)` on non-local CPU" );
+	    LOGINFO("   nonlocal...");
+	    // Exception( NotImplementedError, "`_send_ignoring_children(...)` on non-local CPU" );
 	}
 	break;
     default:
@@ -290,7 +391,16 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T
 	Exception( ValueError, "`_send_ignoring_children(...)` applied with unkown resource type" );
     }
 
-    return Proxy<T>(location, dest);
+    LOGINFO("   creating Proxy...");
+    // Proxy<T>* ret = new Proxy<T>(location, dest); // Proxies should be explicitly removed  
+    // LOGINFO("   ...done @{}", fmt::ptr(ret));
+    // Proxy<T>&& ret =
+    return Proxy<T>(location, obj, dest); // Proxies should be explicitly removed
+    
+	//LOGINFO("   ...done @{}", fmt::ptr(&ret));
+    // return ret;
+    // LOGINFO("   ...done @{}", fmt::ptr(ret));
+    // return *ret;
 }
 
 /**
@@ -303,10 +413,10 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T
  * @return A Proxy representing the data at the destination location.
  */
 template <typename T, typename Dummy = void, typename std::enable_if_t<!has_send_children<T>::value, Dummy>* = nullptr>
-HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
-    LOGTRACE("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
+HOST inline Proxy<T>& send(const Resource& location, T& obj, T* dest = nullptr) {
+    LOGINFO("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
     // Simple objects can simply be copied without worrying about contained objects and arrays
-    auto ret = _send_ignoring_children(location, obj, dest);
+    Proxy<T>&& ret = _send_ignoring_children(location, obj, dest);
     LOGTRACE("...done sending");
     // printf("...done\n");        
     return ret;
@@ -323,7 +433,7 @@ HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
  */
 template <typename T, typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr>
 HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
-    LOGTRACE("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
+    LOGINFO("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
     auto dummy = obj.send_children(location); // function is expected to return an object of type obj with all pointers appropriately assigned to valid pointers on location
     Proxy<T> ret = _send_ignoring_children<T>(location, dummy, dest);
     LOGTRACE("... clearing dummy complex object");
@@ -332,4 +442,53 @@ HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
     return ret;
 }
 
+// Utility function for constructing objects in remote memory address
+// spaces, obviating the need to construct simple objects locally
+// before copying. Returns a Proxy object, but in cases where the
+// remote resource location is non-CPU or non-local, metadata for
+// Proxy will be blank.
+template<typename T, typename... Args>
+Proxy<T> construct_remote(Resource location, Args&&...args) {
+    switch (location.type) {
+    case Resource::CPU:
+	if (location.is_local()) {
+	    T* ptr = new T{std::forward<Args>(args)...};
+	    return Proxy<T>(location, *ptr);
+	} else {
+	    Exception( NotImplementedError, "construct_remote() non-local CPU calls" );
+	}
+	break;
+    case Resource::GPU:
+#ifdef __CUDACC__
+	if (location.is_local()) {
+	    T* devptr;
+	    LOGWARN("construct_remote: TODO: switch to device associated with location");
+	    gpuErrchk(cudaMalloc(&devptr, sizeof(T)));
+	    constructor_kernel<<<1,32>>>(devptr, std::forward<Args>(args)...);
+	    gpuErrchk(cudaDeviceSynchronize());
+	    LOGWARN("construct_remote: proxy.metadata not set");
+	    return Proxy<T>(location);
+	    // Exception( NotImplementedError, "cunstruct_remote() local GPU call" );
+	    // Note: this only support basic RetType objects
+	    // T* dest;
+	    // T obj;
+	    // gpuErrchk(cudaMalloc(&dest, sizeof(RetType)));
+	    // proxy_sync_call_kernel<T, RetType, Args2...><<<1,32>>>(dest, addr, addr->*memberFunc, args...);
+	    // 	gpuErrchk(cudaMemcpy(dest, &obj, sizeof(RetType), cudaMemcpyHostToDevice));
+	    // 	gpuErrchk(cudaFree(dest));
+	} else {
+	    Exception( NotImplementedError, "cunstruct_remote() non-local GPU call" );
+	}
+#else
+	Exception( NotImplementedError, "construct_remote() for GPU only defined for files compiled with nvvc" );
+#endif	    		
+	break;
+    case Resource::MPI:
+	Exception( NotImplementedError, "construct_remote() for MPI" );
+	break;
+    default:
+	Exception( ValueError, "construct_remote(): unknown resource type" );
+    }
+    return Proxy<T>{};
+}
 
diff --git a/src/Random.h b/src/Random.h
index 8022ff0..8c9175b 100644
--- a/src/Random.h
+++ b/src/Random.h
@@ -18,6 +18,10 @@ inline void cuRandAssert(curandStatus code, const char *file, int line, bool abo
 		if (abort) exit(code);
 	}
 }
+// #else
+// #define QUALIFIERS __host__ __device__
+// #include <curand_kernel.h>
+// #undef QUALIFIERS
 #endif
 
 namespace Random {
@@ -79,6 +83,41 @@ public:
     // void reorder(int *a, int n);
 };
 
+class RandomCPUOld {
+public:
+    using state_t = void;
+
+    // Methods for maninpulating the state
+    void init(size_t num, unsigned long seed, size_t offset) {
+	// 
+    };
+
+    // Methods for generating random values
+    HOST DEVICE inline float gaussian(state_t* state) {
+	LOGWARN("RandomCPU::gaussian(): not implemented");
+	return 0;
+    };
+
+    HOST DEVICE inline state_t* get_gaussian_state() { return nullptr; };
+
+    HOST DEVICE inline void set_gaussian_state(state_t* state) {};
+
+    // // HOST DEVICE inline float gaussian(RandomState* state) {};
+    // HOST inline Vector3 gaussian_vector(size_t idx, size_t thread) {
+    // 	return Vector3();
+    // };
+    // unsigned int integer() {
+    // 	return 0;
+    // };
+    // unsigned int poisson(float lambda) {
+    // 	return 0;
+    // };
+    // float uniform() {
+    // 	return 0;
+    // };
+    // void reorder(int *a, int n);
+};
+
 #ifdef USE_CUDA
 
 class RandomAcc {
@@ -157,6 +196,7 @@ public:
 	gpuErrchk(cudaMalloc((void**)&(rng->states), sizeof(state_t)*num_states));
 	gpuErrchk(cudaMemcpy(dest, rng, sizeof(RNG),cudaMemcpyHostToDevice));
 	rng->states = nullptr;
+	// Since we avoided constructuer, don't call `delete rng;`
 	return dest;
     }
 	
diff --git a/src/Resource.cu b/src/Resource.cu
index bff4d60..68f217b 100644
--- a/src/Resource.cu
+++ b/src/Resource.cu
@@ -1,14 +1,18 @@
 #include "Resource.h"
 
-HOST DEVICE size_t current_thread_idx() {
+HOST DEVICE size_t caller_id() {
 #ifdef USE_MPI
-    Exception( NotImplementedError, "current_thread_idx() not implemented on GPU" );
+    // LOGINFO("... caller_id(): MPI path");
+    Exception( NotImplementedError, "caller_id() not implemented on GPU" );
     int world_rank;
     return MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 #else
-#ifdef __CUDA_ACC__
-    Exception( NotImplementedError, "current_thread_idx() not implemented on GPU" );
+#ifdef __CUDA_ARCH__
+    LOGINFO("... caller_id(): GPU path");
+    Exception( NotImplementedError, "caller_id() not implemented on GPU" );
+    return 0;
 #else
+    // LOGWARN("... caller_id() CPU path: return 0");
     return 0;
 #endif
 #endif
diff --git a/src/Resource.h b/src/Resource.h
index 8a6b054..3f15939 100644
--- a/src/Resource.h
+++ b/src/Resource.h
@@ -8,8 +8,11 @@
 #include "GPUManager.h"
 
 
-HOST DEVICE size_t current_thread_idx();
-    
+/**
+ * @brief Routine that returns the index of the calling resource,
+ * regardless of whether it an MPI rank or a GPU.
+ */
+HOST DEVICE size_t caller_id();
 
 
 /**
@@ -25,29 +28,21 @@ struct Resource {
     Resource* parent; ///< Parent resource; nullptr unless type is GPU
         
     /**
-     * @brief Checks if the resource is running on the calling thread. 
+     * @brief Check if resource is local to the calling thread. 
      */
-    HOST DEVICE bool is_running() const {
-	bool ret = true;
-// #ifdef __CUDA_ACC__
-// 	ret = (type == GPU);
-// #else
-// 	ret = (type == CPU);
-// #endif
-	return ret;
-    }
-
-    // Q: should this return True for GPU that is attached/assigned to current thread? Currently assuming yes.
     HOST DEVICE bool is_local() const {
 	bool ret = true;
-#ifdef __CUDA_ACC__
+#ifdef __CUDACC__
 	ret = (type == GPU);
-	LOGWARN("Resource::is_local() not fully implemented on GPU devices; returning {}",ret);
+	LOGWARN("Resource::is_local(): called from GPU but not fully implemented on GPU devices, returning {}",ret);
 #else
+	LOGINFO("Resource::is_local() out of CUDACC...");
 	if (type == GPU && parent != nullptr) {
+	    LOGINFO("Resource::is_local(): called from CPU for resource on GPU, checking parent...");
 	    ret = parent->is_local();
 	} else {
-	    ret = (current_thread_idx() == id);
+	    ret = (caller_id() == id);
+	    LOGINFO("Resource::is_local(): called from CPU, returning {}", ret);
 	}
 #endif
 	return ret;
@@ -56,7 +51,7 @@ struct Resource {
 
     static Resource Local() {
 	LOGWARN("Resource::Local() not properly implemented");
-#ifdef __CUDA_ACC__
+#ifdef __CUDACC__
 	return Resource{ GPU, 0 };
 #else
 	return Resource{ CPU, 0 };
@@ -65,4 +60,3 @@ struct Resource {
     bool operator==(const Resource& other) const { return type == other.type && id == other.id; };
 	
 };
-
diff --git a/src/SimManager.cu b/src/SimManager.cu
index 811b6ee..ad91c24 100644
--- a/src/SimManager.cu
+++ b/src/SimManager.cu
@@ -10,8 +10,11 @@ void SimManager::run() {
     // Patch p(10,0,0,sys);
 
     Patch p;
-
+    LOGINFO("here2!");
+//*
     RandomCPU rng{};
+    LOGINFO("there2!");
+    // RandomAcc rng{};
     //ProxyPatch p2(10,0,0);
 
     // p.add_compute( std::make_unique<LocalPairForce>() );
@@ -25,18 +28,21 @@ void SimManager::run() {
 //     p.add_compute( std::make_unique<LocalBonded>() );
 // #endif
 
+    LOGINFO("here!");
     auto tmp = Random::get_gaussian_state(&rng);
-
+    LOGINFO("there!");
     for (size_t step = 0; step < 10; ++step) {
-	LOGINFO("Step {}: random {:0.2f}", step, Random::gaussian(&rng,(RandomCPU::state_t*) nullptr));
+	LOGINFO("where!");
+	LOGINFO("Step {:d}: random", step); // {:0.2f}", step, Random::gaussian(&rng,(RandomCPU::state_t*) nullptr));
 	p.compute();
 #ifdef USE_CUDA
 	cudaDeviceSynchronize();
 #endif
     }
-#ifdef USE_CUDA
-    RandomGPU<128>::launch_test_kernel<64>((size_t) 1);
-    cudaDeviceSynchronize();
-#endif
+    // */
+// #ifdef USE_CUDA
+//     RandomGPU<128>::launch_test_kernel<64>((size_t) 1);
+//     cudaDeviceSynchronize();
+// #endif
 
 };
diff --git a/src/SimSystem.cpp b/src/SimSystem.cpp
index 4fe9c39..6c09cae 100644
--- a/src/SimSystem.cpp
+++ b/src/SimSystem.cpp
@@ -30,7 +30,7 @@ void CellDecomposer::decompose(SimSystem& sys, ResourceCollection& resources) {
 	Vector3_t<size_t> ijk = index_to_ijk( idx, n_p_v.x, n_p_v.y, n_p_v.z );
 	// size_t cap = 2*num_particles / n_r;
 	// auto p2 = Patch(cap);
-	auto p2 = Patch();	// don't allocate array locally
+	Patch p2 = Patch();	// don't allocate array locally
 	// TODO: generalize to non-orthogonal basis
 	Vector3 pmin = min + dr.element_mult(ijk);
 	Vector3 pmax = pmin + dr;
diff --git a/src/SimSystem.h b/src/SimSystem.h
index c4710f2..a777a22 100644
--- a/src/SimSystem.h
+++ b/src/SimSystem.h
@@ -21,9 +21,11 @@ class BoundaryConditions {
 public:
     BoundaryConditions() :  origin(0), basis{(5000,0,0),(0,5000,0),(0,0,5000)}, periodic{true,true,true} {
 	// do things
+	LOGINFO("BoundaryConditions()");
     }
     BoundaryConditions( Vector3 basis1, Vector3 basis2, Vector3 basis3, Vector3 origin = Vector3(0), bool periodic1 = true, bool periodic2 = true, bool periodic3 = true ) {
 	// do things
+	LOGINFO("BoundaryConditions(...)");
     }
 
     static constexpr size_t dim = 3;
@@ -105,6 +107,7 @@ public:
 	// explicit operator int() const {return object_type*16 + algorithm*4 + backend;};
     };
 
+    inline			// C++17 feature needed for compilation... unsure of why
     static constexpr Conf default_conf = Conf{291.0f, Conf::CellDecomp, Conf::AllPeriodic, {5000.0f,5000.0f,5000.0f}, 50.0f };
 
     SimSystem() : SimSystem(default_conf) {}
@@ -170,6 +173,7 @@ public:
 	       
     // void consolidate_patches(); // maybe not needed
     void distribute_patches() {
+	LOGINFO("distribute_patches()");
 	decomp;
     }
 
-- 
GitLab