From 3b23fc2442ef219b44e1c9ff214ed5d2fbfea2c8 Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Tue, 16 Jan 2024 17:40:36 -0600
Subject: [PATCH] Checkpoint

---
 src/CMakeLists.txt    |   1 +
 src/ParticlePatch.cpp |  84 ++++++++++++++++++++++++++++------
 src/ParticlePatch.h   | 103 ++++++++++++++++++++++++++++++++++++++----
 src/Proxy.h           |  54 +++++++++++++++++++---
 src/SimSystem.cpp     |  57 +++++++++++++++++++----
 src/SimSystem.h       |  38 ++++++++--------
 src/Types.h           |  18 ++++++++
 src/Types/Array.h     |  18 ++++----
 src/Types/Vector3.h   |  30 ++++++++++++
 9 files changed, 338 insertions(+), 65 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 742832a..772b770 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -3,6 +3,7 @@ add_library("lib${PROJECT_NAME}"
   ARBDException.cpp
   GPUManager.cpp
   ParticlePatch.cpp
+  SimSystem.cpp
   Integrator.cpp
   Integrator/CPU.cpp
   Integrator/CUDA.cu
diff --git a/src/ParticlePatch.cpp b/src/ParticlePatch.cpp
index 151e182..4bf45d8 100644
--- a/src/ParticlePatch.cpp
+++ b/src/ParticlePatch.cpp
@@ -8,24 +8,30 @@
 // };
 
 void Patch::initialize() {
-    data->pos_force = data->momentum = nullptr;
-    data->particle_types = data->particle_order = nullptr;
+    Data* data = new Data{nullptr,nullptr,nullptr,nullptr};	// TODO free
+    metadata = Metadata{0,0,0};
+    // data->pos_force = data->momentum = nullptr;
+    // data->particle_types = data->particle_order = nullptr;
+    metadata.data = send( Resource{Resource::CPU,0}, *data );
 };
 
 Patch* Patch::copy_to_cuda(Patch* dev_ptr) const {
+    Exception(NotImplementedError, "Deprecated");
     
     if (dev_ptr == nullptr) { // allocate if needed
 	gpuErrchk(cudaMalloc(&dev_ptr, sizeof( typeid(this) )));
     }
 
-    Patch* tmp;
-    tmp = new Patch(0);
-    tmp->data->pos_force = data->pos_force->copy_to_cuda();
-    tmp->data->momentum = data->momentum->copy_to_cuda();
-    tmp->data->particle_types = data->particle_types->copy_to_cuda();
-    tmp->data->particle_order = data->particle_order->copy_to_cuda();
+    // Patch* tmp;
+    // tmp = new Patch(0);
+    // tmp->metadata->data->pos_force = data->pos_force->copy_to_cuda();
+    // tmp->data->momentum = data->momentum->copy_to_cuda();
+    // tmp->data->particle_types = data->particle_types->copy_to_cuda();
+    // tmp->data->particle_order = data->particle_order->copy_to_cuda();
+
+    // return tmp;
+    return nullptr;
 
-    return tmp;
     // tmp.pos_force;
     // // Allocate member data
     // pos_force.
@@ -44,12 +50,22 @@ Patch* Patch::copy_to_cuda(Patch* dev_ptr) const {
 
 Patch Patch::send_children(Resource location) const {
     Patch tmp(0);
+    Proxy<Data> newdata;
     switch (location.type) {
     case Resource::GPU:
-	tmp.data->pos_force = data->pos_force->copy_to_cuda();
-	tmp.data->momentum = data->momentum->copy_to_cuda();
-	tmp.data->particle_types = data->particle_types->copy_to_cuda();
-	tmp.data->particle_order = data->particle_order->copy_to_cuda();
+	// metadata.data is already a proxy because the data may reside on the GPU;
+	// Q: In what cases should we move metadata.data.addr?
+	//     For example, we could always move, or we could never move, or we could move if location metadata.data.location is a CPU resource
+
+	// if (location != metadata.data,location) 
+	// newdata = metadata.data.callSync( send, addr
+	// data->pos_force = metadata.data.callSync( send,  ->pos_force->copy_to_cuda();
+	// data->momentum = metadata.data->momentum->copy_to_cuda();
+	// data->particle_types = metadata.data->particle_types->copy_to_cuda();
+	// data->particle_order = metadata.data->particle_order->copy_to_cuda();
+
+	// Nothing to do then?
+	
 	break;
     case Resource::CPU:
 	Exception( NotImplementedError, "`send_children(...)` not implemented on CPU" );
@@ -57,7 +73,8 @@ Patch Patch::send_children(Resource location) const {
     default:
 	Exception( ValueError, "`send_children(...)` passed unknown resource type" );
     }
-    tmp.data.location = location;
+    newdata.location = location;
+    tmp.metadata.data = newdata;
     return tmp;
 };
 
@@ -77,3 +94,42 @@ void Patch::compute() {
 	c_p->compute(this);
     }
 };
+
+
+// template<typename T>
+// size_t Patch::send_particles_filtered( Proxy<Patch>& destination, T filter ) {
+size_t Patch::send_particles_filtered( Proxy<Patch>* destination, std::function<bool(size_t,Patch::Data)> filter ) {
+// TODO determine who allocates and cleans up temporary data
+    // TODO determine if there is a good way to avoid having temporary data (this can be done later if delegated to a reasonable object)
+
+    // TODO think about the design and whether it's possible to have a single array with start/end
+    // TODO think about the design for parallel sending; should a receiver do some work?
+
+    // TODO currently everything is sychronous, but that will change; how do we avoid race conditions?
+    // E.g. have a single array allocated with known start and end for each PE/Patch?
+    
+    Data buffer;		// not sure which object should allocate
+    { 
+    using _filter_t = decltype(filter);
+    // using A = decltype(destination->metadata.data);
+    using A = Proxy<Patch::Data>&;
+    // size_t num_sent = metadata.data.callSync<size_t,A>( &Patch::Data::send_particles_filtered, destination->metadata.data, filter );
+    // size_t num_sent = metadata.data.callSync<size_t,A>( static_cast<size_t (Patch::Data::*)(Proxy<Patch::Data>&, _filter_t)>(&Patch::Data::send_particles_filtered), destination->metadata.data, filter );
+
+    // size_t num_sent = metadata.data.callSync<size_t,A>( &Patch::Data::send_particles_filtered, destination->metadata.data, filter );
+    size_t num_sent = metadata.data.callSync( &Patch::Data::send_particles_filtered, &((*destination)->metadata.data), filter );
+
+    }
+    // size_t num_sent = metadata.data.callSync<size_t,Proxy<Patch::Data>&,_filter_t>( &Patch::Data::send_particles_filtered<_filter_t>, destination->metadata.data, filter );
+    return 0;
+};
+
+// template<typename T>
+// size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>& destination, T filter ) { }
+size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>* destination, std::function<bool(size_t,Patch::Data)> filter ) { 
+    Data buffer;		// not sure which object should allocate
+    // metadata.data.callSync( &Patch::Data::send_particles_filtered, destination, filter );
+    // x
+    return 0;
+};
+
diff --git a/src/ParticlePatch.h b/src/ParticlePatch.h
index 52e6266..2423565 100644
--- a/src/ParticlePatch.h
+++ b/src/ParticlePatch.h
@@ -10,6 +10,7 @@
 
 #include <vector> // std::vector
 #include <memory> // std::make_unique
+#include <functional>
 
 #ifdef USE_CUDA
 #include <cuda.h>
@@ -23,7 +24,12 @@
 
 #include "PatchOp.h"
 
+class Decomposer;
+class CellDecomposer;
+
 class BasePatch {
+    friend Decomposer;
+    friend CellDecomposer;
 public:
     // BasePatch(size_t num, short thread_id, short gpu_id, SimSystem& sys);
     // BasePatch(size_t num, short thread_id, short gpu_id);
@@ -124,6 +130,33 @@ public:
     Patch() : BasePatch() { initialize(); }
     Patch(size_t capacity) : BasePatch(capacity) { initialize(); }
 
+    // Particle data arrays pointing to either CPU or GPU memory
+    struct Data {
+	VectorArr* pos_force;
+	VectorArr* momentum;
+	Array<size_t>* particle_types;
+	Array<size_t>* particle_order;
+
+	HOST DEVICE inline Vector3& get_pos(size_t i) { return (*pos_force)[i*2]; };
+	HOST DEVICE inline Vector3& get_force(size_t i) { return (*pos_force)[i*2+1]; };
+	HOST DEVICE inline Vector3& get_momentum(size_t i) { return (*momentum)[i]; };
+	HOST DEVICE inline size_t& get_type(size_t i) { return (*particle_types)[i]; };
+	HOST DEVICE inline size_t& get_order(size_t i) { return (*particle_order)[i]; };
+
+	// Replace with auto? Return number of particles sent?
+	// template<typename T>
+	// size_t send_particles_filtered( Proxy<Data>& destination, T filter ); // = [](size_t idx, Patch::Data d)->bool { return true; } );
+	size_t send_particles_filtered( Proxy<Data>* destination, std::function<bool(size_t,Data)> filter ); // = [](size_t idx, Patch::Data d)->bool { return true; } );
+
+    };
+
+    // Metadata stored on host even if Data is on device
+    struct Metadata {
+	size_t num;
+	Vector3 min;
+	Vector3 max;
+	Proxy<Data> data;		// actual data may be found elsewhere
+    };
     
     // void deleteParticles(IndexList& p);
     // void addParticles(size_t n, size_t typ);
@@ -149,6 +182,27 @@ public:
     
     // TODO? emplace_point_particles
     void compute();
+
+    // Communication
+    size_t send_particles( Proxy<Patch>* destination ); // Same as send_children?
+    // void send_particles_filtered( Proxy<Patch> destination, std::function<bool(size_t, Patch::Data)> = [](size_t idx, Patch::Data d)->bool { return true; } );
+
+    // Replace with auto? Return number of particles sent?
+    // template<typename T>
+    // size_t send_particles_filtered( Proxy<Patch>& destination, T filter );
+    // // [](size_t idx, Patch::Data d)->bool { return true; } );
+    size_t send_particles_filtered( Proxy<Patch>* destination, std::function<bool(size_t,Data)> filter );
+    // [](size_t idx, Patch::Data d)->bool { return true; } );
+
+    void clear() {
+	WARN("Patch::clear() was called but is not implemented");
+    }
+
+    size_t test() {
+	WARN("Patch::test() was called but is not implemented");
+	return 1;
+    }
+
     
 private:
     void initialize();
@@ -160,14 +214,7 @@ private:
     std::vector<std::unique_ptr<PatchOp>> local_computes; // Operations that will be performed on this patch each timestep
     std::vector<std::unique_ptr<PatchOp>> nonlocal_computes; // Operations that will be performed on this patch each timestep
 
-    // CPU particle arrays
-    struct Data {
-	VectorArr* pos_force;
-	VectorArr* momentum;
-	Array<size_t>* particle_types;
-	Array<size_t>* particle_order;
-    };
-    Proxy<Data> data;		// actual data may be found elsewhere
+    Metadata metadata;		// Usually associated with proxy, but can use it here too
     
     // size_t num_rb;
     // Vector3* rb_pos;
@@ -187,6 +234,46 @@ private:
 };
 
 
+/* MOVED
+// template<typename T>
+// size_t Patch::send_particles_filtered( Proxy<Patch>& destination, T filter ) {
+size_t Patch::send_particles_filtered( Proxy<Patch>* destination, std::function<bool(size_t,Patch::Data)> filter ) {
+// TODO determine who allocates and cleans up temporary data
+    // TODO determine if there is a good way to avoid having temporary data (this can be done later if delegated to a reasonable object)
+
+    // TODO think about the design and whether it's possible to have a single array with start/end
+    // TODO think about the design for parallel sending; should a receiver do some work?
+
+    // TODO currently everything is sychronous, but that will change; how do we avoid race conditions?
+    // E.g. have a single array allocated with known start and end for each PE/Patch?
+    
+    Data buffer;		// not sure which object should allocate
+    { 
+    using _filter_t = decltype(filter);
+    // using A = decltype(destination->metadata.data);
+    using A = Proxy<Patch::Data>&;
+    // size_t num_sent = metadata.data.callSync<size_t,A>( &Patch::Data::send_particles_filtered, destination->metadata.data, filter );
+    // size_t num_sent = metadata.data.callSync<size_t,A>( static_cast<size_t (Patch::Data::*)(Proxy<Patch::Data>&, _filter_t)>(&Patch::Data::send_particles_filtered), destination->metadata.data, filter );
+
+    // size_t num_sent = metadata.data.callSync<size_t,A>( &Patch::Data::send_particles_filtered, destination->metadata.data, filter );
+    size_t num_sent = metadata.data.callSync( &Patch::Data::send_particles_filtered, &((*destination)->metadata.data), filter );
+
+    }
+    // size_t num_sent = metadata.data.callSync<size_t,Proxy<Patch::Data>&,_filter_t>( &Patch::Data::send_particles_filtered<_filter_t>, destination->metadata.data, filter );
+    return 0;
+};
+
+// template<typename T>
+// size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>& destination, T filter ) { }
+size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>* destination, std::function<bool(size_t,Patch::Data)> filter ) { 
+    Data buffer;		// not sure which object should allocate
+    // metadata.data.callSync( &Patch::Data::send_particles_filtered, destination, filter );
+    // x
+    return 0;
+};
+
+*/
+
 // // Patch::Patch(size_t num, short thread_id, short gpu_id) {};
 // #ifndef USE_CUDA
 // void Patch::compute() {
diff --git a/src/Proxy.h b/src/Proxy.h
index 78ee47d..eeac2c1 100644
--- a/src/Proxy.h
+++ b/src/Proxy.h
@@ -18,6 +18,7 @@ struct Resource {
 };
 
 // START traits
+// These ugly bits of code help implement SFINAE in C++14 and should likely be removed if a newer standard is adopted 
 // https://stackoverflow.com/questions/55191505/c-compile-time-check-if-method-exists-in-template-type
 /**
  * @brief Template trait to check if a method 'send_children' exists in a type.
@@ -25,18 +26,42 @@ struct Resource {
 #include <type_traits>
 template <typename T, typename = void>
 struct has_send_children : std::false_type {};
-
 template <typename T>
 struct has_send_children<T, decltype(std::declval<T>().send_children(Resource{Resource::CPU,0}), void())> : std::true_type {};
+
+// template <typename T, typename = void>
+// struct has_metadata : std::false_type {};
+// template <typename T>
+// struct has_metadata<T, decltype(std::declval<T>()::Metadata, void())> : std::true_type {};
+
+template <typename...>
+using void_t = void;
+// struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; 
 // END traits
+template <typename T, typename = void>
+struct Metadata_t { }; 
+template <typename T>
+struct Metadata_t<T, void_t<typename T::Metadata>> : T::Metadata { };
+// struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; 
 
 
 /**
  * @brief Template class representing a proxy for the underlying data.
  * @tparam T The type of the underlying data.
  */
+// C++17 way: template<typename T, typename Metadata = std::void_t<typename T::Metadata>>
+// C++14 way:
+//template<typename T, typename Metadata = typename std::conditional<has_metadata<T>::value, typename T::Metadata, void>::type>
 template<typename T>
 class Proxy {
+
+    // Define Metadata types using SFINAE
+    // template<typename=void> struct Metadata_t { };
+    // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { };
+    // template<typename=void> struct Metadata_t { };
+    // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { };
+    // using Metadata_t = Metadata_t<T>;
+    
 public:
 
     /**
@@ -57,12 +82,22 @@ public:
      */
     Resource location;	    ///< The device (thread/gpu) holding the data represented by the proxy.
     T* addr;		    ///< The address of the underlying object.
-
+    Metadata_t<T>* metadata;	// Q: rename to Metadata? Has
+				// consequences for Proxy<Proxy<T>>
+				// that limits specialization but uses
+				// T::Metadata automatically
+				//
+				// A: depends on how Proxy<Proxy<T>>
+				// objects are used
+    
     template <typename RetType, typename... Args>
-    RetType callSync(RetType (T::*memberFunc)(Args...), Args... args) {
+    RetType callSync(RetType (T::*memberFunc)(Args...), Args&&... args) {
         switch (location.type) {
             case Resource::CPU:
-                return (addr->*memberFunc)(args...);
+	    // 	return ([&](auto&&... capturedArgs) {
+            //     return (addr->*memberFunc)(std::forward<decltype(capturedArgs)>(capturedArgs)...);
+            // })(std::forward<Args>(args)...);
+		return (addr->*memberFunc)(std::forward<Args>(args)...);
             case Resource::GPU:
                 // Handle GPU-specific logic
                 std::cerr << "Error: GPU not implemented in synchronous call." << std::endl;
@@ -108,6 +143,13 @@ public:
     }
 };
 
+// // Partial specialization
+// template<typename T>
+// using Proxy<T> = Proxy<T, std::void_t<typename T::Metadata>>
+// // template<typename T>
+// // class Proxy<T, typename T::Metadata> { };
+
+
 /**
  * @brief Template function to send data ignoring children to a specified location.
  * @tparam T The type of the data to be sent.
@@ -151,7 +193,7 @@ template <typename T, typename Dummy = void, typename std::enable_if_t<!has_send
 HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
     TRACE("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
     // Simple objects can simply be copied without worrying about contained objects and arrays
-    auto ret = _send_ignoring_children<T>(location, obj, dest);
+    auto ret = _send_ignoring_children(location, obj, dest);
     TRACE("...done sending");
     // printf("...done\n");        
     return ret;
@@ -170,7 +212,7 @@ template <typename T, typename Dummy = void, typename std::enable_if_t<has_send_
 HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
     TRACE("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
     auto dummy = obj.send_children(location); // function is expected to return an object of type obj with all pointers appropriately assigned to valid pointers on location
-    Proxy<T> ret = _send_ignoring_children(location, dummy, dest);
+    Proxy<T> ret = _send_ignoring_children<T>(location, dummy, dest);
     TRACE("... clearing dummy complex object");
     dummy.clear();
     TRACE("... done sending");
diff --git a/src/SimSystem.cpp b/src/SimSystem.cpp
index 74df513..bd5c13d 100644
--- a/src/SimSystem.cpp
+++ b/src/SimSystem.cpp
@@ -7,7 +7,7 @@ void CellDecomposer::decompose(SimSystem& sys, ResourceCollection& resources) {
     // Initialize workers; TODO move this out of decompose
     std::vector<Proxy<Worker>> workers;
     for (auto& r: resources.resources) {
-	Worker w(sys.cutoff, sys.patches);
+	Worker w{sys.cutoff, sys.patches};
 	auto w_p = send(r, w);
 	workers.push_back( w_p );
     }
@@ -20,17 +20,56 @@ void CellDecomposer::decompose(SimSystem& sys, ResourceCollection& resources) {
     Vector3 n_p_v = (dr / cutoff).element_floor();
     size_t n_r = resources.resources.size();
 
-    size_t n_p = static_cast<size_t>(round(n_p_v[0]*n_p_v[1]*n_p_v[2]));
-    for (size_t i = 0; i < n_p; ++i) {
-	auto w_p = workers[i / floor(n_p/n_r)];
-	w_p.create_patch();
-	// other stuff
-    }
+    size_t num_particles = 0;
+    for (auto& p: sys.patches) num_particles += p.metadata->num;
+   
+    std::vector<Proxy<Patch>> new_patches;
+    // std::vector<Patch> new_patches;
+    size_t n_p = static_cast<size_t>(round(n_p_v.x*n_p_v.y*n_p_v.z));
+    for (size_t idx = 0; idx < n_p; ++idx) {
+	Vector3_t<size_t> ijk = index_to_ijk( idx, n_p_v.x, n_p_v.y, n_p_v.z );
+	// size_t cap = 2*num_particles / n_r;
+	// auto p2 = Patch(cap);
+	auto p2 = Patch();	// don't allocate array locally
+	// TODO: generalize to non-orthogonal basis
+	Vector3 pmin = min + dr.element_mult(ijk);
+	Vector3 pmax = pmin + dr;
+
+	p2.lower_bound = pmin;
+	p2.upper_bound = pmax;
+
+	size_t r_idx = idx / ceil(n_p/n_r);
+	
+	//auto p2_p = send(resources.resources[r_idx], p2);
+	Proxy<Patch> p2_p = send(resources.resources[r_idx], p2);
+
+	for (auto& p: sys.patches) {
+	    auto filter_lambda = [pmin, pmax] (size_t i, Patch::Data d)->bool {
+		return d.get_pos(i).x >= pmin.x && d.get_pos(i).x < pmax.x &&
+		    d.get_pos(i).y >= pmin.y && d.get_pos(i).y < pmax.y &&
+		    d.get_pos(i).z >= pmin.z && d.get_pos(i).z < pmax.z; };
+	    auto filter = std::function<bool(size_t, Patch::Data)>(filter_lambda);
+	    using _filter_t = decltype(filter);
+	    // p.callSync<void,Proxy<Patch>,_filter_t>( &Patch::send_particles_filtered<_filter_t>, p2_p, filter );
+	    // p.callSync<size_t, Proxy<Patch>&, _filter_t>(static_cast<size_t (Patch::*)(Proxy<Patch>&, _filter_t)>(&Patch::send_particles_filtered<_filter_t>), p2_p, filter);
+	    p.callSync( &Patch::test ); // DEBUG
+	    // p.callSync<size_t, Proxy<Patch>&>( &Patch::send_particles_filtered, p2_p, filter);
+	    p.callSync( &Patch::send_particles_filtered, &p2_p, filter);
+	    
+	    num_particles += p.metadata->num;
+	}
 	
+	new_patches.push_back( p2_p );
+    }
     
+    // size_t n_p = static_cast<size_t>(round(n_p_v[0]*n_p_v[1]*n_p_v[2]));
+    // for (size_t i = 0; i < n_p; ++i) {
+    // 	auto w_p = workers[i / floor(n_p/n_r)];
+    // 	w_p.create_patch();
+    // 	// other stuff
+    // }
+	
     
-    std::vector<Proxy<Patch>> new_patches;
-
     
     // Count particles in each new_patch
     //   (ultimately this must be distributed, but via what mechanisms? Decomposer is only on controlling thread in current implementation, but perhaps each thread should have its own copy)
diff --git a/src/SimSystem.h b/src/SimSystem.h
index 9f893dd..c4710f2 100644
--- a/src/SimSystem.h
+++ b/src/SimSystem.h
@@ -32,16 +32,15 @@ public:
     bool periodic[dim];    
 };
 
-template<>
-class Proxy<Patch> : BasePatch {
-public:
-    Proxy<Patch>(Resource& r, Patch* obj) : location(&r), addr(obj) { };
-    Resource* location;
-    Patch* addr;
-    size_t num;
-    Vector3 min;
-    Vector3 max;    
-};
+// class ProxyPatch : public Proxy<Patch>, public BasePatch {
+// public:
+//     ProxyPatch(Resource& r, Patch* obj) : location(&r), addr(obj) { };
+//     Resource* location;
+//     Patch* addr;
+//     size_t num;
+//     Vector3 min;
+//     Vector3 max;    
+// };
 
 struct ResourceCollection {
     
@@ -85,6 +84,7 @@ class CellDecomposer : public Decomposer {
 
 class SimSystem {
     friend class Decomposer;
+    friend class CellDecomposer;
     friend class SimManager;
 public:
     struct Conf {
@@ -145,10 +145,10 @@ public:
     const Vector3 get_min() const {
 	Vector3 min(Vector3::highest());
 	for (auto& p: patches) {
-	    if (min.x > p.min.x) min.x = p.min.x;
-	    if (min.y > p.min.y) min.y = p.min.y;
-	    if (min.z > p.min.z) min.z = p.min.z;
-	    if (min.w > p.min.w) min.w = p.min.w;
+	    if (min.x > p.metadata->min.x) min.x = p.metadata->min.x;
+	    if (min.y > p.metadata->min.y) min.y = p.metadata->min.y;
+	    if (min.z > p.metadata->min.z) min.z = p.metadata->min.z;
+	    if (min.w > p.metadata->min.w) min.w = p.metadata->min.w;
 	}
 	return min;
     }
@@ -156,10 +156,10 @@ public:
     const Vector3 get_max() const {
 	Vector3 max(Vector3::lowest());
 	for (auto& p: patches) {
-	    if (max.x < p.max.x) max.x = p.max.x;
-	    if (max.y < p.max.y) max.y = p.max.y;
-	    if (max.z < p.max.z) max.z = p.max.z;
-	    if (max.w < p.max.w) max.w = p.max.w;
+	    if (max.x < p.metadata->max.x) max.x = p.metadata->max.x;
+	    if (max.y < p.metadata->max.y) max.y = p.metadata->max.y;
+	    if (max.z < p.metadata->max.z) max.z = p.metadata->max.z;
+	    if (max.w < p.metadata->max.w) max.w = p.metadata->max.w;
 	}
 	return max;
     }
@@ -173,7 +173,7 @@ public:
 	decomp;
     }
 
-private:
+protected:
     Temperature temperature;
     // std::vector<Interactions> interactions; // not quite..
     
diff --git a/src/Types.h b/src/Types.h
index ff72cbe..9213ebb 100644
--- a/src/Types.h
+++ b/src/Types.h
@@ -27,6 +27,8 @@ inline std::string string_format(const std::string fmt_str, ...) {
     return std::string(formatted.get());
 }
 
+
+// Includes of various types (allows those to be used simply by including Types.h)
 #include "Types/Vector3.h"
 #include "Types/Matrix3.h"
 
@@ -37,3 +39,19 @@ using Matrix3 = Matrix3_t<float,false>;
 
 #include "Types/Array.h"
 using VectorArr = Array<Vector3>;
+
+// Helpful routines
+HOST DEVICE inline Vector3_t<size_t> index_to_ijk(size_t idx, size_t nx, size_t ny, size_t nz) {
+    Vector3_t<size_t> res;
+    res.z = idx % nz;
+    res.y = (idx/nz) % ny;
+    res.x = (idx/(ny*nz)) % nx;
+    return res;
+}
+HOST DEVICE inline Vector3_t<size_t> index_to_ijk(size_t idx, const size_t n[]) {
+    return index_to_ijk(idx, n[0], n[1], n[2]);
+}
+HOST DEVICE inline Vector3_t<size_t> index_to_ijk(size_t idx, const Vector3_t<size_t> n) {
+    return index_to_ijk(idx, n.x, n.y, n.z);
+}
+
diff --git a/src/Types/Array.h b/src/Types/Array.h
index 359505c..91dcbcf 100644
--- a/src/Types/Array.h
+++ b/src/Types/Array.h
@@ -53,7 +53,7 @@ public:
 	    values[i] = a[i];
 	}
 	// printf("Copy-operator for Array<T> %x with values %x\n",this, values);
-	printf("Copy-operator for Array<T>\n");
+	// printf("Copy-operator for Array<T>\n");
 	return *this;
     }
     HOST DEVICE inline Array<T>& operator=(Array<T>&& a) { // move assignment operator
@@ -65,7 +65,7 @@ public:
 	a.num = 0;
 	a.values = nullptr;
 	// printf("Move-operator for Array<T> %x with values %x\n",this, values);
-	printf("Move-operator for Array<T>\n");
+	// printf("Move-operator for Array<T>\n");
 	return *this;
     }
     HOST void clear() {
@@ -143,7 +143,7 @@ public:
 	Array<T> tmp(0);
 	tmp.num = num;
 	tmp.values = values_d;
-	printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d);
+	// printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d);
 	return tmp;
     }
     template <typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr>
@@ -159,7 +159,7 @@ public:
 		gpuErrchk(cudaMalloc(&values_d, sz));
 		// Copy values
 		for (size_t i = 0; i < num; ++i) {
-		    printf("Sending_children for children\n");
+		    // printf("Sending_children for children\n");
 		    auto tmp = values[i].send_children(location);
 		    send(location, tmp, values_d+i);
 		    tmp.clear();
@@ -177,7 +177,7 @@ public:
 	Array<T> tmp(0);
 	tmp.num = num;
 	tmp.values = values_d;
-	printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d);
+	// printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d);
 	return tmp;
     }
     
@@ -208,7 +208,7 @@ public:
 	gpuErrchk(cudaMemcpy(dev_ptr, &tmp, sizeof(Array<T>), cudaMemcpyHostToDevice));
 	tmp.num = 0;
 	tmp.values = nullptr;
-	printf("Copying Array<%s> %x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, dev_ptr);
+	// printf("Copying Array<%s> %x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, dev_ptr);
 	return dev_ptr;
     }
 
@@ -240,7 +240,7 @@ public:
 	gpuErrchk(cudaMemcpy(dev_ptr, &tmp, sizeof(Array<T>), cudaMemcpyHostToDevice));
 	tmp.num = 0;
 	tmp.values = nullptr;
-	printf("Copying Array %x with values %x to device at %x\n",this, values, dev_ptr);
+	// printf("Copying Array %x with values %x to device at %x\n",this, values, dev_ptr);
 	return dev_ptr;
     }
 
@@ -293,7 +293,7 @@ public:
 
     template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr>
     HOST static void remove_from_cuda(Array<T>* dev_ptr, bool remove_self = true) {
-	printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+	// printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
 	if (dev_ptr == nullptr) return;
 	Array<T> tmp(0);
 	gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
@@ -312,7 +312,7 @@ public:
 
     template <typename Dummy = void, typename std::enable_if_t<has_copy_to_cuda<T>::value, Dummy>* = nullptr>
     HOST static void remove_from_cuda(Array<T>* dev_ptr, bool remove_self = true) {
-	printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+	// printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
 	if (dev_ptr == nullptr) return;
 	Array<T> tmp(0);
 	gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
diff --git a/src/Types/Vector3.h b/src/Types/Vector3.h
index 85001a7..70eba08 100644
--- a/src/Types/Vector3.h
+++ b/src/Types/Vector3.h
@@ -168,6 +168,26 @@ public:
 	}
 
 
+	template<typename U>
+	HOST DEVICE inline auto element_mult(const U w[]) {
+	    using TU = typename std::common_type<T,U>::type;
+	    Vector3_t<TU> ret( x*w[0], y*w[1], z*w[2]);
+	    return ret;
+	}
+	template<typename U>
+	HOST DEVICE inline auto element_mult(const Vector3_t<U>&& w) {
+	    using TU = typename std::common_type<T,U>::type;
+	    Vector3_t<TU> ret( x*w.x, y*w.y, z*w.z);
+	    return ret;
+	}
+	template<typename U>
+	HOST DEVICE inline auto element_mult(const Vector3_t<U>& w) {
+	    using TU = typename std::common_type<T,U>::type;
+	    Vector3_t<TU> ret( x*w.x, y*w.y, z*w.z);
+	    return ret;
+	}
+
+
 	template<typename U>
 	HOST DEVICE static inline auto element_mult(const Vector3_t<T>&& v, const Vector3_t<U>&& w) {
 	    using TU = typename std::common_type<T,U>::type;
@@ -178,6 +198,16 @@ public:
 		return ret;
 	}
 
+	template<typename U>
+	HOST DEVICE static inline auto element_mult(const Vector3_t<T>&& v, const U w[]) {
+	    using TU = typename std::common_type<T,U>::type;
+		Vector3_t<TU> ret(
+			v.x*w[0],
+			v.y*w[1],
+			v.z*w[2]);
+		return ret;
+	}
+
 	HOST DEVICE static inline Vector3_t<T> element_sqrt(const Vector3_t<T>&& w) {
 		Vector3_t<T> ret(
 			sqrt(w.x),
-- 
GitLab