From 3b23fc2442ef219b44e1c9ff214ed5d2fbfea2c8 Mon Sep 17 00:00:00 2001 From: Chris Maffeo <cmaffeo2@illinois.edu> Date: Tue, 16 Jan 2024 17:40:36 -0600 Subject: [PATCH] Checkpoint --- src/CMakeLists.txt | 1 + src/ParticlePatch.cpp | 84 ++++++++++++++++++++++++++++------ src/ParticlePatch.h | 103 ++++++++++++++++++++++++++++++++++++++---- src/Proxy.h | 54 +++++++++++++++++++--- src/SimSystem.cpp | 57 +++++++++++++++++++---- src/SimSystem.h | 38 ++++++++-------- src/Types.h | 18 ++++++++ src/Types/Array.h | 18 ++++---- src/Types/Vector3.h | 30 ++++++++++++ 9 files changed, 338 insertions(+), 65 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 742832a..772b770 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,6 +3,7 @@ add_library("lib${PROJECT_NAME}" ARBDException.cpp GPUManager.cpp ParticlePatch.cpp + SimSystem.cpp Integrator.cpp Integrator/CPU.cpp Integrator/CUDA.cu diff --git a/src/ParticlePatch.cpp b/src/ParticlePatch.cpp index 151e182..4bf45d8 100644 --- a/src/ParticlePatch.cpp +++ b/src/ParticlePatch.cpp @@ -8,24 +8,30 @@ // }; void Patch::initialize() { - data->pos_force = data->momentum = nullptr; - data->particle_types = data->particle_order = nullptr; + Data* data = new Data{nullptr,nullptr,nullptr,nullptr}; // TODO free + metadata = Metadata{0,0,0}; + // data->pos_force = data->momentum = nullptr; + // data->particle_types = data->particle_order = nullptr; + metadata.data = send( Resource{Resource::CPU,0}, *data ); }; Patch* Patch::copy_to_cuda(Patch* dev_ptr) const { + Exception(NotImplementedError, "Deprecated"); if (dev_ptr == nullptr) { // allocate if needed gpuErrchk(cudaMalloc(&dev_ptr, sizeof( typeid(this) ))); } - Patch* tmp; - tmp = new Patch(0); - tmp->data->pos_force = data->pos_force->copy_to_cuda(); - tmp->data->momentum = data->momentum->copy_to_cuda(); - tmp->data->particle_types = data->particle_types->copy_to_cuda(); - tmp->data->particle_order = data->particle_order->copy_to_cuda(); + // Patch* tmp; + // tmp = new Patch(0); + // tmp->metadata->data->pos_force = data->pos_force->copy_to_cuda(); + // tmp->data->momentum = data->momentum->copy_to_cuda(); + // tmp->data->particle_types = data->particle_types->copy_to_cuda(); + // tmp->data->particle_order = data->particle_order->copy_to_cuda(); + + // return tmp; + return nullptr; - return tmp; // tmp.pos_force; // // Allocate member data // pos_force. @@ -44,12 +50,22 @@ Patch* Patch::copy_to_cuda(Patch* dev_ptr) const { Patch Patch::send_children(Resource location) const { Patch tmp(0); + Proxy<Data> newdata; switch (location.type) { case Resource::GPU: - tmp.data->pos_force = data->pos_force->copy_to_cuda(); - tmp.data->momentum = data->momentum->copy_to_cuda(); - tmp.data->particle_types = data->particle_types->copy_to_cuda(); - tmp.data->particle_order = data->particle_order->copy_to_cuda(); + // metadata.data is already a proxy because the data may reside on the GPU; + // Q: In what cases should we move metadata.data.addr? + // For example, we could always move, or we could never move, or we could move if location metadata.data.location is a CPU resource + + // if (location != metadata.data,location) + // newdata = metadata.data.callSync( send, addr + // data->pos_force = metadata.data.callSync( send, ->pos_force->copy_to_cuda(); + // data->momentum = metadata.data->momentum->copy_to_cuda(); + // data->particle_types = metadata.data->particle_types->copy_to_cuda(); + // data->particle_order = metadata.data->particle_order->copy_to_cuda(); + + // Nothing to do then? + break; case Resource::CPU: Exception( NotImplementedError, "`send_children(...)` not implemented on CPU" ); @@ -57,7 +73,8 @@ Patch Patch::send_children(Resource location) const { default: Exception( ValueError, "`send_children(...)` passed unknown resource type" ); } - tmp.data.location = location; + newdata.location = location; + tmp.metadata.data = newdata; return tmp; }; @@ -77,3 +94,42 @@ void Patch::compute() { c_p->compute(this); } }; + + +// template<typename T> +// size_t Patch::send_particles_filtered( Proxy<Patch>& destination, T filter ) { +size_t Patch::send_particles_filtered( Proxy<Patch>* destination, std::function<bool(size_t,Patch::Data)> filter ) { +// TODO determine who allocates and cleans up temporary data + // TODO determine if there is a good way to avoid having temporary data (this can be done later if delegated to a reasonable object) + + // TODO think about the design and whether it's possible to have a single array with start/end + // TODO think about the design for parallel sending; should a receiver do some work? + + // TODO currently everything is sychronous, but that will change; how do we avoid race conditions? + // E.g. have a single array allocated with known start and end for each PE/Patch? + + Data buffer; // not sure which object should allocate + { + using _filter_t = decltype(filter); + // using A = decltype(destination->metadata.data); + using A = Proxy<Patch::Data>&; + // size_t num_sent = metadata.data.callSync<size_t,A>( &Patch::Data::send_particles_filtered, destination->metadata.data, filter ); + // size_t num_sent = metadata.data.callSync<size_t,A>( static_cast<size_t (Patch::Data::*)(Proxy<Patch::Data>&, _filter_t)>(&Patch::Data::send_particles_filtered), destination->metadata.data, filter ); + + // size_t num_sent = metadata.data.callSync<size_t,A>( &Patch::Data::send_particles_filtered, destination->metadata.data, filter ); + size_t num_sent = metadata.data.callSync( &Patch::Data::send_particles_filtered, &((*destination)->metadata.data), filter ); + + } + // size_t num_sent = metadata.data.callSync<size_t,Proxy<Patch::Data>&,_filter_t>( &Patch::Data::send_particles_filtered<_filter_t>, destination->metadata.data, filter ); + return 0; +}; + +// template<typename T> +// size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>& destination, T filter ) { } +size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>* destination, std::function<bool(size_t,Patch::Data)> filter ) { + Data buffer; // not sure which object should allocate + // metadata.data.callSync( &Patch::Data::send_particles_filtered, destination, filter ); + // x + return 0; +}; + diff --git a/src/ParticlePatch.h b/src/ParticlePatch.h index 52e6266..2423565 100644 --- a/src/ParticlePatch.h +++ b/src/ParticlePatch.h @@ -10,6 +10,7 @@ #include <vector> // std::vector #include <memory> // std::make_unique +#include <functional> #ifdef USE_CUDA #include <cuda.h> @@ -23,7 +24,12 @@ #include "PatchOp.h" +class Decomposer; +class CellDecomposer; + class BasePatch { + friend Decomposer; + friend CellDecomposer; public: // BasePatch(size_t num, short thread_id, short gpu_id, SimSystem& sys); // BasePatch(size_t num, short thread_id, short gpu_id); @@ -124,6 +130,33 @@ public: Patch() : BasePatch() { initialize(); } Patch(size_t capacity) : BasePatch(capacity) { initialize(); } + // Particle data arrays pointing to either CPU or GPU memory + struct Data { + VectorArr* pos_force; + VectorArr* momentum; + Array<size_t>* particle_types; + Array<size_t>* particle_order; + + HOST DEVICE inline Vector3& get_pos(size_t i) { return (*pos_force)[i*2]; }; + HOST DEVICE inline Vector3& get_force(size_t i) { return (*pos_force)[i*2+1]; }; + HOST DEVICE inline Vector3& get_momentum(size_t i) { return (*momentum)[i]; }; + HOST DEVICE inline size_t& get_type(size_t i) { return (*particle_types)[i]; }; + HOST DEVICE inline size_t& get_order(size_t i) { return (*particle_order)[i]; }; + + // Replace with auto? Return number of particles sent? + // template<typename T> + // size_t send_particles_filtered( Proxy<Data>& destination, T filter ); // = [](size_t idx, Patch::Data d)->bool { return true; } ); + size_t send_particles_filtered( Proxy<Data>* destination, std::function<bool(size_t,Data)> filter ); // = [](size_t idx, Patch::Data d)->bool { return true; } ); + + }; + + // Metadata stored on host even if Data is on device + struct Metadata { + size_t num; + Vector3 min; + Vector3 max; + Proxy<Data> data; // actual data may be found elsewhere + }; // void deleteParticles(IndexList& p); // void addParticles(size_t n, size_t typ); @@ -149,6 +182,27 @@ public: // TODO? emplace_point_particles void compute(); + + // Communication + size_t send_particles( Proxy<Patch>* destination ); // Same as send_children? + // void send_particles_filtered( Proxy<Patch> destination, std::function<bool(size_t, Patch::Data)> = [](size_t idx, Patch::Data d)->bool { return true; } ); + + // Replace with auto? Return number of particles sent? + // template<typename T> + // size_t send_particles_filtered( Proxy<Patch>& destination, T filter ); + // // [](size_t idx, Patch::Data d)->bool { return true; } ); + size_t send_particles_filtered( Proxy<Patch>* destination, std::function<bool(size_t,Data)> filter ); + // [](size_t idx, Patch::Data d)->bool { return true; } ); + + void clear() { + WARN("Patch::clear() was called but is not implemented"); + } + + size_t test() { + WARN("Patch::test() was called but is not implemented"); + return 1; + } + private: void initialize(); @@ -160,14 +214,7 @@ private: std::vector<std::unique_ptr<PatchOp>> local_computes; // Operations that will be performed on this patch each timestep std::vector<std::unique_ptr<PatchOp>> nonlocal_computes; // Operations that will be performed on this patch each timestep - // CPU particle arrays - struct Data { - VectorArr* pos_force; - VectorArr* momentum; - Array<size_t>* particle_types; - Array<size_t>* particle_order; - }; - Proxy<Data> data; // actual data may be found elsewhere + Metadata metadata; // Usually associated with proxy, but can use it here too // size_t num_rb; // Vector3* rb_pos; @@ -187,6 +234,46 @@ private: }; +/* MOVED +// template<typename T> +// size_t Patch::send_particles_filtered( Proxy<Patch>& destination, T filter ) { +size_t Patch::send_particles_filtered( Proxy<Patch>* destination, std::function<bool(size_t,Patch::Data)> filter ) { +// TODO determine who allocates and cleans up temporary data + // TODO determine if there is a good way to avoid having temporary data (this can be done later if delegated to a reasonable object) + + // TODO think about the design and whether it's possible to have a single array with start/end + // TODO think about the design for parallel sending; should a receiver do some work? + + // TODO currently everything is sychronous, but that will change; how do we avoid race conditions? + // E.g. have a single array allocated with known start and end for each PE/Patch? + + Data buffer; // not sure which object should allocate + { + using _filter_t = decltype(filter); + // using A = decltype(destination->metadata.data); + using A = Proxy<Patch::Data>&; + // size_t num_sent = metadata.data.callSync<size_t,A>( &Patch::Data::send_particles_filtered, destination->metadata.data, filter ); + // size_t num_sent = metadata.data.callSync<size_t,A>( static_cast<size_t (Patch::Data::*)(Proxy<Patch::Data>&, _filter_t)>(&Patch::Data::send_particles_filtered), destination->metadata.data, filter ); + + // size_t num_sent = metadata.data.callSync<size_t,A>( &Patch::Data::send_particles_filtered, destination->metadata.data, filter ); + size_t num_sent = metadata.data.callSync( &Patch::Data::send_particles_filtered, &((*destination)->metadata.data), filter ); + + } + // size_t num_sent = metadata.data.callSync<size_t,Proxy<Patch::Data>&,_filter_t>( &Patch::Data::send_particles_filtered<_filter_t>, destination->metadata.data, filter ); + return 0; +}; + +// template<typename T> +// size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>& destination, T filter ) { } +size_t Patch::Data::send_particles_filtered( Proxy<Patch::Data>* destination, std::function<bool(size_t,Patch::Data)> filter ) { + Data buffer; // not sure which object should allocate + // metadata.data.callSync( &Patch::Data::send_particles_filtered, destination, filter ); + // x + return 0; +}; + +*/ + // // Patch::Patch(size_t num, short thread_id, short gpu_id) {}; // #ifndef USE_CUDA // void Patch::compute() { diff --git a/src/Proxy.h b/src/Proxy.h index 78ee47d..eeac2c1 100644 --- a/src/Proxy.h +++ b/src/Proxy.h @@ -18,6 +18,7 @@ struct Resource { }; // START traits +// These ugly bits of code help implement SFINAE in C++14 and should likely be removed if a newer standard is adopted // https://stackoverflow.com/questions/55191505/c-compile-time-check-if-method-exists-in-template-type /** * @brief Template trait to check if a method 'send_children' exists in a type. @@ -25,18 +26,42 @@ struct Resource { #include <type_traits> template <typename T, typename = void> struct has_send_children : std::false_type {}; - template <typename T> struct has_send_children<T, decltype(std::declval<T>().send_children(Resource{Resource::CPU,0}), void())> : std::true_type {}; + +// template <typename T, typename = void> +// struct has_metadata : std::false_type {}; +// template <typename T> +// struct has_metadata<T, decltype(std::declval<T>()::Metadata, void())> : std::true_type {}; + +template <typename...> +using void_t = void; +// struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; // END traits +template <typename T, typename = void> +struct Metadata_t { }; +template <typename T> +struct Metadata_t<T, void_t<typename T::Metadata>> : T::Metadata { }; +// struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; /** * @brief Template class representing a proxy for the underlying data. * @tparam T The type of the underlying data. */ +// C++17 way: template<typename T, typename Metadata = std::void_t<typename T::Metadata>> +// C++14 way: +//template<typename T, typename Metadata = typename std::conditional<has_metadata<T>::value, typename T::Metadata, void>::type> template<typename T> class Proxy { + + // Define Metadata types using SFINAE + // template<typename=void> struct Metadata_t { }; + // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { }; + // template<typename=void> struct Metadata_t { }; + // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { }; + // using Metadata_t = Metadata_t<T>; + public: /** @@ -57,12 +82,22 @@ public: */ Resource location; ///< The device (thread/gpu) holding the data represented by the proxy. T* addr; ///< The address of the underlying object. - + Metadata_t<T>* metadata; // Q: rename to Metadata? Has + // consequences for Proxy<Proxy<T>> + // that limits specialization but uses + // T::Metadata automatically + // + // A: depends on how Proxy<Proxy<T>> + // objects are used + template <typename RetType, typename... Args> - RetType callSync(RetType (T::*memberFunc)(Args...), Args... args) { + RetType callSync(RetType (T::*memberFunc)(Args...), Args&&... args) { switch (location.type) { case Resource::CPU: - return (addr->*memberFunc)(args...); + // return ([&](auto&&... capturedArgs) { + // return (addr->*memberFunc)(std::forward<decltype(capturedArgs)>(capturedArgs)...); + // })(std::forward<Args>(args)...); + return (addr->*memberFunc)(std::forward<Args>(args)...); case Resource::GPU: // Handle GPU-specific logic std::cerr << "Error: GPU not implemented in synchronous call." << std::endl; @@ -108,6 +143,13 @@ public: } }; +// // Partial specialization +// template<typename T> +// using Proxy<T> = Proxy<T, std::void_t<typename T::Metadata>> +// // template<typename T> +// // class Proxy<T, typename T::Metadata> { }; + + /** * @brief Template function to send data ignoring children to a specified location. * @tparam T The type of the data to be sent. @@ -151,7 +193,7 @@ template <typename T, typename Dummy = void, typename std::enable_if_t<!has_send HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) { TRACE("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); // Simple objects can simply be copied without worrying about contained objects and arrays - auto ret = _send_ignoring_children<T>(location, obj, dest); + auto ret = _send_ignoring_children(location, obj, dest); TRACE("...done sending"); // printf("...done\n"); return ret; @@ -170,7 +212,7 @@ template <typename T, typename Dummy = void, typename std::enable_if_t<has_send_ HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) { TRACE("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); auto dummy = obj.send_children(location); // function is expected to return an object of type obj with all pointers appropriately assigned to valid pointers on location - Proxy<T> ret = _send_ignoring_children(location, dummy, dest); + Proxy<T> ret = _send_ignoring_children<T>(location, dummy, dest); TRACE("... clearing dummy complex object"); dummy.clear(); TRACE("... done sending"); diff --git a/src/SimSystem.cpp b/src/SimSystem.cpp index 74df513..bd5c13d 100644 --- a/src/SimSystem.cpp +++ b/src/SimSystem.cpp @@ -7,7 +7,7 @@ void CellDecomposer::decompose(SimSystem& sys, ResourceCollection& resources) { // Initialize workers; TODO move this out of decompose std::vector<Proxy<Worker>> workers; for (auto& r: resources.resources) { - Worker w(sys.cutoff, sys.patches); + Worker w{sys.cutoff, sys.patches}; auto w_p = send(r, w); workers.push_back( w_p ); } @@ -20,17 +20,56 @@ void CellDecomposer::decompose(SimSystem& sys, ResourceCollection& resources) { Vector3 n_p_v = (dr / cutoff).element_floor(); size_t n_r = resources.resources.size(); - size_t n_p = static_cast<size_t>(round(n_p_v[0]*n_p_v[1]*n_p_v[2])); - for (size_t i = 0; i < n_p; ++i) { - auto w_p = workers[i / floor(n_p/n_r)]; - w_p.create_patch(); - // other stuff - } + size_t num_particles = 0; + for (auto& p: sys.patches) num_particles += p.metadata->num; + + std::vector<Proxy<Patch>> new_patches; + // std::vector<Patch> new_patches; + size_t n_p = static_cast<size_t>(round(n_p_v.x*n_p_v.y*n_p_v.z)); + for (size_t idx = 0; idx < n_p; ++idx) { + Vector3_t<size_t> ijk = index_to_ijk( idx, n_p_v.x, n_p_v.y, n_p_v.z ); + // size_t cap = 2*num_particles / n_r; + // auto p2 = Patch(cap); + auto p2 = Patch(); // don't allocate array locally + // TODO: generalize to non-orthogonal basis + Vector3 pmin = min + dr.element_mult(ijk); + Vector3 pmax = pmin + dr; + + p2.lower_bound = pmin; + p2.upper_bound = pmax; + + size_t r_idx = idx / ceil(n_p/n_r); + + //auto p2_p = send(resources.resources[r_idx], p2); + Proxy<Patch> p2_p = send(resources.resources[r_idx], p2); + + for (auto& p: sys.patches) { + auto filter_lambda = [pmin, pmax] (size_t i, Patch::Data d)->bool { + return d.get_pos(i).x >= pmin.x && d.get_pos(i).x < pmax.x && + d.get_pos(i).y >= pmin.y && d.get_pos(i).y < pmax.y && + d.get_pos(i).z >= pmin.z && d.get_pos(i).z < pmax.z; }; + auto filter = std::function<bool(size_t, Patch::Data)>(filter_lambda); + using _filter_t = decltype(filter); + // p.callSync<void,Proxy<Patch>,_filter_t>( &Patch::send_particles_filtered<_filter_t>, p2_p, filter ); + // p.callSync<size_t, Proxy<Patch>&, _filter_t>(static_cast<size_t (Patch::*)(Proxy<Patch>&, _filter_t)>(&Patch::send_particles_filtered<_filter_t>), p2_p, filter); + p.callSync( &Patch::test ); // DEBUG + // p.callSync<size_t, Proxy<Patch>&>( &Patch::send_particles_filtered, p2_p, filter); + p.callSync( &Patch::send_particles_filtered, &p2_p, filter); + + num_particles += p.metadata->num; + } + new_patches.push_back( p2_p ); + } + // size_t n_p = static_cast<size_t>(round(n_p_v[0]*n_p_v[1]*n_p_v[2])); + // for (size_t i = 0; i < n_p; ++i) { + // auto w_p = workers[i / floor(n_p/n_r)]; + // w_p.create_patch(); + // // other stuff + // } + - std::vector<Proxy<Patch>> new_patches; - // Count particles in each new_patch // (ultimately this must be distributed, but via what mechanisms? Decomposer is only on controlling thread in current implementation, but perhaps each thread should have its own copy) diff --git a/src/SimSystem.h b/src/SimSystem.h index 9f893dd..c4710f2 100644 --- a/src/SimSystem.h +++ b/src/SimSystem.h @@ -32,16 +32,15 @@ public: bool periodic[dim]; }; -template<> -class Proxy<Patch> : BasePatch { -public: - Proxy<Patch>(Resource& r, Patch* obj) : location(&r), addr(obj) { }; - Resource* location; - Patch* addr; - size_t num; - Vector3 min; - Vector3 max; -}; +// class ProxyPatch : public Proxy<Patch>, public BasePatch { +// public: +// ProxyPatch(Resource& r, Patch* obj) : location(&r), addr(obj) { }; +// Resource* location; +// Patch* addr; +// size_t num; +// Vector3 min; +// Vector3 max; +// }; struct ResourceCollection { @@ -85,6 +84,7 @@ class CellDecomposer : public Decomposer { class SimSystem { friend class Decomposer; + friend class CellDecomposer; friend class SimManager; public: struct Conf { @@ -145,10 +145,10 @@ public: const Vector3 get_min() const { Vector3 min(Vector3::highest()); for (auto& p: patches) { - if (min.x > p.min.x) min.x = p.min.x; - if (min.y > p.min.y) min.y = p.min.y; - if (min.z > p.min.z) min.z = p.min.z; - if (min.w > p.min.w) min.w = p.min.w; + if (min.x > p.metadata->min.x) min.x = p.metadata->min.x; + if (min.y > p.metadata->min.y) min.y = p.metadata->min.y; + if (min.z > p.metadata->min.z) min.z = p.metadata->min.z; + if (min.w > p.metadata->min.w) min.w = p.metadata->min.w; } return min; } @@ -156,10 +156,10 @@ public: const Vector3 get_max() const { Vector3 max(Vector3::lowest()); for (auto& p: patches) { - if (max.x < p.max.x) max.x = p.max.x; - if (max.y < p.max.y) max.y = p.max.y; - if (max.z < p.max.z) max.z = p.max.z; - if (max.w < p.max.w) max.w = p.max.w; + if (max.x < p.metadata->max.x) max.x = p.metadata->max.x; + if (max.y < p.metadata->max.y) max.y = p.metadata->max.y; + if (max.z < p.metadata->max.z) max.z = p.metadata->max.z; + if (max.w < p.metadata->max.w) max.w = p.metadata->max.w; } return max; } @@ -173,7 +173,7 @@ public: decomp; } -private: +protected: Temperature temperature; // std::vector<Interactions> interactions; // not quite.. diff --git a/src/Types.h b/src/Types.h index ff72cbe..9213ebb 100644 --- a/src/Types.h +++ b/src/Types.h @@ -27,6 +27,8 @@ inline std::string string_format(const std::string fmt_str, ...) { return std::string(formatted.get()); } + +// Includes of various types (allows those to be used simply by including Types.h) #include "Types/Vector3.h" #include "Types/Matrix3.h" @@ -37,3 +39,19 @@ using Matrix3 = Matrix3_t<float,false>; #include "Types/Array.h" using VectorArr = Array<Vector3>; + +// Helpful routines +HOST DEVICE inline Vector3_t<size_t> index_to_ijk(size_t idx, size_t nx, size_t ny, size_t nz) { + Vector3_t<size_t> res; + res.z = idx % nz; + res.y = (idx/nz) % ny; + res.x = (idx/(ny*nz)) % nx; + return res; +} +HOST DEVICE inline Vector3_t<size_t> index_to_ijk(size_t idx, const size_t n[]) { + return index_to_ijk(idx, n[0], n[1], n[2]); +} +HOST DEVICE inline Vector3_t<size_t> index_to_ijk(size_t idx, const Vector3_t<size_t> n) { + return index_to_ijk(idx, n.x, n.y, n.z); +} + diff --git a/src/Types/Array.h b/src/Types/Array.h index 359505c..91dcbcf 100644 --- a/src/Types/Array.h +++ b/src/Types/Array.h @@ -53,7 +53,7 @@ public: values[i] = a[i]; } // printf("Copy-operator for Array<T> %x with values %x\n",this, values); - printf("Copy-operator for Array<T>\n"); + // printf("Copy-operator for Array<T>\n"); return *this; } HOST DEVICE inline Array<T>& operator=(Array<T>&& a) { // move assignment operator @@ -65,7 +65,7 @@ public: a.num = 0; a.values = nullptr; // printf("Move-operator for Array<T> %x with values %x\n",this, values); - printf("Move-operator for Array<T>\n"); + // printf("Move-operator for Array<T>\n"); return *this; } HOST void clear() { @@ -143,7 +143,7 @@ public: Array<T> tmp(0); tmp.num = num; tmp.values = values_d; - printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d); + // printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d); return tmp; } template <typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr> @@ -159,7 +159,7 @@ public: gpuErrchk(cudaMalloc(&values_d, sz)); // Copy values for (size_t i = 0; i < num; ++i) { - printf("Sending_children for children\n"); + // printf("Sending_children for children\n"); auto tmp = values[i].send_children(location); send(location, tmp, values_d+i); tmp.clear(); @@ -177,7 +177,7 @@ public: Array<T> tmp(0); tmp.num = num; tmp.values = values_d; - printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d); + // printf("Array<%s>.send_children() @%x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, values_d); return tmp; } @@ -208,7 +208,7 @@ public: gpuErrchk(cudaMemcpy(dev_ptr, &tmp, sizeof(Array<T>), cudaMemcpyHostToDevice)); tmp.num = 0; tmp.values = nullptr; - printf("Copying Array<%s> %x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, dev_ptr); + // printf("Copying Array<%s> %x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, dev_ptr); return dev_ptr; } @@ -240,7 +240,7 @@ public: gpuErrchk(cudaMemcpy(dev_ptr, &tmp, sizeof(Array<T>), cudaMemcpyHostToDevice)); tmp.num = 0; tmp.values = nullptr; - printf("Copying Array %x with values %x to device at %x\n",this, values, dev_ptr); + // printf("Copying Array %x with values %x to device at %x\n",this, values, dev_ptr); return dev_ptr; } @@ -293,7 +293,7 @@ public: template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr> HOST static void remove_from_cuda(Array<T>* dev_ptr, bool remove_self = true) { - printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr); + // printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr); if (dev_ptr == nullptr) return; Array<T> tmp(0); gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost)); @@ -312,7 +312,7 @@ public: template <typename Dummy = void, typename std::enable_if_t<has_copy_to_cuda<T>::value, Dummy>* = nullptr> HOST static void remove_from_cuda(Array<T>* dev_ptr, bool remove_self = true) { - printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr); + // printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr); if (dev_ptr == nullptr) return; Array<T> tmp(0); gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost)); diff --git a/src/Types/Vector3.h b/src/Types/Vector3.h index 85001a7..70eba08 100644 --- a/src/Types/Vector3.h +++ b/src/Types/Vector3.h @@ -168,6 +168,26 @@ public: } + template<typename U> + HOST DEVICE inline auto element_mult(const U w[]) { + using TU = typename std::common_type<T,U>::type; + Vector3_t<TU> ret( x*w[0], y*w[1], z*w[2]); + return ret; + } + template<typename U> + HOST DEVICE inline auto element_mult(const Vector3_t<U>&& w) { + using TU = typename std::common_type<T,U>::type; + Vector3_t<TU> ret( x*w.x, y*w.y, z*w.z); + return ret; + } + template<typename U> + HOST DEVICE inline auto element_mult(const Vector3_t<U>& w) { + using TU = typename std::common_type<T,U>::type; + Vector3_t<TU> ret( x*w.x, y*w.y, z*w.z); + return ret; + } + + template<typename U> HOST DEVICE static inline auto element_mult(const Vector3_t<T>&& v, const Vector3_t<U>&& w) { using TU = typename std::common_type<T,U>::type; @@ -178,6 +198,16 @@ public: return ret; } + template<typename U> + HOST DEVICE static inline auto element_mult(const Vector3_t<T>&& v, const U w[]) { + using TU = typename std::common_type<T,U>::type; + Vector3_t<TU> ret( + v.x*w[0], + v.y*w[1], + v.z*w[2]); + return ret; + } + HOST DEVICE static inline Vector3_t<T> element_sqrt(const Vector3_t<T>&& w) { Vector3_t<T> ret( sqrt(w.x), -- GitLab