diff --git a/src/ParticlePatch.cpp b/src/ParticlePatch.cpp index e64231a3b491d500aacd53fb9016567075b8a6ee..a4b76ca3ce43183ae1fbe8801d1058b0d8c30ff9 100644 --- a/src/ParticlePatch.cpp +++ b/src/ParticlePatch.cpp @@ -7,12 +7,29 @@ // // rb_pos = rb_orient = rb_mom = rb_ang_mom = type = rb_type = nullptr; // }; +size_t BasePatch::global_patch_idx = 0; + void Patch::initialize() { + LOGINFO("Creating Patch::Data"); + /* Data* data = new Data{nullptr,nullptr,nullptr,nullptr}; // TODO free - metadata = Metadata{0,0,0}; + // metadata = Metadata{0,0,0}; // data->pos_force = data->momentum = nullptr; // data->particle_types = data->particle_order = nullptr; - metadata.data = send( Resource{Resource::CPU,0}, *data ); + LOGINFO("Sending Patch::Data @{}", fmt::ptr(data)); + Resource r = Resource{Resource::CPU,0}; + // if (&(metadata.data) != nullptr) { + // LOGINFO( "Freeing metadata.data"); + // delete &(metadata.data); + // LOGINFO( "Freeing metadata"); + // } + // send(r,*data); + metadata.data = send(r, *data); + LOGINFO("Data sent"); + */ + LOGWARN("Patch::initialize(): Creating Data on CPU resource"); + Resource r = Resource{Resource::CPU,0}; + metadata.data = construct_remote<Data>(r, capacity); }; Patch* Patch::copy_to_cuda(Patch* dev_ptr) const { @@ -49,10 +66,12 @@ Patch* Patch::copy_to_cuda(Patch* dev_ptr) const { }; Patch Patch::send_children(Resource location) const { + LOGWARN("Patch::send_children()"); Patch tmp(0); Proxy<Data> newdata; switch (location.type) { case Resource::GPU: + Exception( NotImplementedError, "`send_children(...)` not implemented on GPU" ); // metadata.data is already a proxy because the data may reside on the GPU; // Q: In what cases should we move metadata.data.addr? // For example, we could always move, or we could never move, or we could move if location metadata.data.location is a CPU resource @@ -80,11 +99,13 @@ Patch Patch::send_children(Resource location) const { Patch Patch::copy_from_cuda(Patch* dev_ptr, Patch* dest) { + Exception(NotImplementedError, "Deprecated"); // TODO Patch tmp = Patch(); return tmp; }; void Patch::remove_from_cuda(Patch* dev_ptr) { + Exception(NotImplementedError, "Deprecated"); // TODO }; @@ -99,7 +120,7 @@ void Patch::compute() { // template<typename T> // size_t Patch::send_particles_filtered( Proxy<Patch>& destination, T filter ) { size_t Patch::send_particles_filtered( Proxy<Patch>& destination, std::function<bool(size_t,Patch::Data)> filter ) { -// TODO determine who allocates and cleans up temporary data + // TODO determine who allocates and cleans up temporary data // TODO determine if there is a good way to avoid having temporary data (this can be done later if delegated to a reasonable object) // TODO think about the design and whether it's possible to have a single array with start/end diff --git a/src/ParticlePatch.h b/src/ParticlePatch.h index ee217a38bb700fca4af8db7b5b59dbb3b1123331..7cf7a51851b803ca84b45d572b1cd872f40721b1 100644 --- a/src/ParticlePatch.h +++ b/src/ParticlePatch.h @@ -33,8 +33,28 @@ class BasePatch { public: // BasePatch(size_t num, short thread_id, short gpu_id, SimSystem& sys); // BasePatch(size_t num, short thread_id, short gpu_id); - BasePatch() : num(0), capacity(0) {} - BasePatch(size_t capacity) : num(0), capacity(capacity) {} + BasePatch() : num(0), capacity(0), patch_idx(++global_patch_idx) {} + BasePatch(size_t capacity) : num(0), capacity(capacity), patch_idx(++global_patch_idx) {} + + // Copy constructor + BasePatch(const BasePatch& other) : num(other.num), capacity(other.capacity), patch_idx(++global_patch_idx) { + LOGTRACE("Copy constructing {} @{}", type_name<*this>().c_str(), fmt::ptr(this)); +} + // Move constructor + BasePatch(BasePatch&& other) : num(std::move(other.num)), capacity(std::move(other.capacity)), patch_idx(std::move(other.patch_idx)) { + LOGTRACE("Move constructing {} @{}", type_name<*this>().c_str(), fmt::ptr(this)); + } + // Move assignment operator + BasePatch& operator=(BasePatch&& other) { + LOGTRACE("Move assigning {} @{}", type_name<*this>().c_str(), fmt::ptr(this)); + num = std::move(other.num); + capacity = std::move(other.capacity); + patch_idx = std::move(other.patch_idx); + // lower_bound = std::move(other.lower_bound); + // upper_bound = std::move(other.upper_bound); + return *this; + } + // ~BasePatch(); protected: @@ -44,12 +64,15 @@ protected: // short thread_id; // MPI // short gpu_id; // -1 if GPU unavailable - static size_t patch_idx; // Unique ID across ranks + static size_t global_patch_idx; // Unique ID across ranks // TODO: preallocate regions that will be used, or offload this to a parallel singleton class + /* const */ size_t patch_idx; // Unique ID across ranks + // Q: should we have different kinds of patches? E.g. Spheres? This specialization _could_ go into subclass, or we could have a ptr to a Region class that can have different implementations Vector3 lower_bound; Vector3 upper_bound; }; + // class PatchProxy { // // // public: @@ -127,11 +150,22 @@ protected: // Storage class that should be class Patch : public BasePatch { public: - Patch() : BasePatch() { initialize(); } + Patch() : BasePatch(), metadata() { LOGINFO("Creating Patch"); initialize(); LOGINFO("Done Creating Patch"); } Patch(size_t capacity) : BasePatch(capacity) { initialize(); } // Particle data arrays pointing to either CPU or GPU memory struct Data { + Data(const size_t capacity = 0) { + if (capacity == 0) { + pos_force = momentum = nullptr; + particle_types = particle_order = nullptr; + } else { + pos_force = new VectorArr(capacity); + momentum = new VectorArr(capacity); + particle_types = new Array<size_t>(capacity); + particle_order = new Array<size_t>(capacity); + } + } VectorArr* pos_force; VectorArr* momentum; Array<size_t>* particle_types; @@ -152,7 +186,11 @@ public: // Metadata stored on host even if Data is on device struct Metadata { + Metadata() : num(0), capacity(0), min(0), max(0), data() {}; + Metadata(const Patch& p) : num(p.metadata.num), capacity(p.metadata.capacity), min(p.metadata.min), max(p.metadata.max), data(p.metadata.data) {}; + Metadata(const Metadata& other) : num(other.num), capacity(other.capacity), min(other.min), max(other.max), data(other.data) {}; size_t num; + size_t capacity; Vector3 min; Vector3 max; Proxy<Data> data; // actual data may be found elsewhere @@ -203,7 +241,6 @@ public: return 1; } - private: void initialize(); diff --git a/src/Patch.h b/src/Patch.h index cd211f94457eae5a9dac48c3ec6bfb9d472e099a..2773a3d52bae14ba47daca96b97d03db159b7424 100644 --- a/src/Patch.h +++ b/src/Patch.h @@ -16,38 +16,38 @@ #include "PatchOp.h" -template<typename Pos, typename Force> -class Patch { - // size_t num_replicas; +// template<typename Pos, typename Force> +// class Patch { +// // size_t num_replicas; -public: - Patch(SimParameters* sim); +// public: +// Patch(SimParameters* sim); - void compute(); +// void compute(); -private: +// private: - SimParameters* sim; +// SimParameters* sim; - Pos minimum; - Pos maximum; +// Pos minimum; +// Pos maximum; - ParticleTypes* types; - ParticleTypes* types_d; +// ParticleTypes* types; +// ParticleTypes* types_d; - // Particle data - size_t num_particles; +// // Particle data +// size_t num_particles; - idx_t* global_idx; // global index of particle - size_t* type_ids; - Pos* pos; - Force* force; +// idx_t* global_idx; // global index of particle +// size_t* type_ids; +// Pos* pos; +// Force* force; - size_t* type_ids_d; - Pos* pos_d; - Force* force_d; +// size_t* type_ids_d; +// Pos* pos_d; +// Force* force_d; - std::vector<Patch> neighbor_patches; - std::vector<BasePatchOp> ops; -}; +// std::vector<Patch> neighbor_patches; +// std::vector<BasePatchOp> ops; +// }; diff --git a/src/Proxy.h b/src/Proxy.h index e1c49e553f9468cfe8994be5732929b0ce6df73f..3756277f932e8e6217ecf06dfca063670c7bb82f 100644 --- a/src/Proxy.h +++ b/src/Proxy.h @@ -4,6 +4,30 @@ #include <iostream> #include "Resource.h" +#include "cuda.h" +#include "cuda_runtime.h" + +// template<typename T, typename...Args1, typename... Args2> +// __global__ void proxy_sync_call_kernel_noreturn(T* addr, (T::*memberFunc(Args1...)), Args2...args); + +#ifdef __CUDACC__ +#include <cuda/std/utility> +// Kernels +template<typename T, typename RetType, typename...Args> +__global__ void proxy_sync_call_kernel(RetType* result, T* addr, RetType (T::*memberFunc(Args...)), Args...args) { + if (blockIdx.x == 0) { + *result = (addr->*memberFunc)(args...); + } +} + +template<typename T, typename... Args> +__global__ void constructor_kernel(T* __restrict__ devptr, Args...args) { + if (blockIdx.x == 0) { + devptr = new T{::cuda::std::forward<Args>(args)...}; + } +} +#endif + // START traits // These ugly bits of code help implement SFINAE in C++14 and should likely be removed if a newer standard is adopted // https://stackoverflow.com/questions/55191505/c-compile-time-check-if-method-exists-in-template-type @@ -18,7 +42,7 @@ struct has_send_children<T, decltype(std::declval<T>().send_children(Resource{Re // template <typename T, typename = void> // struct has_metadata : std::false_type {}; -// template <typename T> +// template <typename _tT> // struct has_metadata<T, decltype(std::declval<T>()::Metadata, void())> : std::true_type {}; template <typename...> @@ -28,9 +52,15 @@ using void_t = void; // Used by Proxy class template <typename T, typename = void> -struct Metadata_t { }; +struct Metadata_t { + Metadata_t(const T& obj) {}; + Metadata_t(const Metadata_t<T>& other) {}; +}; template <typename T> -struct Metadata_t<T, void_t<typename T::Metadata>> : T::Metadata { }; +struct Metadata_t<T, void_t<typename T::Metadata>> : T::Metadata { + Metadata_t(const T& obj) : T::Metadata(obj) {}; + Metadata_t(const Metadata_t<T>& other) : T::Metadata(other) {}; +}; // struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { }; @@ -113,9 +143,52 @@ struct Proxy { /** * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr. */ - Proxy() : location(Resource{Resource::CPU,0}), addr(nullptr) {}; - Proxy(const Resource& r, T* obj) : location(r), addr(obj) {}; - + Proxy() : location(Resource{Resource::CPU,0}), addr(nullptr), metadata(nullptr) { + LOGINFO("Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this)); + }; + Proxy(const Resource& r) : location(r), addr(nullptr), metadata(nullptr) { + LOGINFO("Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this)); + }; + Proxy(const Resource& r, T& obj, T* dest = nullptr) : location(r), addr(dest == nullptr ? &obj : dest) { + if (dest == nullptr) metadata = nullptr; + else metadata = new Metadata_t<T>(obj); + LOGINFO("Constructing Proxy<{}> @{} wrapping @{} with metadata @{}", + type_name<T>().c_str(), fmt::ptr(this), fmt::ptr(&obj), fmt::ptr(metadata)); + }; + // Copy constructor + Proxy(const Proxy<T>& other) : location(other.location), addr(other.addr), metadata(nullptr) { + LOGINFO("Copy Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this)); + if (other.metadata != nullptr) { + const Metadata_t<T>& tmp = *(other.metadata); + metadata = new Metadata_t<T>(tmp); + } + }; + Proxy<T>& operator=(const Proxy<T>& other) { + if (this != &other) { + // Free existing resources. + if (metadata != nullptr) delete metadata; + location = other.location; + addr = other.addr; + const Metadata_t<T>& tmp = *(other.metadata); + metadata = new Metadata_t<T>(tmp); // copy construct! + // std::copy(other.metadata, other.metadata + sizeof(Metadata_t<T>), metadata); + } + return *this; + }; + Proxy(Proxy<T>&& other) : addr(nullptr), metadata(nullptr) { + LOGINFO("Move Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this)); + location = other.location; + addr = other.addr; + // For now we avoid std::move, but we may choose to change this behavior + // const Metadata_t<T>& tmp = *(other.metadata); + metadata = other.metadata; + other.metadata = nullptr; + }; + ~Proxy() { + LOGINFO("Deconstructing Proxy<{}> @{} with metadata @{}", type_name<T>().c_str(), fmt::ptr(this), fmt::ptr(metadata)); + if (metadata != nullptr) delete metadata; + }; + /** * @brief Overloaded operator-> returns the address of the underlying object. * @return The address of the underlying object. @@ -142,11 +215,27 @@ struct Proxy { } break; case Resource::GPU: +#ifdef __CUDACC__ if (location.is_local()) { - Exception( NotImplementedError, "Proxy::callSync() local GPU calls" ); + if (sizeof(RetType) > 0) { + // Note: this only support basic RetType objects + RetType* dest; + RetType obj; + gpuErrchk(cudaMalloc(&dest, sizeof(RetType))); + proxy_sync_call_kernel<T, RetType, Args2...><<<1,32>>>(dest, addr, addr->*memberFunc, args...); + // proxy_sync_call_kernel<><<<1,32>>>(dest, addr, addr->*memberFunc, args...); + gpuErrchk(cudaMemcpy(dest, &obj, sizeof(RetType), cudaMemcpyHostToDevice)); + gpuErrchk(cudaFree(dest)); + return obj; + } else { + Exception( NotImplementedError, "Proxy::callSync() local GPU calls" ); + } } else { Exception( NotImplementedError, "Proxy::callSync() non-local GPU calls" ); } +#else +Exception( NotImplementedError, "Proxy::callSync() for GPU only defined for files compiled with nvvc" ); +#endif break; case Resource::MPI: Exception( NotImplementedError, "MPI sync calls (deprecate?)" ); @@ -262,8 +351,11 @@ struct Proxy<T, typename std::enable_if_t<std::is_arithmetic<T>::value>> { */ template <typename T> HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T* dest = nullptr) { + LOGTRACE(" _send_ignoring_children..."); switch (location.type) { case Resource::GPU: + LOGINFO(" GPU..."); +#ifdef USE_CUDA if (location.is_local()) { if (dest == nullptr) { // allocate if needed LOGTRACE(" cudaMalloc for array"); @@ -273,16 +365,25 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T } else { Exception( NotImplementedError, "`_send_ignoring_children(...)` on non-local GPU" ); } +#else + Exception( NotImplementedError, "USE_CUDA is not enabled" ); +#endif break; case Resource::CPU: + LOGINFO(" CPU..."); if (location.is_local()) { - if (dest == nullptr) { // allocate if needed - LOGTRACE(" Allocate CPU memory for {}", decltype(T)); - dest = new T; - } - memcpy(dest, &obj, sizeof(T)); + LOGINFO(" local CPU..."); + // if (dest == nullptr) { // allocate if needed + // LOGINFO(" allocate memory..."); + // LOGTRACE(" Allocate CPU memory for {}", type_name<T>().c_str()); + // dest = new T; + // } + // LOGINFO(" memcpying..."); + // memcpy(dest, &obj, sizeof(T)); + // dest = *obj; } else { - Exception( NotImplementedError, "`_send_ignoring_children(...)` on non-local CPU" ); + LOGINFO(" nonlocal..."); + // Exception( NotImplementedError, "`_send_ignoring_children(...)` on non-local CPU" ); } break; default: @@ -290,7 +391,16 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T Exception( ValueError, "`_send_ignoring_children(...)` applied with unkown resource type" ); } - return Proxy<T>(location, dest); + LOGINFO(" creating Proxy..."); + // Proxy<T>* ret = new Proxy<T>(location, dest); // Proxies should be explicitly removed + // LOGINFO(" ...done @{}", fmt::ptr(ret)); + // Proxy<T>&& ret = + return Proxy<T>(location, obj, dest); // Proxies should be explicitly removed + + //LOGINFO(" ...done @{}", fmt::ptr(&ret)); + // return ret; + // LOGINFO(" ...done @{}", fmt::ptr(ret)); + // return *ret; } /** @@ -303,10 +413,10 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T * @return A Proxy representing the data at the destination location. */ template <typename T, typename Dummy = void, typename std::enable_if_t<!has_send_children<T>::value, Dummy>* = nullptr> -HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) { - LOGTRACE("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); +HOST inline Proxy<T>& send(const Resource& location, T& obj, T* dest = nullptr) { + LOGINFO("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); // Simple objects can simply be copied without worrying about contained objects and arrays - auto ret = _send_ignoring_children(location, obj, dest); + Proxy<T>&& ret = _send_ignoring_children(location, obj, dest); LOGTRACE("...done sending"); // printf("...done\n"); return ret; @@ -323,7 +433,7 @@ HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) { */ template <typename T, typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr> HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) { - LOGTRACE("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); + LOGINFO("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest)); auto dummy = obj.send_children(location); // function is expected to return an object of type obj with all pointers appropriately assigned to valid pointers on location Proxy<T> ret = _send_ignoring_children<T>(location, dummy, dest); LOGTRACE("... clearing dummy complex object"); @@ -332,4 +442,53 @@ HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) { return ret; } +// Utility function for constructing objects in remote memory address +// spaces, obviating the need to construct simple objects locally +// before copying. Returns a Proxy object, but in cases where the +// remote resource location is non-CPU or non-local, metadata for +// Proxy will be blank. +template<typename T, typename... Args> +Proxy<T> construct_remote(Resource location, Args&&...args) { + switch (location.type) { + case Resource::CPU: + if (location.is_local()) { + T* ptr = new T{std::forward<Args>(args)...}; + return Proxy<T>(location, *ptr); + } else { + Exception( NotImplementedError, "construct_remote() non-local CPU calls" ); + } + break; + case Resource::GPU: +#ifdef __CUDACC__ + if (location.is_local()) { + T* devptr; + LOGWARN("construct_remote: TODO: switch to device associated with location"); + gpuErrchk(cudaMalloc(&devptr, sizeof(T))); + constructor_kernel<<<1,32>>>(devptr, std::forward<Args>(args)...); + gpuErrchk(cudaDeviceSynchronize()); + LOGWARN("construct_remote: proxy.metadata not set"); + return Proxy<T>(location); + // Exception( NotImplementedError, "cunstruct_remote() local GPU call" ); + // Note: this only support basic RetType objects + // T* dest; + // T obj; + // gpuErrchk(cudaMalloc(&dest, sizeof(RetType))); + // proxy_sync_call_kernel<T, RetType, Args2...><<<1,32>>>(dest, addr, addr->*memberFunc, args...); + // gpuErrchk(cudaMemcpy(dest, &obj, sizeof(RetType), cudaMemcpyHostToDevice)); + // gpuErrchk(cudaFree(dest)); + } else { + Exception( NotImplementedError, "cunstruct_remote() non-local GPU call" ); + } +#else + Exception( NotImplementedError, "construct_remote() for GPU only defined for files compiled with nvvc" ); +#endif + break; + case Resource::MPI: + Exception( NotImplementedError, "construct_remote() for MPI" ); + break; + default: + Exception( ValueError, "construct_remote(): unknown resource type" ); + } + return Proxy<T>{}; +} diff --git a/src/Random.h b/src/Random.h index 8022ff034b91ded22d9a322ecceba7327243f077..8c9175be0dcb700dc9a1a16fa280a4796ddf8cea 100644 --- a/src/Random.h +++ b/src/Random.h @@ -18,6 +18,10 @@ inline void cuRandAssert(curandStatus code, const char *file, int line, bool abo if (abort) exit(code); } } +// #else +// #define QUALIFIERS __host__ __device__ +// #include <curand_kernel.h> +// #undef QUALIFIERS #endif namespace Random { @@ -79,6 +83,41 @@ public: // void reorder(int *a, int n); }; +class RandomCPUOld { +public: + using state_t = void; + + // Methods for maninpulating the state + void init(size_t num, unsigned long seed, size_t offset) { + // + }; + + // Methods for generating random values + HOST DEVICE inline float gaussian(state_t* state) { + LOGWARN("RandomCPU::gaussian(): not implemented"); + return 0; + }; + + HOST DEVICE inline state_t* get_gaussian_state() { return nullptr; }; + + HOST DEVICE inline void set_gaussian_state(state_t* state) {}; + + // // HOST DEVICE inline float gaussian(RandomState* state) {}; + // HOST inline Vector3 gaussian_vector(size_t idx, size_t thread) { + // return Vector3(); + // }; + // unsigned int integer() { + // return 0; + // }; + // unsigned int poisson(float lambda) { + // return 0; + // }; + // float uniform() { + // return 0; + // }; + // void reorder(int *a, int n); +}; + #ifdef USE_CUDA class RandomAcc { @@ -157,6 +196,7 @@ public: gpuErrchk(cudaMalloc((void**)&(rng->states), sizeof(state_t)*num_states)); gpuErrchk(cudaMemcpy(dest, rng, sizeof(RNG),cudaMemcpyHostToDevice)); rng->states = nullptr; + // Since we avoided constructuer, don't call `delete rng;` return dest; } diff --git a/src/Resource.cu b/src/Resource.cu index bff4d609fe55a4fb79d8830a81b729f86b910b92..68f217b71fb6c799f203e9078dbb11e7130baf52 100644 --- a/src/Resource.cu +++ b/src/Resource.cu @@ -1,14 +1,18 @@ #include "Resource.h" -HOST DEVICE size_t current_thread_idx() { +HOST DEVICE size_t caller_id() { #ifdef USE_MPI - Exception( NotImplementedError, "current_thread_idx() not implemented on GPU" ); + // LOGINFO("... caller_id(): MPI path"); + Exception( NotImplementedError, "caller_id() not implemented on GPU" ); int world_rank; return MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); #else -#ifdef __CUDA_ACC__ - Exception( NotImplementedError, "current_thread_idx() not implemented on GPU" ); +#ifdef __CUDA_ARCH__ + LOGINFO("... caller_id(): GPU path"); + Exception( NotImplementedError, "caller_id() not implemented on GPU" ); + return 0; #else + // LOGWARN("... caller_id() CPU path: return 0"); return 0; #endif #endif diff --git a/src/Resource.h b/src/Resource.h index 8a6b054af8ea8e2e1f54dc6d0139ea9ae4680f88..3f159399197f72c36f6ec5a11c7c61b7ec8e993d 100644 --- a/src/Resource.h +++ b/src/Resource.h @@ -8,8 +8,11 @@ #include "GPUManager.h" -HOST DEVICE size_t current_thread_idx(); - +/** + * @brief Routine that returns the index of the calling resource, + * regardless of whether it an MPI rank or a GPU. + */ +HOST DEVICE size_t caller_id(); /** @@ -25,29 +28,21 @@ struct Resource { Resource* parent; ///< Parent resource; nullptr unless type is GPU /** - * @brief Checks if the resource is running on the calling thread. + * @brief Check if resource is local to the calling thread. */ - HOST DEVICE bool is_running() const { - bool ret = true; -// #ifdef __CUDA_ACC__ -// ret = (type == GPU); -// #else -// ret = (type == CPU); -// #endif - return ret; - } - - // Q: should this return True for GPU that is attached/assigned to current thread? Currently assuming yes. HOST DEVICE bool is_local() const { bool ret = true; -#ifdef __CUDA_ACC__ +#ifdef __CUDACC__ ret = (type == GPU); - LOGWARN("Resource::is_local() not fully implemented on GPU devices; returning {}",ret); + LOGWARN("Resource::is_local(): called from GPU but not fully implemented on GPU devices, returning {}",ret); #else + LOGINFO("Resource::is_local() out of CUDACC..."); if (type == GPU && parent != nullptr) { + LOGINFO("Resource::is_local(): called from CPU for resource on GPU, checking parent..."); ret = parent->is_local(); } else { - ret = (current_thread_idx() == id); + ret = (caller_id() == id); + LOGINFO("Resource::is_local(): called from CPU, returning {}", ret); } #endif return ret; @@ -56,7 +51,7 @@ struct Resource { static Resource Local() { LOGWARN("Resource::Local() not properly implemented"); -#ifdef __CUDA_ACC__ +#ifdef __CUDACC__ return Resource{ GPU, 0 }; #else return Resource{ CPU, 0 }; @@ -65,4 +60,3 @@ struct Resource { bool operator==(const Resource& other) const { return type == other.type && id == other.id; }; }; - diff --git a/src/SimManager.cu b/src/SimManager.cu index 811b6ee1ac4e8eab119e73924b17142fd796f3ae..ad91c2454b91d5ae9b324fef8b6d4ae8ea835226 100644 --- a/src/SimManager.cu +++ b/src/SimManager.cu @@ -10,8 +10,11 @@ void SimManager::run() { // Patch p(10,0,0,sys); Patch p; - + LOGINFO("here2!"); +//* RandomCPU rng{}; + LOGINFO("there2!"); + // RandomAcc rng{}; //ProxyPatch p2(10,0,0); // p.add_compute( std::make_unique<LocalPairForce>() ); @@ -25,18 +28,21 @@ void SimManager::run() { // p.add_compute( std::make_unique<LocalBonded>() ); // #endif + LOGINFO("here!"); auto tmp = Random::get_gaussian_state(&rng); - + LOGINFO("there!"); for (size_t step = 0; step < 10; ++step) { - LOGINFO("Step {}: random {:0.2f}", step, Random::gaussian(&rng,(RandomCPU::state_t*) nullptr)); + LOGINFO("where!"); + LOGINFO("Step {:d}: random", step); // {:0.2f}", step, Random::gaussian(&rng,(RandomCPU::state_t*) nullptr)); p.compute(); #ifdef USE_CUDA cudaDeviceSynchronize(); #endif } -#ifdef USE_CUDA - RandomGPU<128>::launch_test_kernel<64>((size_t) 1); - cudaDeviceSynchronize(); -#endif + // */ +// #ifdef USE_CUDA +// RandomGPU<128>::launch_test_kernel<64>((size_t) 1); +// cudaDeviceSynchronize(); +// #endif }; diff --git a/src/SimSystem.cpp b/src/SimSystem.cpp index 4fe9c39df5ffe58a49b3b47153873d111f84fdef..6c09cae45523eb6d83bf3d1226fc3812f006936e 100644 --- a/src/SimSystem.cpp +++ b/src/SimSystem.cpp @@ -30,7 +30,7 @@ void CellDecomposer::decompose(SimSystem& sys, ResourceCollection& resources) { Vector3_t<size_t> ijk = index_to_ijk( idx, n_p_v.x, n_p_v.y, n_p_v.z ); // size_t cap = 2*num_particles / n_r; // auto p2 = Patch(cap); - auto p2 = Patch(); // don't allocate array locally + Patch p2 = Patch(); // don't allocate array locally // TODO: generalize to non-orthogonal basis Vector3 pmin = min + dr.element_mult(ijk); Vector3 pmax = pmin + dr; diff --git a/src/SimSystem.h b/src/SimSystem.h index c4710f2ba57af7508ae7162f7bba6692e7d6896f..a777a2273328ac591a74ad5523d9f1393cdf6b87 100644 --- a/src/SimSystem.h +++ b/src/SimSystem.h @@ -21,9 +21,11 @@ class BoundaryConditions { public: BoundaryConditions() : origin(0), basis{(5000,0,0),(0,5000,0),(0,0,5000)}, periodic{true,true,true} { // do things + LOGINFO("BoundaryConditions()"); } BoundaryConditions( Vector3 basis1, Vector3 basis2, Vector3 basis3, Vector3 origin = Vector3(0), bool periodic1 = true, bool periodic2 = true, bool periodic3 = true ) { // do things + LOGINFO("BoundaryConditions(...)"); } static constexpr size_t dim = 3; @@ -105,6 +107,7 @@ public: // explicit operator int() const {return object_type*16 + algorithm*4 + backend;}; }; + inline // C++17 feature needed for compilation... unsure of why static constexpr Conf default_conf = Conf{291.0f, Conf::CellDecomp, Conf::AllPeriodic, {5000.0f,5000.0f,5000.0f}, 50.0f }; SimSystem() : SimSystem(default_conf) {} @@ -170,6 +173,7 @@ public: // void consolidate_patches(); // maybe not needed void distribute_patches() { + LOGINFO("distribute_patches()"); decomp; }