Something went wrong on our end
Proxy.h 18.88 KiB
#pragma once
#include <future>
#include <iostream>
#include "Resource.h"
#include "cuda.h"
#include "cuda_runtime.h"
// template<typename T, typename...Args1, typename... Args2>
// __global__ void proxy_sync_call_kernel_noreturn(T* addr, (T::*memberFunc(Args1...)), Args2...args);
#ifdef __CUDACC__
#include <cuda/std/utility>
// Kernels
template<typename T, typename RetType, typename...Args>
__global__ void proxy_sync_call_kernel(RetType* result, T* addr, RetType (T::*memberFunc(Args...)), Args...args) {
if (blockIdx.x == 0) {
*result = (addr->*memberFunc)(args...);
}
}
template<typename T, typename... Args>
__global__ void constructor_kernel(T* __restrict__ devptr, Args...args) {
if (blockIdx.x == 0) {
devptr = new T{::cuda::std::forward<Args>(args)...};
}
}
#endif
// START traits
// These ugly bits of code help implement SFINAE in C++14 and should likely be removed if a newer standard is adopted
// https://stackoverflow.com/questions/55191505/c-compile-time-check-if-method-exists-in-template-type
/**
* @brief Template trait to check if a method 'send_children' exists in a type.
*/
#include <type_traits>
template <typename T, typename = void>
struct has_send_children : std::false_type {};
template <typename T>
struct has_send_children<T, decltype(std::declval<T>().send_children(Resource{Resource::CPU,0}), void())> : std::true_type {};
// template <typename T, typename = void>
// struct has_metadata : std::false_type {};
// template <typename _tT>
// struct has_metadata<T, decltype(std::declval<T>()::Metadata, void())> : std::true_type {};
template <typename...>
using void_t = void;
// struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { };
// END traits
// Used by Proxy class
template <typename T, typename = void>
struct Metadata_t {
Metadata_t(const T& obj) {};
Metadata_t(const Metadata_t<T>& other) {};
};
template <typename T>
struct Metadata_t<T, void_t<typename T::Metadata>> : T::Metadata {
Metadata_t(const T& obj) : T::Metadata(obj) {};
Metadata_t(const Metadata_t<T>& other) : T::Metadata(other) {};
};
// struct Metadata_t<T, decltype(std::declval<typename T::Metadata>(), void())> : T::Metadata { };
// template<typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
// struct Proxy {
// /**
// * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr.
// */
// Proxy() : location{Resource{Resource::CPU,0}}, addr{nullptr} {};
// Proxy(const Resource& r, T* obj) : location{r}, addr{obj} {};
// /**
// * @brief Overloaded operator-> returns the address of the underlying object.
// * @return The address of the underlying object.
// */
// auto operator->() { return addr; }
// auto operator->() const { return addr; }
// /**
// * @brief The resource associated with the data represented by the proxy.
// */
// Resource location; ///< The device (thread/gpu) holding the data represented by the proxy.
// T* addr; ///< The address of the underlying object.
// };
/**
* @brief Template class representing a proxy for the underlying data.
* @tparam T The type of the underlying data.
*/
// Q: why a pointer?
// Q: rename to Metadata? Has
// consequences for Proxy<Proxy<T>>
// that limits specialization but uses
// T::Metadata automatically
//
// A: depends on how Proxy<Proxy<T>>
// objects are used
// void move(const Resource& newloc) {
// LOGTRACE("Moving object from {} to {}", location, newloc);
// Proxy<T> new_proxy;
// switch (location.type) {
// case Resource::CPU:
// if (location.is_local()) {
// new_proxy = send(location, &addr);
// } else {
// Exception( NotImplementedError, "Proxy::move() non-local CPU calls" );
// }
// break;
// case Resource::GPU:
// if (location.is_local()) {
// Exception( NotImplementedError, "Proxy::move() local GPU calls" );
// } else {
// Exception( NotImplementedError, "Proxy::move() non-local GPU calls" );
// }
// break;
// case Resource::MPI:
// Exception( NotImplementedError, "MPI move (deprecate?)" );
// break;
// default:
// Exception( ValueError, "Proxy::move(): Unknown resource type" );
// }
// // auto Proxy<T>{ location , newaddr }
// // auto tmpmeta = send(newloc, metadata);
// location = newloc;
// addr = new_proxy.addr;
// metadata = new_proxy.metadata;
// }
// C++17 way: template<typename T, typename Metadata = std::void_t<typename T::Metadata>>
// C++14 way: template<typename T, typename Metadata = typename std::conditional<has_metadata<T>::value, typename T::Metadata, void>::type>
// Neither needed!
template<typename T, typename Enable = void>
struct Proxy {
/**
* @brief Default constructor initializes the location to a default CPU resource and the address to nullptr.
*/
Proxy() : location(Resource{Resource::CPU,0}), addr(nullptr), metadata(nullptr) {
LOGINFO("Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this));
};
Proxy(const Resource& r) : location(r), addr(nullptr), metadata(nullptr) {
LOGINFO("Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this));
};
Proxy(const Resource& r, T& obj, T* dest = nullptr) : location(r), addr(dest == nullptr ? &obj : dest) {
if (dest == nullptr) metadata = nullptr;
else metadata = new Metadata_t<T>(obj);
LOGINFO("Constructing Proxy<{}> @{} wrapping @{} with metadata @{}",
type_name<T>().c_str(), fmt::ptr(this), fmt::ptr(&obj), fmt::ptr(metadata));
};
// Copy constructor
Proxy(const Proxy<T>& other) : location(other.location), addr(other.addr), metadata(nullptr) {
LOGINFO("Copy Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this));
if (other.metadata != nullptr) {
const Metadata_t<T>& tmp = *(other.metadata);
metadata = new Metadata_t<T>(tmp);
}
};
Proxy<T>& operator=(const Proxy<T>& other) {
if (this != &other) {
// Free existing resources.
if (metadata != nullptr) delete metadata;
location = other.location;
addr = other.addr;
const Metadata_t<T>& tmp = *(other.metadata);
metadata = new Metadata_t<T>(tmp); // copy construct!
// std::copy(other.metadata, other.metadata + sizeof(Metadata_t<T>), metadata);
}
return *this;
};
Proxy(Proxy<T>&& other) : addr(nullptr), metadata(nullptr) {
LOGINFO("Move Constructing Proxy<{}> @{}", type_name<T>().c_str(), fmt::ptr(this));
location = other.location;
addr = other.addr;
// For now we avoid std::move, but we may choose to change this behavior
// const Metadata_t<T>& tmp = *(other.metadata);
metadata = other.metadata;
other.metadata = nullptr;
};
~Proxy() {
LOGINFO("Deconstructing Proxy<{}> @{} with metadata @{}", type_name<T>().c_str(), fmt::ptr(this), fmt::ptr(metadata));
if (metadata != nullptr) delete metadata;
};
/**
* @brief Overloaded operator-> returns the address of the underlying object.
* @return The address of the underlying object.
*/
auto operator->() { return addr; };
auto operator->() const { return addr; };
/**
* @brief The resource associated with the data represented by the proxy.
*/
Resource location; ///< The device (thread/gpu) holding the data represented by the proxy.
T* addr; ///< The address of the underlying object.
Metadata_t<T>* metadata; ///< T-specific metadata that resides in same memory space as Proxy<T>
// Use two template parameter packs as suggested here: https://stackoverflow.com/questions/26994969/inconsistent-parameter-pack-deduction-with-variadic-templates
template <typename RetType, typename... Args1, typename... Args2>
RetType callSync(RetType (T::*memberFunc)(Args1...), Args2&&... args) {
switch (location.type) {
case Resource::CPU:
if (location.is_local()) {
return (addr->*memberFunc)(std::forward<Args2>(args)...);
} else {
Exception( NotImplementedError, "Proxy::callSync() non-local CPU calls" );
}
break;
case Resource::GPU:
#ifdef __CUDACC__
if (location.is_local()) {
if (sizeof(RetType) > 0) {
// Note: this only support basic RetType objects
RetType* dest;
RetType obj;
gpuErrchk(cudaMalloc(&dest, sizeof(RetType)));
proxy_sync_call_kernel<T, RetType, Args2...><<<1,32>>>(dest, addr, addr->*memberFunc, args...);
// proxy_sync_call_kernel<><<<1,32>>>(dest, addr, addr->*memberFunc, args...);
gpuErrchk(cudaMemcpy(dest, &obj, sizeof(RetType), cudaMemcpyHostToDevice));
gpuErrchk(cudaFree(dest));
return obj;
} else {
Exception( NotImplementedError, "Proxy::callSync() local GPU calls" );
}
} else {
Exception( NotImplementedError, "Proxy::callSync() non-local GPU calls" );
}
#else
Exception( NotImplementedError, "Proxy::callSync() for GPU only defined for files compiled with nvvc" );
#endif
break;
case Resource::MPI:
Exception( NotImplementedError, "MPI sync calls (deprecate?)" );
break;
default:
Exception( ValueError, "Proxy::callSync(): Unknown resource type" );
}
return RetType{};
}
// TODO generalize to handle void RetType
template <typename RetType, typename... Args1, typename... Args2>
std::future<RetType> callAsync(RetType (T::*memberFunc)(Args1...), Args2&&... args) {
switch (location.type) {
case Resource::CPU:
if (location.is_local()) {
return (addr->*memberFunc)(std::forward<Args2>(args)...);
} else {
Exception( NotImplementedError, "Proxy::callAsync() non-local CPU calls" );
}
break;
case Resource::GPU:
if (location.is_local()) {
Exception( NotImplementedError, "Proxy::callAsync() local GPU calls" );
} else {
Exception( NotImplementedError, "Proxy::callAsync() non-local GPU calls" );
}
break;
case Resource::MPI:
Exception( NotImplementedError, "MPI async calls (deprecate?)" );
break;
default:
Exception( ValueError, "Proxy::callAsync(): Unknown resource type" );
}
return std::async(std::launch::async, [] { return RetType{}; });
}
};
// Specialization for bool/int/float types that do not have member functions
template<typename T>
struct Proxy<T, typename std::enable_if_t<std::is_arithmetic<T>::value>> {
/**
* @brief Default constructor initializes the location to a default CPU resource and the address to nullptr.
*/
Proxy() : location{Resource{Resource::CPU,0}}, addr{nullptr} {};
Proxy(const Resource& r, T* obj) : location{r}, addr{obj} {};
/**
* @brief Overloaded operator-> returns the address of the underlying object.
* @return The address of the underlying object.
*/
auto operator->() { return addr; }
auto operator->() const { return addr; }
/**
* @brief The resource associated with the data represented by the proxy.
*/
Resource location; ///< The device (thread/gpu) holding the data represented by the proxy.
T* addr; ///< The address of the underlying object.
};
// using Proxy<int> = SimpleProxy<int>;
// class Proxy<int> {
// // Define Metadata types using SFINAE
// // template<typename=void> struct Metadata_t { };
// // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { };
// // template<typename=void> struct Metadata_t { };
// // template<> struct Metadata_t<void_t<T::Metadata>> : T::Metadata { };
// // using Metadata_t = Metadata_t<T>;
// public:
// /**
// * @brief Default constructor initializes the location to a default CPU resource and the address to nullptr.
// */
// Proxy<int>() : location(Resource{Resource::CPU,0}), addr(nullptr) {};
// Proxy<int>(const Resource& r, int* obj) : location(r), addr(obj) {};
// /**
// * @brief Overloaded operator-> returns the address of the underlying object.
// * @return The address of the underlying object.
// */
// auto operator->() { return addr; }
// auto operator->() const { return addr; }
// /**
// * @brief The resource associated with the data represented by the proxy.
// */
// Resource location; ///< The device (thread/gpu) holding the data represented by the proxy.
// int* addr; ///< The address of the underlying object.
// };
// // Partial specialization
// template<typename T>
// using Proxy<T> = Proxy<T, std::void_t<typename T::Metadata>>
// // template<typename T>
// // class Proxy<T, typename T::Metadata> { };
/**
* @brief Template function to send data ignoring children to a specified location.
* @tparam T The type of the data to be sent.
* @param location The destination resource for the data.
* @param obj The data to be sent.
* @param dest Optional parameter to provide a pre-allocated destination. If not provided, memory is allocated.
* @return A Proxy representing the data at the destination location.
*/
template <typename T>
HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T* dest = nullptr) {
LOGTRACE(" _send_ignoring_children...");
switch (location.type) {
case Resource::GPU:
LOGINFO(" GPU...");
#ifdef USE_CUDA
if (location.is_local()) {
if (dest == nullptr) { // allocate if needed
LOGTRACE(" cudaMalloc for array");
gpuErrchk(cudaMalloc(&dest, sizeof(T)));
}
gpuErrchk(cudaMemcpy(dest, &obj, sizeof(T), cudaMemcpyHostToDevice));
} else {
Exception( NotImplementedError, "`_send_ignoring_children(...)` on non-local GPU" );
}
#else
Exception( NotImplementedError, "USE_CUDA is not enabled" );
#endif
break;
case Resource::CPU:
LOGINFO(" CPU...");
if (location.is_local()) {
LOGINFO(" local CPU...");
// if (dest == nullptr) { // allocate if needed
// LOGINFO(" allocate memory...");
// LOGTRACE(" Allocate CPU memory for {}", type_name<T>().c_str());
// dest = new T;
// }
// LOGINFO(" memcpying...");
// memcpy(dest, &obj, sizeof(T));
// dest = *obj;
} else {
LOGINFO(" nonlocal...");
// Exception( NotImplementedError, "`_send_ignoring_children(...)` on non-local CPU" );
}
break;
default:
// error
Exception( ValueError, "`_send_ignoring_children(...)` applied with unkown resource type" );
}
LOGINFO(" creating Proxy...");
// Proxy<T>* ret = new Proxy<T>(location, dest); // Proxies should be explicitly removed
// LOGINFO(" ...done @{}", fmt::ptr(ret));
// Proxy<T>&& ret =
return Proxy<T>(location, obj, dest); // Proxies should be explicitly removed
//LOGINFO(" ...done @{}", fmt::ptr(&ret));
// return ret;
// LOGINFO(" ...done @{}", fmt::ptr(ret));
// return *ret;
}
/**
* @brief Template function to send simple objects to a specified location without considering child objects.
* This version will be selected upon send(location, obj) if obj.send_children does not exist (C++14-compatible SFINAE)
* @tparam T The type of the data to be sent.
* @param location The destination resource for the data.
* @param obj The data to be sent.
* @param dest Optional parameter to provide a pre-allocated destination. If not provided, memory is allocated.
* @return A Proxy representing the data at the destination location.
*/
template <typename T, typename Dummy = void, typename std::enable_if_t<!has_send_children<T>::value, Dummy>* = nullptr>
HOST inline Proxy<T>& send(const Resource& location, T& obj, T* dest = nullptr) {
LOGINFO("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
// Simple objects can simply be copied without worrying about contained objects and arrays
Proxy<T>&& ret = _send_ignoring_children(location, obj, dest);
LOGTRACE("...done sending");
// printf("...done\n");
return ret;
}
/**
* @brief Template function to send more complex objects to a specified location.
* This version will be selected upon send(location, obj) if obj.send_children exists (C++14-compatible SFINAE)
* @tparam T The type of the data to be sent.
* @param location The destination resource for the data.
* @param obj The data to be sent.
* @param dest Optional parameter to provide a pre-allocated destination. If not provided, memory is allocated on the GPU.
* @return A Proxy representing the data at the destination location.
*/
template <typename T, typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr>
HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
LOGINFO("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
auto dummy = obj.send_children(location); // function is expected to return an object of type obj with all pointers appropriately assigned to valid pointers on location
Proxy<T> ret = _send_ignoring_children<T>(location, dummy, dest);
LOGTRACE("... clearing dummy complex object");
dummy.clear();
LOGTRACE("... done sending");
return ret;
}
// Utility function for constructing objects in remote memory address
// spaces, obviating the need to construct simple objects locally
// before copying. Returns a Proxy object, but in cases where the
// remote resource location is non-CPU or non-local, metadata for
// Proxy will be blank.
template<typename T, typename... Args>
Proxy<T> construct_remote(Resource location, Args&&...args) {
switch (location.type) {
case Resource::CPU:
if (location.is_local()) {
T* ptr = new T{std::forward<Args>(args)...};
return Proxy<T>(location, *ptr);
} else {
Exception( NotImplementedError, "construct_remote() non-local CPU calls" );
}
break;
case Resource::GPU:
#ifdef __CUDACC__
if (location.is_local()) {
T* devptr;
LOGWARN("construct_remote: TODO: switch to device associated with location");
gpuErrchk(cudaMalloc(&devptr, sizeof(T)));
constructor_kernel<<<1,32>>>(devptr, std::forward<Args>(args)...);
gpuErrchk(cudaDeviceSynchronize());
LOGWARN("construct_remote: proxy.metadata not set");
return Proxy<T>(location);
// Exception( NotImplementedError, "cunstruct_remote() local GPU call" );
// Note: this only support basic RetType objects
// T* dest;
// T obj;
// gpuErrchk(cudaMalloc(&dest, sizeof(RetType)));
// proxy_sync_call_kernel<T, RetType, Args2...><<<1,32>>>(dest, addr, addr->*memberFunc, args...);
// gpuErrchk(cudaMemcpy(dest, &obj, sizeof(RetType), cudaMemcpyHostToDevice));
// gpuErrchk(cudaFree(dest));
} else {
Exception( NotImplementedError, "cunstruct_remote() non-local GPU call" );
}
#else
Exception( NotImplementedError, "construct_remote() for GPU only defined for files compiled with nvvc" );
#endif
break;
case Resource::MPI:
Exception( NotImplementedError, "construct_remote() for MPI" );
break;
default:
Exception( ValueError, "construct_remote(): unknown resource type" );
}
return Proxy<T>{};
}