diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c3fa1e1bfafd21b082c7bfcdbea63bed93a9038 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,11 @@ +add_library("lib${PROJECT_NAME}" + GPUManager.cpp + ParticlePatch.cpp + Integrator.cpp + Integrator/CPU.cpp + Integrator/CUDA.cu + Interaction.cpp + Interaction/CPU.cpp + Interaction/CUDA.cu + SignalManager.cpp +) diff --git a/src/Integrator/kernels.h b/src/Integrator/kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..29c7412bba618081d7c66ae7cba08acac2d37081 --- /dev/null +++ b/src/Integrator/kernels.h @@ -0,0 +1,25 @@ +#pragma once + +// #include "../useful.h" +#include "../Types.h" + +#ifdef __CUDACC__ + #define HOST __host__ + #define DEVICE __device__ +#else + #define HOST + #define DEVICE +#endif + +namespace IntegratorKernels { + HOST DEVICE void __inline__ BDIntegrate() { + // std::cout << "Computes::BDIntegrate_inline" << std::endl; + printf("Integrator::BDIntegrate\n"); + }; + + HOST DEVICE void __inline__ BDIntegrate(Vector3* __restrict__ pos, const Vector3 * const __restrict__ force, const int& idx, float& root_Dt, Vector3& normal_sample_3D) { + printf("Integrator::BDIntegrate\n"); + pos[idx] = pos[idx] + force[idx] * root_Dt + normal_sample_3D; + }; + +} diff --git a/src/Interaction.cpp b/src/Interaction.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0ec6e2d72bbb7b5db747f83f5281c250f7cb3cc6 --- /dev/null +++ b/src/Interaction.cpp @@ -0,0 +1,70 @@ +#include "Interaction.h" + +bool operator<(const LocalInteraction::Conf x, const LocalInteraction::Conf y) { return (int(x) < int(y)); }; +std::map<LocalInteraction::Conf, LocalInteraction*> LocalInteraction::_interactions; + +LocalInteraction* LocalInteraction::GetInteraction(Conf& conf) { + // Checks _interactions for a matching configuration, returns one if found, otherwise creates + if (conf.backend == Conf::Default) { +#ifdef USE_CUDA + conf.backend = Conf::CUDA; +#else + conf.backend = Conf::CPU; +#endif + } + + // Insert configuration into map, if it exists + auto emplace_result = LocalInteraction::_interactions.emplace(conf, nullptr); + auto& it = emplace_result.first; + bool& inserted = emplace_result.second; + if (inserted) { + // Conf not found, so create a new one + LocalInteraction* tmp; + + switch (conf.object_type) { + case Conf::Particle: + switch (conf.dof) { + case Conf::Bond: + switch (conf.form) { + case Conf::Harmonic: + switch (conf.backend) { + case Conf::CUDA: +#ifdef USE_CUDA + tmp = new LocalBondedCUDA(); +#else + std::cerr << "WARNING: LocalInteraction::GetInteraction: " + << "CUDA disabled, creating CPU interaction instead" << std::endl; + tmp = new LocalBonded(); +#endif + break; + case Conf::CPU: + tmp = new LocalBonded(); + break; + default: + std::cerr << "Error: LocalInteraction::GetInteraction: " + << "Unrecognized backend; exiting" << std::endl; + } + default: + std::cerr << "Error: LocalInteraction::GetInteraction: " + << "Unrecognized form; exiting" << std::endl; + assert(false); + } + break; + default: + std::cerr << "Error: Interaction::GetInteraction: " + << "Unrecognized algorithm type; exiting" << std::endl; + assert(false); + } + break; + case Conf::RigidBody: + assert(false); + break; + default: + std::cerr << "Error: Interaction::GetInteraction: " + << "Unrecognized object type; exiting" << std::endl; + assert(false); + } + it->second = tmp; + } + return it->second; +} diff --git a/src/Interaction/CPU.cpp b/src/Interaction/CPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..39d4bd1b192ec82e1efda4a2fdad8e6d47c8f480 --- /dev/null +++ b/src/Interaction/CPU.cpp @@ -0,0 +1,6 @@ +#include "CPU.h" + +void LocalBonded::compute(Patch* p) { + std::cout << "LocalBonded::compute()" << std::endl; + InteractionKernels::HarmonicBonds(); +}; diff --git a/src/Interaction/CPU.h b/src/Interaction/CPU.h new file mode 100644 index 0000000000000000000000000000000000000000..777a59bc11ea4d627fe57c5db379f0c786734dc6 --- /dev/null +++ b/src/Interaction/CPU.h @@ -0,0 +1,8 @@ +#pragma once +#include "../Interaction.h" + +class LocalBonded : public LocalInteraction { +public: + void compute(Patch* patch); + int num_patches() const { return 1; }; +}; diff --git a/src/Interaction/CUDA.cu b/src/Interaction/CUDA.cu new file mode 100644 index 0000000000000000000000000000000000000000..a89fdd8dc88caca6a6453199d55b4c80867ea416 --- /dev/null +++ b/src/Interaction/CUDA.cu @@ -0,0 +1,15 @@ +#include "CUDA.h" + +#ifdef USE_CUDA +__global__ void HarmonicBonds_kernel() { + if (threadIdx.x == 0) { + printf("HarmonicBonds_kernel()\n"); + InteractionKernels::HarmonicBonds(); + } +}; + +void LocalBondedCUDA::compute(Patch* p) { + printf("HarmonicBondsCUDA::compute()\n"); + HarmonicBonds_kernel<<<1,32>>>(); +}; +#endif diff --git a/src/Interaction/CUDA.h b/src/Interaction/CUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..7a3e9e53bcd82b054ba8d584856987d2f5af388e --- /dev/null +++ b/src/Interaction/CUDA.h @@ -0,0 +1,10 @@ +#pragma once +#include "../Interaction.h" + +#ifdef USE_CUDA +class LocalBondedCUDA : public LocalInteraction { +public: + void compute(Patch* patch); + int num_patches() const { return 1; }; +}; +#endif diff --git a/src/Interaction/kernels.h b/src/Interaction/kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..5ee970cd974ea8104b87ee0e319e66a300d17bed --- /dev/null +++ b/src/Interaction/kernels.h @@ -0,0 +1,24 @@ +#pragma once + +// #include "../useful.h" +#include "../Types.h" + +#ifdef __CUDACC__ + #define HOST __host__ + #define DEVICE __device__ +#else + #define HOST + #define DEVICE +#endif + +namespace InteractionKernels { + HOST DEVICE void __inline__ HarmonicBonds() { + // std::cout << "Computes::BDIntegrate_inline" << std::endl; + printf("Interaction::HarmonicBondsDummy()\n"); + }; + + HOST DEVICE void __inline__ HarmonicBonds(Vector3* __restrict__ pos, const Vector3 * const __restrict__ force) { + printf("Interaction::HarmonicBonds\n"); + // pos[idx] = pos[idx] + force[idx] * root_Dt + normal_sample_3D; + }; +} diff --git a/src/Tests/CMakeLists.txt b/src/Tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f85436c0f56a0ca5bedb5692e6049d12a85105d3 --- /dev/null +++ b/src/Tests/CMakeLists.txt @@ -0,0 +1,31 @@ +find_package(Catch2 3 REQUIRED) + +# include(doctest) +include(Catch) # contains some cmake macros that will help with discovery of tests +include(CTest) + +## TODO, add a macro to add these projects, build with CUDA +add_executable("arbd_tests" +vector3_precision.cu +) +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda") +set_property(TARGET arbd_tests PROPERTY CXX_STANDARD 14) +set_property(TARGET arbd_tests PROPERTY CUDA_STANDARD 14) +# target_link_libraries("arbd_tests" PRIVATE "lib${PROJECT_NAME}" Catch2::Catch2) +target_link_libraries("arbd_tests" PRIVATE "lib${PROJECT_NAME}" Catch2::Catch2WithMain) + +## Add optional libraries +if(USE_CUDA) + target_link_libraries(arbd_tests PUBLIC curand) +endif() +if(USE_NCCL) + target_link_libraries(arbd_tests PUBLIC nccl) +endif() +if(USE_NVTX) + target_link_libraries(arbd_tests PUBLIC nvToolsExt) +endif() + +## catch_discover_tests("${PROJECT_NAME}_TESTS") + +catch_discover_tests(arbd_tests) +install(TARGETS arbd_tests) diff --git a/src/Tests/type_name.h b/src/Tests/type_name.h new file mode 100644 index 0000000000000000000000000000000000000000..90e83093f160191ed082236f36f57801a746e3c5 --- /dev/null +++ b/src/Tests/type_name.h @@ -0,0 +1,33 @@ +#include <type_traits> +#include <typeinfo> +#ifndef _MSC_VER +#include <cxxabi.h> +#endif +#include <memory> +#include <string> +#include <cstdlib> + +template <typename T> +std::string type_name() { + using TR = typename std::remove_reference<T>::type; + std::unique_ptr<char, void(*)(void*)> own + ( +#ifndef _MSC_VER + abi::__cxa_demangle(typeid(TR).name(), nullptr, + nullptr, nullptr), +#else + nullptr, +#endif + std::free + ); + std::string r = own != nullptr ? own.get() : typeid(TR).name(); + if (std::is_const<TR>::value) + r += " const"; + if (std::is_volatile<TR>::value) + r += " volatile"; + if (std::is_lvalue_reference<T>::value) + r += "&"; + else if (std::is_rvalue_reference<T>::value) + r += "&&"; + return r; +} diff --git a/src/Tests/vector3_precision.cu b/src/Tests/vector3_precision.cu new file mode 100644 index 0000000000000000000000000000000000000000..8fba1f9580e6f5d181fbc2250940bc8a6b918262 --- /dev/null +++ b/src/Tests/vector3_precision.cu @@ -0,0 +1,194 @@ +#include <float.h> +#include <iostream> +#include <cstdio> + +// #include "useful.h" +#include "../SignalManager.h" +#include "../Types.h" +#include <cuda.h> + +#include <catch2/catch_test_macros.hpp> +#include <catch2/matchers/catch_matchers_floating_point.hpp> + +#include "type_name.h" + + +// Test kernel function +// namespace TestVectorOps { + +// ChatGPT("Add doxygen to each function, class and struct, where R represents a return parameter, T and U") + +template<typename R, typename T, typename U> +HOST DEVICE R cross_vectors(T&& v1, U&& v2) { + return v1.cross(v2); +} + +template<typename F, typename R, typename T, typename U> +__global__ void binary_op_test_kernel( F fn, R* result, T in1, U in2 ) { + if (blockIdx.x == 0) { + *result = (*fn)(in1,in2); + } +} + +template<typename T, typename U> +void check_vectors_equal( T&& cpu, U&& gpu) { + CHECK( type_name<decltype(cpu)>() == type_name<decltype(gpu)>() ); // should be unneccesary + CHECK( cpu.x == gpu.x ); + CHECK( cpu.y == gpu.y ); + CHECK( cpu.z == gpu.z ); + CHECK( cpu.w == gpu.w ); +} + +template<typename R, typename T, typename U> +struct TestOp { + std::string name; + typename R (*)(T,U) func; +}; + + +TEST_CASE( "Check that Vector3_t binary operations are identical on GPU and CPU", "[Vector3Dcross]" ) { + // INFO("Test case start"); + using T = Vector3_t<float>; + using U = Vector3_t<double>; + using R = std::common_type_t<T,U>; + + T v1(1,0,0); + U v2(0,2,0); + R *gpu_result_d, gpu_result, cpu_result; + + + std::vector<TestOp> ops; + ops.push_back(TestOp("add", [] __host__ __device__ (T a, U b){ return static_cast<R>(b+a); }; )); + ops.push_back(TestOp("cross", [] __host__ __device__ (T a, U b){ return a.cross(b); }; )); + // auto sub = [] __host__ __device__ (T& a, U& b){ return a.cross(b); }; + // auto dot = [] __host__ __device__ (T& a, U& b){ return a.cross(b); }; + auto cross = [] __host__ __device__ (T a, U b){ return a.cross(b); }; + + using L = decltype(add); + std::vector<R(*)(T,U)> lambdas; + lambdas.push_back(add); + lambdas.push_back(cross); + + // Get_result + cudaMalloc((void **)&gpu_result_d, sizeof(R)); + + for (auto& op: ops) { + INFO(op.name); + binary_op_test_kernel<<<1,1>>>(op.func, gpu_result_d, v1, v2); + cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + + // Get cpu_result + cpu_result = fn(v1,v2); // rvalue version failed on cpu, but works on gpu + + // Check consistency + check_vectors_equal(cpu_result, gpu_result); + } +} + +// int main(int argc, char* argv[]) { +// return 0; +// } + +// int main(int argc, char* argv[]) { +// SignalManager::manage_segfault(); +// Vector3_t<float> v1(1,0,0); +// Vector3_t<double> v2(0,2,0); +// v2.y = 6.62606957e-44; + +// std::cout << "test_vector3.cpp" << std::endl; +// // // std::cout << v1.to_char() << std::endl; + +// // std::cout << v1.to_char() << " x " << v2.to_char() << +// // " = " << (v1.cross(v2)).to_char() << std::endl; +// // std::cout << (-v2).to_char() << " x " << v1.to_char() << +// // " = " << (-v2.cross(v1)).to_char() << std::endl; + +// // std::cout.precision(17); +// // std::cout << sizeof(v2.cross(v1).z) << " " << sizeof(v1.cross(v2).z) << std::endl; +// // std::cout << (v2.cross(v1).z) << " " << (v1.cross(v2).z) << std::endl; +// // std::cout << float(6.62606957e-44) << std::endl; +// // std::cout << " ... done" << std::endl; + +// // test_kernel<<<1,1>>>(v1,v2); +// // cudaDeviceSynchronize(); +// return 0; +// } + +// #include <iostream> + +// // Since C++ 11 +// // template<typename T> +// // using func_t = T (*) (T, T); + +// template<typename R, typename T, typename U> +// using func_t = R (*) (T, U); + +// template <typename R, typename T, typename U> +// __host__ __device__ R add_func (T x, U y) +// { +// return x + y; +// } + +// template <typename T> +// __host__ __device__ T mul_func (T x, T y) +// { +// return x * y; +// } + +// // Required for functional pointer argument in kernel function +// // Static pointers to device functions +// template <typename R, typename T, typename U> +// __device__ func_t<R,T,U> p_add_func = add_func<R,T,U>; +// // template <typename T> +// // __device__ func_t<T> p_mul_func = mul_func<T>; + + +// template <typename R, typename T, typename U> +// __global__ void kernel(func_t<R,T,U> op, T * d_x, U * d_y, R * result) +// { +// *result = (*op)(*d_x, *d_y); +// } + +// template <typename T, typename U> +// void mytest(T x, U y) +// { +// using R = std::common_type_t<T,U>; +// func_t<R,T,U> h_add_func; +// // func_t<T> h_mul_func; + +// T * d_x; +// U * d_y; +// cudaMalloc(&d_x, sizeof(T)); +// cudaMalloc(&d_y, sizeof(U)); +// cudaMemcpy(d_x, &x, sizeof(T), cudaMemcpyHostToDevice); +// cudaMemcpy(d_y, &y, sizeof(U), cudaMemcpyHostToDevice); + +// R result; +// R * d_result, * h_result; +// cudaMalloc(&d_result, sizeof(R)); +// h_result = &result; + +// // Copy device function pointer to host side +// cudaMemcpyFromSymbol(&h_add_func, p_add_func<R,T,U>, sizeof(func_t<R,T,U>)); +// // cudaMemcpyFromSymbol(&h_mul_func, p_mul_func<T>, sizeof(func_t<T>)); + +// kernel<R,T,U><<<1,1>>>(h_add_func, d_x, d_y, d_result); +// cudaDeviceSynchronize(); +// cudaMemcpy(h_result, d_result, sizeof(R), cudaMemcpyDeviceToHost); +// CHECK( result == add_func<R,T,U>(x,y) ); + +// // kernel<T><<<1,1>>>(h_mul_func, d_x, d_y, d_result); +// // cudaDeviceSynchronize(); +// // cudaMemcpy(h_result, d_result, sizeof(T), cudaMemcpyDeviceToHost); +// // CHECK( result == mul_func(x,y) ); +// } + +// TEST_CASE( "Check that Vector3_t binary operations are identical on GPU and CPU", "[Vector3Dcross]" ) { +// INFO("TEST START"); +// mytest<int,float>(2.05, 10.00); +// // mytest<float>(2.05, 10.00); +// // mytest<double>(2.05, 10.00); +// } + +// */ diff --git a/src/Types.h b/src/Types.h new file mode 100644 index 0000000000000000000000000000000000000000..6cfa33eda7c5868d6452029cadccb4b4c3202de5 --- /dev/null +++ b/src/Types.h @@ -0,0 +1,47 @@ +#pragma once + +#ifdef __CUDACC__ + #define HOST __host__ + #define DEVICE __device__ +#else + #define HOST + #define DEVICE +#endif + +#if defined(__CUDACC__) // NVCC + #define MY_ALIGN(n) __align__(n) +#elif defined(__GNUC__) // GCC + #define MY_ALIGN(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) // MSVC + #define MY_ALIGN(n) __declspec(align(n)) +#else + #error "Please provide a definition for MY_ALIGN macro for your host compiler!" +#endif + +#ifdef USE_CUDA +#include <cuda_runtime.h> +#else + +struct MY_ALIGN(16) float4 { + float4() : x(0), y(0), z(0), w(0) {}; + float4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}; + float4 operator+(const float4&& o) { + return float4(x+o.x,y+o.y,z+o.z,w+o.w); + // float4 r; + // r.x = x+o.x; r.y = y+o.y; r.z = z+o.z; r.w = w+o.w; + // return r; + }; + float4 operator*(const float&& s) { + return float4(x*s,y*s,z*s,w*s); + }; + + float x,y,z,w; +}; +#endif + + +#include "Types/Vector3.h" +#include "Types/Vector3.h" + +using Vector3 = Vector3_t<float>; +using Matrix3 = Vector3_t<float>; /* TODO: FIX */ diff --git a/src/Types/Vector3.h b/src/Types/Vector3.h new file mode 100644 index 0000000000000000000000000000000000000000..890de64ea4938eb70b53d0ac0ddb0a9626e6f389 --- /dev/null +++ b/src/Types/Vector3.h @@ -0,0 +1,219 @@ +/********************************************************************* + * @file Vector3.h + * + * @brief Declaration of templated Vector3_t class. + *********************************************************************/ +#pragma once +#include <type_traits> // for std::common_type<T,U> + +/** + * 3D vector utility class with common operations implemented on CPU and GPU. + * + * Implemented with 4D data storage for better GPU alignment; extra + * data can be stored in fourth varaible this->w + * + * @tparam T the type of data stored in the four fields, x,y,z,w; T + * Should usually be float or double. +g */ +template<typename T> +class MY_ALIGN(4*sizeof(T)) Vector3_t { +public: + HOST DEVICE inline Vector3_t<T>() : x(0), y(0), z(0), w(0) {} + HOST DEVICE inline Vector3_t<T>(T s):x(s), y(s), z(s), w(s) {} + HOST DEVICE inline Vector3_t<T>(const Vector3_t<T>& v):x(v.x), y(v.y), z(v.z), w(v.w) {} + HOST DEVICE inline Vector3_t<T>(T x0, T y0, T z0) : x(x0), y(y0), z(z0), w(0) {} + HOST DEVICE inline Vector3_t<T>(const T* d) : x(d[0]), y(d[1]), z(d[2]), w(0) {} + HOST DEVICE inline Vector3_t<T>(const float4 a) : x(a.x), y(a.y), z(a.z), w(a.w) {} + +#if __cplusplus >= 201703L + // TODO: test if c++17 constexpr performs better than std::common_type, otherwise get rid of this #if block + template <typename U> + HOST DEVICE inline Vector3_t<typename std::common_type<T,U>::type> cross(const Vector3_t<U>& w) const { + if constexpr(sizeof(U) < sizeof(T)) { + Vector3_t<T> v; + v.x = y*w.z - z*w.y; + v.y = z*w.x - x*w.z; + v.z = x*w.y - y*w.x; + return v; + } else { + Vector3_t<U> v; + v.x = y*w.z - z*w.y; + v.y = z*w.x - x*w.z; + v.z = x*w.y - y*w.x; + return v; + } + } +#else + template <typename U> + HOST DEVICE inline Vector3_t<typename std::common_type<T,U>::type> cross(const Vector3_t<U>& w) const { + using TU = typename std::common_type<T,U>::type; + Vector3_t<TU> v; + v.x = y*w.z - z*w.y; + v.y = z*w.x - x*w.z; + v.z = x*w.y - y*w.x; + return v; + } +#endif + + HOST DEVICE inline Vector3_t<T>& operator=(const Vector3_t<T>&& v) { + x = v.x; + y = v.y; + z = v.z; + return *this; + } + + HOST DEVICE inline Vector3_t<T>& operator+=(const Vector3_t<T>&& v) { + x += v.x; + y += v.y; + z += v.z; + return *this; + } + + HOST DEVICE inline Vector3_t<T>& operator-=(const Vector3_t<T>&& v) { + x -= v.x; + y -= v.y; + z -= v.z; + return *this; + } + + HOST DEVICE inline Vector3_t<T>& operator*=(float s) { + x *= s; + y *= s; + z *= s; + return *this; + } + + HOST DEVICE inline Vector3_t<T>& operator/=(float s) { + const float sinv = 1.0f/s; + x *= sinv; + y *= sinv; + z *= sinv; + return *this; + } + + HOST DEVICE inline Vector3_t<T> operator-() const { + Vector3_t<T> v; + v.x = -x; + v.y = -y; + v.z = -z; + return v; + } + + template<typename U> + HOST DEVICE inline Vector3_t<T> operator+(const Vector3_t<U>& w ) const { + using TU = typename std::common_type<T,U>::type; + Vector3_t<TU> v; + v.x = x + w.x; + v.y = y + w.y; + v.z = z + w.z; + return v; + } + + HOST DEVICE inline Vector3_t<T> operator-(const Vector3_t<T>& w ) const { + Vector3_t<T> v; + v.x = x - w.x; + v.y = y - w.y; + v.z = z - w.z; + return v; + } + + template<typename U> + HOST DEVICE inline Vector3_t<T> operator*(U&& s) const { + // TODO, throw exception if int + using TU = typename std::common_type<T,U>::type; + Vector3_t<TU> v; + v.x = s*x; + v.y = s*y; + v.z = s*z; + return v; + } + + template<typename U> + HOST DEVICE inline Vector3_t<T> operator/(U&& s) const { + const U inv = static_cast<U>(1.0)/s; + return (*this)*inv; + } + + template<typename U> + HOST DEVICE inline float dot(const Vector3_t<U>& w) const { return x*w.x + y*w.y + z*w.z; } + + HOST DEVICE inline float length() const { return sqrtf(x*x + y*y + z*z); } + HOST DEVICE inline float length2() const { return x*x + y*y + z*z; } + HOST DEVICE inline float rLength() const { + float l = length(); + if (l != 0.0f) + return 1.0f / l; + return 0.0f; + } + + HOST DEVICE inline float rLength2() const { + float l2 = length2(); + if (l2 != 0.0f) return 1.0f / l2; + return 0.0f; + } + + template<typename U> + HOST DEVICE static inline auto element_mult(const Vector3_t<T>&& v, const Vector3_t<U>&& w) { + using TU = typename std::common_type<T,U>::type; + Vector3_t<TU> ret( + v.x*w.x, + v.y*w.y, + v.z*w.z); + return ret; + } + + HOST DEVICE static inline Vector3_t<T> element_sqrt(const Vector3_t<T>&& w) { + Vector3_t<T> ret( + sqrt(w.x), + sqrt(w.y), + sqrt(w.z)); + return ret; + } + + HOST DEVICE inline void print() const { + printf("%0.3f %0.3f %0.3f\n", x,y,z); + } + + T x, y, z, w; //append a member w + + char* to_char() const { + char* s = new char[128]; + sprintf(s, "%.10g %.10g %.10g", x, y, z); + s[127] = 0; + return s; + } +}; + + + +/* template<typename T> */ +/* HOST DEVICE inline Vector3_t<T> operator*(float s, Vector3_t<T> v) { */ +/* v.x *= s; */ +/* v.y *= s; */ +/* v.z *= s; */ +/* return v; */ +/* } */ + +// template<typename T> +// HOST DEVICE inline Vector3_t<T> operator/(Vector3_t<T> v, float s) { +// const float sinv = 1.0f/s; +// return v*sinv; +// } + +template<typename T, typename U> +HOST DEVICE inline auto operator/(U&& s, Vector3_t<T>&& v) { + // TODO, throw exception if int + using TU = typename std::common_type<T,U>::type; + Vector3_t<TU> ret; + ret.x = s / v.x; + ret.y = s / v.y; + ret.z = s / v.z; + return ret; +} + +namespace std { + template<typename T,typename U> + struct common_type<Vector3_t<T>, Vector3_t<U>> { + using type = Vector3_t<common_type_t<T,U>>; + }; +}