diff --git a/src/GPUManager.h b/src/GPUManager.h index d4b8d91d654beb68a4041e8e6c063289cb9ecc36..159a3ac0117d4208f34ffef6c8edee251756cec8 100644 --- a/src/GPUManager.h +++ b/src/GPUManager.h @@ -1,10 +1,45 @@ #pragma once +#ifdef __CUDACC__ + #define HOST __host__ + #define DEVICE __device__ +#else + #define HOST + #define DEVICE +#endif + +#if defined(__CUDACC__) // NVCC + #define MY_ALIGN(n) __align__(n) +#elif defined(__GNUC__) // GCC + #define MY_ALIGN(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) // MSVC + #define MY_ALIGN(n) __declspec(align(n)) +#else + #error "Please provide a definition for MY_ALIGN macro for your host compiler!" +#endif + +#ifndef USE_CUDA +struct MY_ALIGN(16) float4 { + float4() : x(0), y(0), z(0), w(0) {}; + float4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}; + float4 operator+(const float4&& o) { + return float4(x+o.x,y+o.y,z+o.z,w+o.w); + }; + float4 operator*(const float&& s) { + return float4(x*s,y*s,z*s,w*s); + }; + + float x,y,z,w; +}; +#endif + + #ifdef USE_CUDA #include <cstdio> #include <vector> #include <cuda.h> #include <cuda_runtime_api.h> +#include <cuda_runtime.h> // #include "useful.h" diff --git a/src/Integrator/CPU.h b/src/Integrator/CPU.h index 876062f4c62db50e89775506412d2b5ca6429471..7fa452c50391ec371c1dadeb2e3e40a34bc2b4e7 100644 --- a/src/Integrator/CPU.h +++ b/src/Integrator/CPU.h @@ -6,3 +6,9 @@ public: void compute(Patch* patch); int num_patches() const { return 1; }; }; + +class LangevinIntegrate : public Integrator { +public: + void compute(Patch* patch); + int num_patches() const { return 1; }; +}; diff --git a/src/ParticlePatch.cpp b/src/ParticlePatch.cpp index a4464acaff0c9d677521770f70eb16154012b532..27a32d1ab83c7ff8f0a1a6992c5f54e1fd7ad588 100644 --- a/src/ParticlePatch.cpp +++ b/src/ParticlePatch.cpp @@ -1,5 +1,12 @@ #include "ParticlePatch.h" +Patch::Patch() : BasePatch() { + pos_force = momentum = nullptr; + // num_rb = num_rb_attached_particles = 0; + // rb_pos = rb_orient = rb_mom = rb_ang_mom = type = rb_type = nullptr; +}; + + void Patch::compute() { for (auto& c_p: local_computes) { c_p->compute(this); diff --git a/src/ParticlePatch.h b/src/ParticlePatch.h index 2efcda21b9598d6a17d8bee71c93afeb30e3bd2e..1ad3f5fc8daf6b5a990caa6c5e603d36b86341d7 100644 --- a/src/ParticlePatch.h +++ b/src/ParticlePatch.h @@ -27,22 +27,24 @@ class BasePatch { public: // BasePatch(size_t num, short thread_id, short gpu_id, SimSystem& sys); // BasePatch(size_t num, short thread_id, short gpu_id); - // BasePatch(); + BasePatch() : num(0), capacity(0) {} // ~BasePatch(); - + private: size_t capacity; size_t num; - short thread_id; // MPI - short gpu_id; // -1 if GPU unavailable - int patch_idx; // Unique ID across ranks + // short thread_id; // MPI + // short gpu_id; // -1 if GPU unavailable + + static size_t patch_idx; // Unique ID across ranks Vector3 lower_bound; Vector3 upper_bound; }; -class PatchProxy : public BasePatch { +class PatchProxy { + // public: // ??? private: @@ -51,50 +53,53 @@ private: class Patch : public BasePatch { public: - Patch(size_t num, short thread_id, short gpu_id) {}; + Patch(); + // void deleteParticles(IndexList& p); - // void addParticles(int n, int typ); + // void addParticles(size_t n, size_t typ); // template<class T> // void add_compute(std::unique_ptr<T>&& p) { // std::unique_ptr<BasePatchOp> base_p = static_cast<std::unique_ptr<BasePatchOp>>(p); // local_computes.emplace_back(p); // }; + void add_compute(std::unique_ptr<BasePatchOp>&& p) { local_computes.emplace_back(std::move(p)); }; + void add_point_particles(size_t num_added); + void add_point_particles(size_t num_added, Vector3* positions, Vector3* momenta = nullptr); + + // TODO? emplace_point_particles void compute(); private: + + // void randomize_positions(size_t start = 0, size_t num = -1); + // std::vector<PatchProxy> neighbors; std::vector<std::unique_ptr<BasePatchOp>> local_computes; // Operations that will be performed on this patch each timestep std::vector<std::unique_ptr<BasePatchOp>> nonlocal_computes; // Operations that will be performed on this patch each timestep - static int patch_idx; // Unique ID across ranks - // CPU particle arrays - Vector3* pos; + Vector3* pos_force; Vector3* momentum; - Vector3* rb_pos; - Matrix3* rb_orient; - Vector3* rb_mom; - Vector3* rb_amom; - - int* type; // particle types: 0, 1, ... -> num * numReplicas - - int num_rb_attached_particles; - int num_group_sites; - int* groupSiteData_d; - - // Device arrays - Vector3* pos_d; - Vector3* momentum_d; - Vector3* rb_pos_d; - Matrix3* rb_orient_d; - Vector3* rb_mom_d; - Vector3* rb_amom_d; - int* type_d; + // size_t num_rb; + // Vector3* rb_pos; + // Matrix3* rb_orient; + // Vector3* rb_mom; + // Vector3* rb_ang_mom; + + // size_t* type; // particle types: 0, 1, ... -> num * numReplicas + // size_t* rb_type; // rigid body types: 0, 1, ... -> num * numReplicas + + // size_t num_rb_attached_particles; + + // TODO: add a rb_attached bitmask + + size_t num_group_sites; + }; // // Patch::Patch(size_t num, short thread_id, short gpu_id) {}; diff --git a/src/ParticlePatch/CPU.cpp b/src/ParticlePatch/CPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..02c4510e82a949c3ffe9537b54493e97f83f1a7f --- /dev/null +++ b/src/ParticlePatch/CPU.cpp @@ -0,0 +1,6 @@ +#include "CPU.h" + +void BDIntegrate::compute(Patch* p) { + std::cout << "BDIntegrate::compute()" << std::endl; + IntegratorKernels::BDIntegrate(); +}; diff --git a/src/ParticlePatch/CPU.h b/src/ParticlePatch/CPU.h new file mode 100644 index 0000000000000000000000000000000000000000..6971727ad1a20870efcf9cb5c9b238f5c601aa22 --- /dev/null +++ b/src/ParticlePatch/CPU.h @@ -0,0 +1,17 @@ +#pragma once +#include "../ParticlePatch.h" + +#ifdef USE_CUDA +class PatchCPU : public Patch { +public: + void compute(); + + void add_compute(std::unique_ptr<BasePatchOp>&& p) { + local_computes.emplace_back(std::move(p)); + }; + + void add_point_particles(size_t num_added); + void add_point_particles(size_t num_added, Vector3* positions, Vector3* momenta = nullptr); + +}; +#endif diff --git a/src/ParticlePatch/CUDA.cu b/src/ParticlePatch/CUDA.cu new file mode 100644 index 0000000000000000000000000000000000000000..ae9c6902850d24fd7b6192d41899941c3e0c3e1c --- /dev/null +++ b/src/ParticlePatch/CUDA.cu @@ -0,0 +1,19 @@ +#include "CUDA.h" + +#ifdef USE_CUDA +// __global__ void BDIntegrate_kernel() { +// if (threadIdx.x == 0) { +// printf("BDIntegrate_kernel()\n"); +// IntegratorKernels::BDIntegrate(); +// } +// }; + +// void BDIntegrateCUDA::compute(Patch* p) { +// printf("BDIntegrateCUDA::compute()\n"); +// BDIntegrate_kernel<<<1,32>>>(); +// }; + +PatchCUDA::PatchCUDA() : Patch() { + pos_force_d = momentum_d = rb_pos_d = rb_orient_d = rb_mom_d = rb_ang_mom_d = type_d = rb_type_d = nullptr; +} +#endif diff --git a/src/ParticlePatch/CUDA.h b/src/ParticlePatch/CUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..1f88139bd0e747fda2bdfd4eeb62f2ea35ae195d --- /dev/null +++ b/src/ParticlePatch/CUDA.h @@ -0,0 +1,26 @@ +#pragma once +#include "../GPUManager.h" +#include "../ParticlePatch.h" + +class PatchCUDA : public Patch { +public: + + void compute(); + // void assign_gpu(GPU); + +private: + + GPU my_gpu; + + // Device arrays + size_t* groupSiteData_d; + Vector3* pos_force_d; + Vector3* momentum_d; + Vector3* rb_pos_d; + Matrix3* rb_orient_d; + Vector3* rb_mom_d; + Vector3* rb_ang_mom_d; + size_t* type_d; + size_t* rb_type_d; + +}; diff --git a/src/ParticlePatch/kernels.h b/src/ParticlePatch/kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..29c7412bba618081d7c66ae7cba08acac2d37081 --- /dev/null +++ b/src/ParticlePatch/kernels.h @@ -0,0 +1,25 @@ +#pragma once + +// #include "../useful.h" +#include "../Types.h" + +#ifdef __CUDACC__ + #define HOST __host__ + #define DEVICE __device__ +#else + #define HOST + #define DEVICE +#endif + +namespace IntegratorKernels { + HOST DEVICE void __inline__ BDIntegrate() { + // std::cout << "Computes::BDIntegrate_inline" << std::endl; + printf("Integrator::BDIntegrate\n"); + }; + + HOST DEVICE void __inline__ BDIntegrate(Vector3* __restrict__ pos, const Vector3 * const __restrict__ force, const int& idx, float& root_Dt, Vector3& normal_sample_3D) { + printf("Integrator::BDIntegrate\n"); + pos[idx] = pos[idx] + force[idx] * root_Dt + normal_sample_3D; + }; + +} diff --git a/src/PatchOp.cu b/src/PatchOp.cu new file mode 100644 index 0000000000000000000000000000000000000000..5ecc4b54077e314c14c63016141a10bdc0539ba9 --- /dev/null +++ b/src/PatchOp.cu @@ -0,0 +1,3 @@ +#include "PatchOp.cuh" // TODO: refactor this file + + diff --git a/src/SimManager.cpp b/src/SimManager.cpp index defd93cbe810281df01ac503389b26fbee47a4f0..24074df38dfca7619cf1e44e720e195185a79e19 100644 --- a/src/SimManager.cpp +++ b/src/SimManager.cpp @@ -7,7 +7,7 @@ void SimManager::run() { // SimSystem sys = SimSystem(); // Patch p(10,0,0,sys); - Patch p(10,0,0); + Patch p; //ProxyPatch p2(10,0,0); @@ -28,6 +28,5 @@ void SimManager::run() { #ifdef USE_CUDA cudaDeviceSynchronize(); #endif - } - + } }; diff --git a/src/Tests/CMakeLists.txt b/src/Tests/CMakeLists.txt index 15fb2f215afe02575ac82743df6db0b5b54a9cb0..26f5dcacaf5b298fc45707f8adbbd1c93ea84f38 100644 --- a/src/Tests/CMakeLists.txt +++ b/src/Tests/CMakeLists.txt @@ -8,6 +8,7 @@ include(CTest) add_executable("arbd_tests" matrix3.cu vector3_precision.cu +bitmask.cu ) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda") set_property(TARGET arbd_tests PROPERTY CXX_STANDARD 14) diff --git a/src/Tests/bitmask.cu b/src/Tests/bitmask.cu new file mode 100644 index 0000000000000000000000000000000000000000..7703e033e693928e9ae0b1b1d05ad687dcf5bfad --- /dev/null +++ b/src/Tests/bitmask.cu @@ -0,0 +1,48 @@ +#include "catch_boiler.h" +#include "../Types.h" + +#include <catch2/catch_tostring.hpp> +namespace Catch { + template <> + struct StringMaker<Bitmask> { + static std::string convert( Bitmask const& value ) { + return value.to_string(); + } + }; +} + +DEF_RUN_TRIAL + +namespace Tests::Bitmask { + + void run_tests() { + using T = ::Bitmask; + // std::ostream test_info; + INFO( "Testing " << type_name<T>() << " functionally" ); + { + using R = T; + int i = 10; + T b = T(i); // bitmask of length i + for (int j: {0,3,10,19}) { + if (j < i) b.set_mask(j,1); + } + + REQUIRE( b.to_string() == "1001000000" ); + + T* b_d = b.copy_to_cuda(); + cudaDeviceSynchronize(); + + T b2 = b.retrieve_from_cuda(b_d); + cudaDeviceSynchronize(); + + REQUIRE( b == b2 ); + + } + } + + TEST_CASE( "Testing Bitmask for GPU/CPU consistency", "[Tests::Bitmask]" ) { + const bool check_diag = true; + run_tests(); + } + +} diff --git a/src/Tests/catch_boiler.h b/src/Tests/catch_boiler.h new file mode 100644 index 0000000000000000000000000000000000000000..898a8a7d5a7d7607351c7edb7a793183ed74a93a --- /dev/null +++ b/src/Tests/catch_boiler.h @@ -0,0 +1,75 @@ +#include <iostream> +#include <cstdio> + +#include "../SignalManager.h" +/* #include "../Types.h" */ +#include "../GPUManager.h" +#include <cuda.h> +#include <nvfunctional> + +#include "type_name.h" + +/* #include <catch2/catch_tostring.hpp> */ +/* namespace Catch { */ +/* template<typename T, bool b1, bool b2> */ +/* struct StringMaker<Matrix3_t<T,b1,b2>> { */ +/* static std::string convert( Matrix3_t<T,b1,b2> const& value ) { */ +/* return value.to_string(); */ +/* } */ +/* }; */ +/* } */ +#include <catch2/catch_test_macros.hpp> +#include <catch2/matchers/catch_matchers_floating_point.hpp> + +namespace Tests { + template<typename Op_t, typename R, typename ...T> + __global__ void op_kernel(R* result, T...in) { + if (blockIdx.x == 0) { + *result = Op_t::op(in...); + } + } +} + +#define DEF_RUN_TRIAL \ +namespace Tests {\ +template<typename Op_t, typename R, typename ...T>\ + void run_trial( std::string name, R expected_result, T...args) {\ + R *gpu_result_d, gpu_result, cpu_result;\ + cpu_result = Op_t::op(args...);\ + cudaMalloc((void **)&gpu_result_d, sizeof(R));\ + \ + op_kernel<Op_t, R, T...><<<1,1>>>(gpu_result_d, args...);\ + cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost);\ + cudaDeviceSynchronize();\ +\ + INFO( name );\ +\ + CAPTURE( cpu_result );\ + CAPTURE( expected_result );\ + \ + REQUIRE( cpu_result == expected_result );\ + CHECK( cpu_result == gpu_result );\ + }\ +} + +namespace Tests::Unary { + template<typename R, typename T> + struct NegateOp { HOST DEVICE static R op(T in) { return static_cast<R>(-in); } }; + + template<typename R, typename T> + struct NormalizedOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized()); } }; + +} + +namespace Tests::Binary { + // R is return type, T and U are types of operands + template<typename R, typename T, typename U> + struct AddOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a+b); } }; + template<typename R, typename T, typename U> + struct SubOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a-b); } }; + template<typename R, typename T, typename U> + struct MultOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a*b); } }; + template<typename R, typename T, typename U> + struct DivOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a/b); } }; + +} diff --git a/src/Tests/matrix3.cu b/src/Tests/matrix3.cu index 4b5430654b30633432516bb873e6760984c88a60..8f89488a2aede5ce98fa3e2b544901dba3625f58 100644 --- a/src/Tests/matrix3.cu +++ b/src/Tests/matrix3.cu @@ -1,12 +1,5 @@ -#include <iostream> -#include <cstdio> - -#include "../SignalManager.h" +#include "catch_boiler.h" #include "../Types.h" -#include <cuda.h> -#include <nvfunctional> - -#include "type_name.h" #include <catch2/catch_tostring.hpp> namespace Catch { @@ -17,144 +10,21 @@ namespace Catch { } }; } -#include <catch2/catch_test_macros.hpp> -#include <catch2/matchers/catch_matchers_floating_point.hpp> - - - -namespace Tests { - - template<typename Op_t, typename R, typename ...T> - __global__ void op_kernel(R* result, T...in) { - if (blockIdx.x == 0) { - *result = Op_t::op(in...); - } - } - - // In C++14, how can I unpack a tuple so its elements are passed to a function as arguments? - template <typename F, typename Tuple, size_t... I> - decltype(auto) apply_impl(F&& f, Tuple&& t, std::index_sequence<I...>) - { - return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...); - } - template <typename F, typename Tuple> - decltype(auto) apply(F&& f, Tuple&& t) - { - using Indices = std::make_index_sequence<std::tuple_size<std::decay_t<Tuple>>::value>; - return apply_impl(std::forward<F>(f), std::forward<Tuple>(t), Indices{}); - } +DEF_RUN_TRIAL - template <typename F, typename Tuple, size_t... I> - decltype(auto) apply_each_impl(F&& f, Tuple&& t, std::index_sequence<I...>) - { - return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...); - } - template <typename F, typename Tuple> - decltype(auto) apply_each(F&& f, Tuple&& t) - { - using Indices = std::make_index_sequence<std::tuple_size<std::decay_t<Tuple>>::value>; - return apply_each_impl(std::forward<F>(f), std::forward<Tuple>(t), Indices{}); - } - - template<typename T> - void call_info(T t) { - // INFO( type_name(t) ); - // UNSCOPED_INFO( type_name<T>() << " " << t ); - } - // template<> - // void call_info(double t) { - // UNSCOPED_INFO( "double " << t ); - // } - // template<> - // void call_info(float t) { - // UNSCOPED_INFO( "double " << t ); - // // INFO( type_name(t) ); - // INFO( "float" ); - // } - // template<> - // void call_info(int t) { - // // INFO( type_name(t) ); - // INFO( "int" ); - // } - - template<typename Op_t, typename R, typename ...T> - void run_trial( std::string name, R expected_result, T...args) { - R *gpu_result_d, gpu_result, cpu_result; - cpu_result = Op_t::op(args...); - cudaMalloc((void **)&gpu_result_d, sizeof(R)); - - op_kernel<Op_t, R, T...><<<1,1>>>(gpu_result_d, args...); - cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost); - cudaDeviceSynchronize(); - - INFO( name ); - // INFO( type_name<T...>() ); - - // auto tmp = std::make_tuple(args...); - // for (int i = 0; i < sizeof(T...); ++i) { - // auto& arg = std::get<i>(tmp); - // CAPTURE( arg ); - // } - // auto fn = [](auto a) { CAPTURE( a ); }; - - // auto fn = [](auto a) { INFO( a->to_string() ); }; - // using expander = int[]; - // (void)expander{0, (void(call_info<T>(std::forward<T>(args))), 0)...}; - - CAPTURE( cpu_result ); - CAPTURE( expected_result ); - - REQUIRE( cpu_result == expected_result ); - CHECK( cpu_result == gpu_result ); - } -} - -namespace Tests::Unary { +namespace Tests::Unary::Matrix3 { template<typename R, typename T> - struct NegateOp { HOST DEVICE static R op(T in) { return static_cast<R>(-in); } }; + struct DeterminantOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.det()); } }; template<typename R, typename T> - struct NormalizedOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized()); } }; - - namespace Matrix3 { - template<typename R, typename T> - struct DeterminantOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.det()); } }; - - template<typename R, typename T> - struct NormalizeDetOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized().det()); } }; - - // template<typename R, typename T> - // struct NormalizeOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized()); } }; - - + struct NormalizeDetOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized().det()); } }; - template<typename R, typename T> - struct InverseOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.inverse()); } }; - - template<typename R, typename T> - struct TransposeOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.transpose()); } }; - } -} - -namespace Tests::Binary { - // R is return type, T and U are types of operands - template<typename R, typename T, typename U> - struct AddOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a+b); } }; - template<typename R, typename T, typename U> - struct SubOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a-b); } }; - template<typename R, typename T, typename U> - struct MultOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a*b); } }; - template<typename R, typename T, typename U> - struct DivOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a/b); } }; - - namespace Matrix3 { - template<typename R, typename T, typename U> - struct CrossOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a.cross(b)); } }; - } -} + template<typename R, typename T> + struct InverseOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.inverse()); } }; -namespace Tests::Unary::Matrix3 { + template<typename R, typename T> + struct TransposeOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.transpose()); } }; template<typename A, bool is_diag=false, bool check_diag=false> void run_tests() { @@ -204,7 +74,11 @@ namespace Tests::Unary::Matrix3 { } namespace Tests::Binary::Matrix3 { + // Custom Operations for Matrix3 + template<typename R, typename T, typename U> + struct CrossOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a.cross(b)); } }; + // Run template<typename A, typename B, bool is_diag=false, bool check_diag=false> void run_tests() { using T = Matrix3_t<A, is_diag, check_diag>; diff --git a/src/Types.h b/src/Types.h index 765b1abb662a6821a3d35a641dffb20b93eb5de3..b1510301015b6e5114bd578210605e02c4677e23 100644 --- a/src/Types.h +++ b/src/Types.h @@ -1,46 +1,13 @@ #pragma once -#ifdef __CUDACC__ - #define HOST __host__ - #define DEVICE __device__ -#else - #define HOST - #define DEVICE -#endif - -#if defined(__CUDACC__) // NVCC - #define MY_ALIGN(n) __align__(n) -#elif defined(__GNUC__) // GCC - #define MY_ALIGN(n) __attribute__((aligned(n))) -#elif defined(_MSC_VER) // MSVC - #define MY_ALIGN(n) __declspec(align(n)) -#else - #error "Please provide a definition for MY_ALIGN macro for your host compiler!" -#endif - -#ifdef USE_CUDA -#include <cuda_runtime.h> -#else -struct MY_ALIGN(16) float4 { - float4() : x(0), y(0), z(0), w(0) {}; - float4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}; - float4 operator+(const float4&& o) { - return float4(x+o.x,y+o.y,z+o.z,w+o.w); - }; - float4 operator*(const float&& s) { - return float4(x*s,y*s,z*s,w*s); - }; - - float x,y,z,w; -}; -#endif - +#include "GPUManager.h" #include <stdarg.h> // For va_start, etc. #include <memory> // For std::unique_ptr #include <cstring> -// from: https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf/8098080#8098080 +// Utility function used by types to return std::string using format syntax inline std::string string_format(const std::string fmt_str, ...) { + // from: https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf/8098080#8098080 int final_n, n = ((int)fmt_str.size()) * 2; /* Reserve two times as much as the length of the fmt_str */ std::unique_ptr<char[]> formatted; va_list ap; @@ -63,3 +30,5 @@ inline std::string string_format(const std::string fmt_str, ...) { using Vector3 = Vector3_t<float>; using Matrix3 = Matrix3_t<float,false>; + +#include "Types/Bitmask.h" diff --git a/src/Types/Bitmask.h b/src/Types/Bitmask.h new file mode 100644 index 0000000000000000000000000000000000000000..3e4ca4aaa3ab55d9665ebccbffed830eb7f2ea5d --- /dev/null +++ b/src/Types/Bitmask.h @@ -0,0 +1,228 @@ +#pragma once +#include <climits> +#include <cstdio> +#include <cstdlib> +#include <cassert> +#include <initializer_list> + + +class BitmaskBase { +public: + BitmaskBase(const size_t len) : len(len) {} + HOST DEVICE + virtual void set_mask(size_t i, bool value) = 0; + HOST DEVICE + virtual bool get_mask(size_t i) const = 0; + size_t get_len() const { return len; } + virtual void print() { + for (int i = 0; i < len; ++i) { + printf("%d", static_cast<int>(get_mask(i)) ); + } + printf("\n"); + } + +protected: + size_t len; +}; + +// Don't use base because virtual member functions require device malloc +class Bitmask { + typedef size_t idx_t; + typedef unsigned int data_t; +public: + Bitmask(const idx_t len) : len(len) { + idx_t tmp = get_array_size(); + // printf("len %lld\n",len); + // printf("tmp %d\n",tmp); + assert(tmp * data_stride >= len); + mask = (tmp > 0) ? new data_t[tmp] : nullptr; + for (int i = 0; i < tmp; ++i) mask[i] = data_t(0); + } + ~Bitmask() { if (mask != nullptr) delete[] mask; } + + HOST DEVICE + idx_t get_len() const { return len; } + + HOST DEVICE + void set_mask(idx_t i, bool value) { + // return; + assert(i < len); + idx_t ci = i/data_stride; + data_t change_bit = (data_t(1) << (i-ci*data_stride)); +#ifdef __CUDA_ARCH__ + if (value) { + atomicOr( &mask[ci], change_bit ); + } else { + atomicAnd( &mask[ci], ~change_bit ); + } +#else + if (value) { + mask[ci] = mask[ci] | change_bit; + } else { + mask[ci] = mask[ci] & (~change_bit); + } +#endif + } + + HOST DEVICE + bool get_mask(const idx_t i) const { + // return false; + assert(i < len); + const idx_t ci = i/data_stride; + return mask[ci] & (data_t(1) << (i-ci*data_stride)); + } + + HOST DEVICE inline bool operator==(Bitmask& b) const { + // Inefficient but straightforward approach; directly comparing underlying data would be fine, but then we need to deal with data strides + if (len != b.len) return false; + for (idx_t i = 0; i < len; ++i) { + if (get_mask(i) != b.get_mask(i)) return false; + } + return true; + } + +#ifdef USE_CUDA + HOST + Bitmask* copy_to_cuda() const { + Bitmask* tmp_obj_d = nullptr; + Bitmask obj_tmp(0); + data_t* mask_d = nullptr; + size_t sz = sizeof(data_t) * get_array_size(); + gpuErrchk(cudaMalloc(&tmp_obj_d, sizeof(Bitmask))); + if (sz > 0) { + gpuErrchk(cudaMalloc(&mask_d, sz)); + gpuErrchk(cudaMemcpy(mask_d, mask, sz, cudaMemcpyHostToDevice)); + } + // printf("Bitmask::copy_to_cuda() len(%lld) mask(%x)\n", len, mask_d); + obj_tmp.len = len; + obj_tmp.mask = mask_d; + gpuErrchk(cudaMemcpy(tmp_obj_d, &obj_tmp, sizeof(Bitmask), cudaMemcpyHostToDevice)); + obj_tmp.mask = nullptr; + return tmp_obj_d; + } + + HOST + static Bitmask retrieve_from_cuda(Bitmask* obj_d) { + Bitmask obj_tmp(0); + gpuErrchk(cudaMemcpy(&obj_tmp, obj_d, sizeof(Bitmask), cudaMemcpyDeviceToHost)); + printf("TEST: %d\n", obj_tmp.len); + if (obj_tmp.len > 0) { + size_t array_size = obj_tmp.get_array_size(); + size_t sz = sizeof(data_t) * array_size; + data_t *data_addr = obj_tmp.mask; + obj_tmp.mask = new data_t[array_size]; + gpuErrchk(cudaMemcpy(obj_tmp.mask, data_addr, sz, cudaMemcpyDeviceToHost)); + } else { + obj_tmp.mask = nullptr; + } + return obj_tmp; + } + + HOST + static void remove_from_cuda(Bitmask* obj_d) { + Bitmask obj_tmp(0); + gpuErrchk(cudaMemcpy(&obj_tmp, obj_d, sizeof(Bitmask), cudaMemcpyDeviceToHost)); + if (obj_tmp.len > 0) { + gpuErrchk(cudaFree(obj_tmp.mask)); + } + obj_tmp.mask = nullptr; + gpuErrchk(cudaMemset((void*) &(obj_d->mask), 0, sizeof(data_t*))); // set nullptr on to device + gpuErrchk(cudaFree(obj_d)); + obj_d = nullptr; + } +#endif + + HOST + auto to_string() const { + std::string s; + s.reserve(len); + for (size_t i = 0; i < len; ++i) { + s += get_mask(i) ? '1' : '0'; + } + return s; + } + + HOST DEVICE + void print() { + for (int i = 0; i < len; ++i) { + printf("%d", (int) get_mask(i)); + } + printf("\n"); + } + +private: + idx_t get_array_size() const { return (len == 0) ? 1 : (len-1)/data_stride + 1; } + idx_t len; + const static idx_t data_stride = CHAR_BIT * sizeof(data_t)/sizeof(char); + data_t* __restrict__ mask; +}; + +/* +template<size_t chunk_size> +class SparseBitmask : public BitmaskBase { +public: + SparseBitmask(const size_t len) : BitmaskBase(len) { + assert( sparse_size > 0 ); + meta_len = (len-1)/data_stride + 1; + meta_mask = Bitmask(meta_len); + } + ~Bitmask() { delete[] mask; } + + void set_mask(size_t i, bool value) { + assert(i < len); + size_t meta_i = i/data_stride; + // TODO: keep track of + + // bool is_set = get_mask(i); + // if ((!is_set) && value) { + // mask[ci] = mask[ci] + (1 << (i-ci*data_stride)); + // } else if (is_set && (!value)) { + // mask[ci] = mask[ci] - (1 << (i-ci*data_stride)); + // } + } + size_t meta_idx_to_char_idx(size_t meta_idx) { + return meta_idx * CHAR_BIT; + } + + bool get_mask(size_t i) { + // assert(i < len); + // size_t ci = i/data_stride; + // return mask[ci] & (1 << (i-ci*data_stride)); + } + +private: + const static size_t data_stride = CHAR_BIT*spare_size; + size_t meta_len; + Bitmask meta_mask; + std::vector<char> mask; +}; +*/ + +/* +bool test_Bitmask() { + for (int i: {8,9,20}) { + Bitmask b = Bitmask(i); + for (int j: {0,3,10,19}) { + if (j < i) b.set_mask(j,1); + } + printf("Testing Bitmask(%d)\n", i); + b.print(); + b.set_mask(3,1); + b.print(); + b.set_mask(3,1); + b.print(); + b.set_mask(3,0); + b.print(); + b.set_mask(2,0); + b.print(); + } + return true; +} +// */ + +/* +int main(int argc, char* argv[]) { + test_Bitmask(); + return 1; +} +//*/