diff --git a/src/GPUManager.h b/src/GPUManager.h
index d4b8d91d654beb68a4041e8e6c063289cb9ecc36..159a3ac0117d4208f34ffef6c8edee251756cec8 100644
--- a/src/GPUManager.h
+++ b/src/GPUManager.h
@@ -1,10 +1,45 @@
 #pragma once
 
+#ifdef __CUDACC__
+    #define HOST __host__
+    #define DEVICE __device__
+#else
+    #define HOST
+    #define DEVICE
+#endif
+
+#if defined(__CUDACC__) // NVCC
+   #define MY_ALIGN(n) __align__(n)
+#elif defined(__GNUC__) // GCC
+  #define MY_ALIGN(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER) // MSVC
+  #define MY_ALIGN(n) __declspec(align(n))
+#else
+  #error "Please provide a definition for MY_ALIGN macro for your host compiler!"
+#endif
+
+#ifndef USE_CUDA
+struct MY_ALIGN(16) float4 {
+    float4() : x(0), y(0), z(0), w(0) {};
+    float4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {};
+    float4 operator+(const float4&& o) {
+	return float4(x+o.x,y+o.y,z+o.z,w+o.w);
+    };
+    float4 operator*(const float&& s) {
+	return float4(x*s,y*s,z*s,w*s);
+    };
+    
+    float x,y,z,w;
+};
+#endif
+
+
 #ifdef USE_CUDA
 #include <cstdio>
 #include <vector>
 #include <cuda.h>
 #include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
 
 // #include "useful.h"
 
diff --git a/src/Integrator/CPU.h b/src/Integrator/CPU.h
index 876062f4c62db50e89775506412d2b5ca6429471..7fa452c50391ec371c1dadeb2e3e40a34bc2b4e7 100644
--- a/src/Integrator/CPU.h
+++ b/src/Integrator/CPU.h
@@ -6,3 +6,9 @@ public:
     void compute(Patch* patch);
     int num_patches() const { return 1; };
 };
+
+class LangevinIntegrate : public Integrator {
+public:
+    void compute(Patch* patch);
+    int num_patches() const { return 1; };
+};
diff --git a/src/ParticlePatch.cpp b/src/ParticlePatch.cpp
index a4464acaff0c9d677521770f70eb16154012b532..27a32d1ab83c7ff8f0a1a6992c5f54e1fd7ad588 100644
--- a/src/ParticlePatch.cpp
+++ b/src/ParticlePatch.cpp
@@ -1,5 +1,12 @@
 #include "ParticlePatch.h"
 
+Patch::Patch() : BasePatch() {
+    pos_force = momentum = nullptr;
+    // num_rb = num_rb_attached_particles = 0;
+    // rb_pos = rb_orient = rb_mom = rb_ang_mom = type = rb_type = nullptr;
+};
+
+
 void Patch::compute() {
     for (auto& c_p: local_computes) {
 	c_p->compute(this);
diff --git a/src/ParticlePatch.h b/src/ParticlePatch.h
index 2efcda21b9598d6a17d8bee71c93afeb30e3bd2e..1ad3f5fc8daf6b5a990caa6c5e603d36b86341d7 100644
--- a/src/ParticlePatch.h
+++ b/src/ParticlePatch.h
@@ -27,22 +27,24 @@ class BasePatch {
 public:
     // BasePatch(size_t num, short thread_id, short gpu_id, SimSystem& sys);
     // BasePatch(size_t num, short thread_id, short gpu_id);
-    // BasePatch();
+    BasePatch() : num(0), capacity(0) {}
     // ~BasePatch();
-
+    
 private:
     size_t capacity;
     size_t num;
-    short thread_id;		// MPI
-    short gpu_id;		// -1 if GPU unavailable
 
-    int patch_idx;		// Unique ID across ranks
+    // short thread_id;		// MPI
+    // short gpu_id;		// -1 if GPU unavailable
+
+    static size_t patch_idx;		// Unique ID across ranks
 
     Vector3 lower_bound;
     Vector3 upper_bound;
 };
 
-class PatchProxy : public BasePatch {
+class PatchProxy {
+    // 
 public:
     // ???
 private:
@@ -51,50 +53,53 @@ private:
 
 class Patch : public BasePatch {
 public:
-    Patch(size_t num, short thread_id, short gpu_id) {};
+    Patch();
+
     // void deleteParticles(IndexList& p);
-    // void addParticles(int n, int typ);
+    // void addParticles(size_t n, size_t typ);
     // template<class T>
     // void add_compute(std::unique_ptr<T>&& p) {
     // 	std::unique_ptr<BasePatchOp> base_p = static_cast<std::unique_ptr<BasePatchOp>>(p);
     // 	local_computes.emplace_back(p);
     // };
+    
     void add_compute(std::unique_ptr<BasePatchOp>&& p) {
 	local_computes.emplace_back(std::move(p));
     };
 
+    void add_point_particles(size_t num_added);
+    void add_point_particles(size_t num_added, Vector3* positions, Vector3* momenta = nullptr);
+    
+    // TODO? emplace_point_particles
     void compute();
     
 private:
+
+    // void randomize_positions(size_t start = 0, size_t num = -1);
+
     // std::vector<PatchProxy> neighbors;    
     std::vector<std::unique_ptr<BasePatchOp>> local_computes; // Operations that will be performed on this patch each timestep
     std::vector<std::unique_ptr<BasePatchOp>> nonlocal_computes; // Operations that will be performed on this patch each timestep
     
-    static int patch_idx;		// Unique ID across ranks
-
     // CPU particle arrays
-    Vector3* pos;
+    Vector3* pos_force;
     Vector3* momentum;
 
-    Vector3* rb_pos;
-    Matrix3* rb_orient;
-    Vector3* rb_mom;
-    Vector3* rb_amom;
-
-    int* type;	     // particle types: 0, 1, ... -> num * numReplicas
-
-    int num_rb_attached_particles;
-    int num_group_sites;
-    int* groupSiteData_d;
-
-    // Device arrays
-    Vector3* pos_d;
-    Vector3* momentum_d;
-    Vector3* rb_pos_d;
-    Matrix3* rb_orient_d;
-    Vector3* rb_mom_d;
-    Vector3* rb_amom_d;
-    int* type_d;
+    // size_t num_rb;
+    // Vector3* rb_pos;
+    // Matrix3* rb_orient;
+    // Vector3* rb_mom;
+    // Vector3* rb_ang_mom;
+
+    // size_t* type;	     // particle types: 0, 1, ... -> num * numReplicas
+    // size_t* rb_type;	     // rigid body types: 0, 1, ... -> num * numReplicas
+
+    // size_t num_rb_attached_particles;
+
+    // TODO: add a rb_attached bitmask
+    
+    size_t num_group_sites;
+
 };
 
 // // Patch::Patch(size_t num, short thread_id, short gpu_id) {};
diff --git a/src/ParticlePatch/CPU.cpp b/src/ParticlePatch/CPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..02c4510e82a949c3ffe9537b54493e97f83f1a7f
--- /dev/null
+++ b/src/ParticlePatch/CPU.cpp
@@ -0,0 +1,6 @@
+#include "CPU.h"
+
+void BDIntegrate::compute(Patch* p) {
+    std::cout << "BDIntegrate::compute()" << std::endl;
+    IntegratorKernels::BDIntegrate();
+};
diff --git a/src/ParticlePatch/CPU.h b/src/ParticlePatch/CPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..6971727ad1a20870efcf9cb5c9b238f5c601aa22
--- /dev/null
+++ b/src/ParticlePatch/CPU.h
@@ -0,0 +1,17 @@
+#pragma once
+#include "../ParticlePatch.h"
+
+#ifdef USE_CUDA
+class PatchCPU : public Patch {
+public:
+    void compute();
+
+        void add_compute(std::unique_ptr<BasePatchOp>&& p) {
+	local_computes.emplace_back(std::move(p));
+    };
+
+    void add_point_particles(size_t num_added);
+    void add_point_particles(size_t num_added, Vector3* positions, Vector3* momenta = nullptr);
+
+};
+#endif
diff --git a/src/ParticlePatch/CUDA.cu b/src/ParticlePatch/CUDA.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ae9c6902850d24fd7b6192d41899941c3e0c3e1c
--- /dev/null
+++ b/src/ParticlePatch/CUDA.cu
@@ -0,0 +1,19 @@
+#include "CUDA.h"
+
+#ifdef USE_CUDA
+// __global__ void BDIntegrate_kernel() {
+//     if (threadIdx.x == 0) {
+// 	printf("BDIntegrate_kernel()\n");
+// 	IntegratorKernels::BDIntegrate();
+//     }
+// };
+
+// void BDIntegrateCUDA::compute(Patch* p) {
+//     printf("BDIntegrateCUDA::compute()\n");
+//     BDIntegrate_kernel<<<1,32>>>();
+// };
+
+PatchCUDA::PatchCUDA() : Patch() {
+    pos_force_d = momentum_d = rb_pos_d = rb_orient_d = rb_mom_d = rb_ang_mom_d = type_d = rb_type_d = nullptr;
+}
+#endif
diff --git a/src/ParticlePatch/CUDA.h b/src/ParticlePatch/CUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f88139bd0e747fda2bdfd4eeb62f2ea35ae195d
--- /dev/null
+++ b/src/ParticlePatch/CUDA.h
@@ -0,0 +1,26 @@
+#pragma once
+#include "../GPUManager.h"
+#include "../ParticlePatch.h"
+
+class PatchCUDA : public Patch {
+public:
+    
+    void compute();
+    // void assign_gpu(GPU);
+    
+private:
+
+    GPU my_gpu;
+    
+    // Device arrays
+    size_t* groupSiteData_d;
+    Vector3* pos_force_d;
+    Vector3* momentum_d;
+    Vector3* rb_pos_d;
+    Matrix3* rb_orient_d;
+    Vector3* rb_mom_d;
+    Vector3* rb_ang_mom_d;
+    size_t* type_d;
+    size_t* rb_type_d;
+
+};
diff --git a/src/ParticlePatch/kernels.h b/src/ParticlePatch/kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..29c7412bba618081d7c66ae7cba08acac2d37081
--- /dev/null
+++ b/src/ParticlePatch/kernels.h
@@ -0,0 +1,25 @@
+#pragma once
+
+// #include "../useful.h"
+#include "../Types.h"
+
+#ifdef __CUDACC__
+    #define HOST __host__
+    #define DEVICE __device__
+#else
+    #define HOST
+    #define DEVICE
+#endif
+
+namespace IntegratorKernels {
+    HOST DEVICE  void __inline__ BDIntegrate() {
+	// std::cout << "Computes::BDIntegrate_inline" << std::endl;
+	printf("Integrator::BDIntegrate\n");
+    };
+
+    HOST DEVICE  void __inline__ BDIntegrate(Vector3* __restrict__ pos, const Vector3 * const __restrict__ force, const int& idx, float& root_Dt, Vector3& normal_sample_3D) {
+	printf("Integrator::BDIntegrate\n");
+	pos[idx] = pos[idx] + force[idx] * root_Dt + normal_sample_3D;
+    };
+
+}
diff --git a/src/PatchOp.cu b/src/PatchOp.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ecc4b54077e314c14c63016141a10bdc0539ba9
--- /dev/null
+++ b/src/PatchOp.cu
@@ -0,0 +1,3 @@
+#include "PatchOp.cuh"		// TODO: refactor this file
+
+
diff --git a/src/SimManager.cpp b/src/SimManager.cpp
index defd93cbe810281df01ac503389b26fbee47a4f0..24074df38dfca7619cf1e44e720e195185a79e19 100644
--- a/src/SimManager.cpp
+++ b/src/SimManager.cpp
@@ -7,7 +7,7 @@ void SimManager::run() {
     // SimSystem sys = SimSystem();
     // Patch p(10,0,0,sys);
 
-    Patch p(10,0,0);
+    Patch p;
 
     //ProxyPatch p2(10,0,0);
 
@@ -28,6 +28,5 @@ void SimManager::run() {
 #ifdef USE_CUDA
 	cudaDeviceSynchronize();
 #endif
-    }
-    
+    }    
 };
diff --git a/src/Tests/CMakeLists.txt b/src/Tests/CMakeLists.txt
index 15fb2f215afe02575ac82743df6db0b5b54a9cb0..26f5dcacaf5b298fc45707f8adbbd1c93ea84f38 100644
--- a/src/Tests/CMakeLists.txt
+++ b/src/Tests/CMakeLists.txt
@@ -8,6 +8,7 @@ include(CTest)
 add_executable("arbd_tests"
 matrix3.cu
 vector3_precision.cu
+bitmask.cu
 )
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
 set_property(TARGET arbd_tests PROPERTY CXX_STANDARD 14)
diff --git a/src/Tests/bitmask.cu b/src/Tests/bitmask.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7703e033e693928e9ae0b1b1d05ad687dcf5bfad
--- /dev/null
+++ b/src/Tests/bitmask.cu
@@ -0,0 +1,48 @@
+#include "catch_boiler.h"
+#include "../Types.h"
+
+#include <catch2/catch_tostring.hpp>
+namespace Catch {
+    template <>
+    struct StringMaker<Bitmask> {
+        static std::string convert( Bitmask const& value ) {
+            return value.to_string();
+        }
+    };
+}
+
+DEF_RUN_TRIAL
+
+namespace Tests::Bitmask {
+
+    void run_tests() {
+	using T = ::Bitmask;
+	// std::ostream test_info;
+	INFO( "Testing " << type_name<T>() << " functionally" );
+	{
+	    using R = T;
+	    int i = 10;
+	    T b = T(i);	// bitmask of length i
+	    for (int j: {0,3,10,19}) {
+		if (j < i) b.set_mask(j,1);
+	    }
+
+	    REQUIRE( b.to_string() == "1001000000" );
+
+	    T* b_d = b.copy_to_cuda();
+	    cudaDeviceSynchronize();
+
+	    T b2 = b.retrieve_from_cuda(b_d);
+	    cudaDeviceSynchronize();
+
+	    REQUIRE( b == b2 );
+
+	}
+    }
+
+    TEST_CASE( "Testing Bitmask for GPU/CPU consistency", "[Tests::Bitmask]" ) {
+	const bool check_diag = true;
+	run_tests();
+    }
+
+}
diff --git a/src/Tests/catch_boiler.h b/src/Tests/catch_boiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..898a8a7d5a7d7607351c7edb7a793183ed74a93a
--- /dev/null
+++ b/src/Tests/catch_boiler.h
@@ -0,0 +1,75 @@
+#include <iostream>
+#include <cstdio>
+
+#include "../SignalManager.h"
+/* #include "../Types.h" */
+#include "../GPUManager.h"
+#include <cuda.h>
+#include <nvfunctional>
+
+#include "type_name.h"
+
+/* #include <catch2/catch_tostring.hpp> */
+/* namespace Catch { */
+/*     template<typename T, bool b1, bool b2> */
+/*     struct StringMaker<Matrix3_t<T,b1,b2>> { */
+/*         static std::string convert( Matrix3_t<T,b1,b2> const& value ) { */
+/*             return value.to_string(); */
+/*         } */
+/*     }; */
+/* } */
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+namespace Tests {
+    template<typename Op_t, typename R, typename ...T>
+    __global__ void op_kernel(R* result, T...in) {
+	if (blockIdx.x == 0) {
+	    *result = Op_t::op(in...);
+	}
+    }
+}
+
+#define DEF_RUN_TRIAL \
+namespace Tests {\
+template<typename Op_t, typename R, typename ...T>\
+    void run_trial( std::string name, R expected_result, T...args) {\
+	R *gpu_result_d, gpu_result, cpu_result;\
+	cpu_result = Op_t::op(args...);\
+	cudaMalloc((void **)&gpu_result_d, sizeof(R));\
+	\
+	op_kernel<Op_t, R, T...><<<1,1>>>(gpu_result_d, args...);\
+	cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost);\
+	cudaDeviceSynchronize();\
+\
+	INFO( name );\
+\
+	CAPTURE( cpu_result );\
+	CAPTURE( expected_result );\
+	\
+	REQUIRE( cpu_result == expected_result );\
+	CHECK( cpu_result == gpu_result );\
+    }\
+}
+
+namespace Tests::Unary {
+    template<typename R, typename T>
+    struct NegateOp { HOST DEVICE static R op(T in) { return static_cast<R>(-in); } };
+
+    template<typename R, typename T>
+    struct NormalizedOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized()); } };
+
+}
+
+namespace Tests::Binary {
+    // R is return type, T and U are types of operands
+    template<typename R, typename T, typename U> 
+    struct AddOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a+b); } };
+    template<typename R, typename T, typename U> 
+    struct SubOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a-b); } };
+    template<typename R, typename T, typename U> 
+    struct MultOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a*b); } };
+    template<typename R, typename T, typename U> 
+    struct DivOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a/b); } };
+
+}
diff --git a/src/Tests/matrix3.cu b/src/Tests/matrix3.cu
index 4b5430654b30633432516bb873e6760984c88a60..8f89488a2aede5ce98fa3e2b544901dba3625f58 100644
--- a/src/Tests/matrix3.cu
+++ b/src/Tests/matrix3.cu
@@ -1,12 +1,5 @@
-#include <iostream>
-#include <cstdio>
-
-#include "../SignalManager.h"
+#include "catch_boiler.h"
 #include "../Types.h"
-#include <cuda.h>
-#include <nvfunctional>
-
-#include "type_name.h"
 
 #include <catch2/catch_tostring.hpp>
 namespace Catch {
@@ -17,144 +10,21 @@ namespace Catch {
         }
     };
 }
-#include <catch2/catch_test_macros.hpp>
-#include <catch2/matchers/catch_matchers_floating_point.hpp>
-
-
-
-namespace Tests {
-
-    template<typename Op_t, typename R, typename ...T>
-    __global__ void op_kernel(R* result, T...in) {
-	if (blockIdx.x == 0) {
-	    *result = Op_t::op(in...);
-	}
-    }
-
 
-    // In C++14, how can I unpack a tuple so its elements are passed to a function as arguments?
-    template <typename F, typename Tuple, size_t... I>
-    decltype(auto) apply_impl(F&& f, Tuple&& t, std::index_sequence<I...>)
-    {
-	return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
-    }
-    template <typename F, typename Tuple>
-    decltype(auto) apply(F&& f, Tuple&& t)
-    {
-	using Indices = std::make_index_sequence<std::tuple_size<std::decay_t<Tuple>>::value>;
-	return apply_impl(std::forward<F>(f), std::forward<Tuple>(t), Indices{});
-    }
+DEF_RUN_TRIAL
 
-    template <typename F, typename Tuple, size_t... I>
-    decltype(auto) apply_each_impl(F&& f, Tuple&& t, std::index_sequence<I...>)
-    {
-	return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
-    }
-    template <typename F, typename Tuple>
-    decltype(auto) apply_each(F&& f, Tuple&& t)
-    {
-	using Indices = std::make_index_sequence<std::tuple_size<std::decay_t<Tuple>>::value>;
-	return apply_each_impl(std::forward<F>(f), std::forward<Tuple>(t), Indices{});
-    }
-
-    template<typename T>
-    void call_info(T t) {
-	// INFO( type_name(t) );
-	// UNSCOPED_INFO( type_name<T>() << " " << t );
-    }
-    // template<>
-    // void call_info(double t) {
-    // 	UNSCOPED_INFO( "double " << t );
-    // }
-    // template<>
-    // void call_info(float t) {
-    // 	UNSCOPED_INFO( "double " << t );
-    // 	// INFO( type_name(t) );
-    // 	INFO( "float" );
-    // }
-    // template<>
-    // void call_info(int t) {
-    // 	// INFO( type_name(t) );
-    // 	INFO( "int" );
-    // }
-    
-    template<typename Op_t, typename R, typename ...T>
-    void run_trial( std::string name, R expected_result, T...args) {
-	R *gpu_result_d, gpu_result, cpu_result;
-	cpu_result = Op_t::op(args...);
-	cudaMalloc((void **)&gpu_result_d, sizeof(R));
-	
-	op_kernel<Op_t, R, T...><<<1,1>>>(gpu_result_d, args...);
-	cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
-	
-	INFO( name );
-	// INFO( type_name<T...>() );
-	
-	// auto tmp = std::make_tuple(args...);
-	// for (int i = 0; i < sizeof(T...); ++i) {
-	//     auto& arg = std::get<i>(tmp);
-	//     CAPTURE( arg );
-	// }
-	// auto fn = [](auto a) { CAPTURE( a ); };
-
-	// auto fn = [](auto a) { INFO( a->to_string() ); };
-	// using expander = int[];
-	// (void)expander{0, (void(call_info<T>(std::forward<T>(args))), 0)...};
-
-	CAPTURE( cpu_result );
-	CAPTURE( expected_result );
-	
-	REQUIRE( cpu_result == expected_result );
-	CHECK( cpu_result == gpu_result );
-    }
-}
-
-namespace Tests::Unary {
+namespace Tests::Unary::Matrix3 {
     template<typename R, typename T>
-    struct NegateOp { HOST DEVICE static R op(T in) { return static_cast<R>(-in); } };
+    struct DeterminantOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.det()); } };
 
     template<typename R, typename T>
-    struct NormalizedOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized()); } };
-
-    namespace Matrix3 {
-	template<typename R, typename T>
-	struct DeterminantOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.det()); } };
-
-	template<typename R, typename T>
-	struct NormalizeDetOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized().det()); } };
-
-	// template<typename R, typename T>
-	// struct NormalizeOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized()); } };
-
-
+    struct NormalizeDetOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized().det()); } };
 	
-	template<typename R, typename T>
-	struct InverseOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.inverse()); } };
-
-	template<typename R, typename T>
-	struct TransposeOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.transpose()); } };
-    }
-}
-
-namespace Tests::Binary {
-    // R is return type, T and U are types of operands
-    template<typename R, typename T, typename U> 
-    struct AddOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a+b); } };
-    template<typename R, typename T, typename U> 
-    struct SubOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a-b); } };
-    template<typename R, typename T, typename U> 
-    struct MultOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a*b); } };
-    template<typename R, typename T, typename U> 
-    struct DivOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a/b); } };
-
-    namespace Matrix3 {
-	template<typename R, typename T, typename U> 
-	struct CrossOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a.cross(b)); } };
-    }
-}
+    template<typename R, typename T>
+    struct InverseOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.inverse()); } };
 
-namespace Tests::Unary::Matrix3 {
+    template<typename R, typename T>
+    struct TransposeOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.transpose()); } };
 
     template<typename A, bool is_diag=false, bool check_diag=false>
     void run_tests() {
@@ -204,7 +74,11 @@ namespace Tests::Unary::Matrix3 {
 }
 
 namespace Tests::Binary::Matrix3 {
+    // Custom Operations for Matrix3
+    template<typename R, typename T, typename U> 
+    struct CrossOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a.cross(b)); } };
 
+    // Run
     template<typename A, typename B, bool is_diag=false, bool check_diag=false>
     void run_tests() {
 	using T = Matrix3_t<A, is_diag, check_diag>;
diff --git a/src/Types.h b/src/Types.h
index 765b1abb662a6821a3d35a641dffb20b93eb5de3..b1510301015b6e5114bd578210605e02c4677e23 100644
--- a/src/Types.h
+++ b/src/Types.h
@@ -1,46 +1,13 @@
 #pragma once
 
-#ifdef __CUDACC__
-    #define HOST __host__
-    #define DEVICE __device__
-#else
-    #define HOST
-    #define DEVICE
-#endif
-
-#if defined(__CUDACC__) // NVCC
-   #define MY_ALIGN(n) __align__(n)
-#elif defined(__GNUC__) // GCC
-  #define MY_ALIGN(n) __attribute__((aligned(n)))
-#elif defined(_MSC_VER) // MSVC
-  #define MY_ALIGN(n) __declspec(align(n))
-#else
-  #error "Please provide a definition for MY_ALIGN macro for your host compiler!"
-#endif
-
-#ifdef USE_CUDA
-#include <cuda_runtime.h>
-#else
-struct MY_ALIGN(16) float4 {
-    float4() : x(0), y(0), z(0), w(0) {};
-    float4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {};
-    float4 operator+(const float4&& o) {
-	return float4(x+o.x,y+o.y,z+o.z,w+o.w);
-    };
-    float4 operator*(const float&& s) {
-	return float4(x*s,y*s,z*s,w*s);
-    };
-    
-    float x,y,z,w;
-};
-#endif
-
+#include "GPUManager.h"
 #include <stdarg.h>  // For va_start, etc.
 #include <memory>    // For std::unique_ptr
 #include <cstring>
 
-// from: https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf/8098080#8098080
+// Utility function used by types to return std::string using format syntax
 inline std::string string_format(const std::string fmt_str, ...) {
+    // from: https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf/8098080#8098080
     int final_n, n = ((int)fmt_str.size()) * 2; /* Reserve two times as much as the length of the fmt_str */
     std::unique_ptr<char[]> formatted;
     va_list ap;
@@ -63,3 +30,5 @@ inline std::string string_format(const std::string fmt_str, ...) {
 
 using Vector3 = Vector3_t<float>;
 using Matrix3 = Matrix3_t<float,false>;
+
+#include "Types/Bitmask.h"
diff --git a/src/Types/Bitmask.h b/src/Types/Bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e4ca4aaa3ab55d9665ebccbffed830eb7f2ea5d
--- /dev/null
+++ b/src/Types/Bitmask.h
@@ -0,0 +1,228 @@
+#pragma once
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <initializer_list>
+
+
+class BitmaskBase {
+public:
+    BitmaskBase(const size_t len) : len(len) {}
+    HOST DEVICE
+    virtual void set_mask(size_t i, bool value) = 0;
+    HOST DEVICE
+    virtual bool get_mask(size_t i) const = 0;
+    size_t get_len() const { return len; }
+    virtual void print() {
+	for (int i = 0; i < len; ++i) {
+	    printf("%d", static_cast<int>(get_mask(i)) );
+	}
+	printf("\n");
+    }
+    
+protected:
+    size_t len;
+};
+
+// Don't use base because virtual member functions require device malloc
+class Bitmask {
+    typedef size_t idx_t;
+    typedef unsigned int data_t;
+public:
+    Bitmask(const idx_t len) : len(len) {
+	idx_t tmp = get_array_size();
+	// printf("len %lld\n",len);
+	// printf("tmp %d\n",tmp);
+	assert(tmp * data_stride >= len);
+	mask = (tmp > 0) ? new data_t[tmp] : nullptr;
+	for (int i = 0; i < tmp; ++i) mask[i] = data_t(0);
+    }
+    ~Bitmask() { if (mask != nullptr) delete[] mask; }
+
+    HOST DEVICE
+    idx_t get_len() const { return len; }
+
+    HOST DEVICE
+    void set_mask(idx_t i, bool value) {
+	// return;
+	assert(i < len);
+	idx_t ci = i/data_stride;
+	data_t change_bit = (data_t(1) << (i-ci*data_stride));
+#ifdef __CUDA_ARCH__
+	if (value) {
+	    atomicOr(  &mask[ci], change_bit );
+	} else {
+	    atomicAnd( &mask[ci], ~change_bit );
+	}
+#else
+	if (value) {
+	    mask[ci] = mask[ci] | change_bit;
+	} else {
+	    mask[ci] = mask[ci] & (~change_bit);
+	}
+#endif
+    }
+    
+    HOST DEVICE
+    bool get_mask(const idx_t i) const {
+	// return false;
+	assert(i < len);
+	const idx_t ci = i/data_stride;
+	return mask[ci] & (data_t(1) << (i-ci*data_stride));
+    }
+
+    HOST DEVICE inline bool operator==(Bitmask& b) const {
+	// Inefficient but straightforward approach; directly comparing underlying data would be fine, but then we need to deal with data strides
+	if (len != b.len) return false;
+	for (idx_t i = 0; i < len; ++i) {
+	    if (get_mask(i) != b.get_mask(i)) return false;
+	}
+	return true;
+    }
+
+#ifdef USE_CUDA
+    HOST
+    Bitmask* copy_to_cuda() const {
+	Bitmask* tmp_obj_d = nullptr;
+	Bitmask obj_tmp(0);
+	data_t* mask_d = nullptr;
+	size_t sz = sizeof(data_t) * get_array_size();
+	gpuErrchk(cudaMalloc(&tmp_obj_d, sizeof(Bitmask)));
+	if (sz > 0) {
+	    gpuErrchk(cudaMalloc(&mask_d, sz));
+	    gpuErrchk(cudaMemcpy(mask_d, mask, sz, cudaMemcpyHostToDevice));
+	}
+	// printf("Bitmask::copy_to_cuda() len(%lld) mask(%x)\n", len, mask_d);
+	obj_tmp.len = len;
+	obj_tmp.mask = mask_d;
+	gpuErrchk(cudaMemcpy(tmp_obj_d, &obj_tmp, sizeof(Bitmask), cudaMemcpyHostToDevice));
+	obj_tmp.mask = nullptr;
+	return tmp_obj_d;
+    }
+
+    HOST
+    static Bitmask retrieve_from_cuda(Bitmask* obj_d) {
+	Bitmask obj_tmp(0);
+	gpuErrchk(cudaMemcpy(&obj_tmp, obj_d, sizeof(Bitmask), cudaMemcpyDeviceToHost));
+	printf("TEST: %d\n", obj_tmp.len);
+	if (obj_tmp.len > 0) {
+	    size_t array_size = obj_tmp.get_array_size();
+	    size_t sz = sizeof(data_t) * array_size;
+	    data_t *data_addr = obj_tmp.mask;
+	    obj_tmp.mask = new data_t[array_size];
+	    gpuErrchk(cudaMemcpy(obj_tmp.mask, data_addr, sz, cudaMemcpyDeviceToHost));
+	} else {
+	    obj_tmp.mask = nullptr;
+	}
+	return obj_tmp;
+    }
+
+    HOST
+    static void remove_from_cuda(Bitmask* obj_d) {
+	Bitmask obj_tmp(0);
+	gpuErrchk(cudaMemcpy(&obj_tmp, obj_d, sizeof(Bitmask), cudaMemcpyDeviceToHost));
+	if (obj_tmp.len > 0) {
+	    gpuErrchk(cudaFree(obj_tmp.mask));
+	}
+	obj_tmp.mask = nullptr;
+	gpuErrchk(cudaMemset((void*) &(obj_d->mask), 0, sizeof(data_t*))); // set nullptr on to device
+	gpuErrchk(cudaFree(obj_d));
+	obj_d = nullptr;
+    }
+#endif
+
+    HOST
+    auto to_string() const {
+	std::string s;
+	s.reserve(len);
+	for (size_t i = 0; i < len; ++i) {
+	    s += get_mask(i) ? '1' : '0';
+	}
+	return s;
+    }
+    
+    HOST DEVICE
+    void print() {
+	for (int i = 0; i < len; ++i) {
+	    printf("%d", (int) get_mask(i));
+	}
+	printf("\n");
+    }
+
+private:
+    idx_t get_array_size() const { return (len == 0) ? 1 : (len-1)/data_stride + 1; }
+    idx_t len;
+    const static idx_t data_stride = CHAR_BIT * sizeof(data_t)/sizeof(char);
+    data_t* __restrict__ mask;
+};
+
+/*
+template<size_t chunk_size>
+class SparseBitmask : public BitmaskBase {
+public:
+    SparseBitmask(const size_t len) : BitmaskBase(len) {
+	assert( sparse_size > 0 );
+	meta_len = (len-1)/data_stride + 1;
+	meta_mask = Bitmask(meta_len);
+    }
+    ~Bitmask() { delete[] mask; }
+
+    void set_mask(size_t i, bool value) {
+	assert(i < len);
+	size_t meta_i = i/data_stride;
+	// TODO: keep track of
+	
+	// bool is_set = get_mask(i);
+	// if ((!is_set) && value) {
+	//     mask[ci] = mask[ci] + (1 << (i-ci*data_stride));
+	// } else if (is_set && (!value)) {
+	//     mask[ci] = mask[ci] - (1 << (i-ci*data_stride));
+	// }
+    }
+    size_t meta_idx_to_char_idx(size_t meta_idx) {
+	return meta_idx * CHAR_BIT;
+    }
+
+    bool get_mask(size_t i) {
+	// assert(i < len);
+	// size_t ci = i/data_stride;
+	// return mask[ci] & (1 << (i-ci*data_stride));
+    }
+    
+private:
+    const static size_t data_stride = CHAR_BIT*spare_size;
+    size_t meta_len;
+    Bitmask meta_mask;
+    std::vector<char> mask;
+};
+*/
+
+/*
+bool test_Bitmask() {
+    for (int i: {8,9,20}) {
+	Bitmask b = Bitmask(i);
+	for (int j: {0,3,10,19}) {
+	    if (j < i) b.set_mask(j,1);
+	}
+	printf("Testing Bitmask(%d)\n", i);
+	b.print();
+	b.set_mask(3,1);
+	b.print();
+	b.set_mask(3,1);
+	b.print();
+	b.set_mask(3,0);
+	b.print();
+	b.set_mask(2,0);
+	b.print();
+    }
+    return true;
+}
+// */
+
+/*
+int main(int argc, char* argv[]) {
+    test_Bitmask();
+    return 1;
+}
+//*/