diff --git a/src/GPUManager.h b/src/GPUManager.h
index 159a3ac0117d4208f34ffef6c8edee251756cec8..fe7741e2ac2275ee75262a1fdb73691186ad3743 100644
--- a/src/GPUManager.h
+++ b/src/GPUManager.h
@@ -33,6 +33,27 @@ struct MY_ALIGN(16) float4 {
 };
 #endif
 
+// START traits
+// https://stackoverflow.com/questions/55191505/c-compile-time-check-if-method-exists-in-template-type
+#include <type_traits>
+// template<class ...Ts>
+// struct voider{
+//     using type = void;
+// };
+
+// template<class T, class = void>
+// struct has_copy_to_cuda : std::false_type{};
+
+// template<class T>
+// struct has_copy_to_cuda<T, typename voider<decltype(std::declval<T>().copy_to_cuda())>::type> : std::true_type{};
+
+template <typename T, typename = void>
+struct has_copy_to_cuda : std::false_type {};
+
+template <typename T>
+struct has_copy_to_cuda<T, decltype(std::declval<T>().copy_to_cuda(), void())> : std::true_type {};
+// END traits
+
 
 #ifdef USE_CUDA
 #include <cstdio>
diff --git a/src/Tests/CMakeLists.txt b/src/Tests/CMakeLists.txt
index 26f5dcacaf5b298fc45707f8adbbd1c93ea84f38..4b2055025b40384e6a98abc2f04296bf1b582805 100644
--- a/src/Tests/CMakeLists.txt
+++ b/src/Tests/CMakeLists.txt
@@ -9,6 +9,7 @@ add_executable("arbd_tests"
 matrix3.cu
 vector3_precision.cu
 bitmask.cu
+array.cu
 )
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
 set_property(TARGET arbd_tests PROPERTY CXX_STANDARD 14)
diff --git a/src/Tests/array.cu b/src/Tests/array.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3a2ec001c4eca0497cc91fa4451fa1b61cf9463e
--- /dev/null
+++ b/src/Tests/array.cu
@@ -0,0 +1,230 @@
+#include <float.h>
+#include <iostream>
+#include <cstdio>
+
+// #include "useful.h"
+#include "../SignalManager.h"
+#include "../Types.h"
+#include <cuda.h>
+#include <nvfunctional>
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+namespace Tests::TestArray {
+    // enum BinaryOp_t { ADD, CROSS, DOT, SUB, FINAL };
+    // BinaryOp_t& operator++(BinaryOp_t& op) { return op = static_cast<BinaryOp_t>( 1+static_cast<int>(op) ); }
+
+    // std::string get_binary_op_name( BinaryOp_t op ) {
+    // 	switch (op) {
+    // 	case ADD:
+    // 	    return "add";
+    // 	case SUB:
+    // 	    return "subtract";
+    // 	case CROSS:
+    // 	    return "cross";
+    // 	case DOT:
+    // 	    return "dot";
+    // 	}
+    // 	return std::string(""); // (static_cast<int>(op)));
+    // }
+
+    // template<typename R, typename T, typename U>
+    // __host__ __device__ nvstd::function<R(T,U)> get_binary_op_func( BinaryOp_t op) {
+    // 	switch (op) {
+    // 	case ADD:
+    // 	    return [] (T a, U b) {return static_cast<R>(b+a);};
+    // 	case SUB:
+    // 	    return [] (T a, U b) {return static_cast<R>(b-a);};
+    // 	case CROSS:
+    // 	    return [] (T a, U b) {return static_cast<R>(b.cross(a));};
+    // 	case DOT:
+    // 	    return [] (T a, U b) {return static_cast<R>(b.dot(a));};
+    // 	default:
+    // 	    assert(false);
+    // 	}
+    // 	return [] (T a, U b) {return static_cast<R>(b+a);};
+    // }
+
+    // template<typename R, typename T, typename U>
+    // __global__ void binary_op_test_kernel( BinaryOp_t op, R* result, T in1, U in2 ) {
+    // 	nvstd::function<R(T,U)> fn = get_binary_op_func<R,T,U>(op);
+    // 	if (blockIdx.x == 0) {
+    // 	    *result = fn(in1,in2);
+    // 	}
+    // }
+
+    // template<typename T, typename U>
+    // void check_vectors_equal( T&& cpu, U&& gpu) {
+    // 	CHECK( type_name<decltype(cpu)>() == type_name<decltype(gpu)>() ); // should be unneccesary
+    // 	CHECK( cpu.x == gpu.x );
+    // 	CHECK( cpu.y == gpu.y );
+    // 	CHECK( cpu.z == gpu.z );
+    // 	CHECK( cpu.w == gpu.w );
+    // }
+
+    // template<typename A, typename B>
+    // void run_tests() {
+    // 	using T = Vector3_t<A>;
+    // 	using U = Vector3_t<B>;
+    // 	using R = std::common_type_t<T,U>;
+    
+    // 	T v1(1,1.005,0);
+    // 	U v2(0,2,0);
+    // 	R *gpu_result_d, gpu_result, cpu_result;
+    // 	cudaMalloc((void **)&gpu_result_d, sizeof(R));
+
+    // 	for (BinaryOp_t op = ADD; op < FINAL; ++op) {
+    // 	    INFO( get_binary_op_name( op ) );
+    // 	    binary_op_test_kernel<R,T,U><<<1,1>>>(op, gpu_result_d, v1, v2);
+    // 	    cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost);
+    // 	    cudaDeviceSynchronize();
+	
+    // 	    // Get cpu_result
+    // 	    cpu_result = (get_binary_op_func<R,T,U>(op))(v1,v2);
+
+    // 	    // Check consistency
+    // 	    check_vectors_equal(cpu_result, gpu_result);
+    // 	}
+    // 	cudaFree(gpu_result_d);
+    // }
+
+    // template <typename T>
+    // void print_enable_if_value_helper(std::true_type) {
+    // 	std::cout << "has_copy_to_cuda is true" << std::endl;
+    // }
+
+    // template <typename T>
+    // void print_enable_if_value_helper(std::false_type) {
+    // 	std::cout << "has_copy_to_cuda is false" << std::endl;
+    // }
+
+    // template <typename T>
+    // void print_enable_if_value_helper(std::true_type) {
+    // 	std::cout << "has_copy_to_cuda is true" << std::endl;
+    // }
+
+    // template <typename T>
+    // void print_enable_if_value_helper(std::false_type) {
+    // 	std::cout << "has_copy_to_cuda is false" << std::endl;
+    // }
+
+    // template <typename T>
+    // void print_enable_if_value() {
+    // 	print_enable_if_value_helper<has_copy_to_cuda<T>>(typename has_copy_to_cuda<T>::type{});
+    // }
+
+    template <typename T>
+    void print_enable_if_value() {
+	if (has_copy_to_cuda<T>::value) {
+	    std::cout << "has_copy_to_cuda is true" << std::endl;
+	} else {
+	    std::cout << "has_copy_to_cuda is false" << std::endl;
+	}
+    }
+
+    template <typename T>
+    Array<T> create_array(size_t num) {
+	Array<T> arr(num);
+	return arr;
+    }
+    TEST_CASE( "Test Array assignment and copy_to_cuda", "[Array]" ) {
+	{
+	    // Creation and copy assignment
+	    Array<Vector3> a = create_array<Vector3>(10);
+	}
+
+	{
+	    // Allocation and deallocation
+	    VectorArr a(10);
+	    a[0] = Vector3(1);
+	    // a[0].print();
+	    // a[1].print();
+	    a[3] = Vector3(3);
+	    // a[3].print();
+
+	    VectorArr* a_d = a.copy_to_cuda();
+	    VectorArr b(0);
+	    VectorArr* b_d = b.copy_to_cuda();
+	    VectorArr a_d_h = a_d->copy_from_cuda(a_d);
+	    VectorArr b_d_h = b_d->copy_from_cuda(b_d);
+		    
+	    // a_d_h[0].print();
+	    // a_d_h[1].print();
+	    // a_d_h[3].print();
+
+	    REQUIRE( a[1] == a_d_h[1] );
+	    REQUIRE( a[3] == a_d_h[3] );
+
+	    VectorArr::remove_from_cuda(a_d);
+	    VectorArr::remove_from_cuda(b_d);
+
+	    print_enable_if_value<int>();  // Replace VectorArr with your actual type
+	    print_enable_if_value<Vector3>();  // Replace VectorArr with your actual type
+	    print_enable_if_value<VectorArr>();  // Replace VectorArr with your actual type
+	    print_enable_if_value<Array<VectorArr>>();  // Replace VectorArr with your actual type
+	    
+	    // b_d_h[0].print();
+	}
+    }
+    TEST_CASE( "Test Assigment and copying of Arrays of Arrays and copy_to_cuda", "[Array]" ) {
+	{
+	    // Allocation and deallocation
+	    // printf("Creating v1(10)\n");
+	    VectorArr v1(10);
+	    for (int i = 0; i < v1.size(); ++i) {
+		v1[i] = Vector3(i+1);
+	    }
+ 	    // printf("Creating v2(20)\n");
+	    VectorArr v2(20);
+	    for (int i = 0; i < v2.size(); ++i) {
+		v2[i] = Vector3(10*i+1);
+	    }
+	    
+	    // printf("Creating a(2)\n");
+	    Array<VectorArr> a(3);
+	    a[0] = v1;
+	    a[1] = v2;
+	    // a[1] = std::move(v2);
+
+	    Array<VectorArr>* a_d = a.copy_to_cuda();
+	    Array<VectorArr> a_d_h = a_d->copy_from_cuda(a_d);
+	    
+	    
+	    REQUIRE( a[0][1] == a_d_h[0][1] );
+	    // REQUIRE( a[0][5] == a_d_h[0][5] );
+
+	    a_d->remove_from_cuda(a_d);
+	}
+    }
+    TEST_CASE( "Test Assigment and copying of Arrays of Arrays of Arrays", "[Array]" ) {
+	{
+	    // Allocation and deallocation
+	    // printf("Creating v1(10)\n");
+	    VectorArr v1(10);
+	    for (int i = 0; i < v1.size(); ++i) {
+		v1[i] = Vector3(i+1);
+	    }
+ 	    // printf("Creating v2(20)\n");
+	    VectorArr v2(20);
+	    for (int i = 0; i < v2.size(); ++i) {
+		v2[i] = Vector3(10*i+1);
+	    }
+	    
+	    // printf("Creating a(3)\n");
+	    Array<VectorArr> a(3);
+	    a[0] = v1;
+	    a[1] = v2;
+
+	    Array<Array<VectorArr>> b(3);
+	    b[0] = a;
+	    b[2] = std::move(a);
+
+	    Array<Array<VectorArr>>* b_d = b.copy_to_cuda();
+	    Array<Array<VectorArr>> b_d_h = b_d->copy_from_cuda(b_d);
+	    	    
+	    REQUIRE( b[0][0][0] == b_d_h[0][0][0] );
+	    b_d->remove_from_cuda(b_d);
+	}
+    }
+}
diff --git a/src/Tests/bitmask.cu b/src/Tests/bitmask.cu
index 7703e033e693928e9ae0b1b1d05ad687dcf5bfad..85089823169618e7d7b613c8dad3d899970f7225 100644
--- a/src/Tests/bitmask.cu
+++ b/src/Tests/bitmask.cu
@@ -32,11 +32,13 @@ namespace Tests::Bitmask {
 	    T* b_d = b.copy_to_cuda();
 	    cudaDeviceSynchronize();
 
-	    T b2 = b.retrieve_from_cuda(b_d);
+	    T b2 = b.copy_from_cuda(b_d);
 	    cudaDeviceSynchronize();
-
 	    REQUIRE( b == b2 );
 
+	    b.remove_from_cuda(b_d);
+	    cudaDeviceSynchronize();
+
 	}
     }
 
diff --git a/src/Tests/catch_boiler.h b/src/Tests/catch_boiler.h
index 898a8a7d5a7d7607351c7edb7a793183ed74a93a..571424f62e6af61b25e50a61575b4ba05dcd5fdf 100644
--- a/src/Tests/catch_boiler.h
+++ b/src/Tests/catch_boiler.h
@@ -7,7 +7,7 @@
 #include <cuda.h>
 #include <nvfunctional>
 
-#include "type_name.h"
+#include "../type_name.h"
 
 /* #include <catch2/catch_tostring.hpp> */
 /* namespace Catch { */
diff --git a/src/Tests/matrix3.cu b/src/Tests/matrix3.cu
index 8f89488a2aede5ce98fa3e2b544901dba3625f58..3be6f5843c79ee002f72c4a2762e382984f84586 100644
--- a/src/Tests/matrix3.cu
+++ b/src/Tests/matrix3.cu
@@ -11,6 +11,8 @@ namespace Catch {
     };
 }
 
+#include "../type_name.h"
+
 DEF_RUN_TRIAL
 
 namespace Tests::Unary::Matrix3 {
diff --git a/src/Tests/vector3_precision.cu b/src/Tests/vector3_precision.cu
index a8b93db041a933919275add24674f2acc417986a..ec706e080d3791b66b54f9b1b0768b65048d96c7 100644
--- a/src/Tests/vector3_precision.cu
+++ b/src/Tests/vector3_precision.cu
@@ -11,7 +11,7 @@
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/matchers/catch_matchers_floating_point.hpp>
 
-#include "type_name.h"
+#include "../type_name.h"
 
 namespace Tests::Vector3 {
     enum BinaryOp_t { ADD, CROSS, DOT, SUB, FINAL };
diff --git a/src/Types.h b/src/Types.h
index b1510301015b6e5114bd578210605e02c4677e23..ff72cbe3d249c66ed6eb943c5ca2dbe743459429 100644
--- a/src/Types.h
+++ b/src/Types.h
@@ -5,6 +5,8 @@
 #include <memory>    // For std::unique_ptr
 #include <cstring>
 
+#include "type_name.h"
+
 // Utility function used by types to return std::string using format syntax
 inline std::string string_format(const std::string fmt_str, ...) {
     // from: https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf/8098080#8098080
@@ -32,3 +34,6 @@ using Vector3 = Vector3_t<float>;
 using Matrix3 = Matrix3_t<float,false>;
 
 #include "Types/Bitmask.h"
+
+#include "Types/Array.h"
+using VectorArr = Array<Vector3>;
diff --git a/src/Types/Array.h b/src/Types/Array.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f8d5510ae1eb321e62b4eec70fae6493552567f
--- /dev/null
+++ b/src/Types/Array.h
@@ -0,0 +1,247 @@
+/*********************************************************************
+ * @file  Array.h
+ * 
+ * @brief Declaration of templated Array class.
+ *********************************************************************/
+#pragma once
+#include <memory>
+#include <type_traits> // for std::common_type<T,U>
+#include <sstream>
+
+// Simple templated array object without resizing capabilities 
+template<typename T>
+class Array {
+public:
+    HOST inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this);
+    HOST inline Array<T>(size_t num) : num(num), values(nullptr) {
+	// printf("Constructing Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values);
+	host_allocate();
+	// printf("Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values);
+    }
+    HOST inline Array<T>(size_t num, const T* inp ) : num(num), values(nullptr) {
+	// printf("Constructing Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values);
+	host_allocate();
+	for (size_t i = 0; i < num; ++i) {
+	    values[i] = inp[i];
+	}
+	// printf("Created Array3 %x with values %x\n",this, values);
+    }
+    HOST inline Array<T>(const Array<T>& a) { // copy constructor
+	// printf("Copy-constructing Array<T> %x from %x with values %x\n",this, &a, a.values);
+	num = a.num;
+	host_allocate();
+	for (size_t i = 0; i < num; ++i) {
+	    values[i] = a[i];
+	}
+	// printf("Copy-constructed Array<T> %x with values %x\n",this, values);
+    }
+    HOST inline Array<T>(Array<T>&& a) { // move constructor
+	// printf("Move-constructing Array<T> from %x with values %x\n", &a, a.values);
+	num = a.num;
+	values = a.values;
+	a.values = nullptr;
+	a.num = 0;		// not needed?
+	// printf("Move-constructed Array<T> with values %x\n",  values);
+    }
+    HOST inline Array<T>& operator=(const Array<T>& a) { // copy assignment operator
+	num = a.num;
+	host_allocate();
+	for (size_t i = 0; i < num; ++i) {
+	    values[i] = a[i];
+	}
+	printf("Copy-operator for Array<T> %x with values %x\n",this, values);
+	return *this;
+    }
+    HOST inline Array<T>& operator=(Array<T>&& a) { // move assignment operator
+	host_deallocate();
+	num = a.num;
+	values = a.values;
+	a.num = 0;
+	a.values = nullptr;
+	printf("Move-operator for Array<T> %x with values %x\n",this, values);
+	return *this;
+    }
+    HOST DEVICE inline T& operator[](size_t i) {
+	assert( i < num );
+	return values[i];
+    }
+    HOST DEVICE inline const T& operator[](size_t i) const {
+	assert( i < num );
+	return values[i];
+    }
+    HOST inline ~Array<T>() {
+	// printf("Destroying Array %x with values %x\n",this, values);
+	host_deallocate();
+    }
+    
+#ifdef USE_CUDA
+    // This ugly template allows overloading copy_to_cuda, depending on whether T.copy_to_cuda exists using C++14-compatible SFINAE
+    template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST inline Array<T>* copy_to_cuda(Array<T>* dev_ptr = nullptr) const {
+	if (dev_ptr == nullptr) { // allocate if needed
+	    // printf("   cudaMalloc for array\n");
+	    gpuErrchk(cudaMalloc(&dev_ptr, sizeof(Array<T>)));
+	}
+
+	// Allocate values_d
+	T* values_d = nullptr;
+	if (num > 0) {
+	    // printf("   cudaMalloc for %d items\n", num);
+	    size_t sz = sizeof(T) * num;
+	    gpuErrchk(cudaMalloc(&values_d, sz));
+
+	    // Copy values
+	    gpuErrchk(cudaMemcpy(values_d, values, sz, cudaMemcpyHostToDevice));
+	}
+	    
+	// Copy Array with pointers correctly assigned
+	Array<T> tmp(0);
+	tmp.num = num;
+	tmp.values = values_d;
+	gpuErrchk(cudaMemcpy(dev_ptr, &tmp, sizeof(Array<T>), cudaMemcpyHostToDevice));
+	tmp.num = 0;
+	tmp.values = nullptr;
+	// printf("Copying Array<%s> %x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, dev_ptr);
+	return dev_ptr;
+    }
+
+    template <typename Dummy = void, typename std::enable_if_t<has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST inline Array<T>* copy_to_cuda(Array<T>* dev_ptr = nullptr) const {
+	// enable_if<!has_copy_to_cuda<T>::value, T>::type* = 0) const {
+	if (dev_ptr == nullptr) { // allocate if needed
+	    // printf("   cudaMalloc for array\n");
+	    gpuErrchk(cudaMalloc(&dev_ptr, sizeof(Array<T>)));
+	}
+
+	// Allocate values_d
+	T* values_d = nullptr;
+	if (num > 0) { 
+	    size_t sz = sizeof(T) * num;
+	    // printf("   cudaMalloc for %d items\n", num);
+	    gpuErrchk(cudaMalloc(&values_d, sz));
+
+	    // Copy values
+	    for (size_t i = 0; i < num; ++i) {
+		values[i].copy_to_cuda(values_d + i);
+	    }
+	}
+
+	// Copy Array with pointers correctly assigned
+	Array<T> tmp(0);
+	tmp.num = num;
+	tmp.values = values_d;
+	gpuErrchk(cudaMemcpy(dev_ptr, &tmp, sizeof(Array<T>), cudaMemcpyHostToDevice));
+	tmp.num = 0;
+	tmp.values = nullptr;
+	// printf("Copying Array %x with values %x to device at %x\n",this, values, dev_ptr);
+	return dev_ptr;
+    }
+
+    template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST static Array<T> copy_from_cuda(Array<T>* dev_ptr) {
+	// Create host object, copy raw device data over
+	Array<T> tmp(0);
+	if (dev_ptr != nullptr) {
+	    gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
+
+	    if (tmp.num > 0) {
+		T* values_d = tmp.values;
+		tmp.values = new T[tmp.num];
+	    	    
+		// Copy values
+		size_t sz = sizeof(T) * tmp.num;
+		gpuErrchk(cudaMemcpy(tmp.values, values_d, sz, cudaMemcpyDeviceToHost));
+	    } else {
+		tmp.values = nullptr;
+	    }
+	}
+	// printf("Copying device Array %x to host %x with values %x\n", dev_ptr, &tmp, tmp.values);
+	return tmp;
+    }
+
+    template <typename Dummy = void, typename std::enable_if_t<has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST static Array<T> copy_from_cuda(Array<T>* dev_ptr) {
+	// Create host object, copy raw device data over
+	Array<T> tmp(0);
+
+	if (dev_ptr != nullptr) {
+	    gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
+
+	    if (tmp.num > 0) {
+		T* values_d = tmp.values;
+		tmp.values = new T[tmp.num];
+	    	    
+		// Copy values
+		for (size_t i = 0; i < tmp.num; ++i) {
+		    tmp.values[i] = T::copy_from_cuda(values_d + i);
+		}
+	    } else {
+		tmp.values = nullptr;
+	    }
+	}
+	// printf("Copying device Array %x to host %x with values %x\n", dev_ptr, &tmp, tmp.values);
+	return tmp;
+    }
+
+    template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST static void remove_from_cuda(Array<T>* dev_ptr, bool remove_self = true) {
+	// printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+	if (dev_ptr == nullptr) return;
+	Array<T> tmp(0);
+	gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
+	if (tmp.num > 0) {
+	    // Remove values
+	    gpuErrchk(cudaFree(tmp.values));
+	}
+	tmp.values = nullptr;
+	gpuErrchk(cudaMemset((void*) &(dev_ptr->values), 0, sizeof(T*))); // set nullptr on to device
+	if (remove_self) {
+	    gpuErrchk(cudaFree(dev_ptr));
+	    dev_ptr = nullptr;
+	}
+	// printf("...done removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+    }
+
+    template <typename Dummy = void, typename std::enable_if_t<has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST static void remove_from_cuda(Array<T>* dev_ptr, bool remove_self = true) {
+	// printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+	if (dev_ptr == nullptr) return;
+	Array<T> tmp(0);
+	gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
+	if (tmp.num > 0) {
+	    // Remove values
+	    for (size_t i = 0; i < tmp.num; ++i) {
+		T::remove_from_cuda(tmp.values+i, false);
+	    }
+	}
+	tmp.values = nullptr;
+	gpuErrchk(cudaMemset((void*) &(dev_ptr->values), 0, sizeof(T*))); // set nullptr on device
+	if (remove_self) {
+	    gpuErrchk(cudaFree(dev_ptr));
+	    dev_ptr = nullptr;
+	}
+	// printf("...done removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+    }
+#endif
+    HOST DEVICE size_t size() const { return num; }
+    
+private:
+    HOST void host_allocate() {
+	host_deallocate();
+	if (num > 0) {
+	    values = new T[num];
+	} else {
+	    values = nullptr;
+	}
+	// printf("Array<%s>.host_allocate() %d values at %x\n", typeid(T).name(), num, values);
+
+    }
+    HOST void host_deallocate() {
+	// printf("Array<%s>.host_deallocate() %d values at %x\n", typeid(T).name(), num, values);
+	if (values != nullptr) delete[] values;
+	values = nullptr;
+    }
+    
+    size_t num;
+    T* values;
+};
diff --git a/src/Types/Bitmask.h b/src/Types/Bitmask.h
index 3e4ca4aaa3ab55d9665ebccbffed830eb7f2ea5d..1ca22ffb384519e2a53ce5ffd4a25185b2d14416 100644
--- a/src/Types/Bitmask.h
+++ b/src/Types/Bitmask.h
@@ -83,12 +83,13 @@ public:
 
 #ifdef USE_CUDA
     HOST
-    Bitmask* copy_to_cuda() const {
-	Bitmask* tmp_obj_d = nullptr;
+    Bitmask* copy_to_cuda(Bitmask* tmp_obj_d = nullptr) const {
 	Bitmask obj_tmp(0);
 	data_t* mask_d = nullptr;
 	size_t sz = sizeof(data_t) * get_array_size();
-	gpuErrchk(cudaMalloc(&tmp_obj_d, sizeof(Bitmask)));
+	if (tmp_obj_d == nullptr) {
+	    gpuErrchk(cudaMalloc(&tmp_obj_d, sizeof(Bitmask)));
+	}
 	if (sz > 0) {
 	    gpuErrchk(cudaMalloc(&mask_d, sz));
 	    gpuErrchk(cudaMemcpy(mask_d, mask, sz, cudaMemcpyHostToDevice));
@@ -102,7 +103,7 @@ public:
     }
 
     HOST
-    static Bitmask retrieve_from_cuda(Bitmask* obj_d) {
+    static Bitmask copy_from_cuda(Bitmask* obj_d) {
 	Bitmask obj_tmp(0);
 	gpuErrchk(cudaMemcpy(&obj_tmp, obj_d, sizeof(Bitmask), cudaMemcpyDeviceToHost));
 	printf("TEST: %d\n", obj_tmp.len);
diff --git a/src/Types/Vector3.h b/src/Types/Vector3.h
index f53f3bf6cd5c947f6aa6827b5a073436bcb40c65..4718273eb5a25622ccdbda8c08475babc37a0c61 100644
--- a/src/Types/Vector3.h
+++ b/src/Types/Vector3.h
@@ -6,6 +6,7 @@
 #pragma once
 #include <memory>
 #include <type_traits> // for std::common_type<T,U>
+#include <sstream>
 
 /**
  * 3D vector utility class with common operations implemented on CPU and GPU.
@@ -22,8 +23,8 @@ public:
     HOST DEVICE inline Vector3_t<T>() : x(T(0)), y(T(0)), z(T(0)), w(T(0)) {}
 	HOST DEVICE inline Vector3_t<T>(T s):x(s), y(s), z(s), w(s) {}
 	HOST DEVICE inline Vector3_t<T>(const Vector3_t<T>& v):x(v.x), y(v.y), z(v.z), w(v.w)  {}
-	HOST DEVICE inline Vector3_t<T>(T x0, T y0, T z0) : x(x0), y(y0), z(z0), w(0) {}
-	HOST DEVICE inline Vector3_t<T>(T x0, T y0, T z0, T w0) : x(x0), y(y0), z(z0), w(w0) {}
+	HOST DEVICE inline Vector3_t<T>(T x, T y, T z) : x(x), y(y), z(z), w(0) {}
+	HOST DEVICE inline Vector3_t<T>(T x, T y, T z, T w) : x(x), y(y), z(z), w(w) {}
 	// HOST DEVICE inline Vector3_t<T>(const T* d) : x(d[0]), y(d[1]), z(d[2]), w(0) {}
         HOST DEVICE inline Vector3_t<T>(const float4 a) : x(a.x), y(a.y), z(a.z), w(a.w) {}
 
@@ -57,6 +58,12 @@ public:
 	}
 #endif
 
+	HOST DEVICE inline Vector3_t<T>& operator=(const Vector3_t<T>& v) {
+		x = v.x;
+		y = v.y;
+		z = v.z;
+		return *this;
+	}
 	HOST DEVICE inline Vector3_t<T>& operator=(const Vector3_t<T>&& v) {
 		x = v.x;
 		y = v.y;
@@ -177,12 +184,17 @@ public:
 		printf("%0.3f %0.3f %0.3f\n", x,y,z);
 	}
 
-	auto to_string() const {
+	auto to_string_old() const {
 	    char s[128];
 	    sprintf(s, "%.10g %.10g %.10g (%.10g)", x, y, z, w);
 	    s[127] = 0;
 	    return std::string(s);
 	}
+	auto to_string() const {
+	    std::ostringstream oss;
+	    oss << x << " " << y << " " << z << " (" << w << ")";
+	    return oss.str();
+	}
 
 	template<typename U>
 	    HOST DEVICE inline bool operator==(U b) const {
diff --git a/src/Tests/type_name.h b/src/type_name.h
similarity index 95%
rename from src/Tests/type_name.h
rename to src/type_name.h
index f64e440bef08d1df44fafd73fb64367d8542327e..8ad2039003f4a1900685be63404a8d6af210d18e 100644
--- a/src/Tests/type_name.h
+++ b/src/type_name.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <type_traits>
 #include <typeinfo>
 #ifndef _MSC_VER
@@ -9,6 +11,7 @@
 
 template <typename T, typename ...Extras>
 std::string type_name() {
+    return typeid(T).name();
     using TR = typename std::remove_reference<T>::type;
     std::unique_ptr<char, void(*)(void*)> own
            (