+// START traits
+// https://stackoverflow.com/questions/55191505/c-compile-time-check-if-method-exists-in-template-type
+#include <type_traits>
+// template<class ...Ts>
+// struct voider{
+//     using type = void;
+// };
+// template<class T, class = void>
+// struct has_copy_to_cuda : std::false_type{};
+// template<class T>
+// struct has_copy_to_cuda<T, typename voider<decltype(std::declval<T>().copy_to_cuda())>::type> : std::true_type{};
+template <typename T, typename = void>
+struct has_copy_to_cuda : std::false_type {};
+template <typename T>
+struct has_copy_to_cuda<T, decltype(std::declval<T>().copy_to_cuda(), void())> : std::true_type {};
+// END traits
 #ifdef USE_CUDA
 #include <cstdio>
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
 set_property(TARGET arbd_tests PROPERTY CXX_STANDARD 14)
+#include <float.h>
+#include <iostream>
+#include <cstdio>
+// #include "useful.h"
+#include "../SignalManager.h"
+#include "../Types.h"
+#include <cuda.h>
+#include <nvfunctional>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+namespace Tests::TestArray {
+    // template <typename T>
+    // void print_enable_if_value() {
+    // 	print_enable_if_value_helper<has_copy_to_cuda<T>>(typename has_copy_to_cuda<T>::type{});
+    // }
+    template <typename T>
+    void print_enable_if_value() {
+	if (has_copy_to_cuda<T>::value) {
+	    std::cout << "has_copy_to_cuda is true" << std::endl;
+	} else {
+	    std::cout << "has_copy_to_cuda is false" << std::endl;
+	}
+    }
+    template <typename T>
+    Array<T> create_array(size_t num) {
+	Array<T> arr(num);
+	return arr;
+    }
+    TEST_CASE( "Test Array assignment and copy_to_cuda", "[Array]" ) {
+	{
+	    // Creation and copy assignment
+	    Array<Vector3> a = create_array<Vector3>(10);
+	}
+	{
+	    // Allocation and deallocation
+	    VectorArr a(10);
+	    a[0] = Vector3(1);
+	    // a[0].print();
+	    // a[1].print();
+	    a[3] = Vector3(3);
+	    // a[3].print();
+	    VectorArr* a_d = a.copy_to_cuda();
+	    VectorArr b(0);
+	    VectorArr* b_d = b.copy_to_cuda();
+	    VectorArr a_d_h = a_d->copy_from_cuda(a_d);
+	    VectorArr b_d_h = b_d->copy_from_cuda(b_d);
+	    // a_d_h[0].print();
+	    // a_d_h[1].print();
+	    // a_d_h[3].print();
+	    REQUIRE( a[1] == a_d_h[1] );
+	    REQUIRE( a[3] == a_d_h[3] );
+	    VectorArr::remove_from_cuda(a_d);
+	    VectorArr::remove_from_cuda(b_d);
+	    print_enable_if_value<int>();  // Replace VectorArr with your actual type
+	    print_enable_if_value<Vector3>();  // Replace VectorArr with your actual type
+	    print_enable_if_value<VectorArr>();  // Replace VectorArr with your actual type
+	    print_enable_if_value<Array<VectorArr>>();  // Replace VectorArr with your actual type
+	    // b_d_h[0].print();
+	}
+    }
+    TEST_CASE( "Test Assigment and copying of Arrays of Arrays and copy_to_cuda", "[Array]" ) {
+	{
+	    // Allocation and deallocation
+	    // printf("Creating v1(10)\n");
+	    VectorArr v1(10);
+	    for (int i = 0; i < v1.size(); ++i) {
+		v1[i] = Vector3(i+1);
+	    }
+ 	    // printf("Creating v2(20)\n");
+	    VectorArr v2(20);
+	    for (int i = 0; i < v2.size(); ++i) {
+		v2[i] = Vector3(10*i+1);
+	    }
+	    // printf("Creating a(2)\n");
+	    Array<VectorArr> a(3);
+	    a[0] = v1;
+	    a[1] = v2;
+	    // a[1] = std::move(v2);
+	    Array<VectorArr>* a_d = a.copy_to_cuda();
+	    Array<VectorArr> a_d_h = a_d->copy_from_cuda(a_d);
+	    REQUIRE( a[0][1] == a_d_h[0][1] );
+	    // REQUIRE( a[0][5] == a_d_h[0][5] );
+	    a_d->remove_from_cuda(a_d);
+	}
+    }
+    TEST_CASE( "Test Assigment and copying of Arrays of Arrays of Arrays", "[Array]" ) {
+	{
+	    // Allocation and deallocation
+	    // printf("Creating v1(10)\n");
+	    VectorArr v1(10);
+	    for (int i = 0; i < v1.size(); ++i) {
+		v1[i] = Vector3(i+1);
+	    }
+ 	    // printf("Creating v2(20)\n");
+	    VectorArr v2(20);
+	    for (int i = 0; i < v2.size(); ++i) {
+		v2[i] = Vector3(10*i+1);
+	    }
+	    // printf("Creating a(3)\n");
+	    Array<VectorArr> a(3);
+	    a[0] = v1;
+	    a[1] = v2;
+	    Array<Array<VectorArr>> b(3);
+	    b[0] = a;
+	    b[2] = std::move(a);
+	    Array<Array<VectorArr>>* b_d = b.copy_to_cuda();
+	    Array<Array<VectorArr>> b_d_h = b_d->copy_from_cuda(b_d);
+	    REQUIRE( b[0][0][0] == b_d_h[0][0][0] );
+	    b_d->remove_from_cuda(b_d);
+	}
+    }
 	    T* b_d = b.copy_to_cuda();
-	    T b2 = b.retrieve_from_cuda(b_d);
+	    T b2 = b.copy_from_cuda(b_d);
 	    REQUIRE( b == b2 );
+	    b.remove_from_cuda(b_d);
+	    cudaDeviceSynchronize();
 #include <cuda.h>
 #include <nvfunctional>
-#include "type_name.h"
+#include "../type_name.h"
 /* #include <catch2/catch_tostring.hpp> */
 /* namespace Catch { */
+#include "../type_name.h"
 namespace Tests::Unary::Matrix3 {
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/matchers/catch_matchers_floating_point.hpp>
-#include "type_name.h"
+#include "../type_name.h"
 namespace Tests::Vector3 {
     enum BinaryOp_t { ADD, CROSS, DOT, SUB, FINAL };
 #include <memory>    // For std::unique_ptr
 #include <cstring>
+#include "type_name.h"
 // Utility function used by types to return std::string using format syntax
 inline std::string string_format(const std::string fmt_str, ...) {
     // from: https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf/8098080#8098080
@@ -32,3 +34,6 @@ using Vector3 = Vector3_t<float>;
 using Matrix3 = Matrix3_t<float,false>;
 #include "Types/Bitmask.h"
+#include "Types/Array.h"
+using VectorArr = Array<Vector3>;
+ * @file  Array.h
+ * 
+ * @brief Declaration of templated Array class.
+ *********************************************************************/
+#pragma once
+#include <memory>
+#include <type_traits> // for std::common_type<T,U>
+#include <sstream>
+// Simple templated array object without resizing capabilities 
+template<typename T>
+class Array {
+    HOST inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this);
+    HOST inline Array<T>(size_t num) : num(num), values(nullptr) {
+	// printf("Constructing Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values);
+	host_allocate();
+	// printf("Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values);
+    }
+    HOST inline Array<T>(size_t num, const T* inp ) : num(num), values(nullptr) {
+	// printf("Constructing Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values);
+	host_allocate();
+	for (size_t i = 0; i < num; ++i) {
+	    values[i] = inp[i];
+	}
+	// printf("Created Array3 %x with values %x\n",this, values);
+    }
+    HOST inline Array<T>(const Array<T>& a) { // copy constructor
+	// printf("Copy-constructing Array<T> %x from %x with values %x\n",this, &a, a.values);
+	num = a.num;
+	host_allocate();
+	for (size_t i = 0; i < num; ++i) {
+	    values[i] = a[i];
+	}
+	// printf("Copy-constructed Array<T> %x with values %x\n",this, values);
+    }
+    HOST inline Array<T>(Array<T>&& a) { // move constructor
+	// printf("Move-constructing Array<T> from %x with values %x\n", &a, a.values);
+	num = a.num;
+	values = a.values;
+	a.values = nullptr;
+	a.num = 0;		// not needed?
+	// printf("Move-constructed Array<T> with values %x\n",  values);
+    }
+    HOST inline Array<T>& operator=(const Array<T>& a) { // copy assignment operator
+	num = a.num;
+	host_allocate();
+	for (size_t i = 0; i < num; ++i) {
+	    values[i] = a[i];
+	}
+	printf("Copy-operator for Array<T> %x with values %x\n",this, values);
+	return *this;
+    }
+    HOST inline Array<T>& operator=(Array<T>&& a) { // move assignment operator
+	host_deallocate();
+	num = a.num;
+	values = a.values;
+	a.num = 0;
+	a.values = nullptr;
+	printf("Move-operator for Array<T> %x with values %x\n",this, values);
+	return *this;
+    }
+    HOST DEVICE inline T& operator[](size_t i) {
+	assert( i < num );
+	return values[i];
+    }
+    HOST DEVICE inline const T& operator[](size_t i) const {
+	assert( i < num );
+	return values[i];
+    }
+    HOST inline ~Array<T>() {
+	// printf("Destroying Array %x with values %x\n",this, values);
+	host_deallocate();
+    }
+#ifdef USE_CUDA
+    // This ugly template allows overloading copy_to_cuda, depending on whether T.copy_to_cuda exists using C++14-compatible SFINAE
+    template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST inline Array<T>* copy_to_cuda(Array<T>* dev_ptr = nullptr) const {
+	if (dev_ptr == nullptr) { // allocate if needed
+	    // printf("   cudaMalloc for array\n");
+	    gpuErrchk(cudaMalloc(&dev_ptr, sizeof(Array<T>)));
+	}
+	// Allocate values_d
+	T* values_d = nullptr;
+	if (num > 0) {
+	    // printf("   cudaMalloc for %d items\n", num);
+	    size_t sz = sizeof(T) * num;
+	    gpuErrchk(cudaMalloc(&values_d, sz));
+	    // Copy values
+	    gpuErrchk(cudaMemcpy(values_d, values, sz, cudaMemcpyHostToDevice));
+	}
+	// Copy Array with pointers correctly assigned
+	Array<T> tmp(0);
+	tmp.num = num;
+	tmp.values = values_d;
+	gpuErrchk(cudaMemcpy(dev_ptr, &tmp, sizeof(Array<T>), cudaMemcpyHostToDevice));
+	tmp.num = 0;
+	tmp.values = nullptr;
+	// printf("Copying Array<%s> %x with %d values %x to device at %x\n", type_name<T>().c_str(), this, num, values, dev_ptr);
+	return dev_ptr;
+    }
+    template <typename Dummy = void, typename std::enable_if_t<has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST inline Array<T>* copy_to_cuda(Array<T>* dev_ptr = nullptr) const {
+	// enable_if<!has_copy_to_cuda<T>::value, T>::type* = 0) const {
+	if (dev_ptr == nullptr) { // allocate if needed
+	    // printf("   cudaMalloc for array\n");
+	    gpuErrchk(cudaMalloc(&dev_ptr, sizeof(Array<T>)));
+	}
+	// Allocate values_d
+	T* values_d = nullptr;
+	if (num > 0) { 
+	    size_t sz = sizeof(T) * num;
+	    // printf("   cudaMalloc for %d items\n", num);
+	    gpuErrchk(cudaMalloc(&values_d, sz));
+	    // Copy values
+	    for (size_t i = 0; i < num; ++i) {
+		values[i].copy_to_cuda(values_d + i);
+	    }
+	}
+	// Copy Array with pointers correctly assigned
+	Array<T> tmp(0);
+	tmp.num = num;
+	tmp.values = values_d;
+	gpuErrchk(cudaMemcpy(dev_ptr, &tmp, sizeof(Array<T>), cudaMemcpyHostToDevice));
+	tmp.num = 0;
+	tmp.values = nullptr;
+	// printf("Copying Array %x with values %x to device at %x\n",this, values, dev_ptr);
+	return dev_ptr;
+    }
+    template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST static Array<T> copy_from_cuda(Array<T>* dev_ptr) {
+	// Create host object, copy raw device data over
+	Array<T> tmp(0);
+	if (dev_ptr != nullptr) {
+	    gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
+	    if (tmp.num > 0) {
+		T* values_d = tmp.values;
+		tmp.values = new T[tmp.num];
+		// Copy values
+		size_t sz = sizeof(T) * tmp.num;
+		gpuErrchk(cudaMemcpy(tmp.values, values_d, sz, cudaMemcpyDeviceToHost));
+	    } else {
+		tmp.values = nullptr;
+	    }
+	}
+	// printf("Copying device Array %x to host %x with values %x\n", dev_ptr, &tmp, tmp.values);
+	return tmp;
+    }
+    template <typename Dummy = void, typename std::enable_if_t<has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST static Array<T> copy_from_cuda(Array<T>* dev_ptr) {
+	// Create host object, copy raw device data over
+	Array<T> tmp(0);
+	if (dev_ptr != nullptr) {
+	    gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
+	    if (tmp.num > 0) {
+		T* values_d = tmp.values;
+		tmp.values = new T[tmp.num];
+		// Copy values
+		for (size_t i = 0; i < tmp.num; ++i) {
+		    tmp.values[i] = T::copy_from_cuda(values_d + i);
+		}
+	    } else {
+		tmp.values = nullptr;
+	    }
+	}
+	// printf("Copying device Array %x to host %x with values %x\n", dev_ptr, &tmp, tmp.values);
+	return tmp;
+    }
+    template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST static void remove_from_cuda(Array<T>* dev_ptr, bool remove_self = true) {
+	// printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+	if (dev_ptr == nullptr) return;
+	Array<T> tmp(0);
+	gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
+	if (tmp.num > 0) {
+	    // Remove values
+	    gpuErrchk(cudaFree(tmp.values));
+	}
+	tmp.values = nullptr;
+	gpuErrchk(cudaMemset((void*) &(dev_ptr->values), 0, sizeof(T*))); // set nullptr on to device
+	if (remove_self) {
+	    gpuErrchk(cudaFree(dev_ptr));
+	    dev_ptr = nullptr;
+	}
+	// printf("...done removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+    }
+    template <typename Dummy = void, typename std::enable_if_t<has_copy_to_cuda<T>::value, Dummy>* = nullptr>
+    HOST static void remove_from_cuda(Array<T>* dev_ptr, bool remove_self = true) {
+	// printf("Removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+	if (dev_ptr == nullptr) return;
+	Array<T> tmp(0);
+	gpuErrchk(cudaMemcpy(&tmp, dev_ptr, sizeof(Array<T>), cudaMemcpyDeviceToHost));
+	if (tmp.num > 0) {
+	    // Remove values
+	    for (size_t i = 0; i < tmp.num; ++i) {
+		T::remove_from_cuda(tmp.values+i, false);
+	    }
+	}
+	tmp.values = nullptr;
+	gpuErrchk(cudaMemset((void*) &(dev_ptr->values), 0, sizeof(T*))); // set nullptr on device
+	if (remove_self) {
+	    gpuErrchk(cudaFree(dev_ptr));
+	    dev_ptr = nullptr;
+	}
+	// printf("...done removing device Array<%s> %x\n", typeid(T).name(), dev_ptr);
+    }
+    HOST DEVICE size_t size() const { return num; }
+    HOST void host_allocate() {
+	host_deallocate();
+	if (num > 0) {
+	    values = new T[num];
+	} else {
+	    values = nullptr;
+	}
+	// printf("Array<%s>.host_allocate() %d values at %x\n", typeid(T).name(), num, values);
+    }
+    HOST void host_deallocate() {
+	// printf("Array<%s>.host_deallocate() %d values at %x\n", typeid(T).name(), num, values);
+	if (values != nullptr) delete[] values;
+	values = nullptr;
+    }
+    size_t num;
+    T* values;
 #ifdef USE_CUDA
-    Bitmask* copy_to_cuda() const {
-	Bitmask* tmp_obj_d = nullptr;
+    Bitmask* copy_to_cuda(Bitmask* tmp_obj_d = nullptr) const {
 	Bitmask obj_tmp(0);
 	data_t* mask_d = nullptr;
 	size_t sz = sizeof(data_t) * get_array_size();
-	gpuErrchk(cudaMalloc(&tmp_obj_d, sizeof(Bitmask)));
+	if (tmp_obj_d == nullptr) {
+	    gpuErrchk(cudaMalloc(&tmp_obj_d, sizeof(Bitmask)));
+	}
 	if (sz > 0) {
 	    gpuErrchk(cudaMalloc(&mask_d, sz));
 	    gpuErrchk(cudaMemcpy(mask_d, mask, sz, cudaMemcpyHostToDevice));
@@ -102,7 +103,7 @@ public:
-    static Bitmask retrieve_from_cuda(Bitmask* obj_d) {
+    static Bitmask copy_from_cuda(Bitmask* obj_d) {
 	Bitmask obj_tmp(0);
 	gpuErrchk(cudaMemcpy(&obj_tmp, obj_d, sizeof(Bitmask), cudaMemcpyDeviceToHost));
 	printf("TEST: %d\n", obj_tmp.len);
 #pragma once
 #include <memory>
 #include <type_traits> // for std::common_type<T,U>
+#include <sstream>
  * 3D vector utility class with common operations implemented on CPU and GPU.
@@ -22,8 +23,8 @@ public:
     HOST DEVICE inline Vector3_t<T>() : x(T(0)), y(T(0)), z(T(0)), w(T(0)) {}
 	HOST DEVICE inline Vector3_t<T>(T s):x(s), y(s), z(s), w(s) {}
 	HOST DEVICE inline Vector3_t<T>(const Vector3_t<T>& v):x(v.x), y(v.y), z(v.z), w(v.w)  {}
-	HOST DEVICE inline Vector3_t<T>(T x0, T y0, T z0) : x(x0), y(y0), z(z0), w(0) {}
-	HOST DEVICE inline Vector3_t<T>(T x0, T y0, T z0, T w0) : x(x0), y(y0), z(z0), w(w0) {}
+	HOST DEVICE inline Vector3_t<T>(T x, T y, T z) : x(x), y(y), z(z), w(0) {}
+	HOST DEVICE inline Vector3_t<T>(T x, T y, T z, T w) : x(x), y(y), z(z), w(w) {}
 	// HOST DEVICE inline Vector3_t<T>(const T* d) : x(d[0]), y(d[1]), z(d[2]), w(0) {}
         HOST DEVICE inline Vector3_t<T>(const float4 a) : x(a.x), y(a.y), z(a.z), w(a.w) {}
@@ -57,6 +58,12 @@ public:
+	HOST DEVICE inline Vector3_t<T>& operator=(const Vector3_t<T>& v) {
+		x = v.x;
+		y = v.y;
+		z = v.z;
+		return *this;
+	}
 	HOST DEVICE inline Vector3_t<T>& operator=(const Vector3_t<T>&& v) {
 		x = v.x;
 		y = v.y;
@@ -177,12 +184,17 @@ public:
 		printf("%0.3f %0.3f %0.3f\n", x,y,z);
-	auto to_string() const {
+	auto to_string_old() const {
 	    char s[128];
 	    sprintf(s, "%.10g %.10g %.10g (%.10g)", x, y, z, w);
 	    s[127] = 0;
 	    return std::string(s);
+	auto to_string() const {
+	    std::ostringstream oss;
+	    oss << x << " " << y << " " << z << " (" << w << ")";
+	    return oss.str();
+	}
 	template<typename U>
 	    HOST DEVICE inline bool operator==(U b) const {
+#pragma once
 #include <type_traits>
 #include <typeinfo>
 #ifndef _MSC_VER
@@ -9,6 +11,7 @@
 template <typename T, typename ...Extras>
 std::string type_name() {
+    return typeid(T).name();
     using TR = typename std::remove_reference<T>::type;
     std::unique_ptr<char, void(*)(void*)> own