diff --git a/src/Tests/array.cu b/src/Tests/array.cu
index 3a2ec001c4eca0497cc91fa4451fa1b61cf9463e..26ed23d57953e61868e75ec0af698e7bd8af1293 100644
--- a/src/Tests/array.cu
+++ b/src/Tests/array.cu
@@ -9,6 +9,7 @@
 #include <nvfunctional>
 
 #include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
 #include <catch2/matchers/catch_matchers_floating_point.hpp>
 
 namespace Tests::TestArray {
@@ -114,6 +115,22 @@ namespace Tests::TestArray {
     // 	print_enable_if_value_helper<has_copy_to_cuda<T>>(typename has_copy_to_cuda<T>::type{});
     // }
 
+    
+    template<typename T> __host__ __device__ 
+    void print_it(T x) { printf("Unsupported type\n"); }
+    template<> __host__ __device__
+    void print_it(const int x) { printf("int %d\n", x); }
+    template<> __host__ __device__
+    void print_it(const long int x) { printf("long int %ld\n", x); }
+    template<> __host__ __device__
+    void print_it(const float x) { printf("float %f\n", x); }
+    template<> __host__ __device__
+    void print_it(const double x) { printf("double %lf\n", x); }
+    template<> __host__ __device__
+    void print_it(const Vector3&& x) { x.print(); }
+    template<> __host__ __device__
+    void print_it(const Vector3& x) { x.print(); }
+    
     template <typename T>
     void print_enable_if_value() {
 	if (has_copy_to_cuda<T>::value) {
@@ -123,15 +140,70 @@ namespace Tests::TestArray {
 	}
     }
 
-    template <typename T>
-    Array<T> create_array(size_t num) {
+    template<typename T>
+    Array<T> allocate_array_host(size_t num) {
 	Array<T> arr(num);
 	return arr;
     }
+
+    template<typename T>
+    Array<T>* allocate_array_device(size_t num) {
+	Array<T> arr(num);
+	return arr.copy_to_cuda();
+    }
+
+    template<typename T>
+    T* allocate_plain_array_host(size_t num) {
+	T* arr = new T[num];
+	return arr;
+    }
+    template<typename T>
+    T* allocate_plain_array_device(size_t num) {
+	T* arr = allocate_plain_array_host<T>(num);
+	T* arr_d;
+	size_t sz = sizeof(T)*num;
+	gpuErrchk(cudaMalloc(&arr_d, sz));
+	gpuErrchk(cudaMemcpy(arr_d, arr, sz, cudaMemcpyHostToDevice));
+	delete[] arr;
+	return arr_d;
+    }
+    
+    template<typename T>
+    HOST DEVICE void inline _copy_helper(size_t& idx, T* __restrict__ out, const T* __restrict__ inp) {
+	out[idx] = inp[idx];
+    }
+    // HOST DEVICE void inline _copy_helper(size_t& idx, float* __restrict__ out, const float* __restrict__ inp) {
+    // 	out[idx] = inp[idx];
+    // }
+    template<typename T>
+    HOST DEVICE void inline _copy_helper(size_t& idx, Array<T>* __restrict__ out, const Array<T>* __restrict__ inp) {
+	(*out)[idx] = (*inp)[idx];
+    }
+
+    
+    template<typename T>
+    __global__ void copy_kernel(size_t num, T* __restrict__ out, const T* __restrict__ inp) {
+	for (size_t i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) {
+	    _copy_helper(i, out, inp);
+	}
+    }
+
+    template<typename T>
+    void call_copy_kernel(size_t num, T* __restrict__ out, const T* __restrict__ inp, size_t block_size=256) {
+	copy_kernel<<<block_size,1,0>>>(num, out, inp);
+	gpuErrchk( cudaDeviceSynchronize() );
+    }
+    
+    // Array<T> _copy_array_cuda(size_t num) {
+    // 	Array<T> arr(num);
+    // 	return arr;
+    // }
+
+    
     TEST_CASE( "Test Array assignment and copy_to_cuda", "[Array]" ) {
 	{
 	    // Creation and copy assignment
-	    Array<Vector3> a = create_array<Vector3>(10);
+	    Array<Vector3> a = allocate_array_host<Vector3>(10);
 	}
 
 	{
@@ -227,4 +299,47 @@ namespace Tests::TestArray {
 	    b_d->remove_from_cuda(b_d);
 	}
     }
+
+    //Benchmark showing that Array<Vector3> performs similarly to plain array for device copy, at least 
+    /*
+    TEST_CASE( "Test performance copying Array vs plain arrays", "[Array]" ) {
+	size_t num = 100000;
+	float* inp3 = allocate_plain_array_device<float>(3*num);
+	float* out3 = allocate_plain_array_device<float>(3*num);
+
+	float* inp4 = allocate_plain_array_device<float>(4*num);
+	float* out4 = allocate_plain_array_device<float>(4*num);
+
+	float4* inpF4 = allocate_plain_array_device<float4>(num);
+	float4* outF4 = allocate_plain_array_device<float4>(num);
+
+	Array<Vector3>* inpV = allocate_array_device<Vector3>(num);
+	Array<Vector3>* outV = allocate_array_device<Vector3>(num);
+
+	// call_copy_kernel(3*num, out3, inp3);
+	// call_copy_kernel(4*num, out4, inp4);
+	// call_copy_kernel(num, outV, inpV);
+	BENCHMARK("Call 3x num float copy") {
+	    call_copy_kernel(3*num, out3, inp3);
+	};
+	BENCHMARK("Call num Vector3 copy") {
+	    call_copy_kernel(num, outV, inpV);
+	};
+	BENCHMARK("Call num float4 copy") {
+	    call_copy_kernel(num, outF4, inpF4);
+	};
+	BENCHMARK("Call 3x num float copy (repeat)") {
+	    call_copy_kernel(3*num, out3, inp3);
+	};
+	BENCHMARK("Call 4x num float copy") {
+	    call_copy_kernel(4*num, out4, inp4);
+	};
+	BENCHMARK("Call num Vector3 copy (repeat)") {
+	    call_copy_kernel(num, outV, inpV);
+	};
+	BENCHMARK("Call num float4 copy (repeat)") {
+	    call_copy_kernel(num, outF4, inpF4);
+	};
+	// */
+    }
 }
diff --git a/src/Types/Array.h b/src/Types/Array.h
index 4f8d5510ae1eb321e62b4eec70fae6493552567f..c5549a9c53de8b871d05df8ca66d59507af01859 100644
--- a/src/Types/Array.h
+++ b/src/Types/Array.h
@@ -12,7 +12,7 @@
 template<typename T>
 class Array {
 public:
-    HOST inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this);
+    HOST DEVICE inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this);
     HOST inline Array<T>(size_t num) : num(num), values(nullptr) {
 	// printf("Constructing Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values);
 	host_allocate();
@@ -43,22 +43,28 @@ public:
 	a.num = 0;		// not needed?
 	// printf("Move-constructed Array<T> with values %x\n",  values);
     }
-    HOST inline Array<T>& operator=(const Array<T>& a) { // copy assignment operator
+    HOST DEVICE inline Array<T>& operator=(const Array<T>& a) { // copy assignment operator
 	num = a.num;
+#ifndef __CUDA_ARCH__
 	host_allocate();
+#endif
 	for (size_t i = 0; i < num; ++i) {
 	    values[i] = a[i];
 	}
-	printf("Copy-operator for Array<T> %x with values %x\n",this, values);
+	// printf("Copy-operator for Array<T> %x with values %x\n",this, values);
+	printf("Copy-operator for Array<T>\n");
 	return *this;
     }
-    HOST inline Array<T>& operator=(Array<T>&& a) { // move assignment operator
+    HOST DEVICE inline Array<T>& operator=(Array<T>&& a) { // move assignment operator
+#ifndef __CUDA_ARCH__
 	host_deallocate();
+#endif
 	num = a.num;
 	values = a.values;
 	a.num = 0;
 	a.values = nullptr;
-	printf("Move-operator for Array<T> %x with values %x\n",this, values);
+	// printf("Move-operator for Array<T> %x with values %x\n",this, values);
+	printf("Move-operator for Array<T>\n");
 	return *this;
     }
     HOST DEVICE inline T& operator[](size_t i) {
@@ -139,6 +145,7 @@ public:
 
     template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr>
     HOST static Array<T> copy_from_cuda(Array<T>* dev_ptr) {
+	// TODO add argument: dest = nullptr 
 	// Create host object, copy raw device data over
 	Array<T> tmp(0);
 	if (dev_ptr != nullptr) {
@@ -243,5 +250,5 @@ private:
     }
     
     size_t num;
-    T* values;
+    T* __restrict__ values;
 };