diff --git a/src/Tests/array.cu b/src/Tests/array.cu index 3a2ec001c4eca0497cc91fa4451fa1b61cf9463e..26ed23d57953e61868e75ec0af698e7bd8af1293 100644 --- a/src/Tests/array.cu +++ b/src/Tests/array.cu @@ -9,6 +9,7 @@ #include <nvfunctional> #include <catch2/catch_test_macros.hpp> +#include <catch2/benchmark/catch_benchmark.hpp> #include <catch2/matchers/catch_matchers_floating_point.hpp> namespace Tests::TestArray { @@ -114,6 +115,22 @@ namespace Tests::TestArray { // print_enable_if_value_helper<has_copy_to_cuda<T>>(typename has_copy_to_cuda<T>::type{}); // } + + template<typename T> __host__ __device__ + void print_it(T x) { printf("Unsupported type\n"); } + template<> __host__ __device__ + void print_it(const int x) { printf("int %d\n", x); } + template<> __host__ __device__ + void print_it(const long int x) { printf("long int %ld\n", x); } + template<> __host__ __device__ + void print_it(const float x) { printf("float %f\n", x); } + template<> __host__ __device__ + void print_it(const double x) { printf("double %lf\n", x); } + template<> __host__ __device__ + void print_it(const Vector3&& x) { x.print(); } + template<> __host__ __device__ + void print_it(const Vector3& x) { x.print(); } + template <typename T> void print_enable_if_value() { if (has_copy_to_cuda<T>::value) { @@ -123,15 +140,70 @@ namespace Tests::TestArray { } } - template <typename T> - Array<T> create_array(size_t num) { + template<typename T> + Array<T> allocate_array_host(size_t num) { Array<T> arr(num); return arr; } + + template<typename T> + Array<T>* allocate_array_device(size_t num) { + Array<T> arr(num); + return arr.copy_to_cuda(); + } + + template<typename T> + T* allocate_plain_array_host(size_t num) { + T* arr = new T[num]; + return arr; + } + template<typename T> + T* allocate_plain_array_device(size_t num) { + T* arr = allocate_plain_array_host<T>(num); + T* arr_d; + size_t sz = sizeof(T)*num; + gpuErrchk(cudaMalloc(&arr_d, sz)); + gpuErrchk(cudaMemcpy(arr_d, arr, sz, cudaMemcpyHostToDevice)); + delete[] arr; + return arr_d; + } + + template<typename T> + HOST DEVICE void inline _copy_helper(size_t& idx, T* __restrict__ out, const T* __restrict__ inp) { + out[idx] = inp[idx]; + } + // HOST DEVICE void inline _copy_helper(size_t& idx, float* __restrict__ out, const float* __restrict__ inp) { + // out[idx] = inp[idx]; + // } + template<typename T> + HOST DEVICE void inline _copy_helper(size_t& idx, Array<T>* __restrict__ out, const Array<T>* __restrict__ inp) { + (*out)[idx] = (*inp)[idx]; + } + + + template<typename T> + __global__ void copy_kernel(size_t num, T* __restrict__ out, const T* __restrict__ inp) { + for (size_t i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) { + _copy_helper(i, out, inp); + } + } + + template<typename T> + void call_copy_kernel(size_t num, T* __restrict__ out, const T* __restrict__ inp, size_t block_size=256) { + copy_kernel<<<block_size,1,0>>>(num, out, inp); + gpuErrchk( cudaDeviceSynchronize() ); + } + + // Array<T> _copy_array_cuda(size_t num) { + // Array<T> arr(num); + // return arr; + // } + + TEST_CASE( "Test Array assignment and copy_to_cuda", "[Array]" ) { { // Creation and copy assignment - Array<Vector3> a = create_array<Vector3>(10); + Array<Vector3> a = allocate_array_host<Vector3>(10); } { @@ -227,4 +299,47 @@ namespace Tests::TestArray { b_d->remove_from_cuda(b_d); } } + + //Benchmark showing that Array<Vector3> performs similarly to plain array for device copy, at least + /* + TEST_CASE( "Test performance copying Array vs plain arrays", "[Array]" ) { + size_t num = 100000; + float* inp3 = allocate_plain_array_device<float>(3*num); + float* out3 = allocate_plain_array_device<float>(3*num); + + float* inp4 = allocate_plain_array_device<float>(4*num); + float* out4 = allocate_plain_array_device<float>(4*num); + + float4* inpF4 = allocate_plain_array_device<float4>(num); + float4* outF4 = allocate_plain_array_device<float4>(num); + + Array<Vector3>* inpV = allocate_array_device<Vector3>(num); + Array<Vector3>* outV = allocate_array_device<Vector3>(num); + + // call_copy_kernel(3*num, out3, inp3); + // call_copy_kernel(4*num, out4, inp4); + // call_copy_kernel(num, outV, inpV); + BENCHMARK("Call 3x num float copy") { + call_copy_kernel(3*num, out3, inp3); + }; + BENCHMARK("Call num Vector3 copy") { + call_copy_kernel(num, outV, inpV); + }; + BENCHMARK("Call num float4 copy") { + call_copy_kernel(num, outF4, inpF4); + }; + BENCHMARK("Call 3x num float copy (repeat)") { + call_copy_kernel(3*num, out3, inp3); + }; + BENCHMARK("Call 4x num float copy") { + call_copy_kernel(4*num, out4, inp4); + }; + BENCHMARK("Call num Vector3 copy (repeat)") { + call_copy_kernel(num, outV, inpV); + }; + BENCHMARK("Call num float4 copy (repeat)") { + call_copy_kernel(num, outF4, inpF4); + }; + // */ + } } diff --git a/src/Types/Array.h b/src/Types/Array.h index 4f8d5510ae1eb321e62b4eec70fae6493552567f..c5549a9c53de8b871d05df8ca66d59507af01859 100644 --- a/src/Types/Array.h +++ b/src/Types/Array.h @@ -12,7 +12,7 @@ template<typename T> class Array { public: - HOST inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this); + HOST DEVICE inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this); HOST inline Array<T>(size_t num) : num(num), values(nullptr) { // printf("Constructing Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values); host_allocate(); @@ -43,22 +43,28 @@ public: a.num = 0; // not needed? // printf("Move-constructed Array<T> with values %x\n", values); } - HOST inline Array<T>& operator=(const Array<T>& a) { // copy assignment operator + HOST DEVICE inline Array<T>& operator=(const Array<T>& a) { // copy assignment operator num = a.num; +#ifndef __CUDA_ARCH__ host_allocate(); +#endif for (size_t i = 0; i < num; ++i) { values[i] = a[i]; } - printf("Copy-operator for Array<T> %x with values %x\n",this, values); + // printf("Copy-operator for Array<T> %x with values %x\n",this, values); + printf("Copy-operator for Array<T>\n"); return *this; } - HOST inline Array<T>& operator=(Array<T>&& a) { // move assignment operator + HOST DEVICE inline Array<T>& operator=(Array<T>&& a) { // move assignment operator +#ifndef __CUDA_ARCH__ host_deallocate(); +#endif num = a.num; values = a.values; a.num = 0; a.values = nullptr; - printf("Move-operator for Array<T> %x with values %x\n",this, values); + // printf("Move-operator for Array<T> %x with values %x\n",this, values); + printf("Move-operator for Array<T>\n"); return *this; } HOST DEVICE inline T& operator[](size_t i) { @@ -139,6 +145,7 @@ public: template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr> HOST static Array<T> copy_from_cuda(Array<T>* dev_ptr) { + // TODO add argument: dest = nullptr // Create host object, copy raw device data over Array<T> tmp(0); if (dev_ptr != nullptr) { @@ -243,5 +250,5 @@ private: } size_t num; - T* values; + T* __restrict__ values; };