Skip to content
Snippets Groups Projects
Commit ea0ddc48 authored by cmaffeo2's avatar cmaffeo2
Browse files

Basic Array benchmarking

parent f3f3feae
No related branches found
No related tags found
No related merge requests found
......@@ -9,6 +9,7 @@
#include <nvfunctional>
#include <catch2/catch_test_macros.hpp>
#include <catch2/benchmark/catch_benchmark.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
namespace Tests::TestArray {
......@@ -114,6 +115,22 @@ namespace Tests::TestArray {
// print_enable_if_value_helper<has_copy_to_cuda<T>>(typename has_copy_to_cuda<T>::type{});
// }
template<typename T> __host__ __device__
void print_it(T x) { printf("Unsupported type\n"); }
template<> __host__ __device__
void print_it(const int x) { printf("int %d\n", x); }
template<> __host__ __device__
void print_it(const long int x) { printf("long int %ld\n", x); }
template<> __host__ __device__
void print_it(const float x) { printf("float %f\n", x); }
template<> __host__ __device__
void print_it(const double x) { printf("double %lf\n", x); }
template<> __host__ __device__
void print_it(const Vector3&& x) { x.print(); }
template<> __host__ __device__
void print_it(const Vector3& x) { x.print(); }
template <typename T>
void print_enable_if_value() {
if (has_copy_to_cuda<T>::value) {
......@@ -123,15 +140,70 @@ namespace Tests::TestArray {
}
}
template <typename T>
Array<T> create_array(size_t num) {
template<typename T>
Array<T> allocate_array_host(size_t num) {
Array<T> arr(num);
return arr;
}
template<typename T>
Array<T>* allocate_array_device(size_t num) {
Array<T> arr(num);
return arr.copy_to_cuda();
}
template<typename T>
T* allocate_plain_array_host(size_t num) {
T* arr = new T[num];
return arr;
}
template<typename T>
T* allocate_plain_array_device(size_t num) {
T* arr = allocate_plain_array_host<T>(num);
T* arr_d;
size_t sz = sizeof(T)*num;
gpuErrchk(cudaMalloc(&arr_d, sz));
gpuErrchk(cudaMemcpy(arr_d, arr, sz, cudaMemcpyHostToDevice));
delete[] arr;
return arr_d;
}
template<typename T>
HOST DEVICE void inline _copy_helper(size_t& idx, T* __restrict__ out, const T* __restrict__ inp) {
out[idx] = inp[idx];
}
// HOST DEVICE void inline _copy_helper(size_t& idx, float* __restrict__ out, const float* __restrict__ inp) {
// out[idx] = inp[idx];
// }
template<typename T>
HOST DEVICE void inline _copy_helper(size_t& idx, Array<T>* __restrict__ out, const Array<T>* __restrict__ inp) {
(*out)[idx] = (*inp)[idx];
}
template<typename T>
__global__ void copy_kernel(size_t num, T* __restrict__ out, const T* __restrict__ inp) {
for (size_t i = threadIdx.x+blockIdx.x*blockDim.x; i < num; i+=blockDim.x*gridDim.x) {
_copy_helper(i, out, inp);
}
}
template<typename T>
void call_copy_kernel(size_t num, T* __restrict__ out, const T* __restrict__ inp, size_t block_size=256) {
copy_kernel<<<block_size,1,0>>>(num, out, inp);
gpuErrchk( cudaDeviceSynchronize() );
}
// Array<T> _copy_array_cuda(size_t num) {
// Array<T> arr(num);
// return arr;
// }
TEST_CASE( "Test Array assignment and copy_to_cuda", "[Array]" ) {
{
// Creation and copy assignment
Array<Vector3> a = create_array<Vector3>(10);
Array<Vector3> a = allocate_array_host<Vector3>(10);
}
{
......@@ -227,4 +299,47 @@ namespace Tests::TestArray {
b_d->remove_from_cuda(b_d);
}
}
//Benchmark showing that Array<Vector3> performs similarly to plain array for device copy, at least
/*
TEST_CASE( "Test performance copying Array vs plain arrays", "[Array]" ) {
size_t num = 100000;
float* inp3 = allocate_plain_array_device<float>(3*num);
float* out3 = allocate_plain_array_device<float>(3*num);
float* inp4 = allocate_plain_array_device<float>(4*num);
float* out4 = allocate_plain_array_device<float>(4*num);
float4* inpF4 = allocate_plain_array_device<float4>(num);
float4* outF4 = allocate_plain_array_device<float4>(num);
Array<Vector3>* inpV = allocate_array_device<Vector3>(num);
Array<Vector3>* outV = allocate_array_device<Vector3>(num);
// call_copy_kernel(3*num, out3, inp3);
// call_copy_kernel(4*num, out4, inp4);
// call_copy_kernel(num, outV, inpV);
BENCHMARK("Call 3x num float copy") {
call_copy_kernel(3*num, out3, inp3);
};
BENCHMARK("Call num Vector3 copy") {
call_copy_kernel(num, outV, inpV);
};
BENCHMARK("Call num float4 copy") {
call_copy_kernel(num, outF4, inpF4);
};
BENCHMARK("Call 3x num float copy (repeat)") {
call_copy_kernel(3*num, out3, inp3);
};
BENCHMARK("Call 4x num float copy") {
call_copy_kernel(4*num, out4, inp4);
};
BENCHMARK("Call num Vector3 copy (repeat)") {
call_copy_kernel(num, outV, inpV);
};
BENCHMARK("Call num float4 copy (repeat)") {
call_copy_kernel(num, outF4, inpF4);
};
// */
}
}
......@@ -12,7 +12,7 @@
template<typename T>
class Array {
public:
HOST inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this);
HOST DEVICE inline Array<T>() : num(0), values(nullptr) {} // printf("Creating Array1 %x\n",this);
HOST inline Array<T>(size_t num) : num(num), values(nullptr) {
// printf("Constructing Array<%s> %x with values %x\n", type_name<T>().c_str(), this, values);
host_allocate();
......@@ -43,22 +43,28 @@ public:
a.num = 0; // not needed?
// printf("Move-constructed Array<T> with values %x\n", values);
}
HOST inline Array<T>& operator=(const Array<T>& a) { // copy assignment operator
HOST DEVICE inline Array<T>& operator=(const Array<T>& a) { // copy assignment operator
num = a.num;
#ifndef __CUDA_ARCH__
host_allocate();
#endif
for (size_t i = 0; i < num; ++i) {
values[i] = a[i];
}
printf("Copy-operator for Array<T> %x with values %x\n",this, values);
// printf("Copy-operator for Array<T> %x with values %x\n",this, values);
printf("Copy-operator for Array<T>\n");
return *this;
}
HOST inline Array<T>& operator=(Array<T>&& a) { // move assignment operator
HOST DEVICE inline Array<T>& operator=(Array<T>&& a) { // move assignment operator
#ifndef __CUDA_ARCH__
host_deallocate();
#endif
num = a.num;
values = a.values;
a.num = 0;
a.values = nullptr;
printf("Move-operator for Array<T> %x with values %x\n",this, values);
// printf("Move-operator for Array<T> %x with values %x\n",this, values);
printf("Move-operator for Array<T>\n");
return *this;
}
HOST DEVICE inline T& operator[](size_t i) {
......@@ -139,6 +145,7 @@ public:
template <typename Dummy = void, typename std::enable_if_t<!has_copy_to_cuda<T>::value, Dummy>* = nullptr>
HOST static Array<T> copy_from_cuda(Array<T>* dev_ptr) {
// TODO add argument: dest = nullptr
// Create host object, copy raw device data over
Array<T> tmp(0);
if (dev_ptr != nullptr) {
......@@ -243,5 +250,5 @@ private:
}
size_t num;
T* values;
T* __restrict__ values;
};
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment