diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h index b3bfb5a6d76e1f50698328f3d6ed6f0624b69cc6..413e2f2bbf6cec280b522ffe2e42af7c5622bcea 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h @@ -19,14 +19,12 @@ void *handleTensorFftApproximationTuples( case GPUNodeConfiguration::APPROX::FP32: { void *t_out; RC->resume_profiler(); - t_out = tensorFft(input); // TODO: correct name here + t_out = tensorFft(input); RC->pause_profiler(); std::pair<double, double> pinfo = RC->get_time_energy(); RC->reset_profiler(); - RC->addToCurrentIterationComputeTime( - "tensorFft", pinfo.first); // and here - RC->addToCurrentIterationComputeEnergy( - "tensorFft", pinfo.second); // and here + RC->addToCurrentIterationComputeTime("tensorFft", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorFft", pinfo.second); return t_out; } default: @@ -55,29 +53,36 @@ void *handleTensorReduceApproximationTuples( case GPUNodeConfiguration::APPROX::FP32: { void *t_out; RC->resume_profiler(); - t_out = tensorReduce(input, axis, func); // TODO: correct name here + t_out = tensorReduce(input, axis, func, false); RC->pause_profiler(); std::pair<double, double> pinfo = RC->get_time_energy(); RC->reset_profiler(); - RC->addToCurrentIterationComputeTime( - "tensorReduce", pinfo.first); // and here - RC->addToCurrentIterationComputeEnergy( - "tensorReduce", pinfo.second); // and here + RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorReduce(input, axis, func, true); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorReduceHalf", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorReduceHalf", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING: { void *t_out; RC->resume_profiler(); - t_out = tensorReductionSamplingReduce(input, axis, func, param); // TODO: correct name here + t_out = tensorReductionSamplingReduce(input, axis, func, param); RC->pause_profiler(); std::pair<double, double> pinfo = RC->get_time_energy(); RC->reset_profiler(); RC->addToCurrentIterationComputeTime( - "tensorReductionSamplingReduce", - pinfo.first); // and here + "tensorReductionSamplingReduce", pinfo.first); RC->addToCurrentIterationComputeEnergy( - "tensorReductionSamplingReduce", - pinfo.second); // and here + "tensorReductionSamplingReduce", pinfo.second); return t_out; } default: @@ -106,14 +111,12 @@ void *handleTensorProjectiveTApproximationTuples( case GPUNodeConfiguration::APPROX::FP32: { void *t_out; RC->resume_profiler(); - t_out = tensorProjectiveT(input, transformation); // TODO: correct name here + t_out = tensorProjectiveT(input, transformation); RC->pause_profiler(); std::pair<double, double> pinfo = RC->get_time_energy(); RC->reset_profiler(); - RC->addToCurrentIterationComputeTime( - "tensorProjectiveT", pinfo.first); // and here - RC->addToCurrentIterationComputeEnergy( - "tensorProjectiveT", pinfo.second); // and here + RC->addToCurrentIterationComputeTime("tensorProjectiveT", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorProjectiveT", pinfo.second); return t_out; } default: @@ -142,14 +145,23 @@ void *handleTensorMap1ApproximationTuples( case GPUNodeConfiguration::APPROX::FP32: { void *t_out; RC->resume_profiler(); - t_out = tensorMap1(func, input); // TODO: correct name here + t_out = tensorMap1(func, input, false); RC->pause_profiler(); std::pair<double, double> pinfo = RC->get_time_energy(); RC->reset_profiler(); - RC->addToCurrentIterationComputeTime( - "tensorMap1", pinfo.first); // and here - RC->addToCurrentIterationComputeEnergy( - "tensorMap1", pinfo.second); // and here + RC->addToCurrentIterationComputeTime("tensorMap1", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorMap1", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorMap1(func, input, true); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorMap1Half", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorMap1Half", pinfo.second); return t_out; } default: @@ -178,14 +190,23 @@ void *handleTensorMap2ApproximationTuples( case GPUNodeConfiguration::APPROX::FP32: { void *t_out; RC->resume_profiler(); - t_out = tensorMap2(func, input1, input2); // TODO: correct name here + t_out = tensorMap2(func, input1, input2, false); RC->pause_profiler(); std::pair<double, double> pinfo = RC->get_time_energy(); RC->reset_profiler(); - RC->addToCurrentIterationComputeTime( - "tensorMap2", pinfo.first); // and here - RC->addToCurrentIterationComputeEnergy( - "tensorMap2", pinfo.second); // and here + RC->addToCurrentIterationComputeTime("tensorMap2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorMap2", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorMap2(func, input1, input2, true); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorMap2Half", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorMap2Half", pinfo.second); return t_out; } default: @@ -214,14 +235,23 @@ void *handleTensorMap3ApproximationTuples( case GPUNodeConfiguration::APPROX::FP32: { void *t_out; RC->resume_profiler(); - t_out = tensorMap3(func, input1, input2, input3); // TODO: correct name here + t_out = tensorMap3(func, input1, input2, input3, false); RC->pause_profiler(); std::pair<double, double> pinfo = RC->get_time_energy(); RC->reset_profiler(); - RC->addToCurrentIterationComputeTime( - "tensorMap3", pinfo.first); // and here - RC->addToCurrentIterationComputeEnergy( - "tensorMap3", pinfo.second); // and here + RC->addToCurrentIterationComputeTime("tensorMap3", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorMap3", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorMap3(func, input1, input2, input3, true); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorMap3Half", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorMap3Half", pinfo.second); return t_out; } default: diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h index 813351da7b0b1fa53252c097bde3d2630efe012c..ac92b0fece1999557e959117fa30195a4949c6be 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h @@ -48,10 +48,14 @@ void _checkCUBLAS(cublasStatus_t error, const char *file, int line); void _checkCUFFT(cufftResult error, const char *file, int line); +void _checkCUDA(cudaError_t err, const char *file, int line); + #define checkCUBLAS(err) _checkCUBLAS(err, __FILE__, __LINE__) #define checkCUFFT(err) _checkCUFFT(err, __FILE__, __LINE__) +#define checkCUDA(err) _checkCUDA(err, __FILE__, __LINE__) + void INFO(const char* format, ...){ if(!LOG_INFO) // Don't print if logging info is disabled return; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h index 520c1954b237039aee681dccc44acdb9b94dc443..26e8e71bfc277fb57b92301f4db199f5aada0b60 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h @@ -1,6 +1,7 @@ #ifndef IMAGE_PROCESSING_COMMON_H #define IMAGE_PROCESSING_COMMON_H +#include <cuda_fp16.h> #include <cudnn.h> #include <device_launch_parameters.h> #include <vector> @@ -19,6 +20,16 @@ template <typename T> __host__ T resolve_func_ptr(void *func_symbol_ptr) { return (T)v_func_ptr; } +template <typename T> T *convertAndGetGPUData(Tensor *t) { + if (t->data_type == CUDNN_DATA_HALF) + throw std::runtime_error("Half tensor cannot be converted to this type"); + return static_cast<T *>(t->gpu_data); +} + +template <> float *convertAndGetGPUData<float>(Tensor *t); + +template <> half *convertAndGetGPUData<half>(Tensor *t); + std::vector<size_t> sizes(Tensor *t); std::vector<size_t> sizes(const Dimension &dim); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh index 1375fc8476304af1e74bf6d6be9349cdd802e98d..30ec8569755a77ae63293aa9ca09c34967c229a1 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh @@ -42,7 +42,7 @@ std::tuple<size_t *, Scalar **> make_cuda_params( std::array<Scalar *, N> gpu_datas; std::transform(srcs.begin(), srcs.end(), gpu_datas.begin(), [](Tensor *t) { hostToDeviceCopy(t); - return (Scalar *)t->gpu_data; + return convertAndGetGPUData<Scalar>(t); }); size_t *cuda_strides; Scalar **cuda_gpu_data; @@ -80,7 +80,7 @@ mapGeneral(void *host_func_ptr, const std::array<Tensor *, N> &srcs) { unsigned threads = std::min(max_threads, n_elem); unsigned grids = std::min(max_grid, ceilDiv(n_elem, threads)); kernelMapBroadcast<Scalar, N><<<grids, threads>>>( - (Scalar *)target->gpu_data, n_elem, func_ptr, gpu_data, cuda_strides); + convertAndGetGPUData<Scalar>(target), n_elem, func_ptr, gpu_data, cuda_strides); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); return target; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh index 51d8c4da3f41bdbff079288f743ee8222d123292..4f9424c04d2cc462c64de00ec4d4fb6d01e1da37 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh @@ -133,7 +133,8 @@ __host__ Tensor *reduceDim( std::vector<size_t> in_sizes = sizes(src), out_sizes = in_sizes; out_sizes[axis] = 1; auto *target = (Tensor *)create4DTensor( - CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, out_sizes[0], out_sizes[1], out_sizes[2], out_sizes[3]); + CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, out_sizes[0], out_sizes[1], + out_sizes[2], out_sizes[3]); changeTensorPlacement(target, DEVICE); // Calculate schedule parameters @@ -144,7 +145,7 @@ __host__ Tensor *reduceDim( in_sizes.begin() + axis + 1, in_sizes.end(), 1, std::multiplies<>()); size_t num_rows = num_irows * num_orows; - // Calculate approximation parameters + // Calculate approximation parameters if (skip_rate != 0.0f) INFO("Approximation happening...\n"); size_t approx_row_size = (size_t)((1 - skip_rate) * row_size); @@ -162,8 +163,8 @@ __host__ Tensor *reduceDim( dim3 threads(AlongDimTh, CrossDimTh); dim3 grid(grid_x, grid_y); kernelReduceDimParallel<Scalar, bin_float_op><<<grid, threads>>>( - (Scalar *)target->gpu_data, (Scalar *)src->gpu_data, init, func_ptr, - num_irows, num_orows, row_size, approx_row_size); + convertAndGetGPUData<Scalar>(target), convertAndGetGPUData<Scalar>(src), + init, func_ptr, num_irows, num_orows, row_size, approx_row_size); } else { DEBUG( "Reducing sequentially, row size = %lu, actually using %lu\n", row_size, @@ -175,10 +176,10 @@ __host__ Tensor *reduceDim( std::min(MaxBlocksPerDim, num_orows), ceilDiv(MaxNBlocks, grid_x)); dim3 grid(grid_x, grid_y); kernelReduceDimSeq<Scalar, bin_float_op><<<grid, threads>>>( - (Scalar *)target->gpu_data, (Scalar *)src->gpu_data, init, func_ptr, - num_irows, num_orows, row_size, approx_row_size); + convertAndGetGPUData<Scalar>(target), convertAndGetGPUData<Scalar>(src), + init, func_ptr, num_irows, num_orows, row_size, approx_row_size); } cudaDeviceSynchronize(); - checkCudaErrors(cudaGetLastError()); + checkCUDA(cudaGetLastError()); return target; } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h index 69982e656d6eaf989d15939432e28b235eb9cb6e..e329de391b5a64ceb81b1aab6d071d0b8d852da4 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h @@ -1,19 +1,19 @@ #ifndef IMG_TENSOR_RUNTIME_H #define IMG_TENSOR_RUNTIME_H -#include <cstddef> -#include "img_tensor_utils.h" #include "device_math.h" +#include "img_tensor_utils.h" +#include <cstddef> // *** Runtime declaration *** // void *tensorFft(void *input); -void *tensorReduce(void *input, size_t axis, void *func); +void *tensorReduce(void *input, size_t axis, void *func, bool fp16); void *tensorReductionSamplingReduce( void *input, size_t axis, void *func, int skip_level); void *tensorProjectiveT(void *input, void *transformation); -void *tensorMap1(void *f, void *i); -void *tensorMap2(void *f2, void *i1, void *i2); -void *tensorMap3(void *f3, void *i1, void *i2, void *i3); +void *tensorMap1(void *f, void *i, bool fp16); +void *tensorMap2(void *f2, void *i1, void *i2, bool fp16); +void *tensorMap3(void *f3, void *i1, void *i2, void *i3, bool fp16); // *** Wrapper API declaration *** // extern "C" { diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp index c30ddf25e5aeffef8123897af863b520846b62ce..f812460d175f42cbcf93c0f851b7ffc69fe3a14c 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp @@ -1,7 +1,17 @@ #include "functional/common.h" -#include <numeric> #include <functional> +#include <numeric> + +template <> float *convertAndGetGPUData<float>(Tensor *t) { + convertToFP32(t); + return static_cast<float *>(t->gpu_data); +} + +template <> half *convertAndGetGPUData<half>(Tensor *t) { + convertToFP16(t); + return static_cast<half *>(t->gpu_half_data); +} std::vector<size_t> sizes(const Dimension &dim) { return std::vector<size_t>(dim.dim_sizes, dim.dim_sizes + dim.num_dims); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu index 633c5a6655d024e9d25257abb2c2a4c8c9a9e05f..c4589b741cd05487814063cc60065bece5a229f1 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu @@ -32,14 +32,14 @@ void *tensorFft(void *input) { int n_batch = int(all_dims[0]) * int(all_dims[1]); // Prepare input data hostToDeviceCopy(t_input); - auto *input_cuda = (cufftReal *)t_input->gpu_data; + auto *input_cuda = convertAndGetGPUData<cufftReal>(t_input); // Define output data // FIXME: make a flag for float2_; not CUDNN_DATA_FLOAT. auto *out_tensor = (Tensor *)create4DTensor( CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], width, (height / 2 + 1)); changeTensorPlacement(out_tensor, DEVICE); - auto *output_cuda = (cufftComplex *)out_tensor->gpu_data; + auto *output_cuda = convertAndGetGPUData<cufftComplex>(out_tensor); // Create a 2D FFT plan cufftHandle plan; checkCUFFT(cufftPlanMany( @@ -53,7 +53,7 @@ void *tensorFft(void *input) { return out_tensor; } -void *tensorReduce(void *input, size_t axis, void *func) { +void *tensorReduce(void *input, size_t axis, void *func, bool fp16) { INFO("Reduce\n"); auto *src = (Tensor *)input; if (axis >= src->dims.num_dims) @@ -62,7 +62,10 @@ void *tensorReduce(void *input, size_t axis, void *func) { throw std::runtime_error("Not supported"); // Skip 0% of sample - return reduceDim<float>(src, 0.0f, func, axis, 0.0f); + if (fp16) + return reduceDim<half>(src, 0.0f, func, axis, 0.0f); + else + return reduceDim<float>(src, 0.0f, func, axis, 0.0f); } void *tensorReductionSamplingReduce( @@ -91,22 +94,31 @@ void *tensorProjectiveT(void *input, void *transformation) { abort(); } -void *tensorMap1(void *f, void *i) { +void *tensorMap1(void *f, void *i, bool fp16) { INFO("Map1\n"); auto *src = (Tensor *)i; - return mapGeneral<float, 1>(f, {src}); + if (fp16) + return mapGeneral<half, 1>(f, {src}); + else + return mapGeneral<float, 1>(f, {src}); } -void *tensorMap2(void *f2, void *i1, void *i2) { +void *tensorMap2(void *f2, void *i1, void *i2, bool fp16) { INFO("Map2\n"); auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2; - return mapGeneral<float, 2>(f2, {src1, src2}); + if (fp16) + return mapGeneral<half, 2>(f2, {src1, src2}); + else + return mapGeneral<float, 2>(f2, {src1, src2}); } -void *tensorMap3(void *f3, void *i1, void *i2, void *i3) { +void *tensorMap3(void *f3, void *i1, void *i2, void *i3, bool fp16) { INFO("Map3\n"); auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2, *src3 = (Tensor *)i3; - return mapGeneral<float, 3>(f3, {src1, src2, src3}); + if (fp16) + return mapGeneral<half, 3>(f3, {src1, src2, src3}); + else + return mapGeneral<float, 3>(f3, {src1, src2, src3}); } // *** Wrapper API implementation *** // diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp index 6d909a1736482efcadb0797c1b47475c0084ce3c..adcce5ee28db75077c6285a619a9ef74d8b53b97 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp @@ -240,12 +240,12 @@ std::vector<float> PSNR(void *gold_ptr, void *approx_ptr) { (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 1, 1, 1); std::memcpy(image_size_tensor->host_data, &image_size_f, sizeof(float)); - auto *diff = tensorMap2(device::fsub_ptrptr, gold_tensor, approx_tensor); - auto *diffsqr = tensorMap2(device::fmul_ptrptr, diff, diff); - auto *mse_sum_1d = tensorReduce(diffsqr, 3, device::fadd_ptrptr); - auto *mse_sum = tensorReduce(mse_sum_1d, 2, device::fadd_ptrptr); - auto *mse_avg = tensorMap2(device::fdiv_ptrptr, mse_sum, image_size_tensor); - auto *psnr_val = (Tensor *)tensorMap1(psnr_ptrptr, mse_avg); + auto *diff = tensorMap2(device::fsub_ptrptr, gold_tensor, approx_tensor, false); + auto *diffsqr = tensorMap2(device::fmul_ptrptr, diff, diff, false); + auto *mse_sum_1d = tensorReduce(diffsqr, 3, device::fadd_ptrptr, false); + auto *mse_sum = tensorReduce(mse_sum_1d, 2, device::fadd_ptrptr, false); + auto *mse_avg = tensorMap2(device::fdiv_ptrptr, mse_sum, image_size_tensor, false); + auto *psnr_val = (Tensor *)tensorMap1(psnr_ptrptr, mse_avg, false); auto *float_data = (float *)psnr_val->host_data; return std::vector<float>(float_data, float_data + batch_dim);