From 76f954de32a611cd1420928003a95af173708387 Mon Sep 17 00:00:00 2001 From: Hashim Sharif <hsharif3@tyler.cs.illinois.edu> Date: Sun, 27 Oct 2019 01:40:05 -0500 Subject: [PATCH] Adding calls for explicity FP32/FP16 conversion --- .../include/approx_techniques.h | 84 +++++++++++-------- .../include/approx_techniques2.h | 38 ++++----- 2 files changed, 66 insertions(+), 56 deletions(-) diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h index af0ed1e202..9689c6fce9 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h @@ -819,7 +819,7 @@ __global__ void depthwise_conv4_half3(__half* const __restrict__ y, __half t1; - int total = C_dim * H_dim * W_dim; + //int total = C_dim * H_dim * W_dim; t1 = xdata[(m - bstartm) * H_dim * W_dim + (start_h + p - bstart_h) * W_dim + start_w + q - bstart_w]; @@ -920,7 +920,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups){ - llvm_hpvm_initTensorRt(0); INFO("*** TensorConvolution \n"); profileEvent("Conv"); @@ -935,7 +934,13 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, Tensor* output; - + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + convertToFP32(input); + convertToFP32(filter); + + if (conv_groups > 32) { // TODO: Support other cases; hostToDeviceCopy(input); @@ -949,7 +954,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device @@ -957,33 +962,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, // NOTE: Necessary to insert the above call for every output tensor - /* - if (c > 255) { - dim3 grid((n / 16), c); - dim3 block(h * w); - depthwise_conv << <grid, block >> > ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - - }*/ - - /* - dim3 grid((n / 12), c); - dim3 block(h * w); - depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - if(n % 12 > 0){ - dim3 grid2((n % 12), c); - dim3 block(h * w); - depthwise_conv <<<grid, block >>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12)); - } - */ int blockSize; blockSize = 64; @@ -994,7 +972,8 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, (float*)input->gpu_data, (float*)filter->gpu_data, input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + KH, KW, h, w, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); } else { @@ -1043,11 +1022,11 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); if (input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); else if (input->data_format == CUDNN_TENSOR_NHWC) { DEBUG("* NHWC Format \n"); - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NHWC, n, h, w, c); } else @@ -1137,6 +1116,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, } +// FIXME: Need to properly fix the new HALF type conversion void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, @@ -1165,6 +1145,9 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); + convertToFP16(input); + convertToFP16(filter); + /***** CONVERSIONS from FP32 to FP16 - on the GPU */ size_t* input_dims = input->dims.dim_sizes; @@ -1209,7 +1192,7 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, + output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // FIXIT: more checks for data types needed output_half = (Tensor*) create4DTensor(CUDNN_DATA_HALF, @@ -1797,7 +1780,7 @@ void* tensorConvPerf(void* input_ptr, void* filter_ptr, Tensor* new_output; if(input->data_format == CUDNN_TENSOR_NCHW) - new_output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); else if(input->data_format == CUDNN_TENSOR_NHWC){ DEBUG("* NHWC Format \n"); @@ -2078,3 +2061,32 @@ void* tensorConvolutionKernelSamp(void* input_ptr, void* filter_ptr, #endif return output; } + + + /* + if (c > 255) { + dim3 grid((n / 16), c); + dim3 block(h * w); + depthwise_conv << <grid, block >> > ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + + }*/ + + /* + dim3 grid((n / 12), c); + dim3 block(h * w); + depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + if(n % 12 > 0){ + dim3 grid2((n % 12), c); + dim3 block(h * w); + depthwise_conv <<<grid, block >>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12)); + } + */ diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h index f66b80d728..a81ffe2962 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h @@ -1,4 +1,6 @@ +#include "tensor_utils.cu" + //This skips every xth row @@ -173,10 +175,17 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, if (conv_groups == 0) { conv_groups = 1; } + Tensor* output; // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); + + + convertToFP32(input); + convertToFP32(filter); + + int n, c, h, w; // output dimensions n = input->dims.dim_sizes[0]; c = filter->dims.dim_sizes[0]; //number of filters @@ -196,7 +205,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, Tensor *new_output; if(row > 1){ - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device @@ -231,7 +240,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); - new_output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); @@ -239,7 +248,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, //interpolate int numBlocks = (n * c * h * w + 127) / 128; approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w, - (float *)output->gpu_data, (float *)new_output->gpu_data, + (float *) output->gpu_data, (float *) new_output->gpu_data, row, start); cudaDeviceSynchronize(); @@ -247,7 +256,8 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, cudaFree(convData); } else if(col > 1){ - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device @@ -282,7 +292,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); - new_output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); @@ -298,7 +308,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, cudaFree(convData); } else{ - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device @@ -336,19 +346,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, profileEvent("Conv_end", true); - #ifdef ERROR_INJECTION_ENABLED - if (op_counter >= total_ops) { - ERROR("No accuracy flag found \n"); - } - int op_acc = op_accuracies[op_counter]; - // Skip errorInjection if explicitly requested - if (skip_tensors.find(op_counter) != skip_tensors.end()) { - op_acc = 0; - } - void* error_norms = tensorAddError(output, op_acc); - add_norms(error_norms, "tensorConv", op_acc); - add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); - op_counter++; - #endif + + return new_output; } -- GitLab