diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu index b97e5beadb7822cce12bdf2ee4d16407cd0483c4..546eb8286390fd5ff8e9bb09bd628a03f14b9f38 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu @@ -2,10 +2,16 @@ // //===----------------------------------------------------------------------===// // -// This file consists of the custom implementation of software approximations -// for tensor convolutions. The approximations implemented are feature sampling -// and perforation for FP32 and FP16 compute precisions. +// This file consists of our CUDA-based implementation for convolution approximations // +// *Supported Approximations: Perforated Convolutions, Filter Sampling +// +// FP32 Convolution Routine: `tensorConvApprox` +// FP16 Convolution Routine: `tensorConvApproxHalf2` +// +// NOTE: These approximations are tuned for NVIDIA Jetson Tx2 device +// +// Author: Akash Kothari //===----------------------------------------------------------------------===// #include "tensor_utils.h" @@ -209,6 +215,7 @@ __global__ void convToGemmHalfInputNewIrregular2( const int W_out, const int V_stride, const int H_stride, const int reduced_filter_elem, const int skip_every, const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id const int n = tx / (C * H_out * W_out); // output image number const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan @@ -1172,10 +1179,9 @@ convToGemmApprox(float *const __restrict__ output, } /// This function serves as an API with the custom implementation of convolution -/// with the perforation and filter sampling support. The compute precison is -/// FP32. This routine is invoked by the tuner for tuning approximations for -/// convolutions. -/// +/// with the perforation and filter sampling support. The compute precison is FP32. +/// NOTE: This routine is used only for correctness testing +/// NOTE: This is NOT the main approximation routine used by HPVM void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups, @@ -1331,8 +1337,6 @@ void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad, vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, num_filter_elem, c * h * w); checkCudaErrors(cudaDeviceSynchronize()); - // Do the matrix multiplication - // Want to multiply convData by filter->gpu_data[f * chan * KH * KW] float alpha = 1.0f, beta = 0.0f; checkCudaErrors(cublasSgemmStridedBatched( @@ -1345,7 +1349,6 @@ void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad, cudaFree(convData); } - // Event("Conv_end"); //, true); return new_output; } @@ -1364,18 +1367,22 @@ __global__ void switchMatrixFull(int N, int n, int c, int h, int w, } } -/// This function serves as an API with the custom implementation of convolution -/// with the perforation and filter sampling support. The compute precison is -/// FP32. + +/************* API for Approximation Convolution Implementations ************/ + +/// ** API for FP32 Convolution that supports Baseline (No Approx), Perforation, and Filter Sampling ** +/// - Arguments to control Approximation: +/// `row`: Controls the fraction of rows skipped (Perforation) - (1/row * 100)% rows skipped +/// `col`: Controls fraction of columns skipped (Perforation) - (1/col * 100)% columns skipped +/// `skip_every`: Controls fration of filter elements skipped (Filter Sampling). (1/skip_every * 100)% filter elems skipped +/// `offset` controls the tensor index at which sampling/perforation starts /// +/// For Baseline convolution pass `row=1` `col=1` `skip_every = 1` void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups, int row, int col, int skip_every, int offset) { - //////INFO("*** TensorConvolution approximation \n"); - // Event("Conv"); - Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; // FIXME: Current hack to preserve backward compatibilty @@ -1386,36 +1393,22 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, hostToDeviceCopy(input); hostToDeviceCopy(filter); - ////Event("H2F_start"); convertToFP32(input); convertToFP32(filter); - ////Event("H2F_end"); const int n = input->dims.dim_sizes[0]; const int c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - const int h = - (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const int w = - (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + - 1; + const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - + Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - ////INFO("batch: %d\n", n); - ////INFO("channels: %d\n", input->dims.dim_sizes[1]); - ////INFO("num_filters: %d\n", c); - ////INFO("kernel height: %d\n", KH); - ////INFO("kernel width: %d\n", KW); - ////INFO("num_filter_elem: %d\n", num_filter_elem); - ////INFO("vertical_stride: %d\n", vertical_stride); - ////INFO("horizontal_stride: %d\n", horizontal_stride); - ////INFO("output height: %d\n", h); - ////INFO("output width: %d\n", w); + if (row > 1) { const int rem_row = (h - offset) % row > 0; const int h_eff = h - ((h - offset) / row) - rem_row; @@ -1432,8 +1425,6 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * - /// input->dims.dim_sizes[1] * h_eff * w)); const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; convToGemmPerfRow<<<gridSize, blockSize>>>( @@ -1464,7 +1455,7 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, const int w_eff = w - ((w - offset) / col) - rem_col; Tensor *output = (Tensor *)create4DTensor( - (cudnnDataType_t)float_type, // input->data_type, + (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device @@ -1475,8 +1466,6 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * - /// input->dims.dim_sizes[1] * h * w_eff)); const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; @@ -1494,7 +1483,7 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); - // interpolate + // Interpolate int blocksize = 128; int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; approxInterpolateCol<<<numBlocks, blocksize>>>( @@ -1518,16 +1507,13 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); const int filtBlockSize = 128; - ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); - const int filtGridSize = - (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; + const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; const float fac = ((float)skip_every) / ((float)skip_every - 1); - //////INFO("fac: %f\n", fac); const int blockSize = 128; - //////INFO("n * h * w : %d\n", (n * h * w )); const int gridSize = (n * h * w + blockSize - 1) / blockSize; + if (!(KH * KW % skip_every)) { - // ////INFO("REGULAR FILTERING\n"); + createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>( reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, @@ -1538,8 +1524,8 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, reduced_filter_elem, skip_every, offset); - } else { - // ////INFO("IRREGULAR FILTERING\n"); + } + else { createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>( reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, reduced_filter_elem, skip_every, offset, fac); @@ -1563,7 +1549,6 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, cudaFree(reducedFilter); } else { - // INFO("FP32 BASELINE\n"); Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, n, c, h, w); changeTensorPlacement(output, DEVICE); @@ -1575,25 +1560,17 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, const int blockSize = 128; const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * - /// input->dims.dim_sizes[1] * h * w)); + convToGemmFullInput<<<gridSize, blockSize>>>( convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, - skip_every, offset); // num_filter_elem); + skip_every, offset); + checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - /* - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h - * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float - *)new_output->gpu_data, h * w, c * h * w, n)); - */ + checkCudaErrors(cublasGemmEx( cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data, @@ -1609,7 +1586,6 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, cudaFree(convData); } - // Event("Conv_end"); return new_output; } @@ -1628,21 +1604,27 @@ __global__ void switchMatrixHalf(int N, int n, int c, int h, int w, } } -/// This function serves as an API to custom implementation of the -/// half-precision convolution with the perforation and filter sampling -/// support. + + + +/// ** API for FP16 Convolution that supports Baseline (No Approx), Perforation, and Filter Sampling ** +/// - Arguments to control Approximation: +/// `row`: Controls the fraction of rows skipped (Perforation) - (1/row * 100)% rows skipped +/// `col`: Controls fraction of columns skipped (Perforation) - (1/col * 100)% columns skipped +/// `skip_every`: Controls fration of filter elements skipped (Filter Sampling). (1/skip_every * 100)% filter elems skipped +/// `offset` controls the tensor index at which sampling/perforation starts /// +/// For Baseline convolution pass `row=1` `col=1` `skip_every = 1` void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups, int row, int col, int skip_every, int offset) { - // INFO("*** TensorConvolution half approximation \n"); - // profileEvent("#Conv"); - + Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; @@ -1670,18 +1652,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w); changeTensorPlacement(new_output, DEVICE); - // INFO("batch: %d\n", n); - // INFO("channels: %d\n", input->dims.dim_sizes[1]); - // INFO("num_filters: %d\n", c); - // INFO("kernel height: %d\n", KH); - // INFO("kernel width: %d\n", KW); - // INFO("num_filter_elem: %d\n", num_filter_elem); - // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); - // INFO("vertical_stride: %d\n", vertical_stride); - // INFO("horizontal_stride: %d\n", horizontal_stride); - // INFO("output height: %d\n", h); - // INFO("output width: %d\n", w); - // INFO("skip_every: %d\n", skip_every); + const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; @@ -1707,7 +1678,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, const int numInterpolationBlocks = (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; if (h * w <= 64) { - // INFO("H *W <= 64\n"); + convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>( convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, @@ -1730,7 +1701,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, checkCudaErrors(cudaDeviceSynchronize()); } else { - // INFO("H *W > 64\n"); + convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>( convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, @@ -1773,7 +1744,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, const int numInterpolationBlocks = (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; if (h * w <= 64) { - // INFO("H *W <= 64\n"); + convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>( convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, @@ -1794,8 +1765,8 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, (__half *)output_half->gpu_half_data, (__half *)new_output->gpu_half_data, col, offset); checkCudaErrors(cudaDeviceSynchronize()); - } else { - // INFO("H *W > 64\n"); + } + else { convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>( convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, @@ -1836,18 +1807,15 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; const float fac = ((float)skip_every) / ((float)skip_every - 1); const int blockSize = 256; - // const int gridSize = (n * h * w + blockSize - 1) / blockSize; - // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); - // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); + const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if (c * num_filter_elem < - 500000) { // 250) {//c * reduced_filter_elem < 150000) { + if (c * num_filter_elem < 500000) { if (!(KH * KW % skip_every)) { - // INFO("---REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, fac); @@ -1862,16 +1830,16 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, w, vertical_stride, horizontal_stride, reduced_filter_elem, skip_every, offset); } else { - // INFO("---IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, reduced_filter_elem, skip_every, offset, fac); checkCudaErrors(cudaDeviceSynchronize()); const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - // convToGemmHalfInputIrregular - convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>( + + convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>( convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, @@ -1891,8 +1859,8 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, changeTensorPlacement(output_half, DEVICE); if (!(KH * KW % skip_every)) { - // INFO("REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, fac); @@ -1907,8 +1875,8 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, w, vertical_stride, horizontal_stride, reduced_filter_elem, skip_every, offset); } else { - // INFO("IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, reduced_filter_elem, skip_every, offset, fac); checkCudaErrors(cudaDeviceSynchronize()); @@ -1943,7 +1911,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, cudaFree(convData); cudaFree(reducedFilter); } else { - // INFO("FP16 BASELINE\n"); + Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w); @@ -1955,7 +1923,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, const int blockSize = 256; const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - // convToGemmHalf + convToGemmHalfInputNew<<<gridSize, blockSize>>>( convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,