diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt index 7359651100d8c181b29e2096ba73db9c026e51cb..e0940eea58c3a14bda5b7d00d2033c7da8df68a7 100644 --- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt +++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt @@ -54,7 +54,7 @@ link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64 $ENV{CUDNN_PATH} $ENV{CUDNN_PATH set( RUNTIME_SRCS_FILENAME approx_simulation.cu - approx_techniques.cu + group_conv.cu approx_techniques2.cu common.cpp configuration.cpp diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu deleted file mode 100644 index 14eab13522acc8402cd9f7d745343a30e5028abf..0000000000000000000000000000000000000000 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu +++ /dev/null @@ -1,1191 +0,0 @@ - -#include "tensor_utils.h" -#include "fp16_gemm.h" -#include "debug.h" -#include "global_data.h" -#include "profiling.h" -#include "op_overheads.h" -#include "error.h" - - -extern "C"{ - - - -__global__ void depthwise_convNew8(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) -{ - - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - - const int num = 8; - - const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - - if(m < M){ - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); - - const int start_h = (tx / W_out) * H_stride - H_pad; - const int start_w = (tx % W_out) * W_stride - W_pad; - - float c0 = 0; - float c1 = 0; - float c2 = 0; - float c3 = 0; - float c4 = 0; - float c5 = 0; - float c6 = 0; - float c7 = 0; - - const float* weights = &w[m * KH * KW]; - - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; - - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; - if(b + 1 < B) - c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; - if(b + 2 < B) - c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; - if(b + 3 < B) - c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; - if(b + 4 < B) - c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; - if(b + 5 < B) - c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; - if(b + 6 < B) - c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; - if(b + 7 < B) - c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; - - - } - } - - y4d(b, m, 0, tx) = c0; - if(b + 1 < B) - y4d(b + 1, m, 0, tx) = c1; - if(b + 2 < B) - y4d(b + 2, m, 0, tx) = c2; - if(b + 3 < B) - y4d(b + 3, m, 0, tx) = c3; - if(b + 4 < B) - y4d(b + 4, m, 0, tx) = c4; - if(b + 5 < B) - y4d(b + 5, m, 0, tx) = c5; - if(b + 6 < B) - y4d(b + 6, m, 0, tx) = c6; - if(b + 7 < B) - y4d(b + 7, m, 0, tx) = c7; - } - - #undef y4d - #undef x4d -} - - - - -__global__ void depthwise_convNew8_half2(__half* const __restrict__ y, - const __half* const __restrict__ x, - const __half* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) -{ - - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - - const int num = 8; - - const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - - if(m < M){ - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); - - const int start_h = (tx / W_out) * H_stride - H_pad; - const int start_w = (tx % W_out) * W_stride - W_pad; - - __half2 c0 = __half2half2(0); - __half2 c1 = __half2half2(0); - __half2 c2 = __half2half2(0); - __half2 c3 = __half2half2(0); - - const __half* weights = &w[m * KH * KW]; - - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - - __half2 t1; - __half2 t2; - __half2 t3; - __half2 t4; - if(b + 7 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); - t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q)); - } - else if(b + 6 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); - t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q)); - - } - else if(b + 5 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); - } - else if(b + 4 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q)); - - } - else if(b + 3 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - } - else if(b + 2 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q)); - - } - else if(b + 1 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - } - else{ - t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q)); - - } - - - c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0); - c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1); - c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2); - c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3); - - } - } - - y4d(b, m, 0, tx) = __high2half(c0); - if(b + 1 < B) - y4d(b + 1, m, 0, tx) = __low2half(c0); - if(b + 2 < B) - y4d(b + 2, m, 0, tx) = __high2half(c1); - if(b + 3 < B) - y4d(b + 3, m, 0, tx) = __low2half(c1); - if(b + 4 < B) - y4d(b + 4, m, 0, tx) = __high2half(c2); - if(b + 5 < B) - y4d(b + 5, m, 0, tx) = __low2half(c2); - if(b + 6 < B) - y4d(b + 6, m, 0, tx) = __high2half(c3); - if(b + 7 < B) - y4d(b + 7, m, 0, tx) = __low2half(c3); - } - - #undef y4d - #undef x4d -} - - - //When stride is 1 - __global__ void depthwise_conv4_half3(__half* const __restrict__ y, - const __half* const __restrict__ x, - const __half* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int C_dim, const int H_dim, const int W_dim) - { - -#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] -#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - - const int num = 1; - - const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out); - - if (m < M) { - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); - - const int start_h = (tx / W_out) - H_pad; - const int start_w = (tx % W_out) - W_pad; - - const int bstart_h = (blockIdx.y * blockDim.x % (H_out * W_out)) / W_out - H_pad; - const int bstart_w = (blockIdx.y * blockDim.x % (H_out * W_out)) % W_out - H_pad; - const int bstartm = (blockIdx.y * blockDim.x / (H_out * W_out)); - - extern __shared__ __half xdata[]; - - for (int i = 0; i < C_dim * H_dim * W_dim; i += blockDim.x) { - if (i / (H_dim * W_dim) + bstartm < M && (i % (H_dim * W_dim)) / W_dim + bstart_h > -1 && - (i % (H_dim * W_dim)) / W_dim + bstart_h < H && (i % (H_dim * W_dim)) % W_dim + bstart_w > -1 && - (i % (H_dim * W_dim)) % W_dim + bstart_w < W) { - xdata[i] = x4d(b, i / (H_dim * W_dim) + bstartm, (i % (H_dim * W_dim)) / W_dim + bstart_h, - (i % (H_dim * W_dim)) % W_dim + bstart_w); - } - } - __syncthreads(); - - __half c0; - const __half* weights = &w[m * KH * KW]; - - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - - __half t1; - - //int total = C_dim * H_dim * W_dim; - t1 = xdata[(m - bstartm) * H_dim * W_dim + (start_h + p - bstart_h) * W_dim + - start_w + q - bstart_w]; - - - c0 = __hfma(t1, weights[k], c0); - } - } - - y4d(b, m, 0, tx) = c0; - - - } - - #undef y4d - #undef x4d - } - - - -void* tensorConvCutlass(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - - - INFO("*** TensorConvolution \n"); - profileEvent("Conv"); - - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - - //FIXME: Current hack to preserve backward compatibilty - if (conv_groups == 0) { - conv_groups = 1; - } - - Tensor* output; - - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - convertToFP32(input); - convertToFP32(filter); - - - if (conv_groups > 32) { - // TODO: Support other cases; - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - int n, c, h, w; // output dimensions - n = input->dims.dim_sizes[0]; - c = input->dims.dim_sizes[1]; - const int KH = filter->dims.dim_sizes[2]; - const int KW = filter->dims.dim_sizes[3]; - h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - - - - int blockSize; - blockSize = 64; - - dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); - dim3 block(blockSize); - depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - - } - else { - - cudnnConvolutionDescriptor_t convDesc; - cudnnConvolutionFwdAlgo_t convAlgo; - cudnnConvolutionMode_t mode; - if (conv_mode == 0) - mode = CUDNN_CONVOLUTION; - else if (conv_mode == 1) - mode = CUDNN_CROSS_CORRELATION; - - // FIXIT: Need to be more aware of the implications of alpha and beta - float alpha = 1.0f, beta = 0.0f; - - // TODO: Support other cases; - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); - - checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - - // NOTE: Adding support for grouped convolution - checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - - - cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - // FIXIT: Think if upscaling values need to be configurable? - // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision - - int n, c, h, w; // output dimensions - // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - - - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - - if (input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - else if (input->data_format == CUDNN_TENSOR_NHWC) { - DEBUG("* NHWC Format \n"); - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NHWC, n, h, w, c); - } - else - ERROR("Unsupported Tensor Type"); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); - - if (convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) - ERROR("NULL descriptor! \n"); - - - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - - - DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - - - // FIXIT: Algo shouldn't be hardcoded - convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - - size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); - - // Allocating memory for the convolution workspace - void* workspace; - checkCudaErrors(cudaMalloc(&workspace, workspace_size)); - DEBUG("workspace size = %d \n", workspace_size); - - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, filter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - } - - cudaDeviceSynchronize(); - profileEvent("Conv_end", true); - - - #ifdef ERROR_INJECTION_ENABLED - - if (op_counter >= total_ops) { - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - - // Skip errorInjection if explicitly requested - if (skip_tensors.find(op_counter) != skip_tensors.end()) { - op_acc = 0; - } - - void* error_norms = tensorAddError(output, op_acc); - add_norms(error_norms, "tensorConv", op_acc); - add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); - - op_counter++; - - #endif - - return output; - - -} - -// FIXME: Need to properly fix the new HALF type conversion -void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - - INFO("*** TensorHConvolution \n"); - profileEvent("#Conv"); - - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; - - cudnnConvolutionDescriptor_t convDesc; - cudnnConvolutionFwdAlgo_t convAlgo; - cudnnConvolutionMode_t mode; - - if(conv_mode == 0) - mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) - mode = CUDNN_CROSS_CORRELATION; - - // FIXIT: Need to be more aware of the implications of alpha and beta - float alpha = 1.0f, beta = 0.0f; - // NOTE: compute in half precision - cudnnDataType_t computeType = CUDNN_DATA_HALF; - - // NOTE: Moving inputs to GPU global memory - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - - // Float-Half Conversions - profileEvent("F2H_start"); - - convertToFP16(input); - convertToFP16(filter); - - profileEvent("F2H_end"); - /******* END OF INPUT DATA CONVERSIONS*/ - - - Tensor *output; - if(conv_groups > 1){ - int n = input->dims.dim_sizes[0]; - int c = input->dims.dim_sizes[1]; - const int KH = filter->dims.dim_sizes[2]; - const int KW = filter->dims.dim_sizes[3]; - int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - - - output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - - int blockSize; - blockSize = 128; - - dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); - dim3 block(blockSize); - depthwise_convNew8_half2<<<grid, block>>> ((__half*) output->gpu_half_data, - (__half*) input->gpu_half_data, - (__half*) filter->gpu_half_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - cudaDeviceSynchronize(); - - - } - else{ - checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ - conv_groups = 1; - } - - // NOTE: Adding support for grouped convolution - checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision - - int n, c, h, w; // output dimensions - // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_half_desc, - filter->filter_half_desc, - &n, &c, &h, &w)); - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - - - output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", - output->data_type, output->data_format, - output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); - - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) - ERROR("NULL descriptor! \n"); - - - // NOTE: The following algo works with TRUE half precision - convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - - - size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_half_desc, - filter->filter_half_desc, - convDesc, - output->tensor_half_desc, - convAlgo, - &workspace_size)); - - // Allocating memory for the convolution workspace - DEBUG("workspace size = %d \n", workspace_size); - void* workspace; - checkCudaErrors(cudaMalloc(&workspace, workspace_size)); - - - - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, - &alpha, - input->tensor_half_desc, - input->gpu_half_data, - filter->filter_half_desc, - filter->gpu_half_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, - output->tensor_half_desc, - output->gpu_half_data)); - - } - - profileEvent("H2F_start"); - - convertToFP32_offline(output); - - profileEvent("H2F_end"); - - - - profileEvent("#Conv_end"); - - - - return output; - -} - -void* tensorHalfConvCutlass2(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - - INFO("*** TensorHConvolution \n"); - profileEvent("#Conv"); - - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - - cudnnConvolutionDescriptor_t convDesc; - cudnnConvolutionFwdAlgo_t convAlgo; - cudnnConvolutionMode_t mode; - if (conv_mode == 0) - mode = CUDNN_CONVOLUTION; - else if (conv_mode == 1) - mode = CUDNN_CROSS_CORRELATION; - - // FIXIT: Need to be more aware of the implications of alpha and beta - float alpha = 1.0f, beta = 0.0f; - // NOTE: compute in half precision - cudnnDataType_t computeType = CUDNN_DATA_HALF; - - // NOTE: Moving inputs to GPU global memory - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - - /***** CONVERSIONS from FP32 to FP16 - on the GPU */ - size_t* input_dims = input->dims.dim_sizes; - size_t* filter_dims = filter->dims.dim_sizes; - - - profileEvent("F2H_start"); - - Tensor* input_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, - input_dims[0], input_dims[1], - input_dims[2], input_dims[3]); - - - changeTensorPlacement(input_half, DEVICE); - Tensor* filter_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, - filter_dims[0], filter_dims[1], - filter_dims[2], filter_dims[3]); - - - changeTensorPlacement(filter_half, DEVICE); - - - f2h((float*)input->gpu_data, input->num_elems, (half*)input_half->gpu_data); - f2h((float*)filter->gpu_data, filter->num_elems, (half*)filter_half->gpu_data); - - - /******* END OF INPUT DATA CONVERSIONS*/ - profileEvent("F2H_end"); - - Tensor* output; - Tensor* output_half; - - - if (conv_groups > 1 && horizontal_stride == 1 && vertical_stride == 1) { - int n = input->dims.dim_sizes[0]; - int c = input->dims.dim_sizes[1]; - const int KH = filter->dims.dim_sizes[2]; - const int KW = filter->dims.dim_sizes[3]; - int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - - - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - // FIXIT: more checks for data types needed - output_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, - CUDNN_TENSOR_NCHW, n, c, h, w); - - - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - - int blockSize; - blockSize = 128; - - dim3 grid(((n + 3) / 4), (c * h * w + blockSize - 1) / blockSize); - dim3 block(blockSize); - int C_dim = blockSize / (h * w) + 1 + 1; - int H_dim = blockSize % (h * w) / w + 1 + KH + 1; - int W_dim = blockSize % (h * w) % w + 1 + KW + 1; - depthwise_conv4_half3 << <grid, block, - sizeof(__half)* C_dim* H_dim* W_dim >> > - ((__half*) output_half->gpu_data, - - (__half*)input_half->gpu_data, - (__half*)filter_half->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, C_dim, H_dim, W_dim); - - - cudaDeviceSynchronize(); - - - } - else { - checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - - //FIXME: Current hack to preserve backward compatibilty - if (conv_groups == 0) { - conv_groups = 1; - } - - // NOTE: Adding support for grouped convolution - checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - - - // FIXIT: Think if upscaling values need to be configurable? - // IMP-FIXIT: CUDNN Cross correlation is only used in the Lenet context - // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision - - int n, c, h, w; // output dimensions - // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - - - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - // FIXIT: more checks for data types needed - output_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, - CUDNN_TENSOR_NCHW, n, c, h, w); - - - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); - - if (convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) - ERROR("NULL descriptor! \n"); - - - // NOTE: The following algo works with TRUE half precision - convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - - - size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input_half->tensor_desc, - filter_half->filter_desc, - convDesc, - output_half->tensor_desc, - convAlgo, - &workspace_size)); - - // Allocating memory for the convolution workspace - DEBUG("workspace size = %d \n", workspace_size); - void* workspace; - checkCudaErrors(cudaMalloc(&workspace, workspace_size)); - - - - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, - &alpha, - input_half->tensor_desc, - input_half->gpu_data, - filter_half->filter_desc, - filter_half->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, - output_half->tensor_desc, - output_half->gpu_data)); - - } - - profileEvent("H2F_start"); - - // NOTE: Transforming half precision output to single precision - h2f((half*)output_half->gpu_data, output->num_elems, (float*)output->gpu_data); - - profileEvent("H2F_end"); - - profileEvent("#Conv_end"); - - - freeTensor(input_half); - freeTensor(filter_half); - freeTensor(output_half); - - return output; - -} - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void interpolateCol(int N, int old_w, int n, int c, int h, int w, float *old_data, float *new_data){ - - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); - - if(col % 2 == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + col / 2]; - else if(col == w - 1) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-1) / 2]; - else - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-1) / 2] + - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col+1) / 2])/2; - - } -} - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void interpolateRow(int N, int old_h, int n, int c, int h, int w, float *old_data, float *new_data){ - - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); - - if(row % 2 == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (row/2) * (w) + col]; - else if(row == h - 1) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (row-1)/2 * (w) + col]; - else - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[n * (c * old_h * w) + ch * (old_h * w) + (row -1)/2 * (w) + col] + - old_data[n * (c * old_h * w) + ch * (old_h * w) + ((row+1) / 2) * (w) + col])/2; - - } -} - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void interpolateXCol(int N, int old_w, int n, int c, int h, int w, - float *old_data, float *new_data, int num){ - - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); - - if(col % num == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + col / num]; - else{ - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-(col%num)) / num]; - } - - } -} - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void interpolateXRow(int N, int old_h, int n, int c, int h, int w, - float *old_data, float *new_data, int num){ - - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); - - if(row % num == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (row/num) * (w) + col]; - else{ - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (row - (row % num))/num * (w) + col]; - } - - } -} - - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void interpolateColHalf(int N, int old_w, int n, int c, int h, int w, __half *old_data, __half *new_data){ - - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); - - if(col % 2 == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + col / 2]; - else if(col == w - 1) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-1) / 2]; - else - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-1) / 2], - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col+1) / 2]),2); - - } -} - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void interpolateRowHalf(int N, int old_h, int n, int c, int h, int w, __half *old_data, __half *new_data){ - - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); - - if(row % 2 == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (row/2) * (w) + col]; - else if(row == h - 1) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (row-1)/2 * (w) + col]; - else - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[n * (c * old_h * w) + ch * (old_h * w) - + (row -1)/2 * (w) + col], - old_data[n * (c * old_h * w) + ch * (old_h * w) - + ((row+1) / 2) * (w) + col]), 2); - - } -} - - - - - -//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols -__global__ void convToGemmApprox(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element? - const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0; - } - } - } - } -} - - -//produces COL MAJOR matrix with reduced_filter_elem rows and NF cols -__global__ void createReducedFilters(float * const __restrict__ output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / num_filter_elem; //filter index - const int offset = tx % num_filter_elem; //offset within filter - if(fIdx < NF) { //is thread id within bounds? - if(offset % skip_every != skip_every-1) { //are we including this filter element? - const int output_row = offset - (offset/skip_every); //calculate output row, taking skipping into account - output[fIdx*reduced_filter_elem + output_row] = input[tx] * 2; - } - } -} - - -void* tensorConvolutionKernelSamp(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, int skip_every){ - - INFO("*** TensorConvolution (w/ kernel sampling) \n"); - profileEvent("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty - if (conv_groups == 0) { - conv_groups = 1; - } - - Tensor* output; - // TODO: Support other cases; - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - - convertToFP32(input); - convertToFP32(filter); - - - int n, c, h, w; // output dimensions - n = input->dims.dim_sizes[0]; - c = filter->dims.dim_sizes[0]; //number of filters - const int KH = filter->dims.dim_sizes[2]; - const int KW = filter->dims.dim_sizes[3]; - h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem - const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - //reduced number after skipping - const int reduced_filter_elem = num_filter_elem - (num_filter_elem/skip_every); - float * convData; - int convDataSize = sizeof(float) * n * reduced_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - float * reducedFilter; - checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); - const int filtBlockSize = 128; - const int filtGridSize = (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize; - createReducedFilters<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (float *)filter->gpu_data, c, - num_filter_elem, reduced_filter_elem, - skip_every); - const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - convToGemmApprox<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every); - checkCudaErrors(cudaDeviceSynchronize()); - //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW] - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, reduced_filter_elem, - &alpha, - convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, - &beta, - (float *)output->gpu_data, h * w, c * h * w, - n)); - cudaFree(convData); - cudaFree(reducedFilter); - profileEvent("Conv_end", true); - #ifdef ERROR_INJECTION_ENABLED - if (op_counter >= total_ops) { - ERROR("No accuracy flag found \n"); - } - int op_acc = op_accuracies[op_counter]; - // Skip errorInjection if explicitly requested - if (skip_tensors.find(op_counter) != skip_tensors.end()) { - op_acc = 0; - } - void* error_norms = tensorAddError(output, op_acc); - add_norms(error_norms, "tensorConv", op_acc); - add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); - op_counter++; - #endif - return output; -} - - -}// End of Extern C diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu index 5fbc7139b95c22fdd91301f4007b0fe06ec08a69..8f2d840362ee523a458339b848e9080a2822d92f 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu @@ -1078,6 +1078,43 @@ __global__ void createReducedFiltersHalfIrregular(__half * output, } } + + +//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols +__global__ void convToGemmApprox(float * const __restrict__ output, + const float * const __restrict input, const int N, const int C, + const int H, const int W, + const int KH, const int KW, const int V_pad, + const int H_pad, const int H_out, + const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_out * W_out); //output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number + const int h = tx % (H_out * W_out) / W_out; //output height index (row number) + const int w = tx % W_out; //output width index (col number) + const int inH = h * V_stride - V_pad; //input height index (row number) + const int inW = w * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element? + const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0; + } + } + } + } +} + + + void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups, diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu new file mode 100644 index 0000000000000000000000000000000000000000..fd8a23b9cad89fe9ac6618e8c1b0e962ab27cf15 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu @@ -0,0 +1,594 @@ + +#include "tensor_utils.h" +#include "fp16_gemm.h" +#include "debug.h" +#include "global_data.h" +#include "profiling.h" +#include "op_overheads.h" +#include "error.h" + + +extern "C"{ + + + +__global__ void depthwise_convNew8(float* const __restrict__ y, + const float* const __restrict__ x, + const float* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) +{ + + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + + const int num = 8; + + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + + if(m < M){ + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; + + float c0 = 0; + float c1 = 0; + float c2 = 0; + float c3 = 0; + float c4 = 0; + float c5 = 0; + float c6 = 0; + float c7 = 0; + + const float* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; + if(b + 1 < B) + c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; + if(b + 2 < B) + c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; + if(b + 3 < B) + c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; + if(b + 4 < B) + c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; + if(b + 5 < B) + c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; + if(b + 6 < B) + c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; + if(b + 7 < B) + c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; + + + } + } + + y4d(b, m, 0, tx) = c0; + if(b + 1 < B) + y4d(b + 1, m, 0, tx) = c1; + if(b + 2 < B) + y4d(b + 2, m, 0, tx) = c2; + if(b + 3 < B) + y4d(b + 3, m, 0, tx) = c3; + if(b + 4 < B) + y4d(b + 4, m, 0, tx) = c4; + if(b + 5 < B) + y4d(b + 5, m, 0, tx) = c5; + if(b + 6 < B) + y4d(b + 6, m, 0, tx) = c6; + if(b + 7 < B) + y4d(b + 7, m, 0, tx) = c7; + } + + #undef y4d + #undef x4d +} + + + + +__global__ void depthwise_convNew8_half2(__half* const __restrict__ y, + const __half* const __restrict__ x, + const __half* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) +{ + + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + + const int num = 8; + + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + + if(m < M){ + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; + + __half2 c0 = __half2half2(0); + __half2 c1 = __half2half2(0); + __half2 c2 = __half2half2(0); + __half2 c3 = __half2half2(0); + + const __half* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + + __half2 t1; + __half2 t2; + __half2 t3; + __half2 t4; + if(b + 7 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); + t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q)); + } + else if(b + 6 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); + t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q)); + + } + else if(b + 5 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); + } + else if(b + 4 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q)); + + } + else if(b + 3 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + } + else if(b + 2 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q)); + + } + else if(b + 1 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + } + else{ + t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q)); + + } + + + c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0); + c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1); + c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2); + c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3); + + } + } + + y4d(b, m, 0, tx) = __high2half(c0); + if(b + 1 < B) + y4d(b + 1, m, 0, tx) = __low2half(c0); + if(b + 2 < B) + y4d(b + 2, m, 0, tx) = __high2half(c1); + if(b + 3 < B) + y4d(b + 3, m, 0, tx) = __low2half(c1); + if(b + 4 < B) + y4d(b + 4, m, 0, tx) = __high2half(c2); + if(b + 5 < B) + y4d(b + 5, m, 0, tx) = __low2half(c2); + if(b + 6 < B) + y4d(b + 6, m, 0, tx) = __high2half(c3); + if(b + 7 < B) + y4d(b + 7, m, 0, tx) = __low2half(c3); + } + + #undef y4d + #undef x4d +} + + + +void* tensorConvCutlass(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups){ + + + INFO("*** TensorConvolution \n"); + profileEvent("Conv"); + + Tensor* input = (Tensor*)input_ptr; + Tensor* filter = (Tensor*)filter_ptr; + + //FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { + conv_groups = 1; + } + + Tensor* output; + + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + convertToFP32(input); + convertToFP32(filter); + + + if (conv_groups > 32) { + // TODO: Support other cases; + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + int n, c, h, w; // output dimensions + n = input->dims.dim_sizes[0]; + c = input->dims.dim_sizes[1]; + const int KH = filter->dims.dim_sizes[2]; + const int KW = filter->dims.dim_sizes[3]; + h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + + + + int blockSize; + blockSize = 64; + + dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); + dim3 block(blockSize); + depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + + } + else { + + cudnnConvolutionDescriptor_t convDesc; + cudnnConvolutionFwdAlgo_t convAlgo; + cudnnConvolutionMode_t mode; + if (conv_mode == 0) + mode = CUDNN_CONVOLUTION; + else if (conv_mode == 1) + mode = CUDNN_CROSS_CORRELATION; + + // FIXIT: Need to be more aware of the implications of alpha and beta + float alpha = 1.0f, beta = 0.0f; + + // TODO: Support other cases; + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); + + checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); + + // NOTE: Adding support for grouped convolution + checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); + + + cudnnDataType_t computeType = CUDNN_DATA_FLOAT; + // FIXIT: Think if upscaling values need to be configurable? + // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? + checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, + vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision + + int n, c, h, w; // output dimensions + // Find dimension of convolution output + checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, + input->tensor_desc, + filter->filter_desc, + &n, &c, &h, &w)); + + + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + + if (input->data_format == CUDNN_TENSOR_NCHW) + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + else if (input->data_format == CUDNN_TENSOR_NHWC) { + DEBUG("* NHWC Format \n"); + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, + CUDNN_TENSOR_NHWC, n, h, w, c); + } + else + ERROR("Unsupported Tensor Type"); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], + output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) + ERROR("NULL descriptor! \n"); + + + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, + input->tensor_desc, + filter->filter_desc, + convDesc, + output->tensor_desc, + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, + &convAlgo)); + + + DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); + + + // FIXIT: Algo shouldn't be hardcoded + convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + + size_t workspace_size; + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, + input->tensor_desc, + filter->filter_desc, + convDesc, + output->tensor_desc, + convAlgo, + &workspace_size)); + + // Allocating memory for the convolution workspace + void* workspace; + checkCudaErrors(cudaMalloc(&workspace, workspace_size)); + DEBUG("workspace size = %d \n", workspace_size); + + + checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, + input->gpu_data, filter->filter_desc, filter->gpu_data, + convDesc, convAlgo, workspace, workspace_size, + &beta, output->tensor_desc, output->gpu_data)); + } + + cudaDeviceSynchronize(); + profileEvent("Conv_end", true); + + + #ifdef ERROR_INJECTION_ENABLED + + if (op_counter >= total_ops) { + ERROR("No accuracy flag found \n"); + } + + int op_acc = op_accuracies[op_counter]; + + // Skip errorInjection if explicitly requested + if (skip_tensors.find(op_counter) != skip_tensors.end()) { + op_acc = 0; + } + + void* error_norms = tensorAddError(output, op_acc); + add_norms(error_norms, "tensorConv", op_acc); + add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); + + op_counter++; + + #endif + + return output; + + +} + +// FIXME: Need to properly fix the new HALF type conversion +void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups){ + + INFO("*** TensorHConvolution \n"); + profileEvent("#Conv"); + + Tensor* input = (Tensor*) input_ptr; + Tensor* filter = (Tensor*) filter_ptr; + + cudnnConvolutionDescriptor_t convDesc; + cudnnConvolutionFwdAlgo_t convAlgo; + cudnnConvolutionMode_t mode; + + if(conv_mode == 0) + mode = CUDNN_CONVOLUTION; + else if(conv_mode == 1) + mode = CUDNN_CROSS_CORRELATION; + + // FIXIT: Need to be more aware of the implications of alpha and beta + float alpha = 1.0f, beta = 0.0f; + // NOTE: compute in half precision + cudnnDataType_t computeType = CUDNN_DATA_HALF; + + // NOTE: Moving inputs to GPU global memory + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + + // Float-Half Conversions + profileEvent("F2H_start"); + + convertToFP16(input); + convertToFP16(filter); + + profileEvent("F2H_end"); + /******* END OF INPUT DATA CONVERSIONS*/ + + + Tensor *output; + if(conv_groups > 1){ + int n = input->dims.dim_sizes[0]; + int c = input->dims.dim_sizes[1]; + const int KH = filter->dims.dim_sizes[2]; + const int KW = filter->dims.dim_sizes[3]; + int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + + + output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + + int blockSize; + blockSize = 128; + + dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); + dim3 block(blockSize); + depthwise_convNew8_half2<<<grid, block>>> ((__half*) output->gpu_half_data, + (__half*) input->gpu_half_data, + (__half*) filter->gpu_half_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, + vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + cudaDeviceSynchronize(); + + + } + else{ + checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); + + //FIXME: Current hack to preserve backward compatibilty + if(conv_groups == 0){ + conv_groups = 1; + } + + // NOTE: Adding support for grouped convolution + checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); + + + checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, + vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision + + int n, c, h, w; // output dimensions + // Find dimension of convolution output + checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, + input->tensor_half_desc, + filter->filter_half_desc, + &n, &c, &h, &w)); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + + + output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", + output->data_type, output->data_format, + output->dims.dim_sizes[0], output->dims.dim_sizes[1], + output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + + if(convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) + ERROR("NULL descriptor! \n"); + + + // NOTE: The following algo works with TRUE half precision + convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + + + size_t workspace_size; + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, + input->tensor_half_desc, + filter->filter_half_desc, + convDesc, + output->tensor_half_desc, + convAlgo, + &workspace_size)); + + // Allocating memory for the convolution workspace + DEBUG("workspace size = %d \n", workspace_size); + void* workspace; + checkCudaErrors(cudaMalloc(&workspace, workspace_size)); + + + + + checkCUDNN(cudnnConvolutionForward(cudnnHandle, + &alpha, + input->tensor_half_desc, + input->gpu_half_data, + filter->filter_half_desc, + filter->gpu_half_data, + convDesc, convAlgo, workspace, workspace_size, + &beta, + output->tensor_half_desc, + output->gpu_half_data)); + + } + + profileEvent("H2F_start"); + + convertToFP32_offline(output); + + profileEvent("H2F_end"); + + + profileEvent("#Conv_end"); + + + return output; + +} + + + + +}// End of Extern C +