diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu index 99deb53042e3c30949216af0dfed74100e3a1109..987865a867ac269721711a2b13c472f52128b8ee 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu @@ -38,6 +38,38 @@ void convToGemm(float * const __restrict__ output, } } +__global__ +void convToGemmHalf(__half * const __restrict__ output, + const __half * const __restrict input, + const int N, const int C, + const int H, const int W, + const int KH, const int KW, + const int V_pad, const int H_pad, + const int H_out, const int W_out, + const int V_stride, const int H_stride){ + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_out * W_out); //output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number + const int h = tx % (H_out * W_out) / W_out; //output height index (row number) + const int w = tx % W_out; //output width index (col number) + const int inH = h * V_stride - V_pad; + const int inW = w * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + } else { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; + } + } + } + } +} + //This skips every xth row //H_eff is the number of rows calculated exactly __global__ @@ -797,28 +829,23 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, convertToFP32(filter); //profileEvent("H2F_end"); - long int n, c, h, w; // output dimensions - n = input->dims.dim_sizes[0]; - c = filter->dims.dim_sizes[0]; //number of filters + const long int n = input->dims.dim_sizes[0]; + const long int c = filter->dims.dim_sizes[0]; //number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - - h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - int rem_row = (h - offset) % row > 0; - int h_eff = h - ((h - offset) / row) - rem_row; - - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - int rem_col = (w - offset) % col > 0; - int w_eff = w - ((w - offset) / col) - rem_col; + const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + if(row > 1) { + const int rem_row = (h - offset) % row > 0; + const int h_eff = h - ((h - offset) / row) - rem_row; - if(row > 1){ Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w); @@ -868,8 +895,10 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, freeTensor(output); cudaFree(convData); - } - else if(col > 1){ + } else if(col > 1) { + const int rem_col = (w - offset) % col > 0; + const int w_eff = w - ((w - offset) / col) - rem_col; + Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); @@ -918,8 +947,7 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, freeTensor(output); cudaFree(convData); - } - else if(skip_every > 1) { + } else if(skip_every > 1) { Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, CUDNN_TENSOR_NCHW, n, c, h, w); @@ -1011,8 +1039,6 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, } else { new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(new_output, DEVICE); float * convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; @@ -1104,19 +1130,13 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, profileEvent("F2H_end"); profileEvent("Conv"); - long int n, c, h, w; // output dimensions - n = input->dims.dim_sizes[0]; - c = filter->dims.dim_sizes[0]; //number of filters + const long int n = input->dims.dim_sizes[0]; + const long int c = filter->dims.dim_sizes[0]; //number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - - h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - int rem_row = (h - offset) % row > 0; - int h_eff = h - ((h - offset) / row) - rem_row; - - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - int rem_col = (w - offset) % col > 0; - int w_eff = w - ((w - offset) / col) - rem_col; + const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; INFO("input: %d %d %d %d\n", input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3]); @@ -1130,6 +1150,9 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, changeTensorPlacement(new_output, DEVICE); if(row > 1){ + const int rem_row = (h - offset) % row > 0; + const int h_eff = h - ((h - offset) / row) - rem_row; + Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w); @@ -1137,9 +1160,6 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output_half, DEVICE); - //total number of filter elem - const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - __half * convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); @@ -1179,17 +1199,16 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, freeTensor(output_half); cudaFree(convData); -} - else if(col > 1){ +} else if(col > 1) { + const int rem_col = (w - offset) % col > 0; + const int w_eff = w - ((w - offset) / col) - rem_col; + Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output_half, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem - const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - + __half * convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); @@ -1228,12 +1247,10 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, freeTensor(output_half); cudaFree(convData); - } else{ + } else if(skip_every > 1) { Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, CUDNN_TENSOR_NCHW, n, c, h, w); - - //total number of filter elem - const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + //reduced number after skipping long int reduced_filter_elem; if(offset != skip_every) { @@ -1319,6 +1336,41 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, cudaFree(convData); cudaFree(reducedFilter); freeTensor(output); + } else { + new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, + n, c, h, w); + + __half * convData; + long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + convToGemmHalf<<<gridSize, blockSize>>>(convData, + (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], + input->dims.dim_sizes[2], + input->dims.dim_sizes[3], + KH, KW, vertical_pad, + horizontal_pad, h, w, vertical_stride, + horizontal_stride); + checkCudaErrors(cudaDeviceSynchronize()); + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + n * h * w, c, num_filter_elem, + alpha_half, + convData, CUDA_R_16F, n * h * w, + (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, + (__half *) new_output->gpu_half_data, CUDA_R_16F, n * h * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + cudaFree(convData); } profileEvent("H2F_start");