Merge branch 'approx_hpvm' of gitlab.engr.illinois.edu:llvm/hpvm into approx_hpvm

c1a2278f · kotsifa2 · 5dcef2ce · 6d1e064d · c1a2278f
Commit c1a2278f authored 5 years ago by kotsifa2
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -2,6 +2,36 @@
 #include "tensor_utils.cu"


+//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
+__global__ void convToGemmApproxHalf(__half * const __restrict__ output,
+				     const __half * const __restrict input, const int N, const int C,
+				     const int H, const int W, const int KH, const int KW, const int V_pad,
+				     const int H_pad, const int H_out, const int W_out, const int V_stride,
+				     const int H_stride, const int reduced_filter_elem,
+				     const int skip_every) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
+	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
+	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	  else
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
+	}
+      }
+    }
+  }
+}
+

 //This skips every xth row
 //H_eff is the number of rows calculated exactly
@@ -350,3 +380,477 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
  
  return new_output;
 }
+
+__global__
+void convToGemmPerfRowHalf(__half * const __restrict__ output,
+			   const __half * const __restrict input, const int N, const int C,
+			   const int H, const int W, const int KH, const int KW, const int V_pad,
+			   const int H_pad, const int H_out, const int W_out, const int V_stride,
+			   const int H_stride, const int x, const int start, const int H_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_eff * W_out); //output image number
+  const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
+  const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  int past_start = (h % (x - 1) >= (x - 1 - start));
+  const int inH = (h / (x - 1) * x + h % (x-1) +
+		   past_start) * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0;
+
+      }
+    }
+  }
+
+}
+
+
+//For use in tensorConvPerfCuda
+//Interpolates every xth row starting from x - 1 - start
+//N is total number of elements in final output array
+__global__
+void approxInterpolateRowHalf(int N, int old_h, int b, int c, int h, int w,
+			      __half *old_data, __half *new_data, int x, int start){
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < N; i += stride){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n = i / (c * h * w);
+    int past_start = ((row % x) >= (x - 1 - start));
+
+    if(row == h-1)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
+    else if (row == 0)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+    else if(row % x == x - 1 - start){
+      int past_startO = ((row - 1) % x) > (x - 1 - start);
+      int oldIdx1 = ch * (b * old_h * w) + n * (old_h * w) +
+	((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col;
+
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	__hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1 * w]), 2);
+    }
+    else
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * old_h * w) + n * (old_h * w) +
+		 ((x-1) * (row / x) + row % x - past_start )  * (w) + col];
+
+
+  }
+
+}
+
+
+//This skips every xth row
+//W_eff is the number of cols calculated exactly
+__global__
+void convToGemmPerfColHalf(__half * const __restrict__ output,
+			   const __half * const __restrict input, const int N, const int C,
+			   const int H, const int W, const int KH, const int KW, const int V_pad,
+			   const int H_pad, const int H_out, const int W_out, const int V_stride,
+			   const int H_stride, const int x, const int start, const int W_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_eff); //output image number
+  const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
+  const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
+  const int w = tx % W_eff; //output width index (col number)
+  int past_start = (w % (x - 1)) >= (x - 1 - start);
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = (w / (x - 1) * x + w % (x-1) +
+		   past_start) * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] = 0;
+
+      }
+    }
+  }
+
+}
+
+
+//For use in tensorConvPerfCuda
+//Interpolates every xth col starting from x - 1 - start
+//N is total number of elements in final output array
+__global__
+void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
+			      __half *old_data, __half *new_data, int x, int start){
+
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < N; i += stride){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n = i / (c * h * w);
+    int past_start = ((col % x) >= (x - 1 - start));
+
+    if(col == w-1)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
+    else if (col == 0)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
+    else if(col % x == x - 1 - start){
+      int past_startO = ((col - 1) % x) > (x - 1 - start);
+      int oldIdx1 = ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
+	((x-1) * ((col - 1) / x) + (col-1) % x - past_startO);
+
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	__hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1]), 2);
+    }
+    else
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
+		 ((x-1) * (col / x) + col % x - past_start)];
+
+  } 
+}
+
+__global__
+void switchMatrix(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < N){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
+
+    new_data[((n_new * c + ch) * h + row ) * w + col] =
+      old_data[((ch * n + n_new) * h + row ) * w + col];
+  }
+
+}
+						
+
+__global__
+void createNewFilter(__half *new_filter, __half *old_filter,
+		     int newFilterSize, int oldFilterSize){
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < newFilterSize; i += stride){
+    new_filter[i] = old_filter[i % oldFilterSize];
+  }
+}
+
+__global__
+void createBatches(int n, const __half * matA[], const __half * matB[], __half * matC[],
+		   __half * convData, __half * newFilter, __half * output,
+		   int aStride, int bStride, int cStride){
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < n; i += stride){
+    matA[i] = &convData[i * aStride];
+    matB[i] = &newFilter[i * bStride];
+    matC[i] = &output[i * cStride];
+  }
+}
+
+//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
+__global__ void convToGemmApproxHalfN(__half * const __restrict__ output,
+				     const __half * const __restrict input, const int N, const int C,
+				     const int H, const int W, const int KH, const int KW, const int V_pad,
+				     const int H_pad, const int H_out, const int W_out, const int V_stride,
+				     const int H_stride, const int reduced_filter_elem,
+				     const int skip_every) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	const int output_col = filter_elem_num; //calculate output column, taking skipping into account
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((output_col * N + n) * H_out + h) * W_out + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
+
+      }
+    }
+  }
+}
+
+//start has to be less than row or less than col
+//row and col have to be >= 0
+//row = col = 1 means no perforation
+void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
+			     int vertical_pad, int horizontal_pad, int vertical_stride,
+			     int horizontal_stride, int conv_mode, int conv_groups,
+			     int row, int col, int start){
+
+  INFO("*** TensorConvolution half perforation \n");
+
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
+
+  profileEvent("F2H_start");
+
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  convertToFP16(input);
+  convertToFP16(filter);
+
+  /******* END OF INPUT DATA CONVERSIONS*/
+  profileEvent("F2H_end");
+
+  profileEvent("Conv");
+
+  Tensor* output_half;
+  int n, c, h, w; // output dimensions
+  n = input->dims.dim_sizes[0];
+  c = filter->dims.dim_sizes[0]; //number of filters
+  const int KH = filter->dims.dim_sizes[2];
+  const int KW = filter->dims.dim_sizes[3];
+
+  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  int h_eff = h - h / row;
+  if(h % row > row - 1 - start)
+    h_eff = h_eff - 1;
+
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  int w_eff = w - w / col;
+  if(w % col > col - 1 - start)
+    w_eff = w_eff - 1;
+
+
+  Tensor *new_output;
+  if(row > 1){
+    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, CUDNN_TENSOR_NCHW,
+					  n, c, h_eff, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
+						   horizontal_pad, h, w,
+						   vertical_stride, horizontal_stride, row, start, h_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h_eff * w, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h_eff * w,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+    
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    approxInterpolateRowHalf<<<numBlocks,256>>>(n * c * h * w, h_eff, n, c, h, w,
+						(__half *)output_half->gpu_half_data,
+						(__half *)new_output->gpu_half_data,
+						row, start);
+    cudaDeviceSynchronize();
+
+    cudaFree(output_half);
+    cudaFree(convData);
+  }
+  else if(col > 1){
+    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
+						   horizontal_pad, h, w,
+						   vertical_stride, horizontal_stride, col, start, w_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h * w_eff, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h * w_eff,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+    
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    approxInterpolateColHalf<<<numBlocks,256>>>(n * c * h * w, w_eff, n, c, h, w,
+						(__half *)output_half->gpu_half_data,
+						(__half *)new_output->gpu_half_data,
+						col, start);
+    
+    cudaDeviceSynchronize();
+
+    cudaFree(output_half);
+    cudaFree(convData);
+
+  }
+  else{
+    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					  CUDNN_TENSOR_NCHW, c, n, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApproxHalfN<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						  input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						  input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w,
+						  vertical_stride, horizontal_stride, num_filter_elem, c * h * w);
+    checkCudaErrors(cudaDeviceSynchronize());
+    //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h * w, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h * w,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+
+
+    // profileEvent("gemm_end", true);
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					  CUDNN_TENSOR_NCHW, n, c, h, w);
+    changeTensorPlacement(new_output, DEVICE);
+
+    
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    switchMatrix<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
+				    (__half *)output_half->gpu_half_data,
+				    (__half *)new_output->gpu_half_data);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    
+    cudaFree(convData);
+    cudaFree(output_half);
+  }
+
+  profileEvent("Conv_end", true);
+
+  profileEvent("H2F_start");
+
+  convertToFP32(new_output);
+
+  profileEvent("H2F_end");
+
+
+  #ifdef ERROR_INJECTION_ENABLED
+  if (op_counter >= total_ops) {
+    ERROR("No accuracy flag found \n");
+  }
+  int op_acc = op_accuracies[op_counter];
+  // Skip errorInjection if explicitly requested
+  if (skip_tensors.find(op_counter) != skip_tensors.end()) {
+    op_acc = 0;
+  }
+  void* error_norms = tensorAddError(output, op_acc);
+  add_norms(error_norms, "tensorConv", op_acc);
+  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
+  op_counter++;
+  #endif
+  return new_output;
+}
+