Separating group convolution functions in a separate source

6e70be1c · Hashim Sharif · 3b0e3f0b · 6e70be1c · 6e70be1c · 6e70be1c
Commit 6e70be1c authored 5 years ago by Hashim Sharif
--- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -54,7 +54,7 @@ link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64 $ENV{CUDNN_PATH} $ENV{CUDNN_PATH
 set(
  RUNTIME_SRCS_FILENAME
  approx_simulation.cu
-  approx_techniques.cu
+  group_conv.cu
  approx_techniques2.cu
  common.cpp
  configuration.cpp

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
@@ -1078,6 +1078,43 @@ __global__ void createReducedFiltersHalfIrregular(__half * output,
  }
 }

+
+
+//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
+__global__ void convToGemmApprox(float * const __restrict__ output,
+				 const float * const __restrict input, const int N, const int C,
+				 const int H, const int W,
+				 const int KH, const int KW, const int V_pad,
+				 const int H_pad, const int H_out,
+				 const int W_out, const int V_stride,
+				 const int H_stride, const int reduced_filter_elem,
+				 const int skip_every) {
+  
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
+	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
+	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	  else
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
+	}
+      }
+    }
+  }
+}
+
+
+
 void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
 			 int vertical_pad, int horizontal_pad, int vertical_stride,
 			 int horizontal_stride, int conv_mode, int conv_groups,

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu