From 7c7949d804c73bbe30ade8ee485226bbe9de5c7b Mon Sep 17 00:00:00 2001
From: Yasmin <ycs4@cornell.edu>
Date: Sat, 7 Dec 2019 18:42:13 -0500
Subject: [PATCH] sampling accuracy discrepancy

---
 .../dnn_sources/src/alexnet2_approxhalf.cc    | 148 +++++++
 .../dnn_sources/src/alexnet2_sampsim.cc       | 148 +++++++
 .../tensor_runtime/include/approx_api.h       |   7 +
 .../include/approx_techniques2.h              | 370 ++++++++++++++++++
 4 files changed, 673 insertions(+)
 create mode 100644 llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_approxhalf.cc
 create mode 100644 llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_sampsim.cc

diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_approxhalf.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_approxhalf.cc
new file mode 100644
index 0000000000..bd604a334a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_approxhalf.cc
@@ -0,0 +1,148 @@
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+
+
+/* NOTE: Reference Architecture to use for profiling */
+void testCifarNet(){
+
+  printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
+ 
+  std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); 
+  std::string input_path =  dir_prefix + std::string("norm_cifar_input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("test_labels.bin"); 
+
+  void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin",
+					  float_type, 32, 3, 3, 3);  
+  void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin",
+					float_type, 1, 32, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin",
+					  float_type, 32, 32, 3, 3);  
+  void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin",
+					float_type, 1, 32, 1, 1);
+  void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin",
+					  float_type, 64, 32, 3, 3);  
+  void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin",
+					  float_type, 64, 64, 3, 3);  
+  void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin",
+					float_type, 1, 64, 1, 1);
+  void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin",
+					  float_type, 128, 64, 3, 3);  
+  void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin",
+					float_type, 1, 128, 1, 1);
+  void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin",
+					  float_type, 128, 128, 3, 3);  
+  void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin",
+					float_type, 1, 128, 1, 1);
+  
+  void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin",
+					 float_type, 1, 1, 2048, 10);  
+  void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin",
+				      float_type, 1, 10, 1, 1);  
+ 
+  
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
+
+  startMemTracking();
+
+  int test_input_size = 10000;
+  int batch_size = 2500;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
+  
+  for(int i = 0; i < batch_count; i++){
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
+    
+    void* conv1out = tensorConvApproxHalf2(input, conv1_filter, 1, 1, 1, 1,
+    					 conv_mode, conv_precision, 1, 1, 2, 1);
+  
+
+    tensorAdd(conv1out, conv1_bias); 
+    void* conv1_tanh = tensorTanh(conv1out);
+    
+    // 2nd Layer
+    void* conv2out = tensorConvApproxHalf(conv1_tanh, conv2_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv2out, conv2_bias); 
+    void* conv2_tanh = tensorTanh(conv2out);
+    void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+     
+    // 3rd Layer
+    void* conv3out = tensorConvApproxHalf(pool2out, conv3_filter, 1, 1, 1, 1,
+					  conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv3out, conv3_bias); 
+    void* conv3_tanh = tensorTanh(conv3out);
+
+    // 4th Layer
+    void* conv4out = tensorConvApproxHalf(conv3_tanh, conv4_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv4out, conv4_bias); 
+    void* conv4_tanh = tensorTanh(conv4out);
+    void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+    
+    // 5th Layer
+    void* conv5out = tensorConvApproxHalf(pool4out, conv5_filter, 1, 1, 1, 1,
+					  conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv5out, conv5_bias); 
+    void* conv5_tanh = tensorTanh(conv5out);
+
+    // 6th Layer
+    void* conv6out = tensorConvApproxHalf(conv5_tanh, conv6_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv6out, conv6_bias); 
+  
+    void* conv6_tanh = tensorTanh(conv6out);
+    void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+    
+    // final FC Layer
+    void* gemm1out = tensorGemmGPU(pool6out, fc1_weights);  
+    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+    void* result = tensorSoftmax(gemm1biasout);
+
+    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+    float accuracy = computeAccuracy2(labels, batch_size, result); 
+    final_accuracy += accuracy;
+    
+    freeBatchMemory();
+  }
+
+  stopProfiling();
+
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
+
+}
+
+
+int main(int argc, char* argv[]){
+
+  llvm_hpvm_initTensorRt(0);
+
+  testCifarNet();
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_sampsim.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_sampsim.cc
new file mode 100644
index 0000000000..a0ac48b5ef
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_sampsim.cc
@@ -0,0 +1,148 @@
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+
+
+/* NOTE: Reference Architecture to use for profiling */
+void testCifarNet(){
+
+  printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
+ 
+  std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); 
+  std::string input_path =  dir_prefix + std::string("norm_cifar_input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("test_labels.bin"); 
+
+  void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin",
+					  float_type, 32, 3, 3, 3);  
+  void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin",
+					float_type, 1, 32, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin",
+					  float_type, 32, 32, 3, 3);  
+  void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin",
+					float_type, 1, 32, 1, 1);
+  void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin",
+					  float_type, 64, 32, 3, 3);  
+  void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin",
+					  float_type, 64, 64, 3, 3);  
+  void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin",
+					float_type, 1, 64, 1, 1);
+  void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin",
+					  float_type, 128, 64, 3, 3);  
+  void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin",
+					float_type, 1, 128, 1, 1);
+  void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin",
+					  float_type, 128, 128, 3, 3);  
+  void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin",
+					float_type, 1, 128, 1, 1);
+  
+  void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin",
+					 float_type, 1, 1, 2048, 10);  
+  void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin",
+				      float_type, 1, 10, 1, 1);  
+ 
+  
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
+
+  startMemTracking();
+
+  int test_input_size = 10000;
+  int batch_size = 2500;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
+  
+  for(int i = 0; i < batch_count; i++){
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
+
+    void* conv1out = tensorConvSampSim(input, conv1_filter, 1, 1, 1, 1,
+    					 conv_mode, conv_precision, 2, 0);
+   
+
+    tensorAdd(conv1out, conv1_bias); 
+    void* conv1_tanh = tensorTanh(conv1out);
+    
+    // 2nd Layer
+    void* conv2out = tensorConvApproxHalf(conv1_tanh, conv2_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv2out, conv2_bias); 
+    void* conv2_tanh = tensorTanh(conv2out);
+    void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+     
+    // 3rd Layer
+    void* conv3out = tensorConvApproxHalf(pool2out, conv3_filter, 1, 1, 1, 1,
+					  conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv3out, conv3_bias); 
+    void* conv3_tanh = tensorTanh(conv3out);
+
+    // 4th Layer
+    void* conv4out = tensorConvApproxHalf(conv3_tanh, conv4_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv4out, conv4_bias); 
+    void* conv4_tanh = tensorTanh(conv4out);
+    void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+    
+    // 5th Layer
+    void* conv5out = tensorConvApproxHalf(pool4out, conv5_filter, 1, 1, 1, 1,
+					  conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv5out, conv5_bias); 
+    void* conv5_tanh = tensorTanh(conv5out);
+
+    // 6th Layer
+    void* conv6out = tensorConvApproxHalf(conv5_tanh, conv6_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision, 1, 1, 1, 1);
+    tensorAdd(conv6out, conv6_bias); 
+  
+    void* conv6_tanh = tensorTanh(conv6out);
+    void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+    
+    // final FC Layer
+    void* gemm1out = tensorGemmGPU(pool6out, fc1_weights);  
+    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+    void* result = tensorSoftmax(gemm1biasout);
+
+    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+    float accuracy = computeAccuracy2(labels, batch_size, result); 
+    final_accuracy += accuracy;
+    
+    freeBatchMemory();
+  }
+
+  stopProfiling();
+
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
+
+}
+
+
+int main(int argc, char* argv[]){
+
+  llvm_hpvm_initTensorRt(0);
+
+  testCifarNet();
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
index 811fa4090f..50438eb22c 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
@@ -67,4 +67,11 @@ extern "C"{
 			 int row, int col,
 			 int skip_every, int skip_offset);
 
+  void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
+			     int vertical_pad, int horizontal_pad,
+			     int vertical_stride, int horizontal_stride,
+			     int conv_mode, int conv_groups,
+			     int row, int col,
+			     int skip_every, int skip_offset);
+
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
index 6042cc7dae..97c7b9070e 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -1671,3 +1671,373 @@ void* tensorConvApproxHalf(void* input_ptr, void* filter_ptr,
   
   return new_output;
 }
+
+
+//Functions for correct offset numbering
+
+//produces COL MAJOR matrix with reduced_filter_elem rows and NF cols
+__global__ void createReducedFiltersHalfNew(__half * output,
+					 const __half * const __restrict input, const int NF,
+					 const int num_filter_elem, const int reduced_filter_elem,
+					 const int skip_every, const int skip_offset,
+					 const float fac) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int fIdx = tx / num_filter_elem; //filter index
+  const int offset = tx % num_filter_elem; //offset within filter
+  if(fIdx < NF) { //is thread id within bounds?
+    if(offset % skip_every != skip_offset) { //are we including this filter element?
+            const int output_row = offset -
+	      (offset/skip_every + (offset % skip_every > skip_offset));
+	          output[fIdx*reduced_filter_elem + output_row] =
+		    __hmul(__float2half_rn(fac), input[tx]);
+    }
+  }
+}
+
+//COL Major matrix with N*H*W columns and reduced_filter_elem rows
+//skip_every = 1 means no perforation
+__global__ void convToGemmHalfInputNew(__half * const __restrict__ output,
+				    const __half * const __restrict input,
+				    const int N, const int C,
+				    const int H, const int W,
+				    const int KH, const int KW, const int V_pad,
+				    const int H_pad, const int H_out,
+				    const int W_out, const int V_stride,
+				    const int H_stride, const int reduced_filter_elem,
+				    const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	if(filter_elem_num % skip_every != skip_offset) {
+	    int output_col = filter_elem_num -
+	      (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
+	    if(skip_every == 1)
+	      output_col = filter_elem_num;
+	    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	      output[((output_col*N + n) * H_out + h) * W_out + w] =
+		input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	    else
+	      output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
+	}
+      }
+    }
+  }
+}
+
+
+//COL Major matrix with N*H*W columns and reduced_filter_elem rows
+//Can only be used when skipping every other element in input sampling
+__global__ void convToGemmHalfInput2New(__half * const __restrict__ output,
+				     const __half * const __restrict input,
+				     const int N, const int C,
+				     const int H, const int W,
+				     const int KH, const int KW, const int V_pad,
+				     const int H_pad, const int H_out,
+				     const int W_out, const int V_stride,
+				     const int H_stride, const int reduced_filter_elem,
+				     const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    const int filter_elem_num = c * KH * KW;
+
+    for(int l = (filter_elem_num % 2 == skip_offset); l < KH * KW; l+=2) {
+      int i = l / KW;
+      int j = l % KW;
+
+      const int new_idx = filter_elem_num + i * KW + j;
+            const int output_col = new_idx -
+	      (new_idx / 2 + (new_idx % 2 > skip_offset)); //new output column
+
+	    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	      output[((output_col*N + n) * H_out + h) * W_out + w] =
+		input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	    else
+	      output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
+
+    }
+  }
+}
+
+void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
+			   int vertical_pad, int horizontal_pad, int vertical_stride,
+			   int horizontal_stride, int conv_mode, int conv_groups,
+			   int row, int col, int skip_every, int offset){
+
+  INFO("*** TensorConvolution half approximation \n");
+  profileEvent("#Conv");
+
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
+
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  profileEvent("F2H_start");
+  convertToFP16(input);
+  convertToFP16(filter);
+  profileEvent("F2H_end");
+
+  /******* END OF INPUT DATA CONVERSIONS*/
+  profileEvent("F2H_end");
+
+  profileEvent("Conv");
+
+  int n, c, h, w; // output dimensions
+  n = input->dims.dim_sizes[0];
+  c = filter->dims.dim_sizes[0]; //number of filters
+  const int KH = filter->dims.dim_sizes[2];
+  const int KW = filter->dims.dim_sizes[3];
+
+  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  int h_eff = h - h / row;
+  if(h % row > row - 1 - offset)
+    h_eff = h_eff - 1;
+
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  int w_eff = w - w / col;
+  if(w % col > col - 1 - offset)
+    w_eff = w_eff - 1;
+
+
+  INFO("input: %d %d %d %d\n", input->dims.dim_sizes[0], input->dims.dim_sizes[1],
+       input->dims.dim_sizes[2], input->dims.dim_sizes[3]);
+  INFO("filter: %d %d %d %d\n", filter->dims.dim_sizes[0], filter->dims.dim_sizes[1],
+       filter->dims.dim_sizes[2], filter->dims.dim_sizes[3]);
+  INFO("output: %d %d %d %d\n", n, c, h, w);
+
+
+  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					       CUDNN_TENSOR_NCHW, n, c, h, w);
+  // NOTE: Changing output tensor placement from host to device
+  changeTensorPlacement(new_output, DEVICE);
+
+  if(row > 1){
+    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+						  CUDNN_TENSOR_NCHW,
+						  n, c, h_eff, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
+						   horizontal_pad, h, w, vertical_stride,
+						   horizontal_stride, row, offset, h_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h_eff * w, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h_eff * w,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    approxInterpolateRowHalf<<<numBlocks,256>>>(n * c * h * w, h_eff, n, c, h, w,
+						(__half *)output_half->gpu_half_data,
+						(__half *)new_output->gpu_half_data,
+						row, offset);
+    cudaDeviceSynchronize();
+
+    freeTensor(output_half);
+    cudaFree(convData);
+  }
+  else if(col > 1){
+    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+						  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
+						   horizontal_pad, h, w, vertical_stride,
+						   horizontal_stride, col, offset, w_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h * w_eff, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h * w_eff,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    approxInterpolateColHalf<<<numBlocks,256>>>(n * c * h * w, w_eff, n, c, h, w,
+						(__half *)output_half->gpu_half_data,
+						(__half *)new_output->gpu_half_data,
+						col, offset);
+
+    cudaDeviceSynchronize();
+
+    freeTensor(output_half);
+    cudaFree(convData);
+
+  }
+  else{
+    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					     CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+    //reduced number after skipping
+    int reduced_filter_elem;
+    if(offset != skip_every){
+      reduced_filter_elem = num_filter_elem - (num_filter_elem/skip_every);
+      if(num_filter_elem % skip_every > offset)
+	reduced_filter_elem = reduced_filter_elem - 1;
+    }
+    else
+      reduced_filter_elem = num_filter_elem;
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+    __half * reducedFilter;
+    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
+    const int filtBlockSize = 128;
+    const int filtGridSize = (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    if(offset != skip_every){
+      float fac = (skip_every * 1.0) / (skip_every - 1);
+      createReducedFiltersHalfNew<<<filtGridSize, filtBlockSize>>>(reducedFilter,
+								(__half *)filter->gpu_half_data,
+								c, num_filter_elem, reduced_filter_elem,
+								skip_every, offset, fac);
+      checkCudaErrors(cudaDeviceSynchronize());
+    }
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    if(skip_every == 2){
+      convToGemmHalfInput2New<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						    input->dims.dim_sizes[1],
+						    input->dims.dim_sizes[2],
+						    input->dims.dim_sizes[3],
+						    KH, KW, vertical_pad, horizontal_pad,
+						    h, w, vertical_stride, horizontal_stride,
+						    reduced_filter_elem, skip_every,
+						    offset);
+    }
+    else{
+      convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1],
+						   input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3],
+						   KH, KW, vertical_pad, horizontal_pad,
+						   h, w, vertical_stride, horizontal_stride,
+						   reduced_filter_elem, skip_every,
+						   offset);
+    }
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    if(offset != skip_every)
+      checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				   n * h * w, c, reduced_filter_elem,
+				   alpha_half,
+				   convData, CUDA_R_16F, n * h * w,
+				   reducedFilter, CUDA_R_16F, reduced_filter_elem,
+				   beta_half,
+				   (__half*) output->gpu_half_data, CUDA_R_16F, n * h * w,
+				   CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+    else
+      checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				   n * h * w, c, reduced_filter_elem,
+				   alpha_half,
+				   convData, CUDA_R_16F, n * h * w,
+				   (__half*) filter->gpu_half_data, CUDA_R_16F,
+				   reduced_filter_elem,
+				   beta_half,
+				   (__half*) output->gpu_half_data, CUDA_R_16F, n * h * w,
+				   CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    switchMatrix<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
+				    (__half *)output->gpu_half_data,
+				    (__half *)new_output->gpu_half_data);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    cudaFree(convData);
+    cudaFree(reducedFilter);
+    freeTensor(output);
+
+  }
+
+  profileEvent("H2F_start");
+  convertToFP32_offline(new_output);
+  profileEvent("H2F_end");
+
+  profileEvent("#Conv_end");
+
+
+  return new_output;
+}
-- 
GitLab