From 7dfa556bfd2a4fb5929d1f927a526ae6df39b86a Mon Sep 17 00:00:00 2001
From: Akash Kothari <akashk4@tyler.cs.illinois.edu>
Date: Tue, 19 Jan 2021 12:03:10 -0600
Subject: [PATCH] Rename approximations techniques CUDA file

---
 hpvm/projects/hpvm-tensor-rt/CMakeLists.txt   |    2 +-
 .../tensor_runtime/src/approx_techniques.cu   | 2010 +++++++++++++++++
 2 files changed, 2011 insertions(+), 1 deletion(-)
 create mode 100644 hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu

diff --git a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
index 2feeaa2fef..d5d7895dcd 100644
--- a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -78,7 +78,7 @@ set(
   RUNTIME_SRCS_FILENAME
   approx_simulation.cu
   group_conv.cu
-  approx_techniques2.cu
+  approx_techniques.cu
   common.cpp
   configuration.cpp
   debug.cc
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
new file mode 100644
index 0000000000..8f08ad9e26
--- /dev/null
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
@@ -0,0 +1,2010 @@
+//===--------------------------- approxtechniques.cu ---------------------===//
+//
+//===----------------------------------------------------------------------===//
+//   
+//  This file  consists of the custom implementation of software approximations
+// for tensor convolutions. The approximations implemented are feature sampling
+// and perforation for FP32 and FP16 compute precisions.  
+//
+//===----------------------------------------------------------------------===//
+ 
+
+#include "tensor_utils.h"
+#include "approx_utils.h"
+#include "debug.h"
+#include "global_data.h"
+#include "fp16_gemm.h"
+#include "fp16_conversion.h"
+#include "profiling.h"
+
+extern "C"{
+
+__global__ void convToGemm(float * const __restrict__ output,
+		       const float * const __restrict input, const int N, const int C,
+		       const int H, const int W, const int KH, const int KW, const int V_pad,
+		       const int H_pad, const int H_out, const int W_out, const int V_stride,
+		       const int H_stride, const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  if(n < N) {
+    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+    const int w = tx % W_out; //output width index (col number)
+    const int inH = h * V_stride - V_pad;
+    const int inW = w * H_stride - H_pad;
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+            output[out_index] = 0;
+      }
+    }
+  }
+}
+
+__global__ void convToGemmFullInput(float * const __restrict__ output,
+                    const float * const __restrict input,
+                    const int N, const int C,
+                    const int H, const int W,
+                    const int KH, const int KW, const int V_pad,
+                    const int H_pad, const int H_out,
+                    const int W_out, const int V_stride,
+                    const int H_stride,
+                    const int skip_every, const int skip_offset) {
+      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+      const int n = tx / (C * H_out * W_out); //output image number
+      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_
+      const int w = tx % W_out; //output width index (col number)
+      const int inH = h * V_stride - V_pad; //input height index (row number)
+      const int inW = w * H_stride - H_pad; //input width index (col number)
+      if(n < N) { //is thread id within bounds?
+          for(int i = 0; i < KH; i++) {
+              for(int j = 0; j < KW; j++) {
+                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen
+                  if(filter_elem_num % skip_every != skip_every-1-skip_offset) {
+                      int output_col = filter_elem_num -
+                               ((filter_elem_num + skip_every)/skip_every);
+                        if(skip_every == 1) output_col = filter_elem_num;
+                        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+                            output[((output_col*N + n) * H_out + h) * W_out + w] =
+                                        input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+                        else         
+                            output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
+                   }                
+                }              
+            }                
+        }
+}
+
+__global__ void convToGemmHalfInputNew(__half * const __restrict__ output,
+                                    const __half * const __restrict input,
+                                    const int N, const int C,
+                                    const int H, const int W,
+                                    const int KH, const int KW, const int V_pad,
+                                    const int H_pad, const int H_out,
+                                    const int W_out, const int V_stride,
+                                    const int H_stride, const int reduced_filter_elem,
+                                    const int skip_every, const int skip_offset) {
+      
+      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+      const int n = tx / (C * H_out * W_out); //output image number
+      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+      const int w = tx % W_out; //output width index (col number)
+      const int inH = h * V_stride - V_pad; //input height index (row number)
+      const int inW = w * H_stride - H_pad; //input width index (col number)
+      if(n < N) { //is thread id within bounds?
+          for(int i = 0; i < KH; i++) {
+              for(int j = 0; j < KW; j++) {
+                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+                  if(filter_elem_num % skip_every != skip_offset) {
+                      int output_col = filter_elem_num -
+                                        (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
+                     if(skip_every == 1) output_col = filter_elem_num;
+                      if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+                          output[((output_col*N + n) * H_out + h) * W_out + w] =
+                                    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+                      else
+                          output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
+                  }
+              }
+           }
+      }
+}
+
+
+__global__
+void convToGemmHalf(__half * const __restrict__ output,
+                    const __half * const __restrict input,
+                    const int N, const int C,
+                    const int H, const int W,
+                    const int KH, const int KW,
+                    const int V_pad, const int H_pad,
+                    const int H_out, const int W_out,
+                    const int V_stride, const int H_stride){
+    
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i
+    const int n = tx / (C * H_out * W_out); //output image numbe
+    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe
+    const int h = tx % (H_out * W_out) / W_out; //output height index (row number
+    const int w = tx % W_out; //output width index (col number
+    const int inH = h * V_stride - V_pad;
+    const int inW = w * H_stride - H_pad; //input width index (col number)
+    if(n < N) { //is thread id within bounds?
+        for(int i = 0; i < KH; i++) {
+            for(int j = 0; j < KW; j++) {
+                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+                if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
+                                            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+                } else {
+                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
+                }
+            }
+        }
+    }
+}
+
+__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output,
+                                        const __half * const __restrict input,
+                                        const int N, const int C,
+                                        const int H, const int W,
+                                        const int KH, const int KW, const int V_pad,
+                                        const int H_pad, const int H_out,
+                                        const int W_out, const int V_stride,
+                                        const int H_stride, const int reduced_filter_elem,
+                                        const int skip_every, const int skip_offset) {
+      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+      const int n = tx / (C * H_out * W_out); //output image number
+      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+      const int w = tx % W_out; //output width index (col number)
+      const int inH = h * V_stride - V_pad; //input height index (row number)
+      const int inW = w * H_stride - H_pad; //input width index (col number)
+      if(n < N) { //is thread id within bounds?
+          for(int i = 0; i < KH; i++) {
+              for(int j = 0; j < KW; j++) {
+                  //const int ki = c * KH * KW + i;
+                  //const int kj = c * KH * KW + j;
+                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+                  if((filter_elem_num - skip_offset) % skip_every) {
+                    const int condition = (filter_elem_num < skip_offset);
+                     const int output_col = condition * filter_elem_num 
+                                    + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) 
+                                                         - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));                   
+                  //if(filter_elem_num % skip_every != skip_offset) {
+                  // int output_col = filter_elem_num -
+                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
+                   //if(skip_every == 1)
+                   //    output_col = filter_elem_num;
+                    const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
+                    //((output_col*N + n) * H_out + h) * W_out + w;
+                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+                       output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+                     else
+                       output[out_index] = 0;
+              }
+            }
+        }
+    }
+}
+
+__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output,
+                                                const __half * const __restrict input,
+                                                const int N, const int C,
+                                                const int H, const int W,
+                                                const int KH, const int KW, const int V_pad,
+                                                const int H_pad, const int H_out,
+                                                const int W_out, const int V_stride,
+                                                const int H_stride, const int reduced_filter_elem,
+                                                const int skip_every, const int skip_offset) {
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (C * H_out * W_out); //output image number
+    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+    const int w = tx % W_out; //output width index (col number)
+    const int inH = h * V_stride - V_pad; //input height index (row number)
+    const int inW = w * H_stride - H_pad; //input width index (col number)
+    if(n < N) { //is thread id within bounds?
+        for(int i = 0; i < KH; i++) {
+            for(int j = 0; j < KW; j++) {
+                //const int ki = c * KH * KW + i;
+                //const int kj = c * KH * KW + j;
+                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+                if((filter_elem_num - skip_offset) % skip_every) {
+                    const int condition = (filter_elem_num < skip_offset);
+                    const int output_col = condition * filter_elem_num
+                                        + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every)
+                                        - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
+                    //if(filter_elem_num % skip_every != skip_offset) {
+                    // int output_col = filter_elem_num -
+                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
+                    //if(skip_every == 1)
+                    //    output_col = filter_elem_num;
+                    const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
+                    //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
+                    //((output_col*N + n) * H_out + h) * W_out + w
+                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+                    else
+                        output[out_index] = 0;
+                }
+            }
+        }
+    }
+}
+
+
+
+__global__ void convToGemmHalf2(__half * const __restrict__ output,
+                       const __half * const __restrict input, const int N, const int C,
+                       const int H, const int W, const int KH, const int KW, const int V_pad,
+                       const int H_pad, const int H_out, const int W_out, const int V_stride,
+                       const int H_stride, const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  if(n < N) { 
+    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+    const int w = tx % W_out; //output width index (col number)
+    const int inH = h * V_stride - V_pad;
+    const int inW = w * H_stride - H_pad;
+    for(int i = 0; i < KH; i++) { 
+      for(int j = 0; j < KW; j++) { 
+        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element 
+        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+            output[out_index] = 0;
+      }
+    }
+  }
+}
+
+__global__ void convToGemmPerfRow(float * const __restrict__ output,
+		       const float * const __restrict input, const int N, const int C,
+		       const int H, const int W, const int KH, const int KW, const int V_pad,
+		       const int H_pad, const int H_out, const int W_out, const int V_stride,
+		       const int H_stride, const int x, const int start, const int H_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_eff * W_out); //output image number
+  if(n < N) { 
+    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
+    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
+    const int w = tx % W_out; //output width index (col number)
+    int h_index;
+    if(h < start) {
+        h_index = h;
+    } else {
+         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+    }
+    const int inH = h_index * V_stride - V_pad;
+    const int inW = w * H_stride - H_pad; //input width index (col number)
+   //#pragma unroll
+    //for (int ki = 0; ki < KH * KW; ki++) {
+      //  int i = ki / KW;
+      //  int j = ki % KW;
+    for(int i = 0; i < KH; i++) {
+        for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element
+    const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[out_index] = 0;
+      }
+    }
+  }
+}
+
+__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w,
+			  float *old_data, float *new_data, int x, int start){
+
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (c * h * w); //output image number
+    if(n < N) {
+        const int ch = tx % (c * h * w) / (h * w); //filter number
+        const int row = tx % (h * w) / w; //output height index (row number)
+        const int col = tx % w; //output width index (col number)
+    
+        if(row < start) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+        } else if(row == h-1) { 
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
+        } else if (row == 0) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+        } else if((row - start) % x == 0) { 
+            int row_index = row - ((row + 1 - start) / x);
+            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; 
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+                (old_data[output_index] + old_data[output_index - w]) / 2;
+        } else {
+            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); 
+            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
+        }
+    }
+}
+
+__global__ void convToGemmPerfCol(float * const __restrict__ output,
+		       const float * const __restrict input, const int N, const int C,
+		       const int H, const int W, const int KH, const int KW, const int V_pad,
+		       const int H_pad, const int H_out, const int W_out, const int V_stride,
+		       const int H_stride, const int x, const int start, const int W_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_eff); //output image number
+  if(n < N) { 
+    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
+    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
+    const int w = tx % W_eff; //output width index (col number)
+    int w_index;
+    if(w < start) {
+      w_index = w;
+    } else {
+      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+    }
+    const int inW = w_index * H_stride - H_pad; 
+    const int inH = h * V_stride - V_pad; //input height index (row number)
+    //#pragma unroll
+    //for (int ki = 0; ki < KH * KW; ki++) {
+      //  int i = ki / KW;
+       // int j = ki % KW;
+    
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = c * KH * KW  + i * KW + j; //index of this filter element
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
+      }
+    }
+  }
+}
+
+__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w,
+			                        float *old_data, float *new_data, int x, int start) { 
+
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (c * h * w); //output image number
+    if(n < N) {
+    	const int ch = tx % (c * h * w) / (h * w); //output chan number
+   	 const int row = tx % (h * w) / w; //output height index (row number)
+    	const int col = tx % w; //output width index (col number)
+
+    	if(col < start) {
+        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] 
+                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    	} else if(col == w - 1) {
+        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+    	} else if (col == 0) {
+        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    	} else if((col - start) % x == 0) {
+        	int col_index = col - ((col + 1 - start) / x);
+       		int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+                	    (old_data[output_index] + old_data[output_index - 1]) / 2;
+    	} else {
+        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);  
+         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+       	 	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
+    	}
+    }
+}
+
+__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
+                       const __half * const __restrict input, const int N, const int C,
+                       const int H, const int W, const int KH, const int KW, const int V_pad,
+                       const int H_pad, const int H_out, const int W_out, const int V_stride,
+                       const int H_stride, const int x, const int start, const int H_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_eff * W_out); //output image number
+  if(n < N) {
+    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
+    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
+    const int w = tx % W_out; //output width index (col number)
+    int h_index;
+    if(h < start) {
+        h_index = h;
+    } else {
+         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+    }
+    const int inH = h_index * V_stride - V_pad;
+    const int inW = w * H_stride - H_pad; //input width index (col number)
+  // #pragma unroll
+    //for (int ki = 0; ki < KH * KW; ki++) {
+     //   int i = ki / KW; 
+     //   int j = ki % KW;
+   
+   for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
+    	const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+    	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+      }
+    }
+  }
+}
+
+__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
+                       const __half * const __restrict input, const int N, const int C,
+                       const int H, const int W, const int KH, const int KW, const int V_pad,
+                       const int H_pad, const int H_out, const int W_out, const int V_stride,
+                       const int H_stride, const int x, const int start, const int H_eff){
+    
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (C * H_eff * W_out); //output image numbe
+    if(n < N) { 
+        const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
+        const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
+        const int w = tx % W_out; //output width index (col number)
+        int h_index;                   
+        if(h < start) {                
+            h_index = h;               
+        } else {                       
+            h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;                                                            
+        }                              
+        const int inH = h_index * V_stride - V_pad;
+        const int inW = w * H_stride - H_pad; //input width index (col number)
+        // #pragma unroll
+        //for (int ki = 0; ki < KH * KW; ki++) {
+            //   int i = ki / KW; 
+            //   int j = ki % KW; 
+            for(int i = 0; i < KH; i++) {
+                for(int j = 0; j < KW; j++) {
+                    const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
+                    const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
+                    //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+                    else
+                        output[out_index] = 0;
+                }
+            }
+    }
+}
+
+__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w,
+                          __half *old_data, __half *new_data, int x, int start) {
+
+    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    //const int n = tx / (c * h * w); //output image number
+    //const int stride = blockDim.x * gridDim.x;
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (c * h * w); //output image number
+    if(n < N) {
+    //for(int i = index; i < N; i += stride){
+        //const int col = ((i % (c * h * w)) % (h * w)) % w;
+        //const int row = ((i % (c * h * w)) % (h * w)) / w;
+        //const int ch = (i % (c * h * w)) / (h * w);
+       // const int n = i / (c * h * w);
+
+        const int ch = tx % (c * h * w) / (h * w); //filter number
+        const int row = tx % (h * w) / w; //output height index (row number)
+        const int col = tx % w; //output width index (col number)
+
+        if(row < start) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+        } else if(row == h-1) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
+        } else if (row == 0) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+        } else if((row - start) % x == 0) {
+            int row_index = row - ((row + 1 - start) / x);
+            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+				__hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+        } else {
+            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
+        }
+    }
+}
+
+__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w,
+                          __half *old_data, __half *new_data, int x, int start) {
+    
+    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    //const int n = tx / (c * h * w); //output image numbe
+    //const int stride = blockDim.x * gridDim.x;
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (c * h * w); //output image number
+    if(n < N) {
+    //for(int i = index; i < N; i += stride){
+        //const int col = ((i % (c * h * w)) % (h * w)) % w;
+        //const int row = ((i % (c * h * w)) % (h * w)) / w;
+        //const int ch = (i % (c * h * w)) / (h * w);
+        //const int n = i / (c * h * w);
+        
+        const int ch = tx % (c * h * w) / (h * w); //filter number
+        const int row = tx % (h * w) / w; //output height index (row number)
+        const int col = tx % w; //output width index (col number
+        if(row < start) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                    old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col];
+       } else if(row == h-1) {
+           new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
+        } else if (row == 0) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+        } else if((row - start) % x == 0) {
+            const int row_index = row - ((row + 1 - start) / x);
+            const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                    __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+        } else {
+            const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+            const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
+        }
+    }
+}
+
+
+__global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
+                       const __half * const __restrict input, const int N, const int C,
+                       const int H, const int W, const int KH, const int KW, const int V_pad,
+                       const int H_pad, const int H_out, const int W_out, const int V_stride,
+                       const int H_stride, const int x, const int start, const int W_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_eff); //output image number
+  if(n < N) {
+    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
+    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
+    const int w = tx % W_eff; //output width index (col number)
+    int w_index;
+    if(w < start) {
+      w_index = w;
+    } else {
+      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+    }
+    const int inW = w_index * H_stride - H_pad;
+    const int inH = h * V_stride - V_pad; //input height index (row number)
+     //#pragma unroll
+    //  for (int ki = 0; ki < KH * KW; ki++) {               
+      //    int i = ki / KW;
+       //   int j = ki % KW; 
+    
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
+        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w;
+        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+
+      }
+    }
+  }
+}
+
+__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
+                       const __half * const __restrict input, const int N, const int C,
+                        const int H, const int W, const int KH, const int KW, const int V_pad,
+                        const int H_pad, const int H_out, const int W_out, const int V_stride,
+                        const int H_stride, const int x, const int start, const int W_eff){
+
+      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+      const int n = tx / (C * H_out * W_eff); //output image number
+      if(n < N) {
+          const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
+          const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
+          const int w = tx % W_eff; //output width index (col number)
+          int w_index;
+          if(w < start) {
+              w_index = w;
+          } else {
+              w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+          }
+          const int inW = w_index * H_stride - H_pad;
+          const int inH = h * V_stride - V_pad; //input height index (row number)
+          //#pragma unroll
+          //  for (int ki = 0; ki < KH * KW; ki++) {               
+              //    int i = ki / KW;
+              //   int j = ki % KW; 
+          for(int i = 0; i < KH; i++) {
+              for(int j = 0; j < KW; j++) {
+                  const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen
+                  const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
+                  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+                  else
+                      output[out_index] = 0;
+              }
+        }
+    }
+}
+
+
+__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
+                                                __half *old_data, __half *new_data, int x, int start) {
+
+    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    //const int stride = blockDim.x * gridDim.x;
+    
+    //for(int i = index; i < N; i += stride){
+      //  const int col = ((i % (c * h * w)) % (h * w)) % w;
+      //  const int row = ((i % (c * h * w)) % (h * w)) / w;
+      //  const int ch = (i % (c * h * w)) / (h * w);
+      //  const int n = i / (c * h * w);
+
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (c * h * w); //output image number
+    if(n < N) {
+    	const int ch = tx % (c * h * w) / (h * w); //output chan number
+    	const int row = tx % (h * w) / w; //output height index (row number)
+        const int col = tx % w; //output width index (col number)
+
+    	if(col < start) {
+        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
+                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    	} else if(col == w - 1) {
+        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+    	} else if (col == 0) {
+        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    	} else if((col - start) % x == 0) {
+        	int col_index = col - ((col + 1 - start) / x);
+        	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+			__hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    	} else {
+        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
+    	}
+   }
+}
+
+__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w,
+                                                __half *old_data, __half *new_data, int x, int start) {
+    
+    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    //const int stride = blockDim.x * gridDim.x;
+    
+   // for(int i = index; i < N; i += stride){
+       // const int col = ((i % (c * h * w)) % (h * w)) % w;
+       // const int row = ((i % (c * h * w)) % (h * w)) / w;
+       // const int ch = (i % (c * h * w)) / (h * w);
+       // const int n = i / (c * h * w);
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (c * h * w); //output image number
+    if(n < N) {
+        const int ch = tx % (c * h * w) / (h * w); //output chan number
+        const int row = tx % (h * w) / w; //output height index (row number)
+        const int col = tx % w; //output width index (col number)
+        if(col < start) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
+                        = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col];
+                        //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+        } else if(col == w - 1) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                            old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
+                            //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+        } else if (col == 0) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                        old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
+                        //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+        } else if((col - start) % x == 0) {
+            const int col_index = col - ((col + 1 - start) / x);
+            const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+            //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                            __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+        } else {
+            const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+            const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+            //const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
+        }
+    }
+}
+
+
+__global__ void convToGemmFullInputRegular(float * const __restrict__ output,
+				    const float * const __restrict input,
+				    const int N, const int C,
+				    const int H, const int W,
+				    const int KH, const int KW, const int V_pad,
+				    const int H_pad, const int H_out,
+				    const int W_out, const int V_stride,
+				    const int H_stride, const int reduced_filter_elem,
+				    const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (H_out * W_out); //output image number
+  if(n < N) {
+    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+    const int w = tx % W_out; //output width index (col number)
+    const int inH = h * V_stride - V_pad; //input height index (row number)
+    const int inW = w * H_stride - H_pad; //input width index (col number)
+    
+    #pragma unroll
+    for(int fi = 0; fi < reduced_filter_elem; fi++) {
+         const int ch = (fi * C) / reduced_filter_elem;
+         const int offset = (skip_offset + ch) % skip_every;
+         int in_index;
+         if(fi < offset) {
+             in_index = fi;
+         } else {
+             in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
+                        + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
+        }
+        const int i = (in_index % (KW * KH)) / KW;
+        const int j = in_index % KW;
+        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; 
+        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+        } else {
+            output[out_index] = 0;
+        }
+      }
+    }
+}
+
+__global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
+                    const float * const __restrict input,
+                    const int N, const int C,
+                    const int H, const int W,
+                    const int KH, const int KW, const int V_pad,
+                    const int H_pad, const int H_out,
+                    const int W_out, const int V_stride,
+                    const int H_stride, const int reduced_filter_elem,
+                    const int skip_every, const int skip_offset) {
+    
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (H_out * W_out); //output image number
+    if(n < N) {
+        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+        const int w = tx % W_out; //output width index (col number)
+        const int inH = h * V_stride - V_pad; //input height index (row number)
+        const int inW = w * H_stride - H_pad; //input width index (col number)
+        
+        #pragma unroll
+        for(int fi = 0; fi < reduced_filter_elem; fi++) {
+            int in_index;
+            if(fi < skip_offset) {
+                in_index = fi;
+            } else {
+                in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+                            + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
+            }
+            const int ch = in_index / (KW * KH);
+            const int i = (in_index % (KW * KH)) / KW;
+            const int j = in_index % KW;
+            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+            } else {
+                output[out_index] = 0;
+            }
+        }
+    }
+}
+
+__global__ void createReducedFiltersFullRegular(float * output,
+					 const float * const __restrict input, const int NF,
+					 const int num_filter_elem, const int reduced_filter_elem, 
+                     const int channels,
+					 const int skip_every, const int skip_offset, const float fac) {
+  
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int fIdx = tx / reduced_filter_elem; //filter index
+  if(fIdx < NF) { 
+    const int offset = tx % reduced_filter_elem; //offset within filter
+    const int ch = (offset * channels) / reduced_filter_elem;
+    const int channel_offset = (skip_offset + ch) % skip_every;
+      int in_index;
+      if(offset < channel_offset) {
+        in_index = offset;
+     } else {
+         in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
+                  + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
+     }
+    output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+  }
+}
+
+__global__ void createReducedFiltersFullIrregular(float * output,
+                     const float * const __restrict input, const int NF,
+                     const int num_filter_elem, const int reduced_filter_elem,
+                     const int skip_every, const int skip_offset, const float fac) {
+
+      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+      const int fIdx = tx / reduced_filter_elem; //filter index
+      if(fIdx < NF) {
+        const int offset = tx % reduced_filter_elem; //offset within filter
+        int in_index;
+        if(offset < skip_offset) {
+            in_index = offset;
+        } else {
+            in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
+                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; 
+        }
+        output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+    }
+}
+
+__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
+                                    const __half * const __restrict input,
+                                    const int N, const int C,
+                                    const int H, const int W,
+                                    const int KH, const int KW, const int V_pad,
+                                    const int H_pad, const int H_out,
+                                    const int W_out, const int V_stride,
+                                    const int H_stride, const int reduced_filter_elem,
+                                    const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  if(n < N) {
+    const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+    const int w = tx % W_out; //output width index (col number)
+    const int inH = h * V_stride - V_pad; //input height index (row number)
+    const int inW = w * H_stride - H_pad; //input width index (col number)
+    
+    #pragma unroll
+    //for(int fi = 0; fi < reduced_filter_elem; fi++) {
+         //const int ch = (fi * C) / reduced_filter_elem;
+      for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
+        const int fi = ch * (reduced_filter_elem / C) + ki;
+        const int offset = (skip_offset + ch) % skip_every;
+         //int in_index;
+         const bool condition = (fi < offset);
+         const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
+                                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
+         //if(fi < offset) {
+         //    in_index = fi;
+         //} else {
+         //    in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
+           //             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
+       // }
+        const int i = (in_index % (KW * KH)) / KW;
+        const int j = in_index % KW;
+        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { 
+            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+        } else {
+            output[out_index] = 0;
+        }
+      }
+    }
+}
+
+__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
+                                    const __half * const __restrict input,
+                                    const int N, const int C, 
+                                    const int H, const int W,
+                                    const int KH, const int KW, const int V_pad,
+                                    const int H_pad, const int H_out,
+                                    const int W_out, const int V_stride,
+                                    const int H_stride, const int reduced_filter_elem,
+                                    const int skip_every, const int skip_offset) {
+
+      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+      const int n = tx / (C * H_out * W_out); //output image number
+      if(n < N) {
+           const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+          const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+          const int w = tx % W_out; //output width index (col number)
+          const int inH = h * V_stride - V_pad; //input height index (row number)
+          const int inW = w * H_stride - H_pad; //input width index (col number)
+          
+          #pragma unroll
+           for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
+               const int fi = ch * (reduced_filter_elem / C) + ki;
+          //for(int fi = 0; fi < reduced_filter_elem; fi++) {
+           //   const int ch = (fi * C) / reduced_filter_elem;
+              const int offset = (skip_offset + ch) % skip_every;
+              const int condition = (fi < offset);
+             const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
+                                                          + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
+             // int in_index;
+              //if(fi < offset) {
+               //   in_index = fi;
+              //} else {
+               //   in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
+                 //               + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
+             // }
+              const int i = (in_index % (KW * KH)) / KW;
+              const int j = in_index % KW;
+              const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+              if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+                  output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+              } else {
+                  output[out_index] = 0;
+             }
+        }
+    }
+}
+
+__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
+                    const __half * const __restrict input,
+                    const int N, const int C,
+                    const int H, const int W,
+                    const int KH, const int KW, const int V_pad,
+                    const int H_pad, const int H_out,
+                    const int W_out, const int V_stride,
+                    const int H_stride, const int reduced_filter_elem,
+                    const int skip_every, const int skip_offset) {
+
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (H_out * W_out); //output image number
+    if(n < N) {
+        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+        const int w = tx % W_out; //output width index (col number)
+        const int inH = h * V_stride - V_pad; //input height index (row number)
+        const int inW = w * H_stride - H_pad; //input width index (col number)
+        
+        #pragma unroll
+        for(int fi = 0; fi < reduced_filter_elem; fi++) {
+            const int condition = (fi < skip_offset);
+            const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+                                             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
+            //int in_index;
+            //if(fi < skip_offset) {
+             //   in_index = fi;
+            //} else {        
+              //  in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+              //              + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
+           // }
+            const int ch = in_index / (KW * KH);
+            const int i = (in_index % (KW * KH)) / KW;
+            const int j = in_index % KW; 
+            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+            } else {
+                output[out_index] = 0;
+            }
+        }
+    }
+}
+
+__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output,
+                                    const __half * const __restrict input,
+                                    const int N, const int C,
+                                    const int H, const int W,
+                                    const int KH, const int KW, const int V_pad,
+                                    const int H_pad, const int H_out,
+                                    const int W_out, const int V_stride,
+                                    const int H_stride, const int reduced_filter_elem,
+                                    const int skip_every, const int skip_offset) {
+
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (H_out * W_out); //output image number
+    if(n < N) {
+        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+        const int w = tx % W_out; //output width index (col number)
+        const int inH = h * V_stride - V_pad; //input height index (row number)
+        const int inW = w * H_stride - H_pad; //input width index (col number)
+       #pragma unroll 
+        for(int fi = 0; fi < reduced_filter_elem; fi++) {
+            const int condition = (fi < skip_offset);
+            const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+                                 + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
+           // int in_index;
+           // if(fi < skip_offset) {
+           //     in_index = fi;
+           // } else {
+            //    in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+                   //             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
+           // }
+            const int ch = in_index / (KW * KH);
+            const int i = (in_index % (KW * KH)) / KW;
+            const int j = in_index % KW;
+            const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+            //const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+            } else {
+                output[out_index] = 0;
+            }
+        }
+    }
+}
+
+
+__global__ void createReducedFiltersHalfRegular(__half * output,
+                                         const __half * const __restrict input, const int NF,
+                                         const int num_filter_elem, const int reduced_filter_elem,
+                     			 const int channels,
+                                         const int skip_every, const int skip_offset, const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  //const int stride = blockDim.x * gridDim.x;
+  
+  //#pragma unroll
+  //for (int i = tx; i < NF; i += stride) {
+    const int fIdx = tx / reduced_filter_elem; //filter index
+  if(fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; //offset within filter
+    const int ch = (offset * channels) / reduced_filter_elem;
+    const int channel_offset = (skip_offset + ch) % skip_every;
+    const int condition = (offset < channel_offset);
+    const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
+                          + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1);
+      
+     // int in_index;
+     // if(offset < channel_offset) {
+      //  in_index = offset;
+     //} else {
+       //  in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
+         //         + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
+    // }
+    output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
+ }
+}
+
+__global__ void createReducedFiltersHalfIrregular(__half * output,
+                     const __half * const __restrict input, const int NF,
+                     const int num_filter_elem, const int reduced_filter_elem,
+                     const int skip_every, const int skip_offset, const float fac) {
+
+      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+     //const int stride = blockDim.x * gridDim.x;
+      //#pragma unroll
+      //for (int i = tx; i < NF; i += stride) { 
+  
+      const int fIdx = tx / reduced_filter_elem; //filter index
+      if(fIdx < NF) {
+        const int offset = tx % reduced_filter_elem; //offset within filter
+        const int condition = (offset < skip_offset);
+        int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
+                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
+        //}
+        output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
+    //}
+  }
+}
+
+
+
+//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
+__global__ void convToGemmApprox(float * const __restrict__ output,
+				 const float * const __restrict input, const int N, const int C,
+				 const int H, const int W,
+				 const int KH, const int KW, const int V_pad,
+				 const int H_pad, const int H_out,
+				 const int W_out, const int V_stride,
+				 const int H_stride, const int reduced_filter_elem,
+				 const int skip_every) {
+  
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
+	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
+	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	  else
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
+	}
+      }
+    }
+  }
+}
+
+
+
+void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
+			 int vertical_pad, int horizontal_pad, int vertical_stride,
+			 int horizontal_stride, int conv_mode, int conv_groups,
+			 int row, int col, int start){
+
+  //////INFO("*** TensorConvolution (output perforation) \n");
+  //Event("Conv");
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
+  
+  Tensor* output;
+  // TODO: Support other cases;
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  //Event("H2F_start");
+  convertToFP32(input);
+  convertToFP32(filter);
+  //Event("H2F_end");
+  
+  long int n, c, h, w; // output dimensions
+  n = input->dims.dim_sizes[0];
+  c = filter->dims.dim_sizes[0]; //number of filters
+  const int KH = filter->dims.dim_sizes[2];
+  const int KW = filter->dims.dim_sizes[3];
+
+  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  int rem_row = (h - start) % row > 0;
+  int h_eff = h - ((h - start) / row) - rem_row;
+  
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  int rem_col = (w - start) % col > 0;
+  int w_eff = w - ((w - start) / col) - rem_col;
+
+  Tensor* new_output;
+  if(row > 1){
+    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+				     CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    float* convData;
+    long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 128;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+					       input->dims.dim_sizes[1],
+					       input->dims.dim_sizes[2],
+					       input->dims.dim_sizes[3],
+					       KH, KW,
+					       vertical_pad, horizontal_pad,
+					       h, w,
+					       vertical_stride, horizontal_stride,
+					       row, start, h_eff);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      h_eff * w, c, num_filter_elem,
+					      &alpha,
+					      convData, h_eff * w,
+					      num_filter_elem * h_eff * w,
+					      (float *)filter->gpu_data,
+					      num_filter_elem, 0,
+					      &beta,
+					      (float *)output->gpu_data,
+					      h_eff * w, c * h_eff * w,
+					      n));
+
+    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 127) / 128;
+    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
+					    (float *) output->gpu_data,
+					    (float *) new_output->gpu_data,
+					    row, start);
+    cudaDeviceSynchronize();
+
+    freeTensor(output);
+    cudaFree(convData);
+  } else if(col > 1){
+    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
+				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    float * convData;
+    long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 128;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+					       input->dims.dim_sizes[1],
+					       input->dims.dim_sizes[2],
+					       input->dims.dim_sizes[3],
+					       KH, KW,
+					       vertical_pad, horizontal_pad, h, w,
+					       vertical_stride, horizontal_stride,
+					       col, start, w_eff);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      h * w_eff, c, num_filter_elem,
+					      &alpha,
+					      convData,
+					      h * w_eff, num_filter_elem * h * w_eff,
+					      (float *)filter->gpu_data,
+					      num_filter_elem, 0,
+					      &beta,
+					      (float *)output->gpu_data,
+					      h * w_eff, c * h * w_eff,
+					      n));
+
+    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 127) / 128;
+    approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w,
+					    (float *)output->gpu_data,
+					    (float *)new_output->gpu_data,
+					    col, start);
+    cudaDeviceSynchronize();
+
+    freeTensor(output);
+    cudaFree(convData);
+  } else { 
+    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+				     CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    float * convData;
+    long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 128;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApprox<<<gridSize, blockSize>>>(convData,
+					      (float *)input->gpu_data, n,
+					      input->dims.dim_sizes[1],
+					      input->dims.dim_sizes[2],
+					      input->dims.dim_sizes[3],
+					      KH, KW,
+					      vertical_pad, horizontal_pad, h, w,
+					      vertical_stride, horizontal_stride,
+					      num_filter_elem, c * h * w);
+    checkCudaErrors(cudaDeviceSynchronize());
+    //Do the matrix multiplication
+    //Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      h * w, c, num_filter_elem,
+					      &alpha,
+					      convData, h * w, num_filter_elem * h * w,
+					      (float *)filter->gpu_data, num_filter_elem, 0,
+					      &beta,
+					      (float *)output->gpu_data, h * w, c * h * w,
+					      n));
+
+    new_output = output;
+    cudaFree(convData);
+  }
+
+  //Event("Conv_end"); //, true);
+  return new_output;
+}
+
+__global__
+void switchMatrixFull(int N, int n, int c, int h, int w,
+              float *old_data, float *new_data){
+
+      int i = blockIdx.x * blockDim.x + threadIdx.x;
+      if(i < N){
+          int col = ((i % (c * h * w)) % (h * w)) % w;
+          int row = ((i % (c * h * w)) % (h * w)) / w;
+          int ch = (i % (c * h * w)) / (h * w);
+          int n_new = i / (c * h * w);
+          
+          new_data[((n_new * c + ch) * h + row ) * w + col] =
+                        old_data[((ch * n + n_new) * h + row ) * w + col];
+        }
+}
+
+
+void* tensorConvApprox(void* input_ptr, void* filter_ptr,
+		       int vertical_pad, int horizontal_pad, int vertical_stride,
+		       int horizontal_stride, int conv_mode, int conv_groups,
+		       int row, int col, int skip_every, int offset){
+
+  //////INFO("*** TensorConvolution approximation \n");
+  //Event("Conv");
+
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
+
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  ////Event("H2F_start");
+  convertToFP32(input);
+  convertToFP32(filter);
+  ////Event("H2F_end");
+
+  const int n = input->dims.dim_sizes[0];
+  const int c = filter->dims.dim_sizes[0]; //number of filters
+  const int KH = filter->dims.dim_sizes[2];
+  const int KW = filter->dims.dim_sizes[3];
+  const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
+				       CUDNN_TENSOR_NCHW, n, c, h, w);
+  // NOTE: Changing output tensor placement from host to device
+  changeTensorPlacement(new_output, DEVICE);
+  ////INFO("batch: %d\n", n);
+  ////INFO("channels: %d\n", input->dims.dim_sizes[1]);
+  ////INFO("num_filters: %d\n", c);
+  ////INFO("kernel height: %d\n", KH);
+  ////INFO("kernel width: %d\n", KW);
+  ////INFO("num_filter_elem: %d\n", num_filter_elem);
+  ////INFO("vertical_stride: %d\n", vertical_stride);
+  ////INFO("horizontal_stride: %d\n", horizontal_stride);
+  ////INFO("output height: %d\n", h);
+  ////INFO("output width: %d\n", w);
+  if(row > 1) {
+    const int rem_row = (h - offset) % row > 0;
+    const int h_eff = h - ((h - offset) / row) - rem_row;
+
+    Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+				      CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+
+    float * convData;
+    long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 128;
+    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w));
+    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+					       input->dims.dim_sizes[1],
+					       input->dims.dim_sizes[2],
+					       input->dims.dim_sizes[3],
+					       KH, KW, vertical_pad, horizontal_pad,
+					       h, w,
+					       vertical_stride, horizontal_stride,
+					       row, offset, h_eff);
+    checkCudaErrors(cudaDeviceSynchronize());
+     
+     float alpha = 1.0f, beta = 0.0f;
+     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+                                                CUBLAS_OP_N, CUBLAS_OP_N,
+                                                h_eff * w, c, num_filter_elem,
+                                                &alpha,
+                                                convData, h_eff * w, num_filter_elem * h_eff * w,
+                                                (float *)filter->gpu_data, num_filter_elem, 0,
+                                                &beta,
+                                                (float *)output->gpu_data, h_eff * w, c * h_eff * w,
+                                                n));
+    //interpolate
+    int blocksize = 128;
+    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
+    approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
+					    (float *) output->gpu_data,
+					    (float *) new_output->gpu_data,
+					    row, offset);
+    cudaDeviceSynchronize();
+
+    freeTensor(output);
+    cudaFree(convData);
+  } else if(col > 1) {
+    const int rem_col = (w - offset) % col > 0;
+    const int w_eff = w - ((w - offset) / col) - rem_col;
+
+    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
+				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+
+    float * convData;
+    long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 128;
+    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff));
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+					       input->dims.dim_sizes[1],
+					       input->dims.dim_sizes[2],
+					       input->dims.dim_sizes[3], KH, KW,
+					       vertical_pad, horizontal_pad, h, w,
+					       vertical_stride, horizontal_stride,
+					       col, offset, w_eff);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      h * w_eff, c, num_filter_elem,
+					      &alpha,
+					      convData, h * w_eff, num_filter_elem * h * w_eff,
+					      (float *)filter->gpu_data, num_filter_elem, 0,
+					      &beta,
+					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
+					      n));
+
+    //interpolate
+    int blocksize = 128;
+    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
+    approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
+					    (float *)output->gpu_data,
+					    (float *)new_output->gpu_data,
+					    col, offset);
+    cudaDeviceSynchronize();
+
+    freeTensor(output);
+    cudaFree(convData);
+  } else if(skip_every > 1) {
+    //reduced number after skipping
+    const int remainder = ((num_filter_elem - offset) % skip_every > 0);
+    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+
+    float* convData;
+    size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+    float* reducedFilter;
+    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
+    
+    const int filtBlockSize = 128;
+    ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
+    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac =  ((float) skip_every) / ((float) skip_every - 1);
+    //////INFO("fac: %f\n", fac);
+    const int blockSize = 128;
+    //////INFO("n * h * w : %d\n", (n * h * w ));    
+    const int gridSize = (n * h * w + blockSize - 1) / blockSize;  
+    if(!(KH * KW % skip_every)) {
+       // ////INFO("REGULAR FILTERING\n");
+        createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
+                                (float *)filter->gpu_data,
+								c, num_filter_elem,
+								reduced_filter_elem,
+								input->dims.dim_sizes[1], skip_every, offset, fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+        convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+                                                        input->dims.dim_sizes[1],
+                                                        input->dims.dim_sizes[2],
+                                                        input->dims.dim_sizes[3],
+                                                        KH, KW, vertical_pad, horizontal_pad,
+                                                        h, w, vertical_stride, horizontal_stride,
+                                                        reduced_filter_elem, skip_every, offset);
+    } else {
+       // ////INFO("IRREGULAR FILTERING\n");
+        createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
+                                    (float *)filter->gpu_data,
+                                    c, num_filter_elem,
+                                    reduced_filter_elem,
+                                    skip_every, offset, fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+        convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,     
+                                                                input->dims.dim_sizes[1],                                                     
+                                                                input->dims.dim_sizes[2],                                                 
+                                                                input->dims.dim_sizes[3],
+                                                                KH, KW, vertical_pad, horizontal_pad,
+                                                                h, w, vertical_stride, horizontal_stride,
+                                                                reduced_filter_elem, skip_every, offset);
+    }
+    checkCudaErrors(cudaDeviceSynchronize());
+    
+    const float alpha = 1.0;
+    const float beta = 0.0;
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+                                            CUBLAS_OP_N, CUBLAS_OP_N,
+                                            h * w, c, reduced_filter_elem,
+                                            &alpha,
+                                            convData, h * w, reduced_filter_elem * h * w,
+                                            reducedFilter, reduced_filter_elem, 0,
+                                            &beta,
+                                            (float *)new_output->gpu_data, h * w, c * h * w,
+                                            n));
+    cudaFree(convData);
+    cudaFree(reducedFilter);
+  } else {
+      INFO("FP32 BASELINE\n");
+      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
+                               CUDNN_TENSOR_NCHW, n, c, h, w);
+    changeTensorPlacement(output, DEVICE);
+
+    float * convData;
+    long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 128;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w));
+    convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+					       input->dims.dim_sizes[1],
+					       input->dims.dim_sizes[2],
+					       input->dims.dim_sizes[3],
+					       KH, KW, vertical_pad, horizontal_pad,
+					       h, w, vertical_stride, horizontal_stride, 
+                           skip_every, offset);//num_filter_elem);
+    checkCudaErrors(cudaDeviceSynchronize());
+     
+     float alpha = 1.0f, beta = 0.0f;
+     /*
+     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+                                          CUBLAS_OP_N, CUBLAS_OP_N,
+                                            h * w, c, num_filter_elem,
+                                            &alpha,
+                                            convData, h * w, num_filter_elem * h * w,
+                                            (float *)filter->gpu_data, num_filter_elem, 0,
+                                            &beta,
+                                            (float *)new_output->gpu_data, h * w, c * h * w,
+                                            n));
+    */
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+                       n * h * w, c, num_filter_elem,
+                        &alpha,
+                        convData,
+                        CUDA_R_32F, n * h * w,
+                        (float *) filter->gpu_data, CUDA_R_32F,
+                        num_filter_elem,
+                        &beta,
+                        (float *) output->gpu_data,
+                        CUDA_R_32F, n * h * w,
+                        CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+    
+    const int numBlocks = (n * c * h * w  + 255) / 256;
+    switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
+                                    (float *)output->gpu_data,
+                                    (float *)new_output->gpu_data);
+    
+    checkCudaErrors(cudaDeviceSynchronize());
+    cudaFree(convData);
+  }
+
+  //Event("Conv_end");
+  return new_output;
+}
+
+__global__
+void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
+
+      int i = blockIdx.x * blockDim.x + threadIdx.x;
+      if(i < N){
+            int col = ((i % (c * h * w)) % (h * w)) % w;
+            int row = ((i % (c * h * w)) % (h * w)) / w;
+            int ch = (i % (c * h * w)) / (h * w);
+            int n_new = i / (c * h * w);
+            
+            new_data[((n_new * c + ch) * h + row ) * w + col] =
+                            old_data[((ch * n + n_new) * h + row ) * w + col];
+      }
+}
+
+
+void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
+			   int vertical_pad, int horizontal_pad,
+			   int vertical_stride, int horizontal_stride,
+			   int conv_mode, int conv_groups,
+			   int row, int col, int skip_every, int offset) {
+
+ //INFO("*** TensorConvolution half approximation \n");
+ // profileEvent("#Conv");
+
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
+
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+  
+  profileEvent("F2H_start");
+   convertToFP16(input);
+   convertToFP16(filter);
+  profileEvent("F2H_end");
+  
+  const long int n = input->dims.dim_sizes[0];
+  const long int c = filter->dims.dim_sizes[0]; //number of filters
+  const int KH = filter->dims.dim_sizes[2];
+  const int KW = filter->dims.dim_sizes[3];
+  const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					       CUDNN_TENSOR_NCHW, n, c, h, w);
+  changeTensorPlacement(new_output, DEVICE);
+  //INFO("batch: %d\n", n);
+  // INFO("channels: %d\n", input->dims.dim_sizes[1]);
+  // INFO("num_filters: %d\n", c);
+  // INFO("kernel height: %d\n", KH);
+  // INFO("kernel width: %d\n", KW);   
+  // INFO("num_filter_elem: %d\n", num_filter_elem);
+   //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
+   //INFO("vertical_stride: %d\n", vertical_stride);
+   //INFO("horizontal_stride: %d\n", horizontal_stride);
+  // INFO("output height: %d\n", h);
+  // INFO("output width: %d\n", w);
+   //INFO("skip_every: %d\n", skip_every);
+  const __half alf = approx_float_to_half(1.0);
+  const __half bet = approx_float_to_half(0.0);
+  const __half *alpha_half = &alf;
+  const __half *beta_half = &bet;
+
+  if(row > 1){
+    const int rem_row = (h - offset) % row > 0;
+    const int h_eff = h - ((h - offset) / row) - rem_row;
+    
+    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+						  CUDNN_TENSOR_NCHW,
+						  n, c, h_eff, w);
+    changeTensorPlacement(output_half, DEVICE);
+
+    __half * convData;
+    long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+    
+    const int patchBlockSize = 256;
+    const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) / patchBlockSize;
+    const int interpolationBlocksize = 256;
+    const int numInterpolationBlocks = (n * c * h * w  + interpolationBlocksize - 1) / interpolationBlocksize;
+    if(h * w <= 64) {
+        //INFO("H *W <= 64\n");
+        convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>(convData,
+                                   (__half *)input->gpu_half_data, n,
+                                   input->dims.dim_sizes[1],
+                                   input->dims.dim_sizes[2],
+                                   input->dims.dim_sizes[3],
+                                   KH, KW, vertical_pad,
+                                   horizontal_pad, h, w, vertical_stride,
+                                   horizontal_stride, row, offset, h_eff);
+        checkCudaErrors(cudaDeviceSynchronize());
+        
+        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+                         n * h_eff * w, c, num_filter_elem,
+                         alpha_half,
+                         convData, CUDA_R_16F, n * h_eff * w,
+                         (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+                         beta_half,
+                         (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
+                         CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+        approxInterpolateRowHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w,
+                                        (__half *)output_half->gpu_half_data,
+                                        (__half *)new_output->gpu_half_data,
+                                        row, offset);
+        checkCudaErrors(cudaDeviceSynchronize());
+    
+    } else {
+        //INFO("H *W > 64\n");
+        convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>(convData,
+						   (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1],
+						   input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3],
+						   KH, KW, vertical_pad,
+						   horizontal_pad, h, w, vertical_stride,
+						   horizontal_stride, row, offset, h_eff);
+        checkCudaErrors(cudaDeviceSynchronize());
+        
+        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
+                                                CUBLAS_OP_N, CUBLAS_OP_N,
+                                                h_eff * w, c, num_filter_elem,
+                                                alpha_half,
+                                                convData, h_eff * w, num_filter_elem * h_eff * w,
+                                                (__half *)filter->gpu_half_data, num_filter_elem, 0,
+                                                beta_half,
+                                                (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w,
+                                                n));    
+        
+        approxInterpolateRowHalf<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w,
+						(__half *)output_half->gpu_half_data,
+						(__half *)new_output->gpu_half_data,
+						row, offset);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+    }
+    freeTensor(output_half);
+    cudaFree(convData);
+} else if(col > 1) {
+    const int rem_col = (w - offset) % col > 0;
+    const int w_eff = w - ((w - offset) / col) - rem_col;
+
+    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+						  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+    changeTensorPlacement(output_half, DEVICE);
+   
+    __half * convData;
+    long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+    
+    const int patchBlockSize = 256;
+    const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) / patchBlockSize;
+    const int interpolationBlocksize = 256;
+    const int numInterpolationBlocks = (n * c * h * w  + interpolationBlocksize - 1) / interpolationBlocksize;
+   if(h * w <= 64) {
+         //INFO("H *W <= 64\n");
+        convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n,
+                                                input->dims.dim_sizes[1],
+                                                input->dims.dim_sizes[2],
+                                                input->dims.dim_sizes[3], KH, KW, vertical_pad,
+                                                horizontal_pad, h, w, vertical_stride,
+                                                horizontal_stride, col, offset, w_eff);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+                                        n * h * w_eff, c, num_filter_elem,
+                                        alpha_half,
+                                        convData, CUDA_R_16F, n * h * w_eff,
+                                        (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+                                        beta_half,
+                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
+                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+         approxInterpolateColHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w,
+                                                        (__half *)output_half->gpu_half_data,
+                                                        (__half *)new_output->gpu_half_data,
+                                                        col, offset);
+          checkCudaErrors(cudaDeviceSynchronize());
+    } else {
+        //INFO("H *W > 64\n");
+        convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n,
+		                            				   input->dims.dim_sizes[1],
+                                                       input->dims.dim_sizes[2],
+                                                       input->dims.dim_sizes[3], KH, KW, vertical_pad,
+                                                       horizontal_pad, h, w, vertical_stride,
+                                                       horizontal_stride, col, offset, w_eff);
+        checkCudaErrors(cudaDeviceSynchronize());
+    
+        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
+                                              CUBLAS_OP_N, CUBLAS_OP_N,
+                                              h * w_eff, c, num_filter_elem,
+                                              alpha_half,
+                                              convData, h * w_eff, num_filter_elem * h * w_eff,
+                                              (__half *)filter->gpu_half_data, num_filter_elem, 0,
+                                              beta_half,
+                                              (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff,
+                                              n));
+
+         approxInterpolateColHalf<<<numInterpolationBlocks,interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w,
+                                 (__half *)output_half->gpu_half_data,
+                                 (__half *)new_output->gpu_half_data,
+                                 col, offset);
+         checkCudaErrors(cudaDeviceSynchronize());
+    }
+
+    freeTensor(output_half);
+    cudaFree(convData);
+  } else if(skip_every > 1) {
+    const int remainder = ((num_filter_elem - offset) % skip_every > 0);
+    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+
+    __half* convData;
+    size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+    __half* reducedFilter;
+    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
+
+    const int filtBlockSize = 256;
+    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac =  ((float) skip_every) / ((float) skip_every - 1);
+    const int blockSize = 256;
+    //const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+   // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
+   // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+    if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { 
+      if(!(KH * KW % skip_every)) {
+        //INFO("---REGULAR FILTERING\n");
+        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
+                                                                (__half *)filter->gpu_half_data,
+								c, num_filter_elem,
+                                                                reduced_filter_elem,
+                                                                input->dims.dim_sizes[1], skip_every, offset, fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+	
+        const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+                                                        input->dims.dim_sizes[1],
+                                                        input->dims.dim_sizes[2],
+                                                        input->dims.dim_sizes[3],
+                                                        KH, KW, vertical_pad, horizontal_pad,
+                                                        h, w, vertical_stride, horizontal_stride,
+                                                        reduced_filter_elem, skip_every, offset);
+      } else {
+        //INFO("---IRREGULAR FILTERING\n");
+        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
+                                    (__half *)filter->gpu_half_data,
+				    c, num_filter_elem,
+                                    reduced_filter_elem,
+                                    skip_every, offset, fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+        
+        const int gridSize = (n * h * w * input->dims.dim_sizes[1]  + blockSize - 1) / blockSize;
+	    //convToGemmHalfInputIrregular
+        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,  
+                                                                input->dims.dim_sizes[1],
+                                                                input->dims.dim_sizes[2],
+                                                                input->dims.dim_sizes[3],
+                                                                KH, KW, vertical_pad, horizontal_pad,
+                                                                h, w, vertical_stride, horizontal_stride,
+                                                                reduced_filter_elem, skip_every, offset);
+     }   
+     checkCudaErrors(cudaDeviceSynchronize());
+
+     checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
+                                            CUBLAS_OP_N, CUBLAS_OP_N,
+                                            h * w, c, reduced_filter_elem,
+                                            alpha_half,
+                                            convData, h * w, reduced_filter_elem * h * w,
+                                            reducedFilter, reduced_filter_elem, 0,
+                                            beta_half,
+                                            (__half *)new_output->gpu_half_data, h * w, c * h * w,
+                                            n));
+    } else {
+        Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+                                 CUDNN_TENSOR_NCHW, n, c, h, w);
+        changeTensorPlacement(output_half, DEVICE);
+
+        if(!(KH * KW % skip_every)) {
+           //INFO("REGULAR FILTERING\n");
+            createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
+                                                        (__half *)filter->gpu_half_data,
+                                                        c, num_filter_elem,
+                                                        reduced_filter_elem,
+                                                        input->dims.dim_sizes[1], skip_every, offset, fac);
+            checkCudaErrors(cudaDeviceSynchronize());
+            
+            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+            convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+                                                                input->dims.dim_sizes[1],
+                                                                input->dims.dim_sizes[2],
+                                                                input->dims.dim_sizes[3],
+                                                                KH, KW, vertical_pad, horizontal_pad,
+                                                                h, w, vertical_stride, horizontal_stride,
+                                                                reduced_filter_elem, skip_every, offset);
+        } else {
+           // INFO("IRREGULAR FILTERING\n");
+            createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
+                                                                            (__half *)filter->gpu_half_data,
+                                                                            c, num_filter_elem,
+                                                                            reduced_filter_elem,
+                                                                            skip_every, offset, fac);
+            checkCudaErrors(cudaDeviceSynchronize());
+            
+            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+            convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+                                                                input->dims.dim_sizes[1],
+                                                                input->dims.dim_sizes[2],
+                                                                input->dims.dim_sizes[3],
+                                                                KH, KW, vertical_pad, horizontal_pad,
+                                                                h, w, vertical_stride, horizontal_stride,
+                                                                reduced_filter_elem, skip_every, offset);
+            }
+            checkCudaErrors(cudaDeviceSynchronize());
+
+            checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+                                        n * h * w, c, reduced_filter_elem,
+                                        alpha_half,
+                                        convData, CUDA_R_16F, n * h * w,
+                                         reducedFilter, CUDA_R_16F, reduced_filter_elem,
+                                        beta_half,
+                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
+                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+            
+            int numBlocks = (n * c * h * w  + 255) / 256;
+            switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
+                                    (__half *)output_half->gpu_half_data,
+                                    (__half *)new_output->gpu_half_data);
+            checkCudaErrors(cudaDeviceSynchronize());
+
+            freeTensor(output_half);
+    }
+    
+    cudaFree(convData);
+    cudaFree(reducedFilter);
+  } else {
+       //INFO("FP16 BASELINE\n");
+      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+                                   CUDNN_TENSOR_NCHW, n, c, h, w);
+      
+      changeTensorPlacement(output, DEVICE);
+      __half * convData;
+      long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
+      checkCudaErrors(cudaMalloc(&convData, convDataSize));
+      
+      const int blockSize = 256;
+      const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+      //convToGemmHalf
+      convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData,
+                                                (__half *)input->gpu_half_data, n,
+                                                input->dims.dim_sizes[1],
+                                                input->dims.dim_sizes[2],
+                                                input->dims.dim_sizes[3],
+                                                KH, KW, vertical_pad,
+                                                horizontal_pad, h, w, vertical_stride,
+                                                horizontal_stride, num_filter_elem,
+                                                skip_every, offset);
+        checkCudaErrors(cudaDeviceSynchronize());
+        
+        const __half alf = approx_float_to_half(1.0);
+        const __half bet = approx_float_to_half(0.0);
+        const __half *alpha_half = &alf;
+        const __half *beta_half = &bet;
+        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+                                    n * h * w, c, num_filter_elem,
+                                    alpha_half,
+                                    convData, CUDA_R_16F, n * h * w,
+                                    (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+                                    beta_half,
+                                    (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w,
+                                    CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+        
+        const int numBlocks = (n * c * h * w  + 255) / 256;
+        switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data,
+                                            (__half *)new_output->gpu_half_data);
+        checkCudaErrors(cudaDeviceSynchronize());
+        
+        freeTensor(output);
+        cudaFree(convData);
+  }
+//    INFO("CONV DONE\n");
+  profileEvent("H2F_start");
+  convertToFP32_offline(new_output);
+  //convertToFP32(input);
+  //convertToFP32(filter);
+  profileEvent("H2F_end");
+  //profileEvent("#Conv_end");
+  //INFO("CONVOLUTION END\n");
+  return new_output;
+}
+
+} // end of Extern C
-- 
GitLab