diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
index 5051b780894b6e758cf768e663739ea6b92c71e5..1f69bb74d184f59896494dbc9ea5b5a934f2ec67 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
@@ -1,9 +1,17 @@
+//===--------------------------- approxs_simulator.cu ---------------------===//
+//
+//===----------------------------------------------------------------------===//
+//   
+//  This file  consists of the simulation of implementation of software 
+// approximations for tensor convolutions. The approximations implemented are 
+// feature sampling and perforation for FP32 and FP16 compute precisions.  
+//
+//===----------------------------------------------------------------------===//
+
 
 #ifndef SIM_HEADER
 #define SIM_HEADER
 
-
-
 #include "tensor_runtime.h"
 #include "tensor_utils.h"
 #include "debug.h"
@@ -29,8 +37,6 @@
 #include <cassert>
 
 
-
-
 //N is new_data's size
 //n, c, h, w are the dimensions of new_data
 __global__
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
deleted file mode 100644
index 8f2d840362ee523a458339b848e9080a2822d92f..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
+++ /dev/null
@@ -1,2000 +0,0 @@
-
-#include "tensor_utils.h"
-#include "approx_utils.h"
-#include "debug.h"
-#include "global_data.h"
-#include "fp16_gemm.h"
-#include "fp16_conversion.h"
-#include "profiling.h"
-
-extern "C"{
-
-__global__ void convToGemm(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-        else
-            output[out_index] = 0;
-      }
-    }
-  }
-}
-
-__global__ void convToGemmFullInput(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride,
-                    const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen
-                  if(filter_elem_num % skip_every != skip_every-1-skip_offset) {
-                      int output_col = filter_elem_num -
-                               ((filter_elem_num + skip_every)/skip_every);
-                        if(skip_every == 1) output_col = filter_elem_num;
-                        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                            output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                        input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                        else         
-                            output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                   }                
-                }              
-            }                
-        }
-}
-
-__global__ void convToGemmHalfInputNew(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-      
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if(filter_elem_num % skip_every != skip_offset) {
-                      int output_col = filter_elem_num -
-                                        (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                     if(skip_every == 1) output_col = filter_elem_num;
-                      if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                          output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                      else
-                          output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                  }
-              }
-           }
-      }
-}
-
-
-__global__
-void convToGemmHalf(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW,
-                    const int V_pad, const int H_pad,
-                    const int H_out, const int W_out,
-                    const int V_stride, const int H_stride){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i
-    const int n = tx / (C * H_out * W_out); //output image numbe
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number
-    const int w = tx % W_out; //output width index (col number
-    const int inH = h * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
-                                            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                } else {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
-                }
-            }
-        }
-    }
-}
-
-__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output,
-                                        const __half * const __restrict input,
-                                        const int N, const int C,
-                                        const int H, const int W,
-                                        const int KH, const int KW, const int V_pad,
-                                        const int H_pad, const int H_out,
-                                        const int W_out, const int V_stride,
-                                        const int H_stride, const int reduced_filter_elem,
-                                        const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  //const int ki = c * KH * KW + i;
-                  //const int kj = c * KH * KW + j;
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                     const int output_col = condition * filter_elem_num 
-                                    + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) 
-                                                         - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));                   
-                  //if(filter_elem_num % skip_every != skip_offset) {
-                  // int output_col = filter_elem_num -
-                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                   //if(skip_every == 1)
-                   //    output_col = filter_elem_num;
-                    const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
-                    //((output_col*N + n) * H_out + h) * W_out + w;
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                       output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                     else
-                       output[out_index] = 0;
-              }
-            }
-        }
-    }
-}
-
-__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output,
-                                                const __half * const __restrict input,
-                                                const int N, const int C,
-                                                const int H, const int W,
-                                                const int KH, const int KW, const int V_pad,
-                                                const int H_pad, const int H_out,
-                                                const int W_out, const int V_stride,
-                                                const int H_stride, const int reduced_filter_elem,
-                                                const int skip_every, const int skip_offset) {
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_out * W_out); //output image number
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-                //const int ki = c * KH * KW + i;
-                //const int kj = c * KH * KW + j;
-                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                    const int output_col = condition * filter_elem_num
-                                        + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every)
-                                        - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
-                    //if(filter_elem_num % skip_every != skip_offset) {
-                    // int output_col = filter_elem_num -
-                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                    //if(skip_every == 1)
-                    //    output_col = filter_elem_num;
-                    const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
-                    //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
-                    //((output_col*N + n) * H_out + h) * W_out + w
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                    else
-                        output[out_index] = 0;
-                }
-            }
-        }
-    }
-}
-
-
-
-__global__ void convToGemmHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) { 
-      for(int j = 0; j < KW; j++) { 
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element 
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-        else
-            output[out_index] = 0;
-      }
-    }
-  }
-}
-
-__global__ void convToGemmPerfRow(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    int h_index;
-    if(h < start) {
-        h_index = h;
-    } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
-    }
-    const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-   //#pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-      //  int i = ki / KW;
-      //  int j = ki % KW;
-    for(int i = 0; i < KH; i++) {
-        for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element
-    const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[out_index] = 0;
-      }
-    }
-  }
-}
-
-__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w,
-			  float *old_data, float *new_data, int x, int start){
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-    
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) { 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) { 
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                (old_data[output_index] + old_data[output_index - w]) / 2;
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); 
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
-    }
-}
-
-__global__ void convToGemmPerfCol(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
-    int w_index;
-    if(w < start) {
-      w_index = w;
-    } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
-    }
-    const int inW = w_index * H_stride - H_pad; 
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    //#pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-      //  int i = ki / KW;
-       // int j = ki % KW;
-    
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW  + i * KW + j; //index of this filter element
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
-      }
-    }
-  }
-}
-
-__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w,
-			                        float *old_data, float *new_data, int x, int start) { 
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    	const int ch = tx % (c * h * w) / (h * w); //output chan number
-   	 const int row = tx % (h * w) / w; //output height index (row number)
-    	const int col = tx % w; //output width index (col number)
-
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] 
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-       		int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                	    (old_data[output_index] + old_data[output_index - 1]) / 2;
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);  
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-       	 	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
-    }
-}
-
-__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    int h_index;
-    if(h < start) {
-        h_index = h;
-    } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
-    }
-    const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-  // #pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-     //   int i = ki / KW; 
-     //   int j = ki % KW;
-   
-   for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-    	const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-    	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-        else
-          output[out_index] = 0;
-      }
-    }
-  }
-}
-
-__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_eff * W_out); //output image numbe
-    if(n < N) { 
-        const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-        const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        int h_index;                   
-        if(h < start) {                
-            h_index = h;               
-        } else {                       
-            h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;                                                            
-        }                              
-        const int inH = h_index * V_stride - V_pad;
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        // #pragma unroll
-        //for (int ki = 0; ki < KH * KW; ki++) {
-            //   int i = ki / KW; 
-            //   int j = ki % KW; 
-            for(int i = 0; i < KH; i++) {
-                for(int j = 0; j < KW; j++) {
-                    const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-                    const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
-                    //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                    else
-                        output[out_index] = 0;
-                }
-            }
-    }
-}
-
-__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-
-    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int n = tx / (c * h * w); //output image number
-    //const int stride = blockDim.x * gridDim.x;
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    //for(int i = index; i < N; i += stride){
-        //const int col = ((i % (c * h * w)) % (h * w)) % w;
-        //const int row = ((i % (c * h * w)) % (h * w)) / w;
-        //const int ch = (i % (c * h * w)) / (h * w);
-       // const int n = i / (c * h * w);
-
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-				__hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
-    }
-}
-
-__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-    
-    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int n = tx / (c * h * w); //output image numbe
-    //const int stride = blockDim.x * gridDim.x;
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    //for(int i = index; i < N; i += stride){
-        //const int col = ((i % (c * h * w)) % (h * w)) % w;
-        //const int row = ((i % (c * h * w)) % (h * w)) / w;
-        //const int ch = (i % (c * h * w)) / (h * w);
-        //const int n = i / (c * h * w);
-        
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col];
-       } else if(row == h-1) {
-           new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            const int row_index = row - ((row + 1 - start) / x);
-            const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
-    }
-}
-
-
-__global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
-    int w_index;
-    if(w < start) {
-      w_index = w;
-    } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
-    }
-    const int inW = w_index * H_stride - H_pad;
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-     //#pragma unroll
-    //  for (int ki = 0; ki < KH * KW; ki++) {               
-      //    int i = ki / KW;
-       //   int j = ki % KW; 
-    
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-        else
-          output[out_index] = 0;
-
-      }
-    }
-  }
-}
-
-__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                        const int H, const int W, const int KH, const int KW, const int V_pad,
-                        const int H_pad, const int H_out, const int W_out, const int V_stride,
-                        const int H_stride, const int x, const int start, const int W_eff){
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_eff); //output image number
-      if(n < N) {
-          const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-          const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-          const int w = tx % W_eff; //output width index (col number)
-          int w_index;
-          if(w < start) {
-              w_index = w;
-          } else {
-              w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
-          }
-          const int inW = w_index * H_stride - H_pad;
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-          //#pragma unroll
-          //  for (int ki = 0; ki < KH * KW; ki++) {               
-              //    int i = ki / KW;
-              //   int j = ki % KW; 
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen
-                  const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
-                  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                  else
-                      output[out_index] = 0;
-              }
-        }
-    }
-}
-
-
-__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
-
-    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int stride = blockDim.x * gridDim.x;
-    
-    //for(int i = index; i < N; i += stride){
-      //  const int col = ((i % (c * h * w)) % (h * w)) % w;
-      //  const int row = ((i % (c * h * w)) % (h * w)) / w;
-      //  const int ch = (i % (c * h * w)) / (h * w);
-      //  const int n = i / (c * h * w);
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    	const int ch = tx % (c * h * w) / (h * w); //output chan number
-    	const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-        	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-			__hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
-   }
-}
-
-__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
-    
-    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int stride = blockDim.x * gridDim.x;
-    
-   // for(int i = index; i < N; i += stride){
-       // const int col = ((i % (c * h * w)) % (h * w)) % w;
-       // const int row = ((i % (c * h * w)) % (h * w)) / w;
-       // const int ch = (i % (c * h * w)) / (h * w);
-       // const int n = i / (c * h * w);
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        const int ch = tx % (c * h * w) / (h * w); //output chan number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-        if(col < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                        = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col];
-                        //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-        } else if(col == w - 1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
-                            //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-        } else if (col == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                        old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
-                        //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-        } else if((col - start) % x == 0) {
-            const int col_index = col - ((col + 1 - start) / x);
-            const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-        } else {
-            const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-            const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            //const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
-    }
-}
-
-
-__global__ void convToGemmFullInputRegular(float * const __restrict__ output,
-				    const float * const __restrict input,
-				    const int N, const int C,
-				    const int H, const int W,
-				    const int KH, const int KW, const int V_pad,
-				    const int H_pad, const int H_out,
-				    const int W_out, const int V_stride,
-				    const int H_stride, const int reduced_filter_elem,
-				    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (H_out * W_out); //output image number
-  if(n < N) {
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-    #pragma unroll
-    for(int fi = 0; fi < reduced_filter_elem; fi++) {
-         const int ch = (fi * C) / reduced_filter_elem;
-         const int offset = (skip_offset + ch) % skip_every;
-         int in_index;
-         if(fi < offset) {
-             in_index = fi;
-         } else {
-             in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
-                        + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-        }
-        const int i = (in_index % (KW * KH)) / KW;
-        const int j = in_index % KW;
-        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; 
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-        } else {
-            output[out_index] = 0;
-        }
-      }
-    }
-}
-
-__global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            int in_index;
-            if(fi < skip_offset) {
-                in_index = fi;
-            } else {
-                in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                            + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-            }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
-    }
-}
-
-__global__ void createReducedFiltersFullRegular(float * output,
-					 const float * const __restrict input, const int NF,
-					 const int num_filter_elem, const int reduced_filter_elem, 
-                     const int channels,
-					 const int skip_every, const int skip_offset, const float fac) {
-  
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / reduced_filter_elem; //filter index
-  if(fIdx < NF) { 
-    const int offset = tx % reduced_filter_elem; //offset within filter
-    const int ch = (offset * channels) / reduced_filter_elem;
-    const int channel_offset = (skip_offset + ch) % skip_every;
-      int in_index;
-      if(offset < channel_offset) {
-        in_index = offset;
-     } else {
-         in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                  + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
-     }
-    output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
-  }
-}
-
-__global__ void createReducedFiltersFullIrregular(float * output,
-                     const float * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int fIdx = tx / reduced_filter_elem; //filter index
-      if(fIdx < NF) {
-        const int offset = tx % reduced_filter_elem; //offset within filter
-        int in_index;
-        if(offset < skip_offset) {
-            in_index = offset;
-        } else {
-            in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; 
-        }
-        output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
-    }
-}
-
-__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-    #pragma unroll
-    //for(int fi = 0; fi < reduced_filter_elem; fi++) {
-         //const int ch = (fi * C) / reduced_filter_elem;
-      for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-        const int fi = ch * (reduced_filter_elem / C) + ki;
-        const int offset = (skip_offset + ch) % skip_every;
-         //int in_index;
-         const bool condition = (fi < offset);
-         const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-         //if(fi < offset) {
-         //    in_index = fi;
-         //} else {
-         //    in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-           //             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-       // }
-        const int i = (in_index % (KW * KH)) / KW;
-        const int j = in_index % KW;
-        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { 
-            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-        } else {
-            output[out_index] = 0;
-        }
-      }
-    }
-}
-
-__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C, 
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      if(n < N) {
-           const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-          const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-          const int w = tx % W_out; //output width index (col number)
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-          const int inW = w * H_stride - H_pad; //input width index (col number)
-          
-          #pragma unroll
-           for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-               const int fi = ch * (reduced_filter_elem / C) + ki;
-          //for(int fi = 0; fi < reduced_filter_elem; fi++) {
-           //   const int ch = (fi * C) / reduced_filter_elem;
-              const int offset = (skip_offset + ch) % skip_every;
-              const int condition = (fi < offset);
-             const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                          + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-             // int in_index;
-              //if(fi < offset) {
-               //   in_index = fi;
-              //} else {
-               //   in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
-                 //               + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-             // }
-              const int i = (in_index % (KW * KH)) / KW;
-              const int j = in_index % KW;
-              const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-              if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                  output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-              } else {
-                  output[out_index] = 0;
-             }
-        }
-    }
-}
-
-__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-            //int in_index;
-            //if(fi < skip_offset) {
-             //   in_index = fi;
-            //} else {        
-              //  in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-              //              + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-           // }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW; 
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
-    }
-}
-
-__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-       #pragma unroll 
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                 + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-           // int in_index;
-           // if(fi < skip_offset) {
-           //     in_index = fi;
-           // } else {
-            //    in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                   //             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-           // }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-            //const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
-    }
-}
-
-
-__global__ void createReducedFiltersHalfRegular(__half * output,
-                                         const __half * const __restrict input, const int NF,
-                                         const int num_filter_elem, const int reduced_filter_elem,
-                     			 const int channels,
-                                         const int skip_every, const int skip_offset, const float fac) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  //const int stride = blockDim.x * gridDim.x;
-  
-  //#pragma unroll
-  //for (int i = tx; i < NF; i += stride) {
-    const int fIdx = tx / reduced_filter_elem; //filter index
-  if(fIdx < NF) {
-    const int offset = tx % reduced_filter_elem; //offset within filter
-    const int ch = (offset * channels) / reduced_filter_elem;
-    const int channel_offset = (skip_offset + ch) % skip_every;
-    const int condition = (offset < channel_offset);
-    const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                          + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1);
-      
-     // int in_index;
-     // if(offset < channel_offset) {
-      //  in_index = offset;
-     //} else {
-       //  in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-         //         + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
-    // }
-    output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
- }
-}
-
-__global__ void createReducedFiltersHalfIrregular(__half * output,
-                     const __half * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-     //const int stride = blockDim.x * gridDim.x;
-      //#pragma unroll
-      //for (int i = tx; i < NF; i += stride) { 
-  
-      const int fIdx = tx / reduced_filter_elem; //filter index
-      if(fIdx < NF) {
-        const int offset = tx % reduced_filter_elem; //offset within filter
-        const int condition = (offset < skip_offset);
-        int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-        //}
-        output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
-    //}
-  }
-}
-
-
-
-//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
-__global__ void convToGemmApprox(float * const __restrict__ output,
-				 const float * const __restrict input, const int N, const int C,
-				 const int H, const int W,
-				 const int KH, const int KW, const int V_pad,
-				 const int H_pad, const int H_out,
-				 const int W_out, const int V_stride,
-				 const int H_stride, const int reduced_filter_elem,
-				 const int skip_every) {
-  
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
-	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
-	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	  else
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
-	}
-      }
-    }
-  }
-}
-
-
-
-void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
-			 int vertical_pad, int horizontal_pad, int vertical_stride,
-			 int horizontal_stride, int conv_mode, int conv_groups,
-			 int row, int col, int start){
-
-  //////INFO("*** TensorConvolution (output perforation) \n");
-  //Event("Conv");
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
-  if (conv_groups == 0) {
-    conv_groups = 1;
-  }
-  
-  Tensor* output;
-  // TODO: Support other cases;
-  hostToDeviceCopy(input);
-  hostToDeviceCopy(filter);
-
-  //Event("H2F_start");
-  convertToFP32(input);
-  convertToFP32(filter);
-  //Event("H2F_end");
-  
-  long int n, c, h, w; // output dimensions
-  n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
-  const int KH = filter->dims.dim_sizes[2];
-  const int KW = filter->dims.dim_sizes[3];
-
-  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  int rem_row = (h - start) % row > 0;
-  int h_eff = h - ((h - start) / row) - rem_row;
-  
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-  int rem_col = (w - start) % col > 0;
-  int w_eff = w - ((w - start) / col) - rem_col;
-
-  Tensor* new_output;
-  if(row > 1){
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h_eff, w);
-
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
-    const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-
-    float* convData;
-    long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-
-    const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, start, h_eff);
-    checkCudaErrors(cudaDeviceSynchronize());
-
-    float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h_eff * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h_eff * w,
-					      num_filter_elem * h_eff * w,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h_eff * w, c * h_eff * w,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(new_output, DEVICE);
-
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, start);
-    cudaDeviceSynchronize();
-
-    freeTensor(output);
-    cudaFree(convData);
-  } else if(col > 1){
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
-
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
-    const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-
-    float * convData;
-    long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-
-    const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, start, w_eff);
-    checkCudaErrors(cudaDeviceSynchronize());
-
-    float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData,
-					      h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h * w_eff, c * h * w_eff,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(new_output, DEVICE);
-
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, start);
-    cudaDeviceSynchronize();
-
-    freeTensor(output);
-    cudaFree(convData);
-  } else { 
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
-
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
-    const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-
-    float * convData;
-    long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-
-    const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    convToGemmApprox<<<gridSize, blockSize>>>(convData,
-					      (float *)input->gpu_data, n,
-					      input->dims.dim_sizes[1],
-					      input->dims.dim_sizes[2],
-					      input->dims.dim_sizes[3],
-					      KH, KW,
-					      vertical_pad, horizontal_pad, h, w,
-					      vertical_stride, horizontal_stride,
-					      num_filter_elem, c * h * w);
-    checkCudaErrors(cudaDeviceSynchronize());
-    //Do the matrix multiplication
-    //Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
-    
-    float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w, num_filter_elem * h * w,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w, c * h * w,
-					      n));
-
-    new_output = output;
-    cudaFree(convData);
-  }
-
-  //Event("Conv_end"); //, true);
-  return new_output;
-}
-
-__global__
-void switchMatrixFull(int N, int n, int c, int h, int w,
-              float *old_data, float *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-          int col = ((i % (c * h * w)) % (h * w)) % w;
-          int row = ((i % (c * h * w)) % (h * w)) / w;
-          int ch = (i % (c * h * w)) / (h * w);
-          int n_new = i / (c * h * w);
-          
-          new_data[((n_new * c + ch) * h + row ) * w + col] =
-                        old_data[((ch * n + n_new) * h + row ) * w + col];
-        }
-}
-
-
-void* tensorConvApprox(void* input_ptr, void* filter_ptr,
-		       int vertical_pad, int horizontal_pad, int vertical_stride,
-		       int horizontal_stride, int conv_mode, int conv_groups,
-		       int row, int col, int skip_every, int offset){
-
-  //////INFO("*** TensorConvolution approximation \n");
-  //Event("Conv");
-
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
-  if (conv_groups == 0) {
-    conv_groups = 1;
-  }
-
-  hostToDeviceCopy(input);
-  hostToDeviceCopy(filter);
-
-  ////Event("H2F_start");
-  convertToFP32(input);
-  convertToFP32(filter);
-  ////Event("H2F_end");
-
-  const int n = input->dims.dim_sizes[0];
-  const int c = filter->dims.dim_sizes[0]; //number of filters
-  const int KH = filter->dims.dim_sizes[2];
-  const int KW = filter->dims.dim_sizes[3];
-  const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-  const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
-  // NOTE: Changing output tensor placement from host to device
-  changeTensorPlacement(new_output, DEVICE);
-  ////INFO("batch: %d\n", n);
-  ////INFO("channels: %d\n", input->dims.dim_sizes[1]);
-  ////INFO("num_filters: %d\n", c);
-  ////INFO("kernel height: %d\n", KH);
-  ////INFO("kernel width: %d\n", KW);
-  ////INFO("num_filter_elem: %d\n", num_filter_elem);
-  ////INFO("vertical_stride: %d\n", vertical_stride);
-  ////INFO("horizontal_stride: %d\n", horizontal_stride);
-  ////INFO("output height: %d\n", h);
-  ////INFO("output width: %d\n", w);
-  if(row > 1) {
-    const int rem_row = (h - offset) % row > 0;
-    const int h_eff = h - ((h - offset) / row) - rem_row;
-
-    Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h_eff, w);
-
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-
-    float * convData;
-    long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-
-    const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, offset, h_eff);
-    checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                &alpha,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (float *)filter->gpu_data, num_filter_elem, 0,
-                                                &beta,
-                                                (float *)output->gpu_data, h_eff * w, c * h_eff * w,
-                                                n));
-    //interpolate
-    int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, offset);
-    cudaDeviceSynchronize();
-
-    freeTensor(output);
-    cudaFree(convData);
-  } else if(col > 1) {
-    const int rem_col = (w - offset) % col > 0;
-    const int w_eff = w - ((w - offset) / col) - rem_col;
-
-    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
-
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-
-    float * convData;
-    long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-
-    const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3], KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, offset, w_eff);
-    checkCudaErrors(cudaDeviceSynchronize());
-
-    float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
-					      n));
-
-    //interpolate
-    int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, offset);
-    cudaDeviceSynchronize();
-
-    freeTensor(output);
-    cudaFree(convData);
-  } else if(skip_every > 1) {
-    //reduced number after skipping
-    const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
-
-    float* convData;
-    size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    float* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
-    
-    const int filtBlockSize = 128;
-    ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac =  ((float) skip_every) / ((float) skip_every - 1);
-    //////INFO("fac: %f\n", fac);
-    const int blockSize = 128;
-    //////INFO("n * h * w : %d\n", (n * h * w ));    
-    const int gridSize = (n * h * w + blockSize - 1) / blockSize;  
-    if(!(KH * KW % skip_every)) {
-       // ////INFO("REGULAR FILTERING\n");
-        createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                (float *)filter->gpu_data,
-								c, num_filter_elem,
-								reduced_filter_elem,
-								input->dims.dim_sizes[1], skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
-    } else {
-       // ////INFO("IRREGULAR FILTERING\n");
-        createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (float *)filter->gpu_data,
-                                    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,     
-                                                                input->dims.dim_sizes[1],                                                     
-                                                                input->dims.dim_sizes[2],                                                 
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-    }
-    checkCudaErrors(cudaDeviceSynchronize());
-    
-    const float alpha = 1.0;
-    const float beta = 0.0;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            &alpha,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
-    cudaFree(convData);
-    cudaFree(reducedFilter);
-  } else {
-      INFO("FP32 BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-                               CUDNN_TENSOR_NCHW, n, c, h, w);
-    changeTensorPlacement(output, DEVICE);
-
-    float * convData;
-    long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-
-    const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w));
-    convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w, vertical_stride, horizontal_stride, 
-                           skip_every, offset);//num_filter_elem);
-    checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     /*
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                          CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, num_filter_elem,
-                                            &alpha,
-                                            convData, h * w, num_filter_elem * h * w,
-                                            (float *)filter->gpu_data, num_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
-    */
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                       n * h * w, c, num_filter_elem,
-                        &alpha,
-                        convData,
-                        CUDA_R_32F, n * h * w,
-                        (float *) filter->gpu_data, CUDA_R_32F,
-                        num_filter_elem,
-                        &beta,
-                        (float *) output->gpu_data,
-                        CUDA_R_32F, n * h * w,
-                        CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-    
-    const int numBlocks = (n * c * h * w  + 255) / 256;
-    switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (float *)output->gpu_data,
-                                    (float *)new_output->gpu_data);
-    
-    checkCudaErrors(cudaDeviceSynchronize());
-    cudaFree(convData);
-  }
-
-  //Event("Conv_end");
-  return new_output;
-}
-
-__global__
-void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-            int col = ((i % (c * h * w)) % (h * w)) % w;
-            int row = ((i % (c * h * w)) % (h * w)) / w;
-            int ch = (i % (c * h * w)) / (h * w);
-            int n_new = i / (c * h * w);
-            
-            new_data[((n_new * c + ch) * h + row ) * w + col] =
-                            old_data[((ch * n + n_new) * h + row ) * w + col];
-      }
-}
-
-
-void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
-			   int vertical_pad, int horizontal_pad,
-			   int vertical_stride, int horizontal_stride,
-			   int conv_mode, int conv_groups,
-			   int row, int col, int skip_every, int offset) {
-
- //INFO("*** TensorConvolution half approximation \n");
- // profileEvent("#Conv");
-
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
-  if (conv_groups == 0) {
-    conv_groups = 1;
-  }
-
-  hostToDeviceCopy(input);
-  hostToDeviceCopy(filter);
-  
-  profileEvent("F2H_start");
-   convertToFP16(input);
-   convertToFP16(filter);
-  profileEvent("F2H_end");
-  
-  const long int n = input->dims.dim_sizes[0];
-  const long int c = filter->dims.dim_sizes[0]; //number of filters
-  const int KH = filter->dims.dim_sizes[2];
-  const int KW = filter->dims.dim_sizes[3];
-  const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-  const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					       CUDNN_TENSOR_NCHW, n, c, h, w);
-  changeTensorPlacement(new_output, DEVICE);
-  //INFO("batch: %d\n", n);
-  // INFO("channels: %d\n", input->dims.dim_sizes[1]);
-  // INFO("num_filters: %d\n", c);
-  // INFO("kernel height: %d\n", KH);
-  // INFO("kernel width: %d\n", KW);   
-  // INFO("num_filter_elem: %d\n", num_filter_elem);
-   //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
-   //INFO("vertical_stride: %d\n", vertical_stride);
-   //INFO("horizontal_stride: %d\n", horizontal_stride);
-  // INFO("output height: %d\n", h);
-  // INFO("output width: %d\n", w);
-   //INFO("skip_every: %d\n", skip_every);
-  const __half alf = approx_float_to_half(1.0);
-  const __half bet = approx_float_to_half(0.0);
-  const __half *alpha_half = &alf;
-  const __half *beta_half = &bet;
-
-  if(row > 1){
-    const int rem_row = (h - offset) % row > 0;
-    const int h_eff = h - ((h - offset) / row) - rem_row;
-    
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW,
-						  n, c, h_eff, w);
-    changeTensorPlacement(output_half, DEVICE);
-
-    __half * convData;
-    long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    
-    const int patchBlockSize = 256;
-    const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) / patchBlockSize;
-    const int interpolationBlocksize = 256;
-    const int numInterpolationBlocks = (n * c * h * w  + interpolationBlocksize - 1) / interpolationBlocksize;
-    if(h * w <= 64) {
-        //INFO("H *W <= 64\n");
-        convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>(convData,
-                                   (__half *)input->gpu_half_data, n,
-                                   input->dims.dim_sizes[1],
-                                   input->dims.dim_sizes[2],
-                                   input->dims.dim_sizes[3],
-                                   KH, KW, vertical_pad,
-                                   horizontal_pad, h, w, vertical_stride,
-                                   horizontal_stride, row, offset, h_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                         n * h_eff * w, c, num_filter_elem,
-                         alpha_half,
-                         convData, CUDA_R_16F, n * h_eff * w,
-                         (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                         beta_half,
-                         (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
-                         CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-        approxInterpolateRowHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-                                        (__half *)output_half->gpu_half_data,
-                                        (__half *)new_output->gpu_half_data,
-                                        row, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
-    
-    } else {
-        //INFO("H *W > 64\n");
-        convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>(convData,
-						   (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1],
-						   input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3],
-						   KH, KW, vertical_pad,
-						   horizontal_pad, h, w, vertical_stride,
-						   horizontal_stride, row, offset, h_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                alpha_half,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                                beta_half,
-                                                (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w,
-                                                n));    
-        
-        approxInterpolateRowHalf<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						row, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
-
-    }
-    freeTensor(output_half);
-    cudaFree(convData);
-} else if(col > 1) {
-    const int rem_col = (w - offset) % col > 0;
-    const int w_eff = w - ((w - offset) / col) - rem_col;
-
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
-    changeTensorPlacement(output_half, DEVICE);
-   
-    __half * convData;
-    long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    
-    const int patchBlockSize = 256;
-    const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) / patchBlockSize;
-    const int interpolationBlocksize = 256;
-    const int numInterpolationBlocks = (n * c * h * w  + interpolationBlocksize - 1) / interpolationBlocksize;
-   if(h * w <= 64) {
-         //INFO("H *W <= 64\n");
-        convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3], KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, col, offset, w_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                        n * h * w_eff, c, num_filter_elem,
-                                        alpha_half,
-                                        convData, CUDA_R_16F, n * h * w_eff,
-                                        (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                                        beta_half,
-                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
-                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-         approxInterpolateColHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-                                                        (__half *)output_half->gpu_half_data,
-                                                        (__half *)new_output->gpu_half_data,
-                                                        col, offset);
-          checkCudaErrors(cudaDeviceSynchronize());
-    } else {
-        //INFO("H *W > 64\n");
-        convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n,
-		                            				   input->dims.dim_sizes[1],
-                                                       input->dims.dim_sizes[2],
-                                                       input->dims.dim_sizes[3], KH, KW, vertical_pad,
-                                                       horizontal_pad, h, w, vertical_stride,
-                                                       horizontal_stride, col, offset, w_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-    
-        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                              CUBLAS_OP_N, CUBLAS_OP_N,
-                                              h * w_eff, c, num_filter_elem,
-                                              alpha_half,
-                                              convData, h * w_eff, num_filter_elem * h * w_eff,
-                                              (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                              beta_half,
-                                              (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff,
-                                              n));
-
-         approxInterpolateColHalf<<<numInterpolationBlocks,interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-                                 (__half *)output_half->gpu_half_data,
-                                 (__half *)new_output->gpu_half_data,
-                                 col, offset);
-         checkCudaErrors(cudaDeviceSynchronize());
-    }
-
-    freeTensor(output_half);
-    cudaFree(convData);
-  } else if(skip_every > 1) {
-    const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
-
-    __half* convData;
-    size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w;
-    checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    __half* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
-
-    const int filtBlockSize = 256;
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac =  ((float) skip_every) / ((float) skip_every - 1);
-    const int blockSize = 256;
-    //const int gridSize = (n * h * w + blockSize - 1) / blockSize;
-   // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
-   // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
-    const __half alf = approx_float_to_half(1.0);
-    const __half bet = approx_float_to_half(0.0);
-    const __half *alpha_half = &alf;
-    const __half *beta_half = &bet;
-    if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { 
-      if(!(KH * KW % skip_every)) {
-        //INFO("---REGULAR FILTERING\n");
-        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                (__half *)filter->gpu_half_data,
-								c, num_filter_elem,
-                                                                reduced_filter_elem,
-                                                                input->dims.dim_sizes[1], skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-	
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
-      } else {
-        //INFO("---IRREGULAR FILTERING\n");
-        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (__half *)filter->gpu_half_data,
-				    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1]  + blockSize - 1) / blockSize;
-	    //convToGemmHalfInputIrregular
-        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,  
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-     }   
-     checkCudaErrors(cudaDeviceSynchronize());
-
-     checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            alpha_half,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            beta_half,
-                                            (__half *)new_output->gpu_half_data, h * w, c * h * w,
-                                            n));
-    } else {
-        Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                 CUDNN_TENSOR_NCHW, n, c, h, w);
-        changeTensorPlacement(output_half, DEVICE);
-
-        if(!(KH * KW % skip_every)) {
-           //INFO("REGULAR FILTERING\n");
-            createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                        (__half *)filter->gpu_half_data,
-                                                        c, num_filter_elem,
-                                                        reduced_filter_elem,
-                                                        input->dims.dim_sizes[1], skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-        } else {
-           // INFO("IRREGULAR FILTERING\n");
-            createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                            (__half *)filter->gpu_half_data,
-                                                                            c, num_filter_elem,
-                                                                            reduced_filter_elem,
-                                                                            skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-            }
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                        n * h * w, c, reduced_filter_elem,
-                                        alpha_half,
-                                        convData, CUDA_R_16F, n * h * w,
-                                         reducedFilter, CUDA_R_16F, reduced_filter_elem,
-                                        beta_half,
-                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
-                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-            
-            int numBlocks = (n * c * h * w  + 255) / 256;
-            switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (__half *)output_half->gpu_half_data,
-                                    (__half *)new_output->gpu_half_data);
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            freeTensor(output_half);
-    }
-    
-    cudaFree(convData);
-    cudaFree(reducedFilter);
-  } else {
-       //INFO("FP16 BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                   CUDNN_TENSOR_NCHW, n, c, h, w);
-      
-      changeTensorPlacement(output, DEVICE);
-      __half * convData;
-      long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
-      checkCudaErrors(cudaMalloc(&convData, convDataSize));
-      
-      const int blockSize = 256;
-      const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-      //convToGemmHalf
-      convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData,
-                                                (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3],
-                                                KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, num_filter_elem,
-                                                skip_every, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        const __half alf = approx_float_to_half(1.0);
-        const __half bet = approx_float_to_half(0.0);
-        const __half *alpha_half = &alf;
-        const __half *beta_half = &bet;
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                    n * h * w, c, num_filter_elem,
-                                    alpha_half,
-                                    convData, CUDA_R_16F, n * h * w,
-                                    (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                                    beta_half,
-                                    (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w,
-                                    CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-        
-        const int numBlocks = (n * c * h * w  + 255) / 256;
-        switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data,
-                                            (__half *)new_output->gpu_half_data);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        freeTensor(output);
-        cudaFree(convData);
-  }
-//    INFO("CONV DONE\n");
-  profileEvent("H2F_start");
-  convertToFP32_offline(new_output);
-  //convertToFP32(input);
-  //convertToFP32(filter);
-  profileEvent("H2F_end");
-  //profileEvent("#Conv_end");
-  //INFO("CONVOLUTION END\n");
-  return new_output;
-}
-
-} // end of Extern C
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index 898d92c18cb8ad0b2df7a6d0c9d905c9649c53c1..9250810a2010a235074c0d29b8fe8bd63650324c 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -1,8 +1,13 @@
-/* This file includes the API implementation of the HPVM tensor runtime built for CPU
-**
-**  Author: Hashim Sharif
-**  Email: hsharif3@illinois.edu
-*/
+//===--------------------------- tensor_runtime_cpu.cc --------------------===//
+//
+//===----------------------------------------------------------------------===//
+//   
+//  This file  consists of the custom implementation of non-approximated and 
+// approximated  versions of tensor operations to execute on CPUs. The 
+// software approximations implemented for tensor convolutions are feature 
+// sampling and perforation for FP32 compute precisions only.  
+//
+//===----------------------------------------------------------------------===//
 
 #include <algorithm>
 #include <cfloat>