diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
index ed59139f71043d03591f78c2e5d682e580b6264a..4f51e38205315686d391eab113a13f4be5006e10 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
@@ -63,25 +63,29 @@ void convToGemmPerfRow(float * const __restrict__ output,
   const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
   const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
   const int w = tx % W_out; //output width index (col number)
-  int past_start = (h % (x - 1) >= (x - 1 - start));
-  const int inH = (h / (x - 1) * x + h % (x-1) +
-		   past_start) * V_stride - V_pad; //input height index (row number)
+  
+  int h_index;
+  if(h < start) {
+      h_index = h;
+  } else {
+     h_index = ((h - start + 1) * x) / (x - 1) + ((h - start + 1) * x) % (x - 1) + start - 1;
+  }
+  const int inH = h_index * V_stride - V_pad;
   const int inW = w * H_stride - H_pad; //input width index (col number)
+  
   if(n < N) { //is thread id within bounds?
     for(int i = 0; i < KH; i++) {
       for(int j = 0; j < KW; j++) {
 	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
 	else
-	  output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] = 0;
+	  output[out_index] = 0;
 
       }
     }
   }
-
 }
 
 
@@ -89,40 +93,88 @@ void convToGemmPerfRow(float * const __restrict__ output,
 //Interpolates every xth row starting from x - 1 - start
 //N is total number of elements in final output array
 __global__
-void approxInterpolateRow(int N, int old_h, int n, int c, int h, int w,
+void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w,
 			  float *old_data, float *new_data, int x, int start){
 
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
-    int past_start = ((row % x) >= (x - 1 - start));
-
-    if(row == h-1)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-    else if (row == 0)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-    else if(row % x == x - 1 - start){
-      int past_startO = ((row - 1) % x) > (x - 1 - start);
-      int oldIdx1 = n * (c * old_h * w) + ch * (old_h * w) +
-	((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col;
-
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(old_data[oldIdx1] + old_data[oldIdx1 + 1 * w]) / 2;
+  //int index = blockIdx.x * blockDim.x + threadIdx.x;
+ // int stride = blockDim.x * gridDim.x;
+ const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+ const int n = tx / (c * h * w); //output image number
+ const int ch = tx % (c * h * w) / (h * w); //output chan number
+ const int row = tx % (h * w) / w; //output height index (row number)
+ const int col = tx % w; //output width index (col number)
+
+  //for(int i = index; i < N; i += stride) {
+  //  int col = ((i % (c * h * w)) % (h * w)) % w;
+   // int row = ((i % (c * h * w)) % (h * w)) / w;
+    //int ch = (i % (c * h * w)) / (h * w);
+    //int n = i / (c * h * w);
+    
+    if(row < start) {
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+             old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if(row == h-1) { 
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+            old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
+    } else if (row == 0) {
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if((row - start) % x == 0) { 
+        int row_index = row - ((row + 1 - start) / x);
+        int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; 
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+                (old_data[output_index] + old_data[output_index - w]) / 2;
+    } else {
+        int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); 
+        int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
     }
-    else
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) +
-		 ((x-1) * (row / x) + row % x - past_start )  * (w) + col];
+  //}
 
+}
 
-  }
+//For use in tensorConvPerfCuda
+//Interpolates every xth row starting from x - 1 - start
+//N is total number of elements in final output array
+__global__
+void approxInterpolateRow2(int N, int old_h, int j, int c, int h, int w,
+              float *old_data, float *new_data, int x, int start){
+
+     // int index = blockIdx.x * blockDim.x + threadIdx.x;
+      // int stride = blockDim.x * gridDim.x;
+      // for(int i = index; i < N; i += stride) {
+       //    int col = ((i % (c * h * w)) % (h * w)) % w;
+       //    int row = ((i % (c * h * w)) % (h * w)) / w;
+       //    int ch = (i % (c * h * w)) / (h * w);
+       //    int n = i / (c * h * w);
+     
+     const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+      const int n = tx / (c * h * w); //output image number
+       const int ch = tx % (c * h * w) / (h * w); //output chan number
+        const int row = tx % (h * w) / w; //output height index (row number)
+         const int col = tx % w; //output width index (col number)
+
+
+           if(row < start) {
+               new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+               old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+          } else if(row == h-1) {
+              new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+              old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
+        } else if (row == 0) {
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+        } else if((row - start) % x == 0) {
+            int row_index = row - ((row + 1 - start) / x);
+            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            (old_data[output_index] + old_data[output_index - w]) / 2;
+        } else {
+            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
+        }
+   // }
 
 }
 
@@ -141,10 +193,14 @@ void convToGemmPerfCol(float * const __restrict__ output,
   const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
   const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
   const int w = tx % W_eff; //output width index (col number)
-  int past_start = (w % (x - 1)) >= (x - 1 - start);
+  int w_index;
+  if(w < start) {
+      w_index = w;
+  } else {
+      w_index = ((w - start + 1) * x) / (x - 1) + ((w - start + 1) * x) % (x - 1) + start - 1;
+  }
+  const int inW = w_index * H_stride; 
   const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = (w / (x - 1) * x + w % (x-1) +
-		   past_start) * H_stride - H_pad; //input width index (col number)
   if(n < N) { //is thread id within bounds?
     for(int i = 0; i < KH; i++) {
       for(int j = 0; j < KW; j++) {
@@ -167,45 +223,46 @@ void convToGemmPerfCol(float * const __restrict__ output,
 //Interpolates every xth col starting from x - 1 - start
 //N is total number of elements in final output array
 __global__
-void approxInterpolateCol(int N, int old_w, int n, int c, int h, int w,
+void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w,
 			  float *old_data, float *new_data, int x, int start){
 
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
-    int past_start = ((col % x) >= (x - 1 - start));
-
-    if(col == w-1)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    else if (col == 0)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    else if(col % x == x - 1 - start){
-      int past_startO = ((col - 1) % x) > (x - 1 - start);
-      int oldIdx1 = n * (c * h * old_w) + ch * (h * old_w) + row * old_w +
-	((x-1) * ((col - 1) / x) + (col-1) % x - past_startO);
-
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(old_data[oldIdx1] + old_data[oldIdx1 + 1]) / 2;
+ // int index = blockIdx.x * blockDim.x + threadIdx.x;
+//  int stride = blockDim.x * gridDim.x;
+
+//  for(int i = index; i < N; i += stride){
+  //  int col = ((i % (c * h * w)) % (h * w)) % w;
+  //  int row = ((i % (c * h * w)) % (h * w)) / w;
+  //  int ch = (i % (c * h * w)) / (h * w);
+  //  int n = i / (c * h * w);
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (c * h * w); //output image number
+    const int ch = tx % (c * h * w) / (h * w); //output chan number
+    const int row = tx % (h * w) / w; //output height index (row number)
+    const int col = tx % w; //output width index (col number)
+
+    if(col < start) {
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] 
+                = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if(col == w - 1) {
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+    } else if (col == 0) {
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if((col - start) % x == 0) {
+        int col_index = col - ((col + 1 - start) / x);
+        int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+                    (old_data[output_index] + old_data[output_index - 1]) / 2;
+    } else {
+        int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);  
+         int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
     }
-    else
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w +
-		 ((x-1) * (col / x) + col % x - past_start)];
-
-  }
-
+ // }   
 }
 
 
-
-  
 __global__
 void convToGemmPerfRowHalf(__half * const __restrict__ output,
 			   const __half * const __restrict input,
@@ -233,18 +290,17 @@ void convToGemmPerfRowHalf(__half * const __restrict__ output,
   if(n < N) { //is thread id within bounds?
     for(int i = 0; i < KH; i++) {
       for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0;
-
+	    const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+	        output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] =
+	            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	   } else {
+	        output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0;
+       }
       }
     }
   }
-
 }
 
 
@@ -260,36 +316,46 @@ void approxInterpolateRowHalf(int N,
 			      int x,
 			      int start){
 
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
+ const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+ const int n = tx / (c * h * w); //output image number
+ const int ch = tx % (c * h * w) / (h * w); //output chan number
+ const int row = tx % (h * w) / w; //output height index (row number)
+ const int col = tx % w; //output width index (col number)
+ 
+ // int index = blockIdx.x * blockDim.x + threadIdx.x;
+  //int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
+ // for(int i = index; i < N; i += stride){
+   // int col = ((i % (c * h * w)) % (h * w)) % w;
+   // int row = ((i % (c * h * w)) % (h * w)) / w;
+   // int ch = (i % (c * h * w)) / (h * w);
+   // int n = i / (c * h * w);
 
     if(row < start) {
-        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] 
-            = old_data[ch * (b * old_h * w) + n * (old_h * w) + (row * w) + col];
-    } else if(row == h-1) {
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	    old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col];
+            //old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if(row == h - 1) { 
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+            old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
+            //old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
     } else if (row == 0) {
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-    	old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
-    } else if((row - start) % x == 0) {
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+            old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+            //old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if((row - start) % x == 0) { 
         int row_index = row - ((row + 1 - start) / x);
-        int output_index = ch * (b * old_h * w) + n * (old_h * w) + (row_index * w) + col;
+        int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+        //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
         new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+            __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
     } else {
-        int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-         int output_index = ch * (b * old_h * w) + n * (old_h * w) + (row_index * w) + col;
-         new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    }
-  }
-
+        int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);  
+        int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+        //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
+    } 
+   // }
 }
 
 
@@ -312,15 +378,18 @@ void convToGemmPerfColHalf(__half * const __restrict__ output,
   const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
   const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
   const int w = tx % W_eff; //output width index (col number)
-  int past_start = (w % (x - 1)) >= (x - 1 - start);
+  int w_index;
+  if(w < start) {
+      w_index = w;
+  } else {
+      w_index = ((w - start + 1) * x) / (x - 1) + ((w - start + 1) * x) % (x - 1) + start - 1;
+  }
+  const int inW = w_index * H_stride - H_pad;
   const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = (w / (x - 1) * x + w % (x-1) +
-		   past_start) * H_stride - H_pad; //input width index (col number)
   if(n < N) { //is thread id within bounds?
     for(int i = 0; i < KH; i++) {
       for(int j = 0; j < KW; j++) {
 	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-
 	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
 	  output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] =
 	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
@@ -342,36 +411,46 @@ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
 			      __half *old_data, __half *new_data, int x, int start){
 
 
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
+ // int index = blockIdx.x * blockDim.x + threadIdx.x;
+ // int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
-    int past_start = ((col % x) >= (x - 1 - start));
-
-    if(col == w-1)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
-    else if (col == 0)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
-    else if(col % x == x - 1 - start){
-      int past_startO = ((col - 1) % x) > (x - 1 - start);
-      int oldIdx1 = ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
-	((x-1) * ((col - 1) / x) + (col-1) % x - past_startO);
-
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	__hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1]), 2);
-    }
-    else
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
-		 ((x-1) * (col / x) + col % x - past_start)];
+ // for(int i = index; i < N; i += stride){
+  //  int col = ((i % (c * h * w)) % (h * w)) % w;
+  //  int row = ((i % (c * h * w)) % (h * w)) / w;
+   // int ch = (i % (c * h * w)) / (h * w);
+   // int n = i / (c * h * w);
 
-  } 
+    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+    const int n = tx / (c * h * w); //output image number
+    const int ch = tx % (c * h * w) / (h * w); //output chan number
+    const int row = tx % (h * w) / w; //output height index (row number)
+    const int col = tx % w; //output width index (col number)
+
+    if(col < start) {
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+                old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + col];
+               // old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if(col == w - 1) { 
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+                old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
+                //old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+    } else if (col == 0) {
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+                old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
+                //old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if((col - start) % x == 0) {
+        int col_index = col - ((col + 1 - start) / x);
+        int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + col_index;
+        //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
+                    __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    } else {                                                                                                                                                        
+        int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);                                                                                  
+        int output_index =  ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + col_index;
+        //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+        new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
+    }  
+ // }
 }
 
 __global__
@@ -544,18 +623,23 @@ __global__ void convToGemmFullInput(float * const __restrict__ output,
     for(int i = 0; i < KH; i++) {
       for(int j = 0; j < KW; j++) {
 	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-
-	if(filter_elem_num % skip_every != skip_every-1-skip_offset) {
-	    int output_col = filter_elem_num -
-	      ((filter_elem_num + skip_every)/skip_every);
-	    if(skip_every == 1)
-	      output_col = filter_elem_num;
-	    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	      output[((output_col*N + n) * H_out + h) * W_out + w] =
-		input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	    else
-	      output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-	}
+    int output_col;
+    if((filter_elem_num - skip_offset) % skip_every != 0) {
+        output_col = filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - ((filter_elem_num + 1 - skip_offset) % skip_every > 0);
+    } else if (filter_elem_num < skip_offset) {
+        output_col = filter_elem_num;
+    } else {
+        continue;
+    }
+    const int output_index = h * W_out + w;
+    const int out_index = n * reduced_filter_elem * W_out * H_out + output_index * reduced_filter_elem + output_col;
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+      //output[((output_col*N + n) * H_out + h) * W_out + w] =
+		output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    else
+     output[out_index] = 0;
+     //output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
+      
       }
     }
   }
@@ -609,12 +693,22 @@ __global__ void createReducedFiltersFull(float * output,
   const int fIdx = tx / num_filter_elem; //filter index
   const int offset = tx % num_filter_elem; //offset within filter
   if(fIdx < NF) { //is thread id within bounds?
+      //f * reduced_num_filter_elem + kernel_width * h + w;
+      if(offset < skip_offset) {
+        output[fIdx * reduced_filter_elem + offset] = input[num_filter_elem * fIdx + offset];
+     } else {
+        int red_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + 
+                    ((offset - skip_offset + 1) * skip_every) % (skip_every - 1) + skip_offset - 1;
+        output[fIdx * reduced_filter_elem + offset] = (skip_every * 1.0 / (skip_every - 1)) * input[num_filter_elem * fIdx + red_index];
+     }
+     /*
     if(offset % skip_every != skip_every-1-skip_offset) { //are we including this filter element?
       const int output_row = offset - ((offset + skip_every)/skip_every);
       //correct for skip_every = 2
             output[fIdx*reduced_filter_elem + output_row] =
 	      (skip_every * 1.0 / (skip_every - 1)) * input[tx];
     }
+    */
   }
 }
 
@@ -672,15 +766,12 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  long int h_eff = h - h / row;
-  if(h % row > row - 1 - start)
-    h_eff = h_eff - 1;
-
+  int rem_row = (h - start) % row > 0;
+  int h_eff = h - ((h - start) / row) - rem_row;
+  
   w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-  long int w_eff = w - w / col;
-  if(w % col > col - 1 - start)
-    w_eff = w_eff - 1;
-
+  int rem_col = (w - start) % col > 0;
+  int w_eff = w - ((w - start) / col) - rem_col;
 
   Tensor* new_output;
   if(row > 1){
@@ -855,9 +946,6 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
 }
 
 
-
-
-  
 /*** NOTE: tensorConvApprox  is the FP32 Baseline routine 
 
 ***/
@@ -893,15 +981,12 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  long int h_eff = h - h / row;
-  if(h % row > row - 1 - offset)
-    h_eff = h_eff - 1;
-
+  int rem_row = (h - offset) % row > 0;
+  int h_eff = h - ((h - offset) / row) - rem_row;
+  
   w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-  long int w_eff = w - w / col;
-  if(w % col > col - 1 - offset)
-    w_eff = w_eff - 1;
-
+  int rem_col = (w - offset) % col > 0;
+  int w_eff = w - ((w - offset) / col) - rem_col;
 
   Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
 				       CUDNN_TENSOR_NCHW, n, c, h, w);
@@ -936,17 +1021,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
 
 
     checkCudaErrors(cudaDeviceSynchronize());
-
-    float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h_eff * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h_eff * w, num_filter_elem * h_eff * w,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h_eff * w, c * h_eff * w,
-					      n));
+     
+     float alpha = 1.0f, beta = 0.0f;
+     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+                                                CUBLAS_OP_N, CUBLAS_OP_N,
+                                                h_eff * w, c, num_filter_elem,
+                                                &alpha,
+                                                convData, h_eff * w, num_filter_elem * h_eff * w,
+                                                (float *)filter->gpu_data, num_filter_elem, 0,
+                                                &beta,
+                                                (float *)output->gpu_data, h_eff * w, c * h_eff * w,
+                                                n));
+     
 
     new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
 					  CUDNN_TENSOR_NCHW, n, c, h, w);
@@ -954,8 +1040,9 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
     changeTensorPlacement(new_output, DEVICE);
 
     //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
+    int blocksize = 128;
+    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
+    approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
 					    (float *) output->gpu_data,
 					    (float *) new_output->gpu_data,
 					    row, offset);
@@ -965,7 +1052,6 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
     cudaFree(convData);
   }
   else if(col > 1){
-    
     Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
@@ -1043,7 +1129,7 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
     float* reducedFilter;
     checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
     const int filtBlockSize = 128;
-    const int filtGridSize = (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
     if(offset != skip_every)
       createReducedFiltersFull<<<filtGridSize, filtBlockSize>>>(reducedFilter,
 								(float *)filter->gpu_data,
@@ -1055,7 +1141,7 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
     const int blockSize = 128;
     const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
     if(skip_every == 2){
-      convToGemmFullInput2<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+      convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
 						    input->dims.dim_sizes[1],
 						    input->dims.dim_sizes[2],
 						    input->dims.dim_sizes[3],
@@ -1104,7 +1190,7 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
 
     int numBlocks = (n * c * h * w  + 255) / 256;
     switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-				    (float *)output->gpu_data,
+			    (float *)output->gpu_data,
 				    (float *)new_output->gpu_data);
 
     checkCudaErrors(cudaDeviceSynchronize());
@@ -1368,7 +1454,7 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
 
     freeTensor(output_half);
     cudaFree(convData);
-  }
+}
   else if(col > 1){
     Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
 						  CUDNN_TENSOR_NCHW, n, c, h, w_eff);