diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
index 8f08ad9e263db5060373bb7450c185966dbea31f..1b770736bab93dd6a47cb4351dd0ad054e8eb14d 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
@@ -169,19 +169,13 @@ __global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ outp
       if(n < N) { //is thread id within bounds?
           for(int i = 0; i < KH; i++) {
               for(int j = 0; j < KW; j++) {
-                  //const int ki = c * KH * KW + i;
-                  //const int kj = c * KH * KW + j;
+
                   const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
                   if((filter_elem_num - skip_offset) % skip_every) {
                     const int condition = (filter_elem_num < skip_offset);
                      const int output_col = condition * filter_elem_num 
                                     + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) 
-                                                         - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));                   
-                  //if(filter_elem_num % skip_every != skip_offset) {
-                  // int output_col = filter_elem_num -
-                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                   //if(skip_every == 1)
-                   //    output_col = filter_elem_num;
+                                                         - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));                   		     
                     const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
                     //((output_col*N + n) * H_out + h) * W_out + w;
                     if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
@@ -213,22 +207,16 @@ __global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ out
     if(n < N) { //is thread id within bounds?
         for(int i = 0; i < KH; i++) {
             for(int j = 0; j < KW; j++) {
-                //const int ki = c * KH * KW + i;
-                //const int kj = c * KH * KW + j;
-                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
                 if((filter_elem_num - skip_offset) % skip_every) {
                     const int condition = (filter_elem_num < skip_offset);
                     const int output_col = condition * filter_elem_num
                                         + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every)
                                         - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
-                    //if(filter_elem_num % skip_every != skip_offset) {
-                    // int output_col = filter_elem_num -
-                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                    //if(skip_every == 1)
-                    //    output_col = filter_elem_num;
+
                     const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
-                    //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
-                    //((output_col*N + n) * H_out + h) * W_out + w
+                    
                     if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
                         output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
                     else
@@ -288,15 +276,13 @@ __global__ void convToGemmPerfRow(float * const __restrict__ output,
     }
     const int inH = h_index * V_stride - V_pad;
     const int inW = w * H_stride - H_pad; //input width index (col number)
-   //#pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-      //  int i = ki / KW;
-      //  int j = ki % KW;
+
     for(int i = 0; i < KH; i++) {
         for(int j = 0; j < KW; j++) {
 	const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element
-    const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
 	  output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
 	else
 	  output[out_index] = 0;
@@ -357,11 +343,7 @@ __global__ void convToGemmPerfCol(float * const __restrict__ output,
     }
     const int inW = w_index * H_stride - H_pad; 
     const int inH = h * V_stride - V_pad; //input height index (row number)
-    //#pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-      //  int i = ki / KW;
-       // int j = ki % KW;
-    
+
     for(int i = 0; i < KH; i++) {
       for(int j = 0; j < KW; j++) {
 	const int filter_elem_num = c * KH * KW  + i * KW + j; //index of this filter element
@@ -427,11 +409,8 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
     }
     const int inH = h_index * V_stride - V_pad;
     const int inW = w * H_stride - H_pad; //input width index (col number)
-  // #pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-     //   int i = ki / KW; 
-     //   int j = ki % KW;
-   
+
+    
    for(int i = 0; i < KH; i++) {
       for(int j = 0; j < KW; j++) {
         const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
@@ -465,38 +444,31 @@ __global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
         }                              
         const int inH = h_index * V_stride - V_pad;
         const int inW = w * H_stride - H_pad; //input width index (col number)
-        // #pragma unroll
-        //for (int ki = 0; ki < KH * KW; ki++) {
-            //   int i = ki / KW; 
-            //   int j = ki % KW; 
-            for(int i = 0; i < KH; i++) {
-                for(int j = 0; j < KW; j++) {
-                    const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-                    const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
-                    //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                    else
-                        output[out_index] = 0;
-                }
-            }
+
+
+	for(int i = 0; i < KH; i++) {
+	  for(int j = 0; j < KW; j++) {
+	    const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
+	    const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
+
+	    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	      output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	    else
+	      output[out_index] = 0;
+
+	  }
+	}
+	
     }
 }
 
 __global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w,
                           __half *old_data, __half *new_data, int x, int start) {
 
-    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int n = tx / (c * h * w); //output image number
-    //const int stride = blockDim.x * gridDim.x;
+
     const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
     const int n = tx / (c * h * w); //output image number
     if(n < N) {
-    //for(int i = index; i < N; i += stride){
-        //const int col = ((i % (c * h * w)) % (h * w)) % w;
-        //const int row = ((i % (c * h * w)) % (h * w)) / w;
-        //const int ch = (i % (c * h * w)) / (h * w);
-       // const int n = i / (c * h * w);
 
         const int ch = tx % (c * h * w) / (h * w); //filter number
         const int row = tx % (h * w) / w; //output height index (row number)
@@ -527,17 +499,9 @@ __global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h,
 __global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w,
                           __half *old_data, __half *new_data, int x, int start) {
     
-    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int n = tx / (c * h * w); //output image numbe
-    //const int stride = blockDim.x * gridDim.x;
     const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
     const int n = tx / (c * h * w); //output image number
     if(n < N) {
-    //for(int i = index; i < N; i += stride){
-        //const int col = ((i % (c * h * w)) % (h * w)) % w;
-        //const int row = ((i % (c * h * w)) % (h * w)) / w;
-        //const int ch = (i % (c * h * w)) / (h * w);
-        //const int n = i / (c * h * w);
         
         const int ch = tx % (c * h * w) / (h * w); //filter number
         const int row = tx % (h * w) / w; //output height index (row number)
@@ -554,13 +518,11 @@ __global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h,
         } else if((row - start) % x == 0) {
             const int row_index = row - ((row + 1 - start) / x);
             const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
             new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
                     __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
         } else {
             const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
             const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
             new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
         }
     }
@@ -587,11 +549,7 @@ __global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
     }
     const int inW = w_index * H_stride - H_pad;
     const int inH = h * V_stride - V_pad; //input height index (row number)
-     //#pragma unroll
-    //  for (int ki = 0; ki < KH * KW; ki++) {               
-      //    int i = ki / KW;
-       //   int j = ki % KW; 
-    
+
     for(int i = 0; i < KH; i++) {
       for(int j = 0; j < KW; j++) {
         const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
@@ -626,10 +584,8 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
           }
           const int inW = w_index * H_stride - H_pad;
           const int inH = h * V_stride - V_pad; //input height index (row number)
-          //#pragma unroll
-          //  for (int ki = 0; ki < KH * KW; ki++) {               
-              //    int i = ki / KW;
-              //   int j = ki % KW; 
+
+
           for(int i = 0; i < KH; i++) {
               for(int j = 0; j < KW; j++) {
                   const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen
@@ -647,15 +603,6 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
 __global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
                                                 __half *old_data, __half *new_data, int x, int start) {
 
-    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int stride = blockDim.x * gridDim.x;
-    
-    //for(int i = index; i < N; i += stride){
-      //  const int col = ((i % (c * h * w)) % (h * w)) % w;
-      //  const int row = ((i % (c * h * w)) % (h * w)) / w;
-      //  const int ch = (i % (c * h * w)) / (h * w);
-      //  const int n = i / (c * h * w);
-
     const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
     const int n = tx / (c * h * w); //output image number
     if(n < N) {
@@ -688,14 +635,6 @@ __global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h,
 __global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w,
                                                 __half *old_data, __half *new_data, int x, int start) {
     
-    //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int stride = blockDim.x * gridDim.x;
-    
-   // for(int i = index; i < N; i += stride){
-       // const int col = ((i % (c * h * w)) % (h * w)) % w;
-       // const int row = ((i % (c * h * w)) % (h * w)) / w;
-       // const int ch = (i % (c * h * w)) / (h * w);
-       // const int n = i / (c * h * w);
     const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
     const int n = tx / (c * h * w); //output image number
     if(n < N) {
@@ -705,25 +644,23 @@ __global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h,
         if(col < start) {
             new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
                         = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col];
-                        //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+   
         } else if(col == w - 1) {
             new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
                             old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
-                            //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+   
         } else if (col == 0) {
             new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
                         old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
-                        //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+   
         } else if((col - start) % x == 0) {
             const int col_index = col - ((col + 1 - start) / x);
             const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
             new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
                             __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
         } else {
             const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
             const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            //const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
             new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
         }
     }
@@ -759,6 +696,7 @@ __global__ void convToGemmFullInputRegular(float * const __restrict__ output,
              in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
                         + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
         }
+	 
         const int i = (in_index % (KW * KH)) / KW;
         const int j = in_index % KW;
         const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; 
@@ -809,13 +747,15 @@ __global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
             }
         }
     }
+
+    
 }
 
 __global__ void createReducedFiltersFullRegular(float * output,
-					 const float * const __restrict input, const int NF,
-					 const int num_filter_elem, const int reduced_filter_elem, 
-                     const int channels,
-					 const int skip_every, const int skip_offset, const float fac) {
+						const float * const __restrict input, const int NF,
+						const int num_filter_elem, const int reduced_filter_elem, 
+						const int channels,
+						const int skip_every, const int skip_offset, const float fac) {
   
   const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
   const int fIdx = tx / reduced_filter_elem; //filter index
@@ -826,11 +766,13 @@ __global__ void createReducedFiltersFullRegular(float * output,
       int in_index;
       if(offset < channel_offset) {
         in_index = offset;
-     } else {
+      }
+      else {
          in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
                   + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
-     }
-    output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+      }
+      
+      output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
   }
 }
 
@@ -873,30 +815,23 @@ __global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
     const int inH = h * V_stride - V_pad; //input height index (row number)
     const int inW = w * H_stride - H_pad; //input width index (col number)
     
-    #pragma unroll
-    //for(int fi = 0; fi < reduced_filter_elem; fi++) {
-         //const int ch = (fi * C) / reduced_filter_elem;
+      #pragma unroll
       for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-        const int fi = ch * (reduced_filter_elem / C) + ki;
-        const int offset = (skip_offset + ch) % skip_every;
-         //int in_index;
+         const int fi = ch * (reduced_filter_elem / C) + ki;
+         const int offset = (skip_offset + ch) % skip_every;
+   
          const bool condition = (fi < offset);
          const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
                                                 + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-         //if(fi < offset) {
-         //    in_index = fi;
-         //} else {
-         //    in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-           //             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-       // }
-        const int i = (in_index % (KW * KH)) / KW;
-        const int j = in_index % KW;
-        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { 
-            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-        } else {
+  
+         const int i = (in_index % (KW * KH)) / KW;
+         const int j = in_index % KW;
+         const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+         if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { 
+             output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+         } else {
             output[out_index] = 0;
-        }
+         }
       }
     }
 }
@@ -922,26 +857,20 @@ __global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
           
           #pragma unroll
            for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-               const int fi = ch * (reduced_filter_elem / C) + ki;
-          //for(int fi = 0; fi < reduced_filter_elem; fi++) {
-           //   const int ch = (fi * C) / reduced_filter_elem;
+
+	      const int fi = ch * (reduced_filter_elem / C) + ki;	          
               const int offset = (skip_offset + ch) % skip_every;
               const int condition = (fi < offset);
-             const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
+              const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
                                                           + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-             // int in_index;
-              //if(fi < offset) {
-               //   in_index = fi;
-              //} else {
-               //   in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
-                 //               + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-             // }
+         
               const int i = (in_index % (KW * KH)) / KW;
               const int j = in_index % KW;
               const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
               if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
                   output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-              } else {
+              }
+	      else {
                   output[out_index] = 0;
              }
         }
@@ -971,20 +900,15 @@ __global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
             const int condition = (fi < skip_offset);
             const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
                                              + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-            //int in_index;
-            //if(fi < skip_offset) {
-             //   in_index = fi;
-            //} else {        
-              //  in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-              //              + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-           // }
-            const int ch = in_index / (KW * KH);
+
+	    const int ch = in_index / (KW * KH);
             const int i = (in_index % (KW * KH)) / KW;
             const int j = in_index % KW; 
             const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
             if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
                 output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
+            }
+	    else {
                 output[out_index] = 0;
             }
         }
@@ -1013,18 +937,11 @@ __global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output
             const int condition = (fi < skip_offset);
             const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
                                  + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-           // int in_index;
-           // if(fi < skip_offset) {
-           //     in_index = fi;
-           // } else {
-            //    in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                   //             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-           // }
+      
             const int ch = in_index / (KW * KH);
             const int i = (in_index % (KW * KH)) / KW;
             const int j = in_index % KW;
             const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-            //const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
             if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
                 output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
             } else {
@@ -1042,11 +959,8 @@ __global__ void createReducedFiltersHalfRegular(__half * output,
                                          const int skip_every, const int skip_offset, const float fac) {
 
   const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  //const int stride = blockDim.x * gridDim.x;
-  
-  //#pragma unroll
-  //for (int i = tx; i < NF; i += stride) {
-    const int fIdx = tx / reduced_filter_elem; //filter index
+
+  const int fIdx = tx / reduced_filter_elem; //filter index
   if(fIdx < NF) {
     const int offset = tx % reduced_filter_elem; //offset within filter
     const int ch = (offset * channels) / reduced_filter_elem;
@@ -1055,15 +969,9 @@ __global__ void createReducedFiltersHalfRegular(__half * output,
     const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
                           + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1);
       
-     // int in_index;
-     // if(offset < channel_offset) {
-      //  in_index = offset;
-     //} else {
-       //  in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-         //         + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
-    // }
     output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
  }
+  
 }
 
 __global__ void createReducedFiltersHalfIrregular(__half * output,
@@ -1071,21 +979,20 @@ __global__ void createReducedFiltersHalfIrregular(__half * output,
                      const int num_filter_elem, const int reduced_filter_elem,
                      const int skip_every, const int skip_offset, const float fac) {
 
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-     //const int stride = blockDim.x * gridDim.x;
-      //#pragma unroll
-      //for (int i = tx; i < NF; i += stride) { 
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int fIdx = tx / reduced_filter_elem; //filter index
   
-      const int fIdx = tx / reduced_filter_elem; //filter index
-      if(fIdx < NF) {
-        const int offset = tx % reduced_filter_elem; //offset within filter
-        const int condition = (offset < skip_offset);
-        int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
+  if(fIdx < NF) {
+
+    const int offset = tx % reduced_filter_elem; //offset within filter
+    const int condition = (offset < skip_offset);
+    
+    int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
                      + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-        //}
-        output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
-    //}
+        
+    output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
   }
+      
 }
 
 
@@ -1112,7 +1019,7 @@ __global__ void convToGemmApprox(float * const __restrict__ output,
       for(int j = 0; j < KW; j++) {
 	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
 	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
-	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
+	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //cal output column, taking skipping into account
 	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
 	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
 	  else
@@ -1130,8 +1037,6 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
 			 int horizontal_stride, int conv_mode, int conv_groups,
 			 int row, int col, int start){
 
-  //////INFO("*** TensorConvolution (output perforation) \n");
-  //Event("Conv");
   Tensor* input = (Tensor*)input_ptr;
   Tensor* filter = (Tensor*)filter_ptr;
   //FIXME: Current hack to preserve backward compatibilty
@@ -1144,10 +1049,8 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-  //Event("H2F_start");
   convertToFP32(input);
   convertToFP32(filter);
-  //Event("H2F_end");
   
   long int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
@@ -1221,14 +1124,14 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(col > 1){
+  }
+  else if(col > 1){
     output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
     float * convData;
@@ -1550,7 +1453,8 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
-      INFO("FP32 BASELINE\n");
+
+      //INFO("FP32 BASELINE\n");
       Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
                                CUDNN_TENSOR_NCHW, n, c, h, w);
     changeTensorPlacement(output, DEVICE);
@@ -1996,14 +1900,12 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
         freeTensor(output);
         cudaFree(convData);
   }
-//    INFO("CONV DONE\n");
+
   profileEvent("H2F_start");
   convertToFP32_offline(new_output);
-  //convertToFP32(input);
-  //convertToFP32(filter);
+
   profileEvent("H2F_end");
-  //profileEvent("#Conv_end");
-  //INFO("CONVOLUTION END\n");
+
   return new_output;
 }