diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
index e81a78860bc1cdb412b823f5fff58cea4c5048fa..987627e5fb8227e853f66b6ae8c8d33c1b40a638 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
@@ -3,507 +3,507 @@
 
 
 __global__ void depthwise_conv8(float* const __restrict__ y,
-	const float* const __restrict__ x,
-	const float* const __restrict__ w,
-	const int B, const int M,
-	const int H, const int W, const int KH,
-	const int KW, const int H_out, const int W_out,
-	const int H_pad, const int W_pad,
-	const int H_stride, const int W_stride)
+				const float* const __restrict__ x,
+				const float* const __restrict__ w,
+				const int B, const int M,
+				const int H, const int W, const int KH,
+				const int KW, const int H_out, const int W_out,
+				const int H_pad, const int W_pad,
+				const int H_stride, const int W_stride)
 {
 
   
-       #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-       #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
  
-	const int num = 8;
-
-	const int b = blockIdx.x * num;
-	const int m = blockIdx.y; //current filter/channel
-
-	const int tx = threadIdx.x;
-
-	const int start_h = (threadIdx.x / W_out) * H_stride - H_pad;
-	const int start_w = (threadIdx.x % W_out) * W_stride - W_pad;
-
-
-	const float* weights = &w[m * KH * KW];
-	float c0 = 0;
-	float c1 = 0;
-	float c2 = 0;
-	float c3 = 0;
-	float c4 = 0;
-	float c5 = 0;
-	float c6 = 0;
-	float c7 = 0;
-
-	for (int k = 0; k < KH * KW; k++) {
-		int p = k / KW;
-		int q = k % KW;
-
-		if (start_h + p > -1 && start_h + p < H &&
-			start_w + q > -1 && start_w + q < W) {
-
-			c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
-			c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
-			c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
-			c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
-			c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
-			c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
-			c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
-			c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
-
-		}
-	}
-
-	y4d(b, m, 0, tx) = c0;
-	y4d(b + 1, m, 0, tx) = c1;
-	y4d(b + 2, m, 0, tx) = c2;
-	y4d(b + 3, m, 0, tx) = c3;
-	y4d(b + 4, m, 0, tx) = c4;
-	y4d(b + 5, m, 0, tx) = c5;
-	y4d(b + 6, m, 0, tx) = c6;
-	y4d(b + 7, m, 0, tx) = c7;
-
-       #undef y4d
-       #undef x4d
+  const int num = 8;
+
+  const int b = blockIdx.x * num;
+  const int m = blockIdx.y; //current filter/channel
+
+  const int tx = threadIdx.x;
+
+  const int start_h = (threadIdx.x / W_out) * H_stride - H_pad;
+  const int start_w = (threadIdx.x % W_out) * W_stride - W_pad;
+
+
+  const float* weights = &w[m * KH * KW];
+  float c0 = 0;
+  float c1 = 0;
+  float c2 = 0;
+  float c3 = 0;
+  float c4 = 0;
+  float c5 = 0;
+  float c6 = 0;
+  float c7 = 0;
+
+  for (int k = 0; k < KH * KW; k++) {
+    int p = k / KW;
+    int q = k % KW;
+
+    if (start_h + p > -1 && start_h + p < H &&
+	start_w + q > -1 && start_w + q < W) {
+
+      c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
+      c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
+      c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
+      c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
+      c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
+      c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
+      c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
+      c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
+
+    }
+  }
+
+  y4d(b, m, 0, tx) = c0;
+  y4d(b + 1, m, 0, tx) = c1;
+  y4d(b + 2, m, 0, tx) = c2;
+  y4d(b + 3, m, 0, tx) = c3;
+  y4d(b + 4, m, 0, tx) = c4;
+  y4d(b + 5, m, 0, tx) = c5;
+  y4d(b + 6, m, 0, tx) = c6;
+  y4d(b + 7, m, 0, tx) = c7;
+
+  #undef y4d
+  #undef x4d
 }
 
 
 __global__ void depthwise_conv(float* const __restrict__ y,
-	const float* const __restrict__ x,
-	const float* const __restrict__ w,
-	const int B, const int M,
-	const int H, const int W, const int KH,
-	const int KW, const int H_out, const int W_out,
-	const int H_pad, const int W_pad,
+			       const float* const __restrict__ x,
+			       const float* const __restrict__ w,
+			       const int B, const int M,
+			       const int H, const int W, const int KH,
+			       const int KW, const int H_out, const int W_out,
+			       const int H_pad, const int W_pad,
 			       const int H_stride, const int W_stride, const int start_batch)
 {
 
-        #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-        #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
-	const int num = 1;
+  const int num = 1;
 
-	const int b = num * blockIdx.x + start_batch;
-	const int m = blockIdx.y; //current filter/channel
+  const int b = num * blockIdx.x + start_batch;
+  const int m = blockIdx.y; //current filter/channel
 
-	const int tx = threadIdx.x;
+  const int tx = threadIdx.x;
 
-	const int start_h = (threadIdx.x / W_out) * H_stride - H_pad;
-	const int start_w = (threadIdx.x % W_out) * W_stride - W_pad;
+  const int start_h = (threadIdx.x / W_out) * H_stride - H_pad;
+  const int start_w = (threadIdx.x % W_out) * W_stride - W_pad;
 
-	float C[num] = { 0 };
+  float C[num] = { 0 };
 
-	const float* weights = &w[m * KH * KW];
+  const float* weights = &w[m * KH * KW];
 
-	for (int k = 0; k < KH * KW; k++) {
-		int p = k / KW;
-		int q = k % KW;
+  for (int k = 0; k < KH * KW; k++) {
+    int p = k / KW;
+    int q = k % KW;
 
-                #pragma unroll
-		for (int i = 0; i < num; i++) {
-			if (start_h + p > -1 && start_h + p < H &&
-				start_w + q > -1 && start_w + q < W) {
+    #pragma unroll
+    for (int i = 0; i < num; i++) {
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
 
-				C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k];
-			}
+	C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k];
+      }
 
-		}
-	}
+    }
+  }
 
-        #pragma unroll
-	for (int i = 0; i < num; i++) {
-	  if(b + i < B)
-		y4d(b + i, m, 0, tx) = C[i];
+  #pragma unroll
+  for (int i = 0; i < num; i++) {
+    if(b + i < B)
+      y4d(b + i, m, 0, tx) = C[i];
 
-	}
+  }
 	
 
-       #undef y4d 
-       #undef x4d
+  #undef y4d 
+  #undef x4d
 }
 
 __global__ void depthwise_conv12(float* const __restrict__ y,
-	const float* const __restrict__ x,
-	const float* const __restrict__ w,
-	const int B, const int M,
-	const int H, const int W, const int KH,
-	const int KW, const int H_out, const int W_out,
-	const int H_pad, const int W_pad,
-	const int H_stride, const int W_stride)
+				 const float* const __restrict__ x,
+				 const float* const __restrict__ w,
+				 const int B, const int M,
+				 const int H, const int W, const int KH,
+				 const int KW, const int H_out, const int W_out,
+				 const int H_pad, const int W_pad,
+				 const int H_stride, const int W_stride)
 {
 
-        #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-        #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
-	const int num = 12;
+  const int num = 12;
 
-	const int b = num * blockIdx.x;
-	const int m = blockIdx.y; //current filter/channel
+  const int b = num * blockIdx.x;
+  const int m = blockIdx.y; //current filter/channel
 
-	const int tx = threadIdx.x;
+  const int tx = threadIdx.x;
 
-	const int start_h = (threadIdx.x / W_out) * H_stride - H_pad;
-	const int start_w = (threadIdx.x % W_out) * W_stride - W_pad;
+  const int start_h = (threadIdx.x / W_out) * H_stride - H_pad;
+  const int start_w = (threadIdx.x % W_out) * W_stride - W_pad;
 
-	float C[num] = { 0 };
+  float C[num] = { 0 };
 
-	const float* weights = &w[m * KH * KW];
+  const float* weights = &w[m * KH * KW];
 
-	for (int k = 0; k < KH * KW; k++) {
-		int p = k / KW;
-		int q = k % KW;
+  for (int k = 0; k < KH * KW; k++) {
+    int p = k / KW;
+    int q = k % KW;
 
-		if (start_h + p > -1 && start_h + p < H &&
-				start_w + q > -1 && start_w + q < W) {
+    if (start_h + p > -1 && start_h + p < H &&
+	start_w + q > -1 && start_w + q < W) {
 
-                  #pragma unroll
-		  for (int i = 0; i < num; i++) {
-		    //if(b + i < B)
-		      C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k];
-		  }
+      #pragma unroll
+      for (int i = 0; i < num; i++) {
+	//if(b + i < B)
+	C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k];
+      }
 
-		}
-	}
+    }
+  }
 
-        #pragma unroll
-	for (int i = 0; i < num; i++) {
-	  //if(b + i < B)
-	    y4d(b + i, m, 0, tx) = C[i];
+  #pragma unroll
+  for (int i = 0; i < num; i++) {
+    //if(b + i < B)
+    y4d(b + i, m, 0, tx) = C[i];
 
-	}
+  }
 	
 
-       #undef y4d 
-       #undef x4d
+#undef y4d 
+#undef x4d
 }
 
 __global__ void depthwise_convNew(float* const __restrict__ y,
-	const float* const __restrict__ x,
-	const float* const __restrict__ w,
-	const int B, const int M,
-	const int H, const int W, const int KH,
-	const int KW, const int H_out, const int W_out,
-	const int H_pad, const int W_pad,
-	const int H_stride, const int W_stride)
+				  const float* const __restrict__ x,
+				  const float* const __restrict__ w,
+				  const int B, const int M,
+				  const int H, const int W, const int KH,
+				  const int KW, const int H_out, const int W_out,
+				  const int H_pad, const int W_pad,
+				  const int H_stride, const int W_stride)
 {
 
-        #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-        #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
-	const int num = 12;
+  const int num = 12;
 
-	const int b = num * blockIdx.x;
-	const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out); 
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out); 
 
-	const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+  const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
 
-	const int start_h = (tx / W_out) * H_stride - H_pad;
-	const int start_w = (tx % W_out) * W_stride - W_pad;
+  const int start_h = (tx / W_out) * H_stride - H_pad;
+  const int start_w = (tx % W_out) * W_stride - W_pad;
 
-	float C[num] = { 0 };
+  float C[num] = { 0 };
 
-	const float* weights = &w[m * KH * KW];
+  const float* weights = &w[m * KH * KW];
 
-	for (int k = 0; k < KH * KW; k++) {
-		int p = k / KW;
-		int q = k % KW;
+  for (int k = 0; k < KH * KW; k++) {
+    int p = k / KW;
+    int q = k % KW;
 
-		if (start_h + p > -1 && start_h + p < H &&
-				start_w + q > -1 && start_w + q < W) {
+    if (start_h + p > -1 && start_h + p < H &&
+	start_w + q > -1 && start_w + q < W) {
 
-                  #pragma unroll
-		  for (int i = 0; i < num; i++) {
-		    if(b + i < B)
-		      C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k];
-		  }
+      #pragma unroll
+      for (int i = 0; i < num; i++) {
+	if(b + i < B)
+	  C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k];
+      }
 
-		}
-	}
+    }
+  }
 
-        #pragma unroll
-	for (int i = 0; i < num; i++) {
-	  if(b + i < B)
-	    y4d(b + i, m, 0, tx) = C[i];
+  #pragma unroll
+  for (int i = 0; i < num; i++) {
+    if(b + i < B)
+      y4d(b + i, m, 0, tx) = C[i];
 
-	}
+  }
 	
 
-       #undef y4d 
-       #undef x4d
+  #undef y4d 
+  #undef x4d
 }
 
 __global__ void depthwise_convNew8(float* const __restrict__ y,
-	const float* const __restrict__ x,
-	const float* const __restrict__ w,
-	const int B, const int M,
-	const int H, const int W, const int KH,
-	const int KW, const int H_out, const int W_out,
-	const int H_pad, const int W_pad,
-	const int H_stride, const int W_stride)
+				   const float* const __restrict__ x,
+				   const float* const __restrict__ w,
+				   const int B, const int M,
+				   const int H, const int W, const int KH,
+				   const int KW, const int H_out, const int W_out,
+				   const int H_pad, const int W_pad,
+				   const int H_stride, const int W_stride)
 {
 
-        #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-        #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
-	const int num = 8;
+  const int num = 8;
 
-	const int b = num * blockIdx.x;
-	const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
 	
-	if(m < M){
-	const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
-
-	const int start_h = (tx / W_out) * H_stride - H_pad;
-	const int start_w = (tx % W_out) * W_stride - W_pad;
-
-	float c0 = 0;
-	float c1 = 0;
-	float c2 = 0;
-	float c3 = 0;
-	float c4 = 0;
-	float c5 = 0;
-	float c6 = 0;
-	float c7 = 0;
+  if(m < M){
+    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+
+    const int start_h = (tx / W_out) * H_stride - H_pad;
+    const int start_w = (tx % W_out) * W_stride - W_pad;
+
+    float c0 = 0;
+    float c1 = 0;
+    float c2 = 0;
+    float c3 = 0;
+    float c4 = 0;
+    float c5 = 0;
+    float c6 = 0;
+    float c7 = 0;
 	
-	const float* weights = &w[m * KH * KW];
-
-	for (int k = 0; k < KH * KW; k++) {
-		int p = k / KW;
-		int q = k % KW;
-
-		if (start_h + p > -1 && start_h + p < H &&
-				start_w + q > -1 && start_w + q < W) {
-
-		  c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
-                  if(b + 1 < B)
-		      c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 2 < B)
-		      c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 3 < B)
-		      c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 4 < B)
-		      c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 5 < B)
-		      c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 6 < B)
-		      c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 7 < B)
-		      c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
-    
+    const float* weights = &w[m * KH * KW];
+
+    for (int k = 0; k < KH * KW; k++) {
+      int p = k / KW;
+      int q = k % KW;
 
-		}
-	}
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
 
-	y4d(b, m, 0, tx) = c0;	
-        if(b + 1 < B)
-	  y4d(b + 1, m, 0, tx) = c1;
+	c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
+	if(b + 1 < B)
+	  c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
 	if(b + 2 < B)
-	  y4d(b + 2, m, 0, tx) = c2;
+	  c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
 	if(b + 3 < B)
-	  y4d(b + 3, m, 0, tx) = c3;
+	  c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
 	if(b + 4 < B)
-	  y4d(b + 4, m, 0, tx) = c4;
+	  c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
 	if(b + 5 < B)
-	  y4d(b + 5, m, 0, tx) = c5;
+	  c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
 	if(b + 6 < B)
-	  y4d(b + 6, m, 0, tx) = c6;
+	  c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
 	if(b + 7 < B)
-	  y4d(b + 7, m, 0, tx) = c7;
-	}
+	  c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
+    
+
+      }
+    }
+
+    y4d(b, m, 0, tx) = c0;	
+    if(b + 1 < B)
+      y4d(b + 1, m, 0, tx) = c1;
+    if(b + 2 < B)
+      y4d(b + 2, m, 0, tx) = c2;
+    if(b + 3 < B)
+      y4d(b + 3, m, 0, tx) = c3;
+    if(b + 4 < B)
+      y4d(b + 4, m, 0, tx) = c4;
+    if(b + 5 < B)
+      y4d(b + 5, m, 0, tx) = c5;
+    if(b + 6 < B)
+      y4d(b + 6, m, 0, tx) = c6;
+    if(b + 7 < B)
+      y4d(b + 7, m, 0, tx) = c7;
+  }
 	
-       #undef y4d 
-       #undef x4d
+  #undef y4d 
+  #undef x4d
 }
 
 __global__ void depthwise_convNew8_half(__half* const __restrict__ y,
-	const __half* const __restrict__ x,
-	const __half* const __restrict__ w,
-	const int B, const int M,
-	const int H, const int W, const int KH,
-	const int KW, const int H_out, const int W_out,
-	const int H_pad, const int W_pad,
-	const int H_stride, const int W_stride)
+					const __half* const __restrict__ x,
+					const __half* const __restrict__ w,
+					const int B, const int M,
+					const int H, const int W, const int KH,
+					const int KW, const int H_out, const int W_out,
+					const int H_pad, const int W_pad,
+					const int H_stride, const int W_stride)
 {
 
-        #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-        #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
-	const int num = 8;
+  const int num = 8;
 
-	const int b = num * blockIdx.x;
-	const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
 	
-	if(m < M){
-	const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
-
-	const int start_h = (tx / W_out) * H_stride - H_pad;
-	const int start_w = (tx % W_out) * W_stride - W_pad;
-
-	__half c0 = 0;
-	__half c1 = 0;
-	__half c2 = 0;
-	__half c3 = 0;
-	__half c4 = 0;
-	__half c5 = 0;
-	__half c6 = 0;
-	__half c7 = 0;
+  if(m < M){
+    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+
+    const int start_h = (tx / W_out) * H_stride - H_pad;
+    const int start_w = (tx % W_out) * W_stride - W_pad;
+
+    __half c0 = 0;
+    __half c1 = 0;
+    __half c2 = 0;
+    __half c3 = 0;
+    __half c4 = 0;
+    __half c5 = 0;
+    __half c6 = 0;
+    __half c7 = 0;
 	
-	const __half* weights = &w[m * KH * KW];
-
-	for (int k = 0; k < KH * KW; k++) {
-		int p = k / KW;
-		int q = k % KW;
-
-		if (start_h + p > -1 && start_h + p < H &&
-				start_w + q > -1 && start_w + q < W) {
-
-		  c0 = __hfma(x4d(b, m, start_h + p, start_w + q), weights[k], c0);
-                  if(b + 1 < B)
-		    c1 = __hfma(x4d(b + 1, m, start_h + p, start_w + q), weights[k], c1);
-		  if(b + 2 < B)
-		    c2 = __hfma(x4d(b + 2, m, start_h + p, start_w + q), weights[k], c2);
-		  if(b + 3 < B)
-		    c3 = __hfma(x4d(b + 3, m, start_h + p, start_w + q), weights[k], c3);
-		  if(b + 4 < B)
-		    c4 = __hfma(x4d(b + 4, m, start_h + p, start_w + q), weights[k], c4);
-		  if(b + 5 < B)
-		    c5 = __hfma(x4d(b + 5, m, start_h + p, start_w + q), weights[k], c5);
-		  if(b + 6 < B)
-		    c6 = __hfma(x4d(b + 6, m, start_h + p, start_w + q), weights[k], c6);
-		  if(b + 7 < B)
-		    c7 = __hfma(x4d(b + 7, m, start_h + p, start_w + q), weights[k], c7);
-    
+    const __half* weights = &w[m * KH * KW];
+
+    for (int k = 0; k < KH * KW; k++) {
+      int p = k / KW;
+      int q = k % KW;
 
-		}
-	}
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
 
-	y4d(b, m, 0, tx) = c0;	
-        if(b + 1 < B)
-	  y4d(b + 1, m, 0, tx) = c1;
+	c0 = __hfma(x4d(b, m, start_h + p, start_w + q), weights[k], c0);
+	if(b + 1 < B)
+	  c1 = __hfma(x4d(b + 1, m, start_h + p, start_w + q), weights[k], c1);
 	if(b + 2 < B)
-	  y4d(b + 2, m, 0, tx) = c2;
+	  c2 = __hfma(x4d(b + 2, m, start_h + p, start_w + q), weights[k], c2);
 	if(b + 3 < B)
-	  y4d(b + 3, m, 0, tx) = c3;
+	  c3 = __hfma(x4d(b + 3, m, start_h + p, start_w + q), weights[k], c3);
 	if(b + 4 < B)
-	  y4d(b + 4, m, 0, tx) = c4;
+	  c4 = __hfma(x4d(b + 4, m, start_h + p, start_w + q), weights[k], c4);
 	if(b + 5 < B)
-	  y4d(b + 5, m, 0, tx) = c5;
+	  c5 = __hfma(x4d(b + 5, m, start_h + p, start_w + q), weights[k], c5);
 	if(b + 6 < B)
-	  y4d(b + 6, m, 0, tx) = c6;
+	  c6 = __hfma(x4d(b + 6, m, start_h + p, start_w + q), weights[k], c6);
 	if(b + 7 < B)
-	  y4d(b + 7, m, 0, tx) = c7;
-	}
+	  c7 = __hfma(x4d(b + 7, m, start_h + p, start_w + q), weights[k], c7);
+    
+
+      }
+    }
+
+    y4d(b, m, 0, tx) = c0;	
+    if(b + 1 < B)
+      y4d(b + 1, m, 0, tx) = c1;
+    if(b + 2 < B)
+      y4d(b + 2, m, 0, tx) = c2;
+    if(b + 3 < B)
+      y4d(b + 3, m, 0, tx) = c3;
+    if(b + 4 < B)
+      y4d(b + 4, m, 0, tx) = c4;
+    if(b + 5 < B)
+      y4d(b + 5, m, 0, tx) = c5;
+    if(b + 6 < B)
+      y4d(b + 6, m, 0, tx) = c6;
+    if(b + 7 < B)
+      y4d(b + 7, m, 0, tx) = c7;
+  }
 	
-       #undef y4d 
-       #undef x4d
+  #undef y4d 
+  #undef x4d
 }
 
 
 
 __global__ void depthwise_convNew12(float* const __restrict__ y,
-	const float* const __restrict__ x,
-	const float* const __restrict__ w,
-	const int B, const int M,
-	const int H, const int W, const int KH,
-	const int KW, const int H_out, const int W_out,
-	const int H_pad, const int W_pad,
-	const int H_stride, const int W_stride)
+				    const float* const __restrict__ x,
+				    const float* const __restrict__ w,
+				    const int B, const int M,
+				    const int H, const int W, const int KH,
+				    const int KW, const int H_out, const int W_out,
+				    const int H_pad, const int W_pad,
+				    const int H_stride, const int W_stride)
 {
 
-        #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-        #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
-	const int num = 12;
+  const int num = 12;
 
-	const int b = num * blockIdx.x;
-	const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
 	
-	if(m < M){
-	const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
-
-	const int start_h = (tx / W_out) * H_stride - H_pad;
-	const int start_w = (tx % W_out) * W_stride - W_pad;
-
-	float c0 = 0;
-	float c1 = 0;
-	float c2 = 0;
-	float c3 = 0;
-	float c4 = 0;
-	float c5 = 0;
-	float c6 = 0;
-	float c7 = 0;
-	float c8 = 0;
-	float c9 = 0;
-	float c10 = 0;
-	float c11 = 0;
+  if(m < M){
+    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+
+    const int start_h = (tx / W_out) * H_stride - H_pad;
+    const int start_w = (tx % W_out) * W_stride - W_pad;
+
+    float c0 = 0;
+    float c1 = 0;
+    float c2 = 0;
+    float c3 = 0;
+    float c4 = 0;
+    float c5 = 0;
+    float c6 = 0;
+    float c7 = 0;
+    float c8 = 0;
+    float c9 = 0;
+    float c10 = 0;
+    float c11 = 0;
 	
-	const float* weights = &w[m * KH * KW];
-
-	for (int k = 0; k < KH * KW; k++) {
-		int p = k / KW;
-		int q = k % KW;
-
-		if (start_h + p > -1 && start_h + p < H &&
-				start_w + q > -1 && start_w + q < W) {
-
-		  c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
-                  if(b + 1 < B)
-		      c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 2 < B)
-		      c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 3 < B)
-		      c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 4 < B)
-		      c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 5 < B)
-		      c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 6 < B)
-		      c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 7 < B)
-		      c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 8 < B)
-		      c8 += x4d(b + 8, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 9 < B)
-		      c9 += x4d(b + 9, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 10 < B)
-		      c10 += x4d(b + 10, m, start_h + p, start_w + q) * weights[k];
-		  if(b + 11 < B)
-		      c11 += x4d(b + 11, m, start_h + p, start_w + q) * weights[k];
-    
+    const float* weights = &w[m * KH * KW];
+
+    for (int k = 0; k < KH * KW; k++) {
+      int p = k / KW;
+      int q = k % KW;
 
-		}
-	}
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
 
-	y4d(b, m, 0, tx) = c0;	
-        if(b + 1 < B)
-	  y4d(b + 1, m, 0, tx) = c1;
+	c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
+	if(b + 1 < B)
+	  c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
 	if(b + 2 < B)
-	  y4d(b + 2, m, 0, tx) = c2;
+	  c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
 	if(b + 3 < B)
-	  y4d(b + 3, m, 0, tx) = c3;
+	  c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
 	if(b + 4 < B)
-	  y4d(b + 4, m, 0, tx) = c4;
+	  c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
 	if(b + 5 < B)
-	  y4d(b + 5, m, 0, tx) = c5;
+	  c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
 	if(b + 6 < B)
-	  y4d(b + 6, m, 0, tx) = c6;
+	  c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
 	if(b + 7 < B)
-	  y4d(b + 7, m, 0, tx) = c7;
+	  c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
 	if(b + 8 < B)
-	  y4d(b + 8, m, 0, tx) = c8;
+	  c8 += x4d(b + 8, m, start_h + p, start_w + q) * weights[k];
 	if(b + 9 < B)
-	  y4d(b + 9, m, 0, tx) = c9;
+	  c9 += x4d(b + 9, m, start_h + p, start_w + q) * weights[k];
 	if(b + 10 < B)
-	  y4d(b + 10, m, 0, tx) = c10;
+	  c10 += x4d(b + 10, m, start_h + p, start_w + q) * weights[k];
 	if(b + 11 < B)
-	  y4d(b + 11, m, 0, tx) = c11;
+	  c11 += x4d(b + 11, m, start_h + p, start_w + q) * weights[k];
+    
+
+      }
+    }
+
+    y4d(b, m, 0, tx) = c0;	
+    if(b + 1 < B)
+      y4d(b + 1, m, 0, tx) = c1;
+    if(b + 2 < B)
+      y4d(b + 2, m, 0, tx) = c2;
+    if(b + 3 < B)
+      y4d(b + 3, m, 0, tx) = c3;
+    if(b + 4 < B)
+      y4d(b + 4, m, 0, tx) = c4;
+    if(b + 5 < B)
+      y4d(b + 5, m, 0, tx) = c5;
+    if(b + 6 < B)
+      y4d(b + 6, m, 0, tx) = c6;
+    if(b + 7 < B)
+      y4d(b + 7, m, 0, tx) = c7;
+    if(b + 8 < B)
+      y4d(b + 8, m, 0, tx) = c8;
+    if(b + 9 < B)
+      y4d(b + 9, m, 0, tx) = c9;
+    if(b + 10 < B)
+      y4d(b + 10, m, 0, tx) = c10;
+    if(b + 11 < B)
+      y4d(b + 11, m, 0, tx) = c11;
 	
-	}
+  }
 	
-       #undef y4d 
-       #undef x4d
+  #undef y4d 
+  #undef x4d
 }
 
 
@@ -512,218 +512,219 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 			int vertical_stride, int horizontal_stride,
 			int conv_mode, int conv_groups){
 
-	llvm_hpvm_initTensorRt(0);
+  llvm_hpvm_initTensorRt(0);
 
-	INFO("*** TensorConvolution \n");
-	profileEvent("Conv");
+  INFO("*** TensorConvolution \n");
+  profileEvent("Conv");
 
-	Tensor* input = (Tensor*)input_ptr;
-	Tensor* filter = (Tensor*)filter_ptr;
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
 
-	//FIXME: Current hack to preserve backward compatibilty
-	if (conv_groups == 0) {
-		conv_groups = 1;
-	}
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
 
-	Tensor* output;
+  Tensor* output;
 
 	
-	if (conv_groups > 32) {
-		// TODO: Support other cases;  
-		hostToDeviceCopy(input);
-		hostToDeviceCopy(filter);
-
-		int n, c, h, w; // output dimensions  
-		n = input->dims.dim_sizes[0];
-		c = input->dims.dim_sizes[1];
-		const int KH = filter->dims.dim_sizes[2];
-		const int KW = filter->dims.dim_sizes[3];
-		h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-		w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-
-		output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
-			CUDNN_TENSOR_NCHW, n, c, h, w);
-
-		// NOTE: Changing output tensor placement from host to device
-		changeTensorPlacement(output, DEVICE);
-		// NOTE: Necessary to insert the above call for every output tensor
-
-
-		/*
-		if (c > 255) {
-		  dim3 grid((n / 16), c);
-		  dim3 block(h * w);
-		  depthwise_conv << <grid, block >> > ((float*)output->gpu_data,
-				(float*)input->gpu_data, (float*)filter->gpu_data,
-				input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-				KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
-
-				}*/
-
-		/*
-		  dim3 grid((n / 12), c);
-		  dim3 block(h * w);
-		  depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data,
-				(float*)input->gpu_data, (float*)filter->gpu_data,
-				input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-				KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
-		  if(n % 12 > 0){ 
-		    dim3 grid2((n % 12), c);
-		    dim3 block(h * w);
-		    depthwise_conv <<<grid, block >>> ((float*)output->gpu_data,
-				(float*)input->gpu_data, (float*)filter->gpu_data,
-				input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-						   KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12));
-						   }
-		*/
+  if (conv_groups > 32) {
+    // TODO: Support other cases;  
+    hostToDeviceCopy(input);
+    hostToDeviceCopy(filter);
+
+    int n, c, h, w; // output dimensions  
+    n = input->dims.dim_sizes[0];
+    c = input->dims.dim_sizes[1];
+    const int KH = filter->dims.dim_sizes[2];
+    const int KW = filter->dims.dim_sizes[3];
+    h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+    w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+
+    output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+				     CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+
+
+    /*
+      if (c > 255) {
+      dim3 grid((n / 16), c);
+      dim3 block(h * w);
+      depthwise_conv << <grid, block >> > ((float*)output->gpu_data,
+      (float*)input->gpu_data, (float*)filter->gpu_data,
+      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+
+      }*/
+
+    /*
+      dim3 grid((n / 12), c);
+      dim3 block(h * w);
+      depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data,
+      (float*)input->gpu_data, (float*)filter->gpu_data,
+      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+      if(n % 12 > 0){ 
+      dim3 grid2((n % 12), c);
+      dim3 block(h * w);
+      depthwise_conv <<<grid, block >>> ((float*)output->gpu_data,
+      (float*)input->gpu_data, (float*)filter->gpu_data,
+      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12));
+      }
+    */
 		
-		int blockSize;
-		blockSize = 128;
+    int blockSize;
+    blockSize = 128;
 		
-		dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
-		dim3 block(blockSize);
-		depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data,
-			(float*)input->gpu_data, (float*)filter->gpu_data,
-			input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-			KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
-
-	}
-	else {
-
-		cudnnConvolutionDescriptor_t convDesc;
-		cudnnConvolutionFwdAlgo_t convAlgo;
-		cudnnConvolutionMode_t mode;
-		if (conv_mode == 0)
-			mode = CUDNN_CONVOLUTION;
-		else if (conv_mode == 1)
-			mode = CUDNN_CROSS_CORRELATION;
-
-		// FIXIT: Need to be more aware of the implications of alpha and beta
-		float alpha = 1.0f, beta = 0.0f;
-
-		// TODO: Support other cases;  
-		hostToDeviceCopy(input);
-		hostToDeviceCopy(filter);
-
-		INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
-
-		checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
-
-		// NOTE: Adding support for grouped convolution
-		checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
-
-
-		cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-		// FIXIT: Think if upscaling values need to be configurable?
-		// IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
-		checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-			vertical_pad, horizontal_pad, // conv padding
-			vertical_stride, horizontal_stride, // conv strides
-			1, 1, // upscaling values
-			mode, // mode is configurable
-			computeType)); // defines compute precision
+    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
+    dim3 block(blockSize);
+    depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data,
+					 (float*)input->gpu_data, (float*)filter->gpu_data,
+					 input->dims.dim_sizes[0], input->dims.dim_sizes[1],
+					 input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+					 KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
 
-		int n, c, h, w; // output dimensions  
-		// Find dimension of convolution output
-		checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-			input->tensor_desc,
-			filter->filter_desc,
-			&n, &c, &h, &w));
+  }
+  else {
 
+    cudnnConvolutionDescriptor_t convDesc;
+    cudnnConvolutionFwdAlgo_t convAlgo;
+    cudnnConvolutionMode_t mode;
+    if (conv_mode == 0)
+      mode = CUDNN_CONVOLUTION;
+    else if (conv_mode == 1)
+      mode = CUDNN_CROSS_CORRELATION;
 
-		DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+    // FIXIT: Need to be more aware of the implications of alpha and beta
+    float alpha = 1.0f, beta = 0.0f;
 
-		if (input->data_format == CUDNN_TENSOR_NCHW)
-			output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
-				CUDNN_TENSOR_NCHW, n, c, h, w);
-		else if (input->data_format == CUDNN_TENSOR_NHWC) {
-			DEBUG("* NHWC Format \n");
-			output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
-				CUDNN_TENSOR_NHWC, n, h, w, c);
-		}
-		else
-			ERROR("Unsupported Tensor Type");
+    // TODO: Support other cases;  
+    hostToDeviceCopy(input);
+    hostToDeviceCopy(filter);
 
-		// NOTE: Changing output tensor placement from host to device
-		changeTensorPlacement(output, DEVICE);
-		// NOTE: Necessary to insert the above call for every output tensor
+    INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
 
-		DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-			output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-			output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+    checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-		if (convDesc == NULL || input->tensor_desc == NULL ||
-			filter->filter_desc == NULL || output->tensor_desc == NULL)
-			ERROR("NULL descriptor! \n");
+    // NOTE: Adding support for grouped convolution
+    checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
 
-		// NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-		checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-			input->tensor_desc,
-			filter->filter_desc,
-			convDesc,
-			output->tensor_desc,
-			CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-			//CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-			0,
-			&convAlgo));
+    cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
+    // FIXIT: Think if upscaling values need to be configurable?
+    // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
+    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
+					       vertical_pad, horizontal_pad, // conv padding
+					       vertical_stride, horizontal_stride, // conv strides
+					       1, 1, // upscaling values
+					       mode, // mode is configurable
+					       computeType)); // defines compute precision
 
+    int n, c, h, w; // output dimensions  
+    // Find dimension of convolution output
+    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
+						     input->tensor_desc,
+						     filter->filter_desc,
+						     &n, &c, &h, &w));
 
-		DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-			CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-			CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-		// FIXIT: Algo shouldn't be hardcoded
-		convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+    if (input->data_format == CUDNN_TENSOR_NCHW)
+      output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+				       CUDNN_TENSOR_NCHW, n, c, h, w);
+    else if (input->data_format == CUDNN_TENSOR_NHWC) {
+      DEBUG("* NHWC Format \n");
+      output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+				       CUDNN_TENSOR_NHWC, n, h, w, c);
+    }
+    else
+      ERROR("Unsupported Tensor Type");
 
-		size_t workspace_size;
-		checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-			input->tensor_desc,
-			filter->filter_desc,
-			convDesc,
-			output->tensor_desc,
-			convAlgo,
-			&workspace_size));
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
 
-		// Allocating memory for the convolution workspace
-		void* workspace;
-		checkCudaErrors(cudaMalloc(&workspace, workspace_size));
-		DEBUG("workspace size = %d \n", workspace_size);
+    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
+	  output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
+	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
 
+    if (convDesc == NULL || input->tensor_desc == NULL ||
+	filter->filter_desc == NULL || output->tensor_desc == NULL)
+      ERROR("NULL descriptor! \n");
 
-		checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-			input->gpu_data, filter->filter_desc, filter->gpu_data,
-			convDesc, convAlgo, workspace, workspace_size,
-			&beta, output->tensor_desc, output->gpu_data));
-	}
 
-	cudaDeviceSynchronize();
-	profileEvent("Conv_end", true);
+    // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
+    checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
+						   input->tensor_desc,
+						   filter->filter_desc,
+						   convDesc,
+						   output->tensor_desc,
+						   CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+						   //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+						   0,
+						   &convAlgo));
+
+
+    DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
+	  CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+	  CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
+
 
+    // FIXIT: Algo shouldn't be hardcoded
+    convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
-        #ifdef ERROR_INJECTION_ENABLED
+    size_t workspace_size;
+    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
+						       input->tensor_desc,
+						       filter->filter_desc,
+						       convDesc,
+						       output->tensor_desc,
+						       convAlgo,
+						       &workspace_size));
 
-	if (op_counter >= total_ops) {
-		ERROR("No accuracy flag found \n");
-	}
+    // Allocating memory for the convolution workspace
+    void* workspace;
+    checkCudaErrors(cudaMalloc(&workspace, workspace_size));
+    DEBUG("workspace size = %d \n", workspace_size);
 
-	int op_acc = op_accuracies[op_counter];
 
-	// Skip errorInjection if explicitly requested
-	if (skip_tensors.find(op_counter) != skip_tensors.end()) {
-		op_acc = 0;
-	}
+    checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
+				       input->gpu_data, filter->filter_desc, filter->gpu_data,
+				       convDesc, convAlgo, workspace, workspace_size,
+				       &beta, output->tensor_desc, output->gpu_data));
+  }
+
+  cudaDeviceSynchronize();
+  profileEvent("Conv_end", true);
 
-	void* error_norms = tensorAddError(output, op_acc);
-	add_norms(error_norms, "tensorConv", op_acc);
-	add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
 
-	op_counter++;
+  #ifdef ERROR_INJECTION_ENABLED
 
-        #endif   
+  if (op_counter >= total_ops) {
+    ERROR("No accuracy flag found \n");
+  }
 
-	return output;
+  int op_acc = op_accuracies[op_counter];
+
+  // Skip errorInjection if explicitly requested
+  if (skip_tensors.find(op_counter) != skip_tensors.end()) {
+    op_acc = 0;
+  }
+
+  void* error_norms = tensorAddError(output, op_acc);
+  add_norms(error_norms, "tensorConv", op_acc);
+  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
+
+  op_counter++;
+
+  #endif   
+
+  return output;
 
 
 }
@@ -819,99 +820,102 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
     dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
     dim3 block(blockSize);
     depthwise_convNew8_half<<<grid, block>>> ((__half*)output_half->gpu_data,
-			(__half*)input_half->gpu_data, (__half*)filter_half->gpu_data,
-			input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-			KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+					      (__half*)input_half->gpu_data, (__half*)filter_half->gpu_data,
+					      input->dims.dim_sizes[0], input->dims.dim_sizes[1],
+					      input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+					      KH, KW, h, w,
+					      vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
     cudaDeviceSynchronize();
 
     
   }
-  else{
-  checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
+  else{    
+    checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
-    conv_groups = 1;
-  }
+    //FIXME: Current hack to preserve backward compatibilty
+    if(conv_groups == 0){
+      conv_groups = 1;
+    }
   
-  // NOTE: Adding support for grouped convolution
-  checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
+    // NOTE: Adding support for grouped convolution
+    checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
   
-  // FIXIT: Think if upscaling values need to be configurable?
-  // IMP-FIXIT:  CUDNN Cross correlation is only used in the Lenet context
-  // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     vertical_stride, horizontal_stride, // conv strides
-					     1, 1, // upscaling values
-					     mode, // mode is configurable
-					     computeType)); // defines compute precision
-
-  int n, c, h, w; // output dimensions
-  // Find dimension of convolution output
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
-  DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+    // FIXIT: Think if upscaling values need to be configurable?
+    // IMP-FIXIT:  CUDNN Cross correlation is only used in the Lenet context
+    // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
+    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
+					       vertical_pad, horizontal_pad, // conv padding
+					       vertical_stride, horizontal_stride, // conv strides
+					       1, 1, // upscaling values
+					       mode, // mode is configurable
+					       computeType)); // defines compute precision
+
+    int n, c, h, w; // output dimensions
+    // Find dimension of convolution output
+    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
+						     input->tensor_desc,
+						     filter->filter_desc,
+						     &n, &c, &h, &w));
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
 
-  output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
-					    CUDNN_TENSOR_NCHW, n, c, h, w);
-  // FIXIT: more checks for data types needed
-  output_half = (Tensor*) create4DTensor(CUDNN_DATA_HALF,
-						 CUDNN_TENSOR_NCHW, n, c, h, w);
+    output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
+				      CUDNN_TENSOR_NCHW, n, c, h, w);
+    // FIXIT: more checks for data types needed
+    output_half = (Tensor*) create4DTensor(CUDNN_DATA_HALF,
+					   CUDNN_TENSOR_NCHW, n, c, h, w);
 
 
   
-  // NOTE: Changing output tensor placement from host to device
-  changeTensorPlacement(output, DEVICE);
-  // NOTE: Necessary to insert the above call for every output tensor
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
 
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
+	  output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
+	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
 
-  if(convDesc == NULL || input->tensor_desc == NULL ||
-     filter->filter_desc == NULL || output->tensor_desc == NULL)
-    ERROR("NULL descriptor! \n");
+    if(convDesc == NULL || input->tensor_desc == NULL ||
+       filter->filter_desc == NULL || output->tensor_desc == NULL)
+      ERROR("NULL descriptor! \n");
 
 
-  // NOTE: The following algo works with TRUE half precision
-  convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-  //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+    // NOTE: The following algo works with TRUE half precision
+    convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
   
-  size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input_half->tensor_desc,
-						     filter_half->filter_desc,
-						     convDesc,
-						     output_half->tensor_desc,
-						     convAlgo,
-						     &workspace_size));
-
-  // Allocating memory for the convolution workspace
-  DEBUG("workspace size = %d \n", workspace_size);
-  void* workspace;
-  checkCudaErrors(cudaMalloc(&workspace, workspace_size));
-
-
-
-
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle,
-				     &alpha,
-				     input_half->tensor_desc,
-				     input_half->gpu_data,
-				     filter_half->filter_desc,
-				     filter_half->gpu_data,
-				     convDesc, convAlgo, workspace, workspace_size,
-				     &beta,
-				     output_half->tensor_desc,
-				     output_half->gpu_data));
+    size_t workspace_size;
+    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
+						       input_half->tensor_desc,
+						       filter_half->filter_desc,
+						       convDesc,
+						       output_half->tensor_desc,
+						       convAlgo,
+						       &workspace_size));
+
+    // Allocating memory for the convolution workspace
+    DEBUG("workspace size = %d \n", workspace_size);
+    void* workspace;
+    checkCudaErrors(cudaMalloc(&workspace, workspace_size));
+
+
+
+
+    checkCUDNN(cudnnConvolutionForward(cudnnHandle,
+				       &alpha,
+				       input_half->tensor_desc,
+				       input_half->gpu_data,
+				       filter_half->filter_desc,
+				       filter_half->gpu_data,
+				       convDesc, convAlgo, workspace, workspace_size,
+				       &beta,
+				       output_half->tensor_desc,
+				       output_half->gpu_data));
 
   }
+  
   profileEvent("H2F_start");
 
   // NOTE: Transforming half precision output to single precision
@@ -919,7 +923,7 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
 
   profileEvent("H2F_end");
 
-  profileEvent("#tensorHalfConv_end");
+  profileEvent("#Conv_end");
 
 
   freeTensor(input_half);