Adding Efficient CUDA-based versions for FP16 and FP32 Depthwise Convolution

9ffb63b8 · Huzaifa · f9b1d79c · 9ffb63b8
Commit 9ffb63b8 authored 6 years ago by Huzaifa
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
@@ -2,72 +2,7 @@



-__global__ void depthwise_conv8(float* const __restrict__ y,
-				const float* const __restrict__ x,
-				const float* const __restrict__ w,
-				const int B, const int M,
-				const int H, const int W, const int KH,
-				const int KW, const int H_out, const int W_out,
-				const int H_pad, const int W_pad,
-				const int H_stride, const int W_stride)
-{
-
-  
-  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
- 
-  const int num = 8;
-
-  const int b = blockIdx.x * num;
-  const int m = blockIdx.y; //current filter/channel
-
-  const int tx = threadIdx.x;
-
-  const int start_h = (threadIdx.x / W_out) * H_stride - H_pad;
-  const int start_w = (threadIdx.x % W_out) * W_stride - W_pad;
-

-  const float* weights = &w[m * KH * KW];
-  float c0 = 0;
-  float c1 = 0;
-  float c2 = 0;
-  float c3 = 0;
-  float c4 = 0;
-  float c5 = 0;
-  float c6 = 0;
-  float c7 = 0;
-
-  for (int k = 0; k < KH * KW; k++) {
-    int p = k / KW;
-    int q = k % KW;
-
-    if (start_h + p > -1 && start_h + p < H &&
-	start_w + q > -1 && start_w + q < W) {
-
-      c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
-      c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
-      c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
-      c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
-      c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
-      c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
-      c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
-      c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
-
-    }
-  }
-
-  y4d(b, m, 0, tx) = c0;
-  y4d(b + 1, m, 0, tx) = c1;
-  y4d(b + 2, m, 0, tx) = c2;
-  y4d(b + 3, m, 0, tx) = c3;
-  y4d(b + 4, m, 0, tx) = c4;
-  y4d(b + 5, m, 0, tx) = c5;
-  y4d(b + 6, m, 0, tx) = c6;
-  y4d(b + 7, m, 0, tx) = c7;
-
-  #undef y4d
-  #undef x4d
-}


 __global__ void depthwise_conv(float* const __restrict__ y,
@@ -124,60 +59,6 @@ __global__ void depthwise_conv(float* const __restrict__ y,
  #undef x4d
 }

-__global__ void depthwise_conv12(float* const __restrict__ y,
-				 const float* const __restrict__ x,
-				 const float* const __restrict__ w,
-				 const int B, const int M,
-				 const int H, const int W, const int KH,
-				 const int KW, const int H_out, const int W_out,
-				 const int H_pad, const int W_pad,
-				 const int H_stride, const int W_stride)
-{
-
-  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
-
-  const int num = 12;
-
-  const int b = num * blockIdx.x;
-  const int m = blockIdx.y; //current filter/channel
-
-  const int tx = threadIdx.x;
-
-  const int start_h = (threadIdx.x / W_out) * H_stride - H_pad;
-  const int start_w = (threadIdx.x % W_out) * W_stride - W_pad;
-
-  float C[num] = { 0 };
-
-  const float* weights = &w[m * KH * KW];
-
-  for (int k = 0; k < KH * KW; k++) {
-    int p = k / KW;
-    int q = k % KW;
-
-    if (start_h + p > -1 && start_h + p < H &&
-	start_w + q > -1 && start_w + q < W) {
-
-      #pragma unroll
-      for (int i = 0; i < num; i++) {
-	//if(b + i < B)
-	C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k];
-      }
-
-    }
-  }
-
-  #pragma unroll
-  for (int i = 0; i < num; i++) {
-    //if(b + i < B)
-    y4d(b + i, m, 0, tx) = C[i];
-
-  }
-	
-
-#undef y4d 
-#undef x4d
-}

 __global__ void depthwise_convNew(float* const __restrict__ y,
 				  const float* const __restrict__ x,
@@ -400,6 +281,165 @@ __global__ void depthwise_convNew8_half(__half* const __restrict__ y,
  #undef x4d
 }

+__global__ void depthwise_convNew8_half1(__half* const __restrict__ y,
+					const __half* const __restrict__ x,
+					const __half* const __restrict__ w,
+					const int B, const int M,
+					const int H, const int W, const int KH,
+					const int KW, const int H_out, const int W_out,
+					const int H_pad, const int W_pad,
+					const int H_stride, const int W_stride)
+{
+
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+
+  const int num = 8;
+
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
+	
+  if(m < M){
+    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+
+    const int start_h = (tx / W_out) * H_stride - H_pad;
+    const int start_w = (tx % W_out) * W_stride - W_pad;
+
+    __half c0 = 0;
+    __half c1 = 0;
+    __half c2 = 0;
+    __half c3 = 0;
+    __half c4 = 0;
+    __half c5 = 0;
+    __half c6 = 0;
+    __half c7 = 0;
+	
+    const __half* weights = &w[m * KH * KW];
+
+    for (int k = 0; k < KH * KW; k++) {
+      int p = k / KW;
+      int q = k % KW;
+
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
+
+	c0 = __hfma(x4d(b, m, start_h + p, start_w + q), weights[k], c0);
+      }
+    }
+
+    if(b + 1 < B){
+      for (int k = 0; k < KH * KW; k++) {
+	int p = k / KW;
+	int q = k % KW;
+
+	if (start_h + p > -1 && start_h + p < H &&
+	    start_w + q > -1 && start_w + q < W) {
+
+	  c1 = __hfma(x4d(b + 1, m, start_h + p, start_w + q), weights[k], c1);
+	}
+      }
+    }
+
+    if(b + 2 < B){
+      for (int k = 0; k < KH * KW; k++) {
+	int p = k / KW;
+	int q = k % KW;
+
+	if (start_h + p > -1 && start_h + p < H &&
+	    start_w + q > -1 && start_w + q < W) {
+
+	  c2 = __hfma(x4d(b + 2, m, start_h + p, start_w + q), weights[k], c2);
+	}
+      }
+    }
+
+    if(b + 3 < B){
+      for (int k = 0; k < KH * KW; k++) {
+	int p = k / KW;
+	int q = k % KW;
+
+	if (start_h + p > -1 && start_h + p < H &&
+	    start_w + q > -1 && start_w + q < W) {
+
+	  c3 = __hfma(x4d(b + 3, m, start_h + p, start_w + q), weights[k], c3);
+	}
+      }
+    }
+
+    if(b + 4 < B){
+      for (int k = 0; k < KH * KW; k++) {
+	int p = k / KW;
+	int q = k % KW;
+
+	if (start_h + p > -1 && start_h + p < H &&
+	    start_w + q > -1 && start_w + q < W) {
+
+	  c4 = __hfma(x4d(b + 4, m, start_h + p, start_w + q), weights[k], c4);
+	}
+      }
+    }
+
+    if(b + 5 < B){
+      for (int k = 0; k < KH * KW; k++) {
+	int p = k / KW;
+	int q = k % KW;
+
+	if (start_h + p > -1 && start_h + p < H &&
+	    start_w + q > -1 && start_w + q < W) {
+
+	  c5 = __hfma(x4d(b + 5, m, start_h + p, start_w + q), weights[k], c5);
+	}
+      }
+    }
+
+    if(b + 6 < B){
+      for (int k = 0; k < KH * KW; k++) {
+	int p = k / KW;
+	int q = k % KW;
+
+	if (start_h + p > -1 && start_h + p < H &&
+	    start_w + q > -1 && start_w + q < W) {
+
+	  c6 = __hfma(x4d(b + 6, m, start_h + p, start_w + q), weights[k], c6);
+	}
+      }
+    }
+
+    if(b + 7 < B){
+      for (int k = 0; k < KH * KW; k++) {
+	int p = k / KW;
+	int q = k % KW;
+
+	if (start_h + p > -1 && start_h + p < H &&
+	    start_w + q > -1 && start_w + q < W) {
+
+	  c7 = __hfma(x4d(b + 7, m, start_h + p, start_w + q), weights[k], c7);
+	}
+      }
+    }
+
+    
+
+    y4d(b, m, 0, tx) = c0;	
+    if(b + 1 < B)
+      y4d(b + 1, m, 0, tx) = c1;
+    if(b + 2 < B)
+      y4d(b + 2, m, 0, tx) = c2;
+    if(b + 3 < B)
+      y4d(b + 3, m, 0, tx) = c3;
+    if(b + 4 < B)
+      y4d(b + 4, m, 0, tx) = c4;
+    if(b + 5 < B)
+      y4d(b + 5, m, 0, tx) = c5;
+    if(b + 6 < B)
+      y4d(b + 6, m, 0, tx) = c6;
+    if(b + 7 < B)
+      y4d(b + 7, m, 0, tx) = c7;
+  }
+	
+  #undef y4d 
+  #undef x4d
+}


 __global__ void depthwise_convNew12(float* const __restrict__ y,
@@ -507,6 +547,374 @@ __global__ void depthwise_convNew12(float* const __restrict__ y,
 }


+__global__ void depthwise_convNew12_half(__half* const __restrict__ y,
+				    const __half* const __restrict__ x,
+				    const __half* const __restrict__ w,
+				    const int B, const int M,
+				    const int H, const int W, const int KH,
+				    const int KW, const int H_out, const int W_out,
+				    const int H_pad, const int W_pad,
+				    const int H_stride, const int W_stride)
+{
+
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+
+  const int num = 12;
+
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
+	
+  if(m < M){
+    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+
+    const int start_h = (tx / W_out) * H_stride - H_pad;
+    const int start_w = (tx % W_out) * W_stride - W_pad;
+
+    __half c0 = 0;
+    __half c1 = 0;
+    __half c2 = 0;
+    __half c3 = 0;
+    __half c4 = 0;
+    __half c5 = 0;
+    __half c6 = 0;
+    __half c7 = 0;
+    __half c8 = 0;
+    __half c9 = 0;
+    __half c10 = 0;
+    __half c11 = 0;
+	
+    const __half* weights = &w[m * KH * KW];
+
+    for (int k = 0; k < KH * KW; k++) {
+      int p = k / KW;
+      int q = k % KW;
+
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
+
+	c0 = __hfma(x4d(b, m, start_h + p, start_w + q), weights[k], c0);
+	if(b + 1 < B)
+	  c1 = __hfma(x4d(b + 1, m, start_h + p, start_w + q), weights[k], c1);
+	if(b + 2 < B)
+	  c2 = __hfma(x4d(b + 2, m, start_h + p, start_w + q), weights[k], c2);
+	if(b + 3 < B)
+	  c3 = __hfma(x4d(b + 3, m, start_h + p, start_w + q), weights[k], c3);
+	if(b + 4 < B)
+	  c4 = __hfma(x4d(b + 4, m, start_h + p, start_w + q), weights[k], c4);
+	if(b + 5 < B)
+	  c5 = __hfma(x4d(b + 5, m, start_h + p, start_w + q), weights[k], c5);
+	if(b + 6 < B)
+	  c6 = __hfma(x4d(b + 6, m, start_h + p, start_w + q), weights[k], c6);
+	if(b + 7 < B)
+	  c7 = __hfma(x4d(b + 7, m, start_h + p, start_w + q), weights[k], c7);
+	if(b + 8 < B)
+	  c8 = __hfma(x4d(b + 8, m, start_h + p, start_w + q), weights[k], c8);
+	if(b + 9 < B)
+	  c9 = __hfma(x4d(b + 9, m, start_h + p, start_w + q), weights[k], c9);
+	if(b + 10 < B)
+	  c10 = __hfma(x4d(b + 10, m, start_h + p, start_w + q), weights[k], c10);
+	if(b + 11 < B)
+	  c11 = __hfma(x4d(b + 11, m, start_h + p, start_w + q), weights[k], c11);
+    
+
+      }
+    }
+
+    y4d(b, m, 0, tx) = c0;	
+    if(b + 1 < B)
+      y4d(b + 1, m, 0, tx) = c1;
+    if(b + 2 < B)
+      y4d(b + 2, m, 0, tx) = c2;
+    if(b + 3 < B)
+      y4d(b + 3, m, 0, tx) = c3;
+    if(b + 4 < B)
+      y4d(b + 4, m, 0, tx) = c4;
+    if(b + 5 < B)
+      y4d(b + 5, m, 0, tx) = c5;
+    if(b + 6 < B)
+      y4d(b + 6, m, 0, tx) = c6;
+    if(b + 7 < B)
+      y4d(b + 7, m, 0, tx) = c7;
+    if(b + 8 < B)
+      y4d(b + 8, m, 0, tx) = c8;
+    if(b + 9 < B)
+      y4d(b + 9, m, 0, tx) = c9;
+    if(b + 10 < B)
+      y4d(b + 10, m, 0, tx) = c10;
+    if(b + 11 < B)
+      y4d(b + 11, m, 0, tx) = c11;
+	
+  }
+	
+  #undef y4d 
+  #undef x4d
+}
+
+
+__global__ void depthwise_convNew8_half2(__half* const __restrict__ y,
+					const __half* const __restrict__ x,
+					const __half* const __restrict__ w,
+					const int B, const int M,
+					const int H, const int W, const int KH,
+					const int KW, const int H_out, const int W_out,
+					const int H_pad, const int W_pad,
+					const int H_stride, const int W_stride)
+{
+
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+
+  const int num = 8;
+
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
+	
+  if(m < M){
+    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+
+    const int start_h = (tx / W_out) * H_stride - H_pad;
+    const int start_w = (tx % W_out) * W_stride - W_pad;
+
+    __half2 c0 = __half2half2(0);
+    __half2 c1 = __half2half2(0);
+    __half2 c2 = __half2half2(0);
+    __half2 c3 = __half2half2(0);
+    	
+    const __half* weights = &w[m * KH * KW];
+
+    for (int k = 0; k < KH * KW; k++) {
+      int p = k / KW;
+      int q = k % KW;
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
+
+      
+	__half2 t1;
+	__half2 t2;
+	__half2 t3;
+	__half2 t4;
+	if(b + 7 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
+	  t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q));
+	}
+	else if(b + 6 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
+	  t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q));
+
+	}
+	else if(b + 5 < B){
+	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	    t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
+	}
+	else if(b + 4 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	  t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q));
+
+	}
+	else if(b + 3 < B){
+	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	 }
+	else if(b + 2 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	  t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q));
+
+	}
+	else if(b + 1 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	}
+	else{
+	  t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q));
+
+	 }
+
+	
+	c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0);
+	c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1);
+	c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2);
+	c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3);
+
+      }
+    }
+
+    y4d(b, m, 0, tx) = __high2half(c0);	
+    if(b + 1 < B)
+      y4d(b + 1, m, 0, tx) = __low2half(c0);
+    if(b + 2 < B)
+      y4d(b + 2, m, 0, tx) = __high2half(c1);
+    if(b + 3 < B)
+      y4d(b + 3, m, 0, tx) = __low2half(c1);
+    if(b + 4 < B)
+      y4d(b + 4, m, 0, tx) = __high2half(c2);
+    if(b + 5 < B)
+      y4d(b + 5, m, 0, tx) = __low2half(c2);
+    if(b + 6 < B)
+      y4d(b + 6, m, 0, tx) = __high2half(c3);
+    if(b + 7 < B)
+      y4d(b + 7, m, 0, tx) = __low2half(c3);
+  }
+	
+  #undef y4d 
+  #undef x4d
+}
+
+
+//When stride is 1
+__global__ void depthwise_conv4_half3(__half* const __restrict__ y,
+	const __half* const __restrict__ x,
+	const __half* const __restrict__ w,
+	const int B, const int M,
+	const int H, const int W, const int KH,
+	const int KW, const int H_out, const int W_out,
+	const int H_pad, const int W_pad,
+	const int C_dim, const int H_dim, const int W_dim)
+{
+
+#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+
+	const int num = 1;
+
+	const int b = num * blockIdx.x;
+	const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out);
+
+	if (m < M) {
+		const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out);
+
+		const int start_h = (tx / W_out) - H_pad;
+		const int start_w = (tx % W_out) - W_pad;
+
+		const int bstart_h = (blockIdx.y * blockDim.x % (H_out * W_out)) / W_out - H_pad;
+		const int bstart_w = (blockIdx.y * blockDim.x % (H_out * W_out)) % W_out - H_pad;
+		const int bstartm = (blockIdx.y * blockDim.x / (H_out * W_out));
+
+		extern __shared__ __half xdata[];
+
+		for (int i = 0; i < C_dim * H_dim * W_dim; i += blockDim.x) {
+			if (i / (H_dim * W_dim) + bstartm < M && (i % (H_dim * W_dim)) / W_dim + bstart_h > -1 &&
+				(i % (H_dim * W_dim)) / W_dim + bstart_h < H && (i % (H_dim * W_dim)) % W_dim + bstart_w > -1 &&
+				(i % (H_dim * W_dim)) % W_dim + bstart_w < W) {
+				xdata[i] = x4d(b, i / (H_dim * W_dim) + bstartm, (i % (H_dim * W_dim)) / W_dim + bstart_h,
+					(i % (H_dim * W_dim)) % W_dim + bstart_w);
+			}
+		}
+		__syncthreads();
+
+		__half c0;
+		const __half* weights = &w[m * KH * KW];
+
+		for (int k = 0; k < KH * KW; k++) {
+			int p = k / KW;
+			int q = k % KW;
+			if (start_h + p > -1 && start_h + p < H &&
+				start_w + q > -1 && start_w + q < W) {
+
+
+				__half t1;
+
+				int total = C_dim * H_dim * W_dim;
+				t1 = xdata[(m - bstartm) * H_dim * W_dim + (start_h + p - bstart_h) * W_dim +
+					start_w + q - bstart_w];
+
+
+				c0 = __hfma(t1, weights[k], c0);
+			}
+		}
+
+		y4d(b, m, 0, tx) = c0;
+
+
+	}
+
+#undef y4d 
+#undef x4d
+}
+
+
+__global__ void depthwise_convNew4_half2(__half* const __restrict__ y,
+					const __half* const __restrict__ x,
+					const __half* const __restrict__ w,
+					const int B, const int M,
+					const int H, const int W, const int KH,
+					const int KW, const int H_out, const int W_out,
+					const int H_pad, const int W_pad,
+					const int H_stride, const int W_stride)
+{
+
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+
+  const int num = 4;
+
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
+	
+  if(m < M){
+    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+
+    const int start_h = (tx / W_out) * H_stride - H_pad;
+    const int start_w = (tx % W_out) * W_stride - W_pad;
+
+    __half2 c0 = __half2half2(0);
+    __half2 c1 = __half2half2(0);
+ 
+    const __half* weights = &w[m * KH * KW];
+
+    for (int k = 0; k < KH * KW; k++) {
+      int p = k / KW;
+      int q = k % KW;
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
+
+      
+	__half2 t1;
+	__half2 t2;
+	if(b + 3 < B){
+	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	 }
+	else if(b + 2 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	  t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q));
+
+	}
+	else if(b + 1 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	}
+	else{
+	  t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q));
+
+	 }
+
+	
+	c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0);
+	c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1);
+	
+      }
+    }
+
+    y4d(b, m, 0, tx) = __high2half(c0);	
+    if(b + 1 < B)
+      y4d(b + 1, m, 0, tx) = __low2half(c0);
+    if(b + 2 < B)
+      y4d(b + 2, m, 0, tx) = __high2half(c1);
+    if(b + 3 < B)
+      y4d(b + 3, m, 0, tx) = __low2half(c1);
+
+  }
+	
+  #undef y4d 
+  #undef x4d
+}
+
+
 void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 			int vertical_pad, int horizontal_pad,
 			int vertical_stride, int horizontal_stride,
@@ -578,7 +986,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
    */
 		
    int blockSize;
-    blockSize = 128;
+    blockSize = 64;
 		
    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
    dim3 block(blockSize);
@@ -797,10 +1205,9 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
    const int KW = filter->dims.dim_sizes[3];
    int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
    int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-
    
    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
-
+    

    output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
 				      CUDNN_TENSOR_NCHW, n, c, h, w);
@@ -816,10 +1223,10 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,

    int blockSize;
    blockSize = 128;
-		
+
    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
    dim3 block(blockSize);
-    depthwise_convNew8_half<<<grid, block>>> ((__half*)output_half->gpu_data,
+    depthwise_convNew8_half2<<<grid, block>>> ((__half*)output_half->gpu_data,
 					      (__half*)input_half->gpu_data, (__half*)filter_half->gpu_data,
 					      input->dims.dim_sizes[0], input->dims.dim_sizes[1],
 					      input->dims.dim_sizes[2], input->dims.dim_sizes[3],
@@ -934,6 +1341,213 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,

 }

+void* tensorHalfConvCutlass2(void* input_ptr, void* filter_ptr,
+			    int vertical_pad, int horizontal_pad,
+			    int vertical_stride, int horizontal_stride,
+			    int conv_mode, int conv_groups){
+
+	INFO("*** TensorHConvolution \n");
+	profileEvent("#Conv");
+
+	Tensor* input = (Tensor*)input_ptr;
+	Tensor* filter = (Tensor*)filter_ptr;
+
+	cudnnConvolutionDescriptor_t convDesc;
+	cudnnConvolutionFwdAlgo_t convAlgo;
+	cudnnConvolutionMode_t mode;
+	if (conv_mode == 0)
+		mode = CUDNN_CONVOLUTION;
+	else if (conv_mode == 1)
+		mode = CUDNN_CROSS_CORRELATION;
+
+	// FIXIT: Need to be more aware of the implications of alpha and beta
+	float alpha = 1.0f, beta = 0.0f;
+	// NOTE: compute in half precision
+	cudnnDataType_t computeType = CUDNN_DATA_HALF;
+
+	// NOTE: Moving inputs to GPU global memory
+	hostToDeviceCopy(input);
+	hostToDeviceCopy(filter);
+
+
+	/***** CONVERSIONS from FP32 to FP16 - on the GPU */
+	size_t* input_dims = input->dims.dim_sizes;
+	size_t* filter_dims = filter->dims.dim_sizes;
+
+
+	profileEvent("F2H_start");
+
+	Tensor* input_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW,
+		input_dims[0], input_dims[1],
+		input_dims[2], input_dims[3]);
+
+
+	changeTensorPlacement(input_half, DEVICE);
+	Tensor* filter_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW,
+		filter_dims[0], filter_dims[1],
+		filter_dims[2], filter_dims[3]);
+
+
+	changeTensorPlacement(filter_half, DEVICE);
+
+
+	f2h((float*)input->gpu_data, input->num_elems, (half*)input_half->gpu_data);
+	f2h((float*)filter->gpu_data, filter->num_elems, (half*)filter_half->gpu_data);
+
+
+	/******* END OF INPUT DATA CONVERSIONS*/
+	profileEvent("F2H_end");
+
+	Tensor* output;
+	Tensor* output_half;
+
+
+	if (conv_groups > 1 && horizontal_stride == 1 && vertical_stride == 1) {
+		int n = input->dims.dim_sizes[0];
+		int c = input->dims.dim_sizes[1];
+		const int KH = filter->dims.dim_sizes[2];
+		const int KW = filter->dims.dim_sizes[3];
+		int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+		int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+
+		DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+
+
+		output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+			CUDNN_TENSOR_NCHW, n, c, h, w);
+		// FIXIT: more checks for data types needed
+		output_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF,
+			CUDNN_TENSOR_NCHW, n, c, h, w);
+
+
+
+		// NOTE: Changing output tensor placement from host to device
+		changeTensorPlacement(output, DEVICE);
+		// NOTE: Necessary to insert the above call for every output tensor
+
+		int blockSize;
+		blockSize = 128;
+
+		dim3 grid(((n + 3) / 4), (c * h * w + blockSize - 1) / blockSize);
+		dim3 block(blockSize);
+		int C_dim = blockSize / (h * w) + 1 + 1;
+		int H_dim = blockSize % (h * w) / w + 1 + KH + 1;
+		int W_dim = blockSize % (h * w) % w + 1 + KW + 1;
+		depthwise_conv4_half3 << <grid, block, sizeof(__half)* C_dim* H_dim* W_dim >> > ((__half*)output_half->gpu_data,
+			(__half*)input_half->gpu_data, (__half*)filter_half->gpu_data,
+			input->dims.dim_sizes[0], input->dims.dim_sizes[1],
+			input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+			KH, KW, h, w,
+			vertical_pad, horizontal_pad, C_dim, H_dim, W_dim);
+		cudaDeviceSynchronize();
+
+
+	}
+	else {
+		checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
+
+		//FIXME: Current hack to preserve backward compatibilty
+		if (conv_groups == 0) {
+			conv_groups = 1;
+		}
+
+		// NOTE: Adding support for grouped convolution
+		checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
+
+
+		// FIXIT: Think if upscaling values need to be configurable?
+		// IMP-FIXIT:  CUDNN Cross correlation is only used in the Lenet context
+		// IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
+		checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
+			vertical_pad, horizontal_pad, // conv padding
+			vertical_stride, horizontal_stride, // conv strides
+			1, 1, // upscaling values
+			mode, // mode is configurable
+			computeType)); // defines compute precision
+
+		int n, c, h, w; // output dimensions
+		// Find dimension of convolution output
+		checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
+			input->tensor_desc,
+			filter->filter_desc,
+			&n, &c, &h, &w));
+		DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+
+
+		output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+			CUDNN_TENSOR_NCHW, n, c, h, w);
+		// FIXIT: more checks for data types needed
+		output_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF,
+			CUDNN_TENSOR_NCHW, n, c, h, w);
+
+
+
+		// NOTE: Changing output tensor placement from host to device
+		changeTensorPlacement(output, DEVICE);
+		// NOTE: Necessary to insert the above call for every output tensor
+
+		DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
+			output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
+			output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+
+		if (convDesc == NULL || input->tensor_desc == NULL ||
+			filter->filter_desc == NULL || output->tensor_desc == NULL)
+			ERROR("NULL descriptor! \n");
+
+
+		// NOTE: The following algo works with TRUE half precision
+		convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+		//convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+
+
+		size_t workspace_size;
+		checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
+			input_half->tensor_desc,
+			filter_half->filter_desc,
+			convDesc,
+			output_half->tensor_desc,
+			convAlgo,
+			&workspace_size));
+
+		// Allocating memory for the convolution workspace
+		DEBUG("workspace size = %d \n", workspace_size);
+		void* workspace;
+		checkCudaErrors(cudaMalloc(&workspace, workspace_size));
+
+
+
+
+		checkCUDNN(cudnnConvolutionForward(cudnnHandle,
+			&alpha,
+			input_half->tensor_desc,
+			input_half->gpu_data,
+			filter_half->filter_desc,
+			filter_half->gpu_data,
+			convDesc, convAlgo, workspace, workspace_size,
+			&beta,
+			output_half->tensor_desc,
+			output_half->gpu_data));
+
+	}
+
+	profileEvent("H2F_start");
+
+	// NOTE: Transforming half precision output to single precision
+	h2f((half*)output_half->gpu_data, output->num_elems, (float*)output->gpu_data);
+
+	profileEvent("H2F_end");
+
+	profileEvent("#Conv_end");
+
+
+	freeTensor(input_half);
+	freeTensor(filter_half);
+	freeTensor(output_half);
+
+	return output;
+
+}
+

 // Perforated Tensor Conv with 'perforation_rate' parameter
 void* tensorConvPerf(void* input, void* filter,