diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
index 7359651100d8c181b29e2096ba73db9c026e51cb..e0940eea58c3a14bda5b7d00d2033c7da8df68a7 100644
--- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -54,7 +54,7 @@ link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64 $ENV{CUDNN_PATH} $ENV{CUDNN_PATH
 set(
   RUNTIME_SRCS_FILENAME
   approx_simulation.cu
-  approx_techniques.cu
+  group_conv.cu
   approx_techniques2.cu
   common.cpp
   configuration.cpp
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
deleted file mode 100644
index 14eab13522acc8402cd9f7d745343a30e5028abf..0000000000000000000000000000000000000000
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
+++ /dev/null
@@ -1,1191 +0,0 @@
-
-#include "tensor_utils.h"
-#include "fp16_gemm.h"
-#include "debug.h"
-#include "global_data.h"
-#include "profiling.h"
-#include "op_overheads.h"
-#include "error.h"
-
-
-extern "C"{
-
-
-
-__global__ void depthwise_convNew8(float* const __restrict__ y,
-				   const float* const __restrict__ x,
-				   const float* const __restrict__ w,
-				   const int B, const int M,
-				   const int H, const int W, const int KH,
-				   const int KW, const int H_out, const int W_out,
-				   const int H_pad, const int W_pad,
-				   const int H_stride, const int W_stride)
-{
-
-  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
-
-  const int num = 8;
-
-  const int b = num * blockIdx.x;
-  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
-	
-  if(m < M){
-    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
-
-    const int start_h = (tx / W_out) * H_stride - H_pad;
-    const int start_w = (tx % W_out) * W_stride - W_pad;
-
-    float c0 = 0;
-    float c1 = 0;
-    float c2 = 0;
-    float c3 = 0;
-    float c4 = 0;
-    float c5 = 0;
-    float c6 = 0;
-    float c7 = 0;
-	
-    const float* weights = &w[m * KH * KW];
-
-    for (int k = 0; k < KH * KW; k++) {
-      int p = k / KW;
-      int q = k % KW;
-
-      if (start_h + p > -1 && start_h + p < H &&
-	  start_w + q > -1 && start_w + q < W) {
-
-	c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
-	if(b + 1 < B)
-	  c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
-	if(b + 2 < B)
-	  c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
-	if(b + 3 < B)
-	  c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
-	if(b + 4 < B)
-	  c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
-	if(b + 5 < B)
-	  c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
-	if(b + 6 < B)
-	  c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
-	if(b + 7 < B)
-	  c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
-    
-
-      }
-    }
-
-    y4d(b, m, 0, tx) = c0;	
-    if(b + 1 < B)
-      y4d(b + 1, m, 0, tx) = c1;
-    if(b + 2 < B)
-      y4d(b + 2, m, 0, tx) = c2;
-    if(b + 3 < B)
-      y4d(b + 3, m, 0, tx) = c3;
-    if(b + 4 < B)
-      y4d(b + 4, m, 0, tx) = c4;
-    if(b + 5 < B)
-      y4d(b + 5, m, 0, tx) = c5;
-    if(b + 6 < B)
-      y4d(b + 6, m, 0, tx) = c6;
-    if(b + 7 < B)
-      y4d(b + 7, m, 0, tx) = c7;
-  }
-	
-  #undef y4d 
-  #undef x4d
-}
-
-
-
-
-__global__ void depthwise_convNew8_half2(__half* const __restrict__ y,
-					const __half* const __restrict__ x,
-					const __half* const __restrict__ w,
-					const int B, const int M,
-					const int H, const int W, const int KH,
-					const int KW, const int H_out, const int W_out,
-					const int H_pad, const int W_pad,
-					const int H_stride, const int W_stride)
-{
-
-  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
-
-  const int num = 8;
-
-  const int b = num * blockIdx.x;
-  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
-	
-  if(m < M){
-    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
-
-    const int start_h = (tx / W_out) * H_stride - H_pad;
-    const int start_w = (tx % W_out) * W_stride - W_pad;
-
-    __half2 c0 = __half2half2(0);
-    __half2 c1 = __half2half2(0);
-    __half2 c2 = __half2half2(0);
-    __half2 c3 = __half2half2(0);
-    	
-    const __half* weights = &w[m * KH * KW];
-
-    for (int k = 0; k < KH * KW; k++) {
-      int p = k / KW;
-      int q = k % KW;
-      if (start_h + p > -1 && start_h + p < H &&
-	  start_w + q > -1 && start_w + q < W) {
-
-      
-	__half2 t1;
-	__half2 t2;
-	__half2 t3;
-	__half2 t4;
-	if(b + 7 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
-	  t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q));
-	}
-	else if(b + 6 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
-	  t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q));
-
-	}
-	else if(b + 5 < B){
-	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	    t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
-	}
-	else if(b + 4 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	  t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q));
-
-	}
-	else if(b + 3 < B){
-	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	 }
-	else if(b + 2 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q));
-
-	}
-	else if(b + 1 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	}
-	else{
-	  t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q));
-
-	 }
-
-	
-	c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0);
-	c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1);
-	c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2);
-	c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3);
-
-      }
-    }
-
-    y4d(b, m, 0, tx) = __high2half(c0);	
-    if(b + 1 < B)
-      y4d(b + 1, m, 0, tx) = __low2half(c0);
-    if(b + 2 < B)
-      y4d(b + 2, m, 0, tx) = __high2half(c1);
-    if(b + 3 < B)
-      y4d(b + 3, m, 0, tx) = __low2half(c1);
-    if(b + 4 < B)
-      y4d(b + 4, m, 0, tx) = __high2half(c2);
-    if(b + 5 < B)
-      y4d(b + 5, m, 0, tx) = __low2half(c2);
-    if(b + 6 < B)
-      y4d(b + 6, m, 0, tx) = __high2half(c3);
-    if(b + 7 < B)
-      y4d(b + 7, m, 0, tx) = __low2half(c3);
-  }
-	
-  #undef y4d 
-  #undef x4d
-}
-
-
-  //When stride is 1
-  __global__ void depthwise_conv4_half3(__half* const __restrict__ y,
-					const __half* const __restrict__ x,
-					const __half* const __restrict__ w,
-					const int B, const int M,
-					const int H, const int W, const int KH,
-					const int KW, const int H_out, const int W_out,
-					const int H_pad, const int W_pad,
-					const int C_dim, const int H_dim, const int W_dim)
-  {
-
-#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
-
-    const int num = 1;
-
-    const int b = num * blockIdx.x;
-    const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out);
-
-    if (m < M) {
-      const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out);
-
-      const int start_h = (tx / W_out) - H_pad;
-      const int start_w = (tx % W_out) - W_pad;
-
-      const int bstart_h = (blockIdx.y * blockDim.x % (H_out * W_out)) / W_out - H_pad;
-      const int bstart_w = (blockIdx.y * blockDim.x % (H_out * W_out)) % W_out - H_pad;
-      const int bstartm = (blockIdx.y * blockDim.x / (H_out * W_out));
-
-      extern __shared__ __half xdata[];
-
-      for (int i = 0; i < C_dim * H_dim * W_dim; i += blockDim.x) {
-	if (i / (H_dim * W_dim) + bstartm < M && (i % (H_dim * W_dim)) / W_dim + bstart_h > -1 &&
-	    (i % (H_dim * W_dim)) / W_dim + bstart_h < H && (i % (H_dim * W_dim)) % W_dim + bstart_w > -1 &&
-	    (i % (H_dim * W_dim)) % W_dim + bstart_w < W) {
-	  xdata[i] = x4d(b, i / (H_dim * W_dim) + bstartm, (i % (H_dim * W_dim)) / W_dim + bstart_h,
-			 (i % (H_dim * W_dim)) % W_dim + bstart_w);
-	}
-      }
-      __syncthreads();
-
-      __half c0;
-      const __half* weights = &w[m * KH * KW];
-
-      for (int k = 0; k < KH * KW; k++) {
-	int p = k / KW;
-	int q = k % KW;
-	if (start_h + p > -1 && start_h + p < H &&
-	    start_w + q > -1 && start_w + q < W) {
-
-
-	  __half t1;
-
-	  //int total = C_dim * H_dim * W_dim;
-	  t1 = xdata[(m - bstartm) * H_dim * W_dim + (start_h + p - bstart_h) * W_dim +
-		     start_w + q - bstart_w];
-
-
-	  c0 = __hfma(t1, weights[k], c0);
-	}
-      }
-
-      y4d(b, m, 0, tx) = c0;
-
-
-    }
-
-  #undef y4d 
-  #undef x4d
- }
-
-
-
-void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups){
-
-
-  INFO("*** TensorConvolution \n");
-  profileEvent("Conv");
-
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-
-  //FIXME: Current hack to preserve backward compatibilty
-  if (conv_groups == 0) {
-    conv_groups = 1;
-  }
-
-  Tensor* output;
-
-  hostToDeviceCopy(input);
-  hostToDeviceCopy(filter);
-
-  convertToFP32(input);
-  convertToFP32(filter);
-
-  
-  if (conv_groups > 32) {
-    // TODO: Support other cases;  
-    hostToDeviceCopy(input);
-    hostToDeviceCopy(filter);
-
-    int n, c, h, w; // output dimensions  
-    n = input->dims.dim_sizes[0];
-    c = input->dims.dim_sizes[1];
-    const int KH = filter->dims.dim_sizes[2];
-    const int KW = filter->dims.dim_sizes[3];
-    h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-    w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
-
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-
-
-		
-    int blockSize;
-    blockSize = 64;
-		
-    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
-    dim3 block(blockSize);
-    depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data,
-					 (float*)input->gpu_data, (float*)filter->gpu_data,
-					 input->dims.dim_sizes[0], input->dims.dim_sizes[1],
-					 input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-					 KH, KW, h, w, vertical_pad, horizontal_pad,
-					 vertical_stride, horizontal_stride);
-
-  }
-  else {
-
-    cudnnConvolutionDescriptor_t convDesc;
-    cudnnConvolutionFwdAlgo_t convAlgo;
-    cudnnConvolutionMode_t mode;
-    if (conv_mode == 0)
-      mode = CUDNN_CONVOLUTION;
-    else if (conv_mode == 1)
-      mode = CUDNN_CROSS_CORRELATION;
-
-    // FIXIT: Need to be more aware of the implications of alpha and beta
-    float alpha = 1.0f, beta = 0.0f;
-
-    // TODO: Support other cases;  
-    hostToDeviceCopy(input);
-    hostToDeviceCopy(filter);
-
-    INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
-
-    checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
-
-    // NOTE: Adding support for grouped convolution
-    checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
-
-
-    cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-    // FIXIT: Think if upscaling values need to be configurable?
-    // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
-    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					       vertical_pad, horizontal_pad, // conv padding
-					       vertical_stride, horizontal_stride, // conv strides
-					       1, 1, // upscaling values
-					       mode, // mode is configurable
-					       computeType)); // defines compute precision
-
-    int n, c, h, w; // output dimensions  
-    // Find dimension of convolution output
-    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     &n, &c, &h, &w));
-
-
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
-
-    if (input->data_format == CUDNN_TENSOR_NCHW)
-      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
-    else if (input->data_format == CUDNN_TENSOR_NHWC) {
-      DEBUG("* NHWC Format \n");
-      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				       CUDNN_TENSOR_NHWC, n, h, w, c);
-    }
-    else
-      ERROR("Unsupported Tensor Type");
-
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-
-    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	  output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
-
-    if (convDesc == NULL || input->tensor_desc == NULL ||
-	filter->filter_desc == NULL || output->tensor_desc == NULL)
-      ERROR("NULL descriptor! \n");
-
-
-    // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-    checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   convDesc,
-						   output->tensor_desc,
-						   CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-						   //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						   0,
-						   &convAlgo));
-
-
-    DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	  CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	  CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-
-
-    // FIXIT: Algo shouldn't be hardcoded
-    convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-
-    size_t workspace_size;
-    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						       input->tensor_desc,
-						       filter->filter_desc,
-						       convDesc,
-						       output->tensor_desc,
-						       convAlgo,
-						       &workspace_size));
-
-    // Allocating memory for the convolution workspace
-    void* workspace;
-    checkCudaErrors(cudaMalloc(&workspace, workspace_size));
-    DEBUG("workspace size = %d \n", workspace_size);
-
-
-    checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				       input->gpu_data, filter->filter_desc, filter->gpu_data,
-				       convDesc, convAlgo, workspace, workspace_size,
-				       &beta, output->tensor_desc, output->gpu_data));
-  }
-
-  cudaDeviceSynchronize();
-  profileEvent("Conv_end", true);
-
-
-  #ifdef ERROR_INJECTION_ENABLED
-
-  if (op_counter >= total_ops) {
-    ERROR("No accuracy flag found \n");
-  }
-
-  int op_acc = op_accuracies[op_counter];
-
-  // Skip errorInjection if explicitly requested
-  if (skip_tensors.find(op_counter) != skip_tensors.end()) {
-    op_acc = 0;
-  }
-
-  void* error_norms = tensorAddError(output, op_acc);
-  add_norms(error_norms, "tensorConv", op_acc);
-  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
-
-  op_counter++;
-
-  #endif   
-
-  return output;
-
-
-}
-
-// FIXME: Need to properly fix the new HALF type conversion
-void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
-			    int vertical_pad, int horizontal_pad,
-			    int vertical_stride, int horizontal_stride,
-			    int conv_mode, int conv_groups){
-
-  INFO("*** TensorHConvolution \n");
-  profileEvent("#Conv");
-
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
-
-  cudnnConvolutionDescriptor_t convDesc;
-  cudnnConvolutionFwdAlgo_t convAlgo;
-  cudnnConvolutionMode_t mode;
-  
-  if(conv_mode == 0)
-    mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
-    mode = CUDNN_CROSS_CORRELATION;
-
-  // FIXIT: Need to be more aware of the implications of alpha and beta
-  float alpha = 1.0f, beta = 0.0f;
-  // NOTE: compute in half precision
-  cudnnDataType_t computeType = CUDNN_DATA_HALF;
-
-  // NOTE: Moving inputs to GPU global memory
-  hostToDeviceCopy(input);
-  hostToDeviceCopy(filter);
-
-
-  // Float-Half Conversions
-  profileEvent("F2H_start");
-
-  convertToFP16(input);
-  convertToFP16(filter);  
-
-  profileEvent("F2H_end");
-  /******* END OF INPUT DATA CONVERSIONS*/
-
-  
-  Tensor *output;
-  if(conv_groups > 1){
-    int n = input->dims.dim_sizes[0];
-    int c = input->dims.dim_sizes[1];
-    const int KH = filter->dims.dim_sizes[2];
-    const int KW = filter->dims.dim_sizes[3];
-    int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-    int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-    
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
-    
-
-    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, 
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
-
-  
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-
-    int blockSize;
-    blockSize = 128;
-
-    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
-    dim3 block(blockSize);
-    depthwise_convNew8_half2<<<grid, block>>> ((__half*) output->gpu_half_data,
-					      (__half*) input->gpu_half_data,
-					      (__half*) filter->gpu_half_data,
-					      input->dims.dim_sizes[0], input->dims.dim_sizes[1],
-					      input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-					      KH, KW, h, w,
-					      vertical_pad, horizontal_pad,
-					      vertical_stride, horizontal_stride);
-    cudaDeviceSynchronize();
-
-    
-  }
-  else{    
-    checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
-
-    //FIXME: Current hack to preserve backward compatibilty
-    if(conv_groups == 0){
-      conv_groups = 1;
-    }
-  
-    // NOTE: Adding support for grouped convolution
-    checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
-
-  
-    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					       vertical_pad, horizontal_pad, // conv padding
-					       vertical_stride, horizontal_stride, // conv strides
-					       1, 1, // upscaling values
-					       mode, // mode is configurable
-					       computeType)); // defines compute precision
-
-    int n, c, h, w; // output dimensions
-    // Find dimension of convolution output
-    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						     input->tensor_half_desc,
-						     filter->filter_half_desc,
-						     &n, &c, &h, &w));
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
-
-
-    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
-
-  
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-
-    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
-	  output->data_type, output->data_format,
-	  output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
-
-    if(convDesc == NULL || input->tensor_desc == NULL ||
-       filter->filter_desc == NULL || output->tensor_desc == NULL)
-      ERROR("NULL descriptor! \n");
-
-
-    // NOTE: The following algo works with TRUE half precision
-    convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-
-  
-    size_t workspace_size;
-    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						       input->tensor_half_desc,
-						       filter->filter_half_desc,
-						       convDesc,
-						       output->tensor_half_desc,
-						       convAlgo,
-						       &workspace_size));
-
-    // Allocating memory for the convolution workspace
-    DEBUG("workspace size = %d \n", workspace_size);
-    void* workspace;
-    checkCudaErrors(cudaMalloc(&workspace, workspace_size));
-
-
-
-
-    checkCUDNN(cudnnConvolutionForward(cudnnHandle,
-				       &alpha,
-				       input->tensor_half_desc,
-				       input->gpu_half_data,
-				       filter->filter_half_desc,
-				       filter->gpu_half_data,
-				       convDesc, convAlgo, workspace, workspace_size,
-				       &beta,
-				       output->tensor_half_desc,
-				       output->gpu_half_data));
-
-  }
-  
-  profileEvent("H2F_start");
-
-  convertToFP32_offline(output);
-  
-  profileEvent("H2F_end");
-
-
-
-  profileEvent("#Conv_end");
-
-
-  
-  return output;
-
-}
-
-void* tensorHalfConvCutlass2(void* input_ptr, void* filter_ptr,
-			     int vertical_pad, int horizontal_pad,
-			     int vertical_stride, int horizontal_stride,
-			     int conv_mode, int conv_groups){
-
-  INFO("*** TensorHConvolution \n");
-  profileEvent("#Conv");
-
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-
-  cudnnConvolutionDescriptor_t convDesc;
-  cudnnConvolutionFwdAlgo_t convAlgo;
-  cudnnConvolutionMode_t mode;
-  if (conv_mode == 0)
-    mode = CUDNN_CONVOLUTION;
-  else if (conv_mode == 1)
-    mode = CUDNN_CROSS_CORRELATION;
-
-  // FIXIT: Need to be more aware of the implications of alpha and beta
-  float alpha = 1.0f, beta = 0.0f;
-  // NOTE: compute in half precision
-  cudnnDataType_t computeType = CUDNN_DATA_HALF;
-
-  // NOTE: Moving inputs to GPU global memory
-  hostToDeviceCopy(input);
-  hostToDeviceCopy(filter);
-
-
-  /***** CONVERSIONS from FP32 to FP16 - on the GPU */
-  size_t* input_dims = input->dims.dim_sizes;
-  size_t* filter_dims = filter->dims.dim_sizes;
-
-
-  profileEvent("F2H_start");
-
-  Tensor* input_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW,
-					       input_dims[0], input_dims[1],
-					       input_dims[2], input_dims[3]);
-
-
-  changeTensorPlacement(input_half, DEVICE);
-  Tensor* filter_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW,
-						filter_dims[0], filter_dims[1],
-						filter_dims[2], filter_dims[3]);
-
-
-  changeTensorPlacement(filter_half, DEVICE);
-
-
-  f2h((float*)input->gpu_data, input->num_elems, (half*)input_half->gpu_data);
-  f2h((float*)filter->gpu_data, filter->num_elems, (half*)filter_half->gpu_data);
-
-
-  /******* END OF INPUT DATA CONVERSIONS*/
-  profileEvent("F2H_end");
-
-  Tensor* output;
-  Tensor* output_half;
-
-
-  if (conv_groups > 1 && horizontal_stride == 1 && vertical_stride == 1) {
-    int n = input->dims.dim_sizes[0];
-    int c = input->dims.dim_sizes[1];
-    const int KH = filter->dims.dim_sizes[2];
-    const int KW = filter->dims.dim_sizes[3];
-    int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-    int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
-
-
-    output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
-    // FIXIT: more checks for data types needed
-    output_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF,
-					  CUDNN_TENSOR_NCHW, n, c, h, w);
-
-
-
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-
-    int blockSize;
-    blockSize = 128;
-
-    dim3 grid(((n + 3) / 4), (c * h * w + blockSize - 1) / blockSize);
-    dim3 block(blockSize);
-    int C_dim = blockSize / (h * w) + 1 + 1;
-    int H_dim = blockSize % (h * w) / w + 1 + KH + 1;
-    int W_dim = blockSize % (h * w) % w + 1 + KW + 1;
-    depthwise_conv4_half3 << <grid, block,
-			   sizeof(__half)* C_dim* H_dim* W_dim >> >
-      ((__half*) output_half->gpu_data,
-       
-			  (__half*)input_half->gpu_data,
-       (__half*)filter_half->gpu_data,
-       input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2],
-       input->dims.dim_sizes[3],
-       KH, KW, h, w, vertical_pad, horizontal_pad, C_dim, H_dim, W_dim);
-
-    
-    cudaDeviceSynchronize();
-
-
-  }
-  else {
-    checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
-
-    //FIXME: Current hack to preserve backward compatibilty
-    if (conv_groups == 0) {
-      conv_groups = 1;
-    }
-
-    // NOTE: Adding support for grouped convolution
-    checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
-
-
-    // FIXIT: Think if upscaling values need to be configurable?
-    // IMP-FIXIT:  CUDNN Cross correlation is only used in the Lenet context
-    // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
-    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					       vertical_pad, horizontal_pad, // conv padding
-					       vertical_stride, horizontal_stride, // conv strides
-					       1, 1, // upscaling values
-					       mode, // mode is configurable
-					       computeType)); // defines compute precision
-
-    int n, c, h, w; // output dimensions
-    // Find dimension of convolution output
-    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     &n, &c, &h, &w));
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
-
-
-    output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
-    // FIXIT: more checks for data types needed
-    output_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF,
-					  CUDNN_TENSOR_NCHW, n, c, h, w);
-
-
-
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(output, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-
-    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
-	  output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
-
-    if (convDesc == NULL || input->tensor_desc == NULL ||
-	filter->filter_desc == NULL || output->tensor_desc == NULL)
-      ERROR("NULL descriptor! \n");
-
-
-    // NOTE: The following algo works with TRUE half precision
-    convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-
-
-    size_t workspace_size;
-    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						       input_half->tensor_desc,
-						       filter_half->filter_desc,
-						       convDesc,
-						       output_half->tensor_desc,
-						       convAlgo,
-						       &workspace_size));
-
-    // Allocating memory for the convolution workspace
-    DEBUG("workspace size = %d \n", workspace_size);
-    void* workspace;
-    checkCudaErrors(cudaMalloc(&workspace, workspace_size));
-
-
-
-
-    checkCUDNN(cudnnConvolutionForward(cudnnHandle,
-				       &alpha,
-				       input_half->tensor_desc,
-				       input_half->gpu_data,
-				       filter_half->filter_desc,
-				       filter_half->gpu_data,
-				       convDesc, convAlgo, workspace, workspace_size,
-				       &beta,
-				       output_half->tensor_desc,
-				       output_half->gpu_data));
-
-  }
-
-  profileEvent("H2F_start");
-
-  // NOTE: Transforming half precision output to single precision
-  h2f((half*)output_half->gpu_data, output->num_elems, (float*)output->gpu_data);
-
-  profileEvent("H2F_end");
-
-  profileEvent("#Conv_end");
-
-
-  freeTensor(input_half);
-  freeTensor(filter_half);
-  freeTensor(output_half);
-
-  return output;
-
-}
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void interpolateCol(int N, int old_w, int n, int c, int h, int w, float *old_data, float *new_data){
-
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
-
-    if(col % 2 == 0)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + col / 2];
-    else if(col == w - 1)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-1) / 2];
-    else
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-1) / 2] +
-	 old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col+1) / 2])/2;
-
-  }
-}
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void interpolateRow(int N, int old_h, int n, int c, int h, int w, float *old_data, float *new_data){
-
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
-
-    if(row % 2 == 0)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + (row/2) * (w) + col];
-    else if(row == h - 1)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + (row-1)/2 * (w) + col];
-    else
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(old_data[n * (c * old_h * w) + ch * (old_h * w) + (row -1)/2 * (w) + col] +
-	 old_data[n * (c * old_h * w) + ch * (old_h * w) + ((row+1) / 2) * (w) + col])/2;
-
-  }
-}
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void interpolateXCol(int N, int old_w, int n, int c, int h, int w,
-		     float *old_data, float *new_data, int num){
-
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
-
-    if(col % num == 0)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + col / num];
-    else{
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-(col%num)) / num];
-    }
-
-  }
-}
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void interpolateXRow(int N, int old_h, int n, int c, int h, int w,
-		     float *old_data, float *new_data, int num){
-
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
-
-    if(row % num == 0)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + (row/num) * (w) + col];
-    else{
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + (row - (row % num))/num * (w) + col];
-    }
-
-  }
-}
-
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void interpolateColHalf(int N, int old_w, int n, int c, int h, int w, __half *old_data, __half *new_data){
-
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
-
-    if(col % 2 == 0)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + col / 2];
-    else if(col == w - 1)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-1) / 2];
-    else
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	__hdiv(__hadd(old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col-1) / 2],
-		      old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + (col+1) / 2]),2);
-
-  }
-}
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void interpolateRowHalf(int N, int old_h, int n, int c, int h, int w, __half *old_data, __half *new_data){
-
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-
-  for(int i = index; i < N; i += stride){
-    int col = ((i % (c * h * w)) % (h * w)) % w;
-    int row = ((i % (c * h * w)) % (h * w)) / w;
-    int ch = (i % (c * h * w)) / (h * w);
-    int n = i / (c * h * w);
-
-    if(row % 2 == 0)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + (row/2) * (w) + col];
-    else if(row == h - 1)
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	old_data[n * (c * old_h * w) + ch * (old_h * w) + (row-1)/2 * (w) + col];
-    else
-      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	__hdiv(__hadd(old_data[n * (c * old_h * w) + ch * (old_h * w)
-			       + (row -1)/2 * (w) + col],
-		               old_data[n * (c * old_h * w) + ch * (old_h * w)
-			       + ((row+1) / 2) * (w) + col]), 2);
-
-  }
-}
-
-
-
-  
-
-//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
-__global__ void convToGemmApprox(float * const __restrict__ output,
-				 const float * const __restrict input, const int N, const int C,
-				 const int H, const int W,
-				 const int KH, const int KW, const int V_pad,
-				 const int H_pad, const int H_out,
-				 const int W_out, const int V_stride,
-				 const int H_stride, const int reduced_filter_elem,
-				 const int skip_every) {
-  
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
-	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
-	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	  else
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
-	}
-      }
-    }
-  }
-}
-
-
-//produces COL MAJOR matrix with reduced_filter_elem rows and NF cols
-__global__ void createReducedFilters(float * const __restrict__ output,
-				     const float * const __restrict input, const int NF,
-				     const int num_filter_elem, const int reduced_filter_elem,
-				     const int skip_every) {
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / num_filter_elem; //filter index
-  const int offset = tx % num_filter_elem; //offset within filter
-  if(fIdx < NF) { //is thread id within bounds?
-    if(offset % skip_every != skip_every-1) { //are we including this filter element?
-      const int output_row = offset - (offset/skip_every); //calculate output row, taking skipping into account
-      output[fIdx*reduced_filter_elem + output_row] = input[tx] * 2;
-    }
-  }
-}
-
-
-void* tensorConvolutionKernelSamp(void* input_ptr, void* filter_ptr,
-				  int vertical_pad, int horizontal_pad,
-				  int vertical_stride, int horizontal_stride,
-				  int conv_mode, int conv_groups, int skip_every){
-
-  INFO("*** TensorConvolution (w/ kernel sampling) \n");
-  profileEvent("Conv");
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
-  if (conv_groups == 0) {
-    conv_groups = 1;
-  }
-
-  Tensor* output;
-  // TODO: Support other cases;
-  hostToDeviceCopy(input);
-  hostToDeviceCopy(filter);
-
-
-  convertToFP32(input);
-  convertToFP32(filter);
-  
-  
-  int n, c, h, w; // output dimensions
-  n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
-  const int KH = filter->dims.dim_sizes[2];
-  const int KW = filter->dims.dim_sizes[3];
-  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-  output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
-				   CUDNN_TENSOR_NCHW, n, c, h, w);
-  // NOTE: Changing output tensor placement from host to device
-  changeTensorPlacement(output, DEVICE);
-  // NOTE: Necessary to insert the above call for every output tensor
-  //total number of filter elem
-  const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-  //reduced number after skipping
-  const int reduced_filter_elem = num_filter_elem - (num_filter_elem/skip_every);
-  float * convData;
-  int convDataSize = sizeof(float) * n * reduced_filter_elem * h * w;
-  checkCudaErrors(cudaMalloc(&convData, convDataSize));
-  float * reducedFilter;
-  checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
-  const int filtBlockSize = 128;
-  const int filtGridSize = (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize;
-  createReducedFilters<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-							(float *)filter->gpu_data, c,
-							num_filter_elem, reduced_filter_elem,
-							skip_every);
-  const int blockSize = 128;
-  const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-  convToGemmApprox<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					    input->dims.dim_sizes[1],
-					    input->dims.dim_sizes[2],
-					    input->dims.dim_sizes[3],
-					    KH, KW, vertical_pad, horizontal_pad, h, w,
-					    vertical_stride, horizontal_stride,
-					    reduced_filter_elem, skip_every);
-  checkCudaErrors(cudaDeviceSynchronize());
-  //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
-  float alpha = 1.0f, beta = 0.0f;
-  checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					    CUBLAS_OP_N, CUBLAS_OP_N,
-					    h * w, c, reduced_filter_elem,
-					    &alpha,
-					    convData, h * w, reduced_filter_elem * h * w,
-					    reducedFilter, reduced_filter_elem, 0,
-					    &beta,
-					    (float *)output->gpu_data, h * w, c * h * w,
-					    n));
-  cudaFree(convData);
-  cudaFree(reducedFilter);
-  profileEvent("Conv_end", true);
-      #ifdef ERROR_INJECTION_ENABLED
-  if (op_counter >= total_ops) {
-    ERROR("No accuracy flag found \n");
-  }
-  int op_acc = op_accuracies[op_counter];
-  // Skip errorInjection if explicitly requested
-  if (skip_tensors.find(op_counter) != skip_tensors.end()) {
-    op_acc = 0;
-  }
-  void* error_norms = tensorAddError(output, op_acc);
-  add_norms(error_norms, "tensorConv", op_acc);
-  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
-  op_counter++;
-      #endif
-  return output;
-}
-
-
-}// End of Extern C
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
index 5fbc7139b95c22fdd91301f4007b0fe06ec08a69..8f2d840362ee523a458339b848e9080a2822d92f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
@@ -1078,6 +1078,43 @@ __global__ void createReducedFiltersHalfIrregular(__half * output,
   }
 }
 
+
+
+//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
+__global__ void convToGemmApprox(float * const __restrict__ output,
+				 const float * const __restrict input, const int N, const int C,
+				 const int H, const int W,
+				 const int KH, const int KW, const int V_pad,
+				 const int H_pad, const int H_out,
+				 const int W_out, const int V_stride,
+				 const int H_stride, const int reduced_filter_elem,
+				 const int skip_every) {
+  
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
+	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
+	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	  else
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
+	}
+      }
+    }
+  }
+}
+
+
+
 void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
 			 int vertical_pad, int horizontal_pad, int vertical_stride,
 			 int horizontal_stride, int conv_mode, int conv_groups,
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd8a23b9cad89fe9ac6618e8c1b0e962ab27cf15
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
@@ -0,0 +1,594 @@
+
+#include "tensor_utils.h"
+#include "fp16_gemm.h"
+#include "debug.h"
+#include "global_data.h"
+#include "profiling.h"
+#include "op_overheads.h"
+#include "error.h"
+
+
+extern "C"{
+
+
+
+__global__ void depthwise_convNew8(float* const __restrict__ y,
+				   const float* const __restrict__ x,
+				   const float* const __restrict__ w,
+				   const int B, const int M,
+				   const int H, const int W, const int KH,
+				   const int KW, const int H_out, const int W_out,
+				   const int H_pad, const int W_pad,
+				   const int H_stride, const int W_stride)
+{
+
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+
+  const int num = 8;
+
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
+	
+  if(m < M){
+    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+
+    const int start_h = (tx / W_out) * H_stride - H_pad;
+    const int start_w = (tx % W_out) * W_stride - W_pad;
+
+    float c0 = 0;
+    float c1 = 0;
+    float c2 = 0;
+    float c3 = 0;
+    float c4 = 0;
+    float c5 = 0;
+    float c6 = 0;
+    float c7 = 0;
+	
+    const float* weights = &w[m * KH * KW];
+
+    for (int k = 0; k < KH * KW; k++) {
+      int p = k / KW;
+      int q = k % KW;
+
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
+
+	c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
+	if(b + 1 < B)
+	  c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
+	if(b + 2 < B)
+	  c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
+	if(b + 3 < B)
+	  c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
+	if(b + 4 < B)
+	  c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
+	if(b + 5 < B)
+	  c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
+	if(b + 6 < B)
+	  c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
+	if(b + 7 < B)
+	  c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
+    
+
+      }
+    }
+
+    y4d(b, m, 0, tx) = c0;	
+    if(b + 1 < B)
+      y4d(b + 1, m, 0, tx) = c1;
+    if(b + 2 < B)
+      y4d(b + 2, m, 0, tx) = c2;
+    if(b + 3 < B)
+      y4d(b + 3, m, 0, tx) = c3;
+    if(b + 4 < B)
+      y4d(b + 4, m, 0, tx) = c4;
+    if(b + 5 < B)
+      y4d(b + 5, m, 0, tx) = c5;
+    if(b + 6 < B)
+      y4d(b + 6, m, 0, tx) = c6;
+    if(b + 7 < B)
+      y4d(b + 7, m, 0, tx) = c7;
+  }
+	
+  #undef y4d 
+  #undef x4d
+}
+
+
+
+
+__global__ void depthwise_convNew8_half2(__half* const __restrict__ y,
+					const __half* const __restrict__ x,
+					const __half* const __restrict__ w,
+					const int B, const int M,
+					const int H, const int W, const int KH,
+					const int KW, const int H_out, const int W_out,
+					const int H_pad, const int W_pad,
+					const int H_stride, const int W_stride)
+{
+
+  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+
+  const int num = 8;
+
+  const int b = num * blockIdx.x;
+  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
+	
+  if(m < M){
+    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+
+    const int start_h = (tx / W_out) * H_stride - H_pad;
+    const int start_w = (tx % W_out) * W_stride - W_pad;
+
+    __half2 c0 = __half2half2(0);
+    __half2 c1 = __half2half2(0);
+    __half2 c2 = __half2half2(0);
+    __half2 c3 = __half2half2(0);
+    	
+    const __half* weights = &w[m * KH * KW];
+
+    for (int k = 0; k < KH * KW; k++) {
+      int p = k / KW;
+      int q = k % KW;
+      if (start_h + p > -1 && start_h + p < H &&
+	  start_w + q > -1 && start_w + q < W) {
+
+      
+	__half2 t1;
+	__half2 t2;
+	__half2 t3;
+	__half2 t4;
+	if(b + 7 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
+	  t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q));
+	}
+	else if(b + 6 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
+	  t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q));
+
+	}
+	else if(b + 5 < B){
+	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	    t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
+	}
+	else if(b + 4 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	  t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q));
+
+	}
+	else if(b + 3 < B){
+	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
+	 }
+	else if(b + 2 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	  t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q));
+
+	}
+	else if(b + 1 < B){
+	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
+	}
+	else{
+	  t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q));
+
+	 }
+
+	
+	c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0);
+	c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1);
+	c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2);
+	c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3);
+
+      }
+    }
+
+    y4d(b, m, 0, tx) = __high2half(c0);	
+    if(b + 1 < B)
+      y4d(b + 1, m, 0, tx) = __low2half(c0);
+    if(b + 2 < B)
+      y4d(b + 2, m, 0, tx) = __high2half(c1);
+    if(b + 3 < B)
+      y4d(b + 3, m, 0, tx) = __low2half(c1);
+    if(b + 4 < B)
+      y4d(b + 4, m, 0, tx) = __high2half(c2);
+    if(b + 5 < B)
+      y4d(b + 5, m, 0, tx) = __low2half(c2);
+    if(b + 6 < B)
+      y4d(b + 6, m, 0, tx) = __high2half(c3);
+    if(b + 7 < B)
+      y4d(b + 7, m, 0, tx) = __low2half(c3);
+  }
+	
+  #undef y4d 
+  #undef x4d
+}
+
+
+
+void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
+			int vertical_pad, int horizontal_pad,
+			int vertical_stride, int horizontal_stride,
+			int conv_mode, int conv_groups){
+
+
+  INFO("*** TensorConvolution \n");
+  profileEvent("Conv");
+
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
+
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
+
+  Tensor* output;
+
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  convertToFP32(input);
+  convertToFP32(filter);
+
+  
+  if (conv_groups > 32) {
+    // TODO: Support other cases;  
+    hostToDeviceCopy(input);
+    hostToDeviceCopy(filter);
+
+    int n, c, h, w; // output dimensions  
+    n = input->dims.dim_sizes[0];
+    c = input->dims.dim_sizes[1];
+    const int KH = filter->dims.dim_sizes[2];
+    const int KW = filter->dims.dim_sizes[3];
+    h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+    w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+
+    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
+				     CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+
+
+		
+    int blockSize;
+    blockSize = 64;
+		
+    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
+    dim3 block(blockSize);
+    depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data,
+					 (float*)input->gpu_data, (float*)filter->gpu_data,
+					 input->dims.dim_sizes[0], input->dims.dim_sizes[1],
+					 input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+					 KH, KW, h, w, vertical_pad, horizontal_pad,
+					 vertical_stride, horizontal_stride);
+
+  }
+  else {
+
+    cudnnConvolutionDescriptor_t convDesc;
+    cudnnConvolutionFwdAlgo_t convAlgo;
+    cudnnConvolutionMode_t mode;
+    if (conv_mode == 0)
+      mode = CUDNN_CONVOLUTION;
+    else if (conv_mode == 1)
+      mode = CUDNN_CROSS_CORRELATION;
+
+    // FIXIT: Need to be more aware of the implications of alpha and beta
+    float alpha = 1.0f, beta = 0.0f;
+
+    // TODO: Support other cases;  
+    hostToDeviceCopy(input);
+    hostToDeviceCopy(filter);
+
+    INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
+
+    checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
+
+    // NOTE: Adding support for grouped convolution
+    checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
+
+
+    cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
+    // FIXIT: Think if upscaling values need to be configurable?
+    // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
+    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
+					       vertical_pad, horizontal_pad, // conv padding
+					       vertical_stride, horizontal_stride, // conv strides
+					       1, 1, // upscaling values
+					       mode, // mode is configurable
+					       computeType)); // defines compute precision
+
+    int n, c, h, w; // output dimensions  
+    // Find dimension of convolution output
+    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
+						     input->tensor_desc,
+						     filter->filter_desc,
+						     &n, &c, &h, &w));
+
+
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+
+    if (input->data_format == CUDNN_TENSOR_NCHW)
+      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+				       CUDNN_TENSOR_NCHW, n, c, h, w);
+    else if (input->data_format == CUDNN_TENSOR_NHWC) {
+      DEBUG("* NHWC Format \n");
+      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
+				       CUDNN_TENSOR_NHWC, n, h, w, c);
+    }
+    else
+      ERROR("Unsupported Tensor Type");
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+
+    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
+	  output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
+	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+
+    if (convDesc == NULL || input->tensor_desc == NULL ||
+	filter->filter_desc == NULL || output->tensor_desc == NULL)
+      ERROR("NULL descriptor! \n");
+
+
+    // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
+    checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
+						   input->tensor_desc,
+						   filter->filter_desc,
+						   convDesc,
+						   output->tensor_desc,
+						   CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+						   //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+						   0,
+						   &convAlgo));
+
+
+    DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
+	  CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+	  CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
+
+
+    // FIXIT: Algo shouldn't be hardcoded
+    convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+
+    size_t workspace_size;
+    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
+						       input->tensor_desc,
+						       filter->filter_desc,
+						       convDesc,
+						       output->tensor_desc,
+						       convAlgo,
+						       &workspace_size));
+
+    // Allocating memory for the convolution workspace
+    void* workspace;
+    checkCudaErrors(cudaMalloc(&workspace, workspace_size));
+    DEBUG("workspace size = %d \n", workspace_size);
+
+
+    checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
+				       input->gpu_data, filter->filter_desc, filter->gpu_data,
+				       convDesc, convAlgo, workspace, workspace_size,
+				       &beta, output->tensor_desc, output->gpu_data));
+  }
+
+  cudaDeviceSynchronize();
+  profileEvent("Conv_end", true);
+
+
+  #ifdef ERROR_INJECTION_ENABLED
+
+  if (op_counter >= total_ops) {
+    ERROR("No accuracy flag found \n");
+  }
+
+  int op_acc = op_accuracies[op_counter];
+
+  // Skip errorInjection if explicitly requested
+  if (skip_tensors.find(op_counter) != skip_tensors.end()) {
+    op_acc = 0;
+  }
+
+  void* error_norms = tensorAddError(output, op_acc);
+  add_norms(error_norms, "tensorConv", op_acc);
+  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
+
+  op_counter++;
+
+  #endif   
+
+  return output;
+
+
+}
+
+// FIXME: Need to properly fix the new HALF type conversion
+void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
+			    int vertical_pad, int horizontal_pad,
+			    int vertical_stride, int horizontal_stride,
+			    int conv_mode, int conv_groups){
+
+  INFO("*** TensorHConvolution \n");
+  profileEvent("#Conv");
+
+  Tensor* input = (Tensor*) input_ptr;
+  Tensor* filter = (Tensor*) filter_ptr;
+
+  cudnnConvolutionDescriptor_t convDesc;
+  cudnnConvolutionFwdAlgo_t convAlgo;
+  cudnnConvolutionMode_t mode;
+  
+  if(conv_mode == 0)
+    mode = CUDNN_CONVOLUTION;
+  else if(conv_mode == 1)
+    mode = CUDNN_CROSS_CORRELATION;
+
+  // FIXIT: Need to be more aware of the implications of alpha and beta
+  float alpha = 1.0f, beta = 0.0f;
+  // NOTE: compute in half precision
+  cudnnDataType_t computeType = CUDNN_DATA_HALF;
+
+  // NOTE: Moving inputs to GPU global memory
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+
+  // Float-Half Conversions
+  profileEvent("F2H_start");
+
+  convertToFP16(input);
+  convertToFP16(filter);  
+
+  profileEvent("F2H_end");
+  /******* END OF INPUT DATA CONVERSIONS*/
+
+  
+  Tensor *output;
+  if(conv_groups > 1){
+    int n = input->dims.dim_sizes[0];
+    int c = input->dims.dim_sizes[1];
+    const int KH = filter->dims.dim_sizes[2];
+    const int KW = filter->dims.dim_sizes[3];
+    int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+    int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+    
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+    
+
+    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, 
+				      CUDNN_TENSOR_NCHW, n, c, h, w);
+
+  
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+
+    int blockSize;
+    blockSize = 128;
+
+    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
+    dim3 block(blockSize);
+    depthwise_convNew8_half2<<<grid, block>>> ((__half*) output->gpu_half_data,
+					      (__half*) input->gpu_half_data,
+					      (__half*) filter->gpu_half_data,
+					      input->dims.dim_sizes[0], input->dims.dim_sizes[1],
+					      input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+					      KH, KW, h, w,
+					      vertical_pad, horizontal_pad,
+					      vertical_stride, horizontal_stride);
+    cudaDeviceSynchronize();
+
+    
+  }
+  else{    
+    checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
+
+    //FIXME: Current hack to preserve backward compatibilty
+    if(conv_groups == 0){
+      conv_groups = 1;
+    }
+  
+    // NOTE: Adding support for grouped convolution
+    checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
+
+  
+    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
+					       vertical_pad, horizontal_pad, // conv padding
+					       vertical_stride, horizontal_stride, // conv strides
+					       1, 1, // upscaling values
+					       mode, // mode is configurable
+					       computeType)); // defines compute precision
+
+    int n, c, h, w; // output dimensions
+    // Find dimension of convolution output
+    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
+						     input->tensor_half_desc,
+						     filter->filter_half_desc,
+						     &n, &c, &h, &w));
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+
+
+    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type,
+				      CUDNN_TENSOR_NCHW, n, c, h, w);
+
+  
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+
+    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
+	  output->data_type, output->data_format,
+	  output->dims.dim_sizes[0], output->dims.dim_sizes[1],
+	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+
+    if(convDesc == NULL || input->tensor_desc == NULL ||
+       filter->filter_desc == NULL || output->tensor_desc == NULL)
+      ERROR("NULL descriptor! \n");
+
+
+    // NOTE: The following algo works with TRUE half precision
+    convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+
+  
+    size_t workspace_size;
+    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
+						       input->tensor_half_desc,
+						       filter->filter_half_desc,
+						       convDesc,
+						       output->tensor_half_desc,
+						       convAlgo,
+						       &workspace_size));
+
+    // Allocating memory for the convolution workspace
+    DEBUG("workspace size = %d \n", workspace_size);
+    void* workspace;
+    checkCudaErrors(cudaMalloc(&workspace, workspace_size));
+
+
+
+
+    checkCUDNN(cudnnConvolutionForward(cudnnHandle,
+				       &alpha,
+				       input->tensor_half_desc,
+				       input->gpu_half_data,
+				       filter->filter_half_desc,
+				       filter->gpu_half_data,
+				       convDesc, convAlgo, workspace, workspace_size,
+				       &beta,
+				       output->tensor_half_desc,
+				       output->gpu_half_data));
+
+  }
+  
+  profileEvent("H2F_start");
+
+  convertToFP32_offline(output);
+  
+  profileEvent("H2F_end");
+
+
+  profileEvent("#Conv_end");
+
+  
+  return output;
+
+}
+
+
+  
+
+}// End of Extern C
+