diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
index 99deb53042e3c30949216af0dfed74100e3a1109..987865a867ac269721711a2b13c472f52128b8ee 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu
@@ -38,6 +38,38 @@ void convToGemm(float * const __restrict__ output,
   }
 }
 
+__global__
+void convToGemmHalf(__half * const __restrict__ output,
+			   const __half * const __restrict input,
+			   const int N, const int C,
+			   const int H, const int W,
+			   const int KH, const int KW,
+			   const int V_pad, const int H_pad,
+			   const int H_out, const int W_out,
+			   const int V_stride, const int H_stride){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad;
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	    const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+	        output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
+	            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	   } else {
+	        output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
+       }
+      }
+    }
+  }
+}
+
 //This skips every xth row
 //H_eff is the number of rows calculated exactly
 __global__
@@ -797,28 +829,23 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   convertToFP32(filter);
   //profileEvent("H2F_end");
 
-  long int n, c, h, w; // output dimensions
-  n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  const long int n = input->dims.dim_sizes[0];
+  const long int c = filter->dims.dim_sizes[0]; //number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-
-  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  int rem_row = (h - offset) % row > 0;
-  int h_eff = h - ((h - offset) / row) - rem_row;
-  
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-  int rem_col = (w - offset) % col > 0;
-  int w_eff = w - ((w - offset) / col) - rem_col;
+  const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
   Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
 				       CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
 
-  const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+  if(row > 1) {
+    const int rem_row = (h - offset) % row > 0;
+    const int h_eff = h - ((h - offset) / row) - rem_row;
 
-  if(row > 1){
     Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
 				      CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
@@ -868,8 +895,10 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
 
     freeTensor(output);
     cudaFree(convData);
-  }
-  else if(col > 1){
+  } else if(col > 1) {
+    const int rem_col = (w - offset) % col > 0;
+    const int w_eff = w - ((w - offset) / col) - rem_col;
+
     Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
@@ -918,8 +947,7 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
 
     freeTensor(output);
     cudaFree(convData);
-  }
-  else if(skip_every > 1) {
+  } else if(skip_every > 1) {
     Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
 					     CUDNN_TENSOR_NCHW, n, c, h, w);
 
@@ -1011,8 +1039,6 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   } else {
     new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
 					  CUDNN_TENSOR_NCHW, n, c, h, w);
-    // NOTE: Changing output tensor placement from host to device
-    changeTensorPlacement(new_output, DEVICE);
 
     float * convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
@@ -1104,19 +1130,13 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
   profileEvent("F2H_end");
   profileEvent("Conv");
 
-  long int n, c, h, w; // output dimensions
-  n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  const long int n = input->dims.dim_sizes[0];
+  const long int c = filter->dims.dim_sizes[0]; //number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-
-  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  int rem_row = (h - offset) % row > 0;
-  int h_eff = h - ((h - offset) / row) - rem_row;
-
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-  int rem_col = (w - offset) % col > 0;
-  int w_eff = w - ((w - offset) / col) - rem_col;
+  const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
   INFO("input: %d %d %d %d\n", input->dims.dim_sizes[0], input->dims.dim_sizes[1],
        input->dims.dim_sizes[2], input->dims.dim_sizes[3]);
@@ -1130,6 +1150,9 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
   changeTensorPlacement(new_output, DEVICE);
 
   if(row > 1){
+    const int rem_row = (h - offset) % row > 0;
+    const int h_eff = h - ((h - offset) / row) - rem_row;
+    
     Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
 						  CUDNN_TENSOR_NCHW,
 						  n, c, h_eff, w);
@@ -1137,9 +1160,6 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
 
-    //total number of filter elem
-    const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-
     __half * convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
@@ -1179,17 +1199,16 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
 
     freeTensor(output_half);
     cudaFree(convData);
-}
-  else if(col > 1){
+} else if(col > 1) {
+    const int rem_col = (w - offset) % col > 0;
+    const int w_eff = w - ((w - offset) / col) - rem_col;
+
     Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
 						  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
-    // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
-    const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-
+   
     __half * convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
@@ -1228,12 +1247,10 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
 
     freeTensor(output_half);
     cudaFree(convData);
-  } else{
+  } else if(skip_every > 1) {
     Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
 					     CUDNN_TENSOR_NCHW, n, c, h, w);
-
-    //total number of filter elem
-    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+    
     //reduced number after skipping
     long int reduced_filter_elem;
     if(offset != skip_every) {
@@ -1319,6 +1336,41 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
     cudaFree(convData);
     cudaFree(reducedFilter);
     freeTensor(output);
+  } else {
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+						  CUDNN_TENSOR_NCHW,
+						  n, c, h, w);
+
+    __half * convData;
+    long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmHalf<<<gridSize, blockSize>>>(convData,
+						   (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1],
+						   input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3],
+						   KH, KW, vertical_pad,
+						   horizontal_pad, h, w, vertical_stride,
+						   horizontal_stride);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h * w, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h * w,
+				 (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half *) new_output->gpu_half_data, CUDA_R_16F, n * h * w,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    
+    cudaFree(convData);
   }
 
   profileEvent("H2F_start");