diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
index a81ffe296233178126555bbb53babdcd4192a7bf..0a741316682324ca6270aab2066ebc9f0b48bcdf 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -2,6 +2,36 @@
 #include "tensor_utils.cu"
 
 
+//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
+__global__ void convToGemmApproxHalf(__half * const __restrict__ output,
+				     const __half * const __restrict input, const int N, const int C,
+				     const int H, const int W, const int KH, const int KW, const int V_pad,
+				     const int H_pad, const int H_out, const int W_out, const int V_stride,
+				     const int H_stride, const int reduced_filter_elem,
+				     const int skip_every) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
+	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
+	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	  else
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
+	}
+      }
+    }
+  }
+}
+
 
 //This skips every xth row
 //H_eff is the number of rows calculated exactly
@@ -350,3 +380,477 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
   
   return new_output;
 }
+
+__global__
+void convToGemmPerfRowHalf(__half * const __restrict__ output,
+			   const __half * const __restrict input, const int N, const int C,
+			   const int H, const int W, const int KH, const int KW, const int V_pad,
+			   const int H_pad, const int H_out, const int W_out, const int V_stride,
+			   const int H_stride, const int x, const int start, const int H_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_eff * W_out); //output image number
+  const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
+  const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  int past_start = (h % (x - 1) >= (x - 1 - start));
+  const int inH = (h / (x - 1) * x + h % (x-1) +
+		   past_start) * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0;
+
+      }
+    }
+  }
+
+}
+
+
+//For use in tensorConvPerfCuda
+//Interpolates every xth row starting from x - 1 - start
+//N is total number of elements in final output array
+__global__
+void approxInterpolateRowHalf(int N, int old_h, int b, int c, int h, int w,
+			      __half *old_data, __half *new_data, int x, int start){
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < N; i += stride){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n = i / (c * h * w);
+    int past_start = ((row % x) >= (x - 1 - start));
+
+    if(row == h-1)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
+    else if (row == 0)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+    else if(row % x == x - 1 - start){
+      int past_startO = ((row - 1) % x) > (x - 1 - start);
+      int oldIdx1 = ch * (b * old_h * w) + n * (old_h * w) +
+	((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col;
+
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	__hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1 * w]), 2);
+    }
+    else
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * old_h * w) + n * (old_h * w) +
+		 ((x-1) * (row / x) + row % x - past_start )  * (w) + col];
+
+
+  }
+
+}
+
+
+//This skips every xth row
+//W_eff is the number of cols calculated exactly
+__global__
+void convToGemmPerfColHalf(__half * const __restrict__ output,
+			   const __half * const __restrict input, const int N, const int C,
+			   const int H, const int W, const int KH, const int KW, const int V_pad,
+			   const int H_pad, const int H_out, const int W_out, const int V_stride,
+			   const int H_stride, const int x, const int start, const int W_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_eff); //output image number
+  const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
+  const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
+  const int w = tx % W_eff; //output width index (col number)
+  int past_start = (w % (x - 1)) >= (x - 1 - start);
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = (w / (x - 1) * x + w % (x-1) +
+		   past_start) * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] = 0;
+
+      }
+    }
+  }
+
+}
+
+
+//For use in tensorConvPerfCuda
+//Interpolates every xth col starting from x - 1 - start
+//N is total number of elements in final output array
+__global__
+void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
+			      __half *old_data, __half *new_data, int x, int start){
+
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < N; i += stride){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n = i / (c * h * w);
+    int past_start = ((col % x) >= (x - 1 - start));
+
+    if(col == w-1)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
+    else if (col == 0)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
+    else if(col % x == x - 1 - start){
+      int past_startO = ((col - 1) % x) > (x - 1 - start);
+      int oldIdx1 = ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
+	((x-1) * ((col - 1) / x) + (col-1) % x - past_startO);
+
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	__hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1]), 2);
+    }
+    else
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
+		 ((x-1) * (col / x) + col % x - past_start)];
+
+  } 
+}
+
+__global__
+void switchMatrix(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < N){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
+
+    new_data[((n_new * c + ch) * h + row ) * w + col] =
+      old_data[((ch * n + n_new) * h + row ) * w + col];
+  }
+
+}
+						
+
+__global__
+void createNewFilter(__half *new_filter, __half *old_filter,
+		     int newFilterSize, int oldFilterSize){
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < newFilterSize; i += stride){
+    new_filter[i] = old_filter[i % oldFilterSize];
+  }
+}
+
+__global__
+void createBatches(int n, const __half * matA[], const __half * matB[], __half * matC[],
+		   __half * convData, __half * newFilter, __half * output,
+		   int aStride, int bStride, int cStride){
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < n; i += stride){
+    matA[i] = &convData[i * aStride];
+    matB[i] = &newFilter[i * bStride];
+    matC[i] = &output[i * cStride];
+  }
+}
+
+//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
+__global__ void convToGemmApproxHalfN(__half * const __restrict__ output,
+				     const __half * const __restrict input, const int N, const int C,
+				     const int H, const int W, const int KH, const int KW, const int V_pad,
+				     const int H_pad, const int H_out, const int W_out, const int V_stride,
+				     const int H_stride, const int reduced_filter_elem,
+				     const int skip_every) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	const int output_col = filter_elem_num; //calculate output column, taking skipping into account
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((output_col * N + n) * H_out + h) * W_out + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
+
+      }
+    }
+  }
+}
+
+//start has to be less than row or less than col
+//row and col have to be >= 0
+//row = col = 1 means no perforation
+void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
+			     int vertical_pad, int horizontal_pad, int vertical_stride,
+			     int horizontal_stride, int conv_mode, int conv_groups,
+			     int row, int col, int start){
+
+  INFO("*** TensorConvolution half perforation \n");
+
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
+
+  profileEvent("F2H_start");
+
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  convertToFP16(input);
+  convertToFP16(filter);
+
+  /******* END OF INPUT DATA CONVERSIONS*/
+  profileEvent("F2H_end");
+
+  profileEvent("Conv");
+
+  Tensor* output_half;
+  int n, c, h, w; // output dimensions
+  n = input->dims.dim_sizes[0];
+  c = filter->dims.dim_sizes[0]; //number of filters
+  const int KH = filter->dims.dim_sizes[2];
+  const int KW = filter->dims.dim_sizes[3];
+
+  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  int h_eff = h - h / row;
+  if(h % row > row - 1 - start)
+    h_eff = h_eff - 1;
+
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  int w_eff = w - w / col;
+  if(w % col > col - 1 - start)
+    w_eff = w_eff - 1;
+
+
+  Tensor *new_output;
+  if(row > 1){
+    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, CUDNN_TENSOR_NCHW,
+					  n, c, h_eff, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
+						   horizontal_pad, h, w,
+						   vertical_stride, horizontal_stride, row, start, h_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h_eff * w, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h_eff * w,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+    
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    approxInterpolateRowHalf<<<numBlocks,256>>>(n * c * h * w, h_eff, n, c, h, w,
+						(__half *)output_half->gpu_half_data,
+						(__half *)new_output->gpu_half_data,
+						row, start);
+    cudaDeviceSynchronize();
+
+    cudaFree(output_half);
+    cudaFree(convData);
+  }
+  else if(col > 1){
+    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
+						   horizontal_pad, h, w,
+						   vertical_stride, horizontal_stride, col, start, w_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h * w_eff, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h * w_eff,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+    
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    approxInterpolateColHalf<<<numBlocks,256>>>(n * c * h * w, w_eff, n, c, h, w,
+						(__half *)output_half->gpu_half_data,
+						(__half *)new_output->gpu_half_data,
+						col, start);
+    
+    cudaDeviceSynchronize();
+
+    cudaFree(output_half);
+    cudaFree(convData);
+
+  }
+  else{
+    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					  CUDNN_TENSOR_NCHW, c, n, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApproxHalfN<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						  input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						  input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w,
+						  vertical_stride, horizontal_stride, num_filter_elem, c * h * w);
+    checkCudaErrors(cudaDeviceSynchronize());
+    //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h * w, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h * w,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+
+
+    // profileEvent("gemm_end", true);
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					  CUDNN_TENSOR_NCHW, n, c, h, w);
+    changeTensorPlacement(new_output, DEVICE);
+
+    
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    switchMatrix<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
+				    (__half *)output_half->gpu_half_data,
+				    (__half *)new_output->gpu_half_data);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    
+    cudaFree(convData);
+    cudaFree(output_half);
+  }
+
+  profileEvent("Conv_end", true);
+
+  profileEvent("H2F_start");
+
+  convertToFP32(new_output);
+
+  profileEvent("H2F_end");
+
+
+  #ifdef ERROR_INJECTION_ENABLED
+  if (op_counter >= total_ops) {
+    ERROR("No accuracy flag found \n");
+  }
+  int op_acc = op_accuracies[op_counter];
+  // Skip errorInjection if explicitly requested
+  if (skip_tensors.find(op_counter) != skip_tensors.end()) {
+    op_acc = 0;
+  }
+  void* error_norms = tensorAddError(output, op_acc);
+  add_norms(error_norms, "tensorConv", op_acc);
+  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
+  op_counter++;
+  #endif
+  return new_output;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc5cddf8a2121a937dbe8cd4582fe1022fd99f48
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
@@ -0,0 +1,238 @@
+#ifndef APPROXHPVM_IMG_RUNTIME_UTILS
+#define APPROXHPVM_IMG_RUNTIME_UTILS
+
+#include "configuration.h"
+#include "hpvm-rt-controller.h"
+
+#include "img_tensor_runtime.h"
+
+
+// Utilities header for ApproxHPVM image runtime API (wrapper runtime API)
+
+void* handleTensorFftApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorFft(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorFft", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorFft", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorReduceApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorReduce(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second); // and here
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorReductionSamplingReduce(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorReductionSamplingReduce",
+                                              pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorReductionSamplingReduce",
+                                                pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorProjectiveTApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorProjectiveT(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorProjectiveT", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorProjectiveT", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorMap1ApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorMap1(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorMap1", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorMap1", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorMap2ApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorMap2(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorMap2", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorMap2", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorMap3ApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorMap3(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorMap3", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorMap3", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+
+#endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
index b482cef5377e0f879b43f06a7ebbfbe01b39be09..14dc8f20f2111e85e82630cdbcc0c695a39c5ecd 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
@@ -72,7 +72,9 @@ public:
     FP32,
     FP16,
     PERFORATION,
-//    INPUT_SAMPLING,
+    INPUT_SAMPLING,
+    REDUCTION_SAMPLING,
+//  ADDITIONAL_APPROXIMATION_METHOD
     APPROX_END
   };
 
@@ -91,6 +93,15 @@ public:
     POOL_MEAN,
     POOL_MIN,
     SOFTMAX,
+    FFT,
+    REDUCE,
+    PROJECTIVE_T,
+    MAP1,
+    MAP2,
+    MAP3,
+//    STENCIL,
+//    COSINE_T,
+//  ADDITIONAL_TENSOR_OPERATION
     TENSOR_OP_END
   };
 
@@ -269,6 +280,24 @@ void GPUNodeConfiguration::print() {
       case TENSOR_OP::SOFTMAX :
         DEBUG("softmax");
         break;
+      case TENSOR_OP::FFT :
+        DEBUG("fft");
+        break;
+      case TENSOR_OP::REDUCE :
+        DEBUG("reduce");
+        break;
+      case TENSOR_OP::PROJECTIVE_T :
+        DEBUG("projectiveT");
+        break;
+      case TENSOR_OP::MAP1 :
+        DEBUG("map1");
+        break;
+      case TENSOR_OP::MAP2 :
+        DEBUG("map2");
+        break;
+      case TENSOR_OP::MAP3 :
+        DEBUG("map3");
+        break;
       default :
         ERROR("Unknown tensor operation.");
         break;
@@ -288,6 +317,12 @@ void GPUNodeConfiguration::print() {
         case APPROX::PERFORATION :
           DEBUG("perf");
           break;
+        case APPROX::INPUT_SAMPLING :
+          DEBUG("input_samp");
+          break;
+        case APPROX::REDUCTION_SAMPLING :
+          DEBUG("red_samp");
+          break;
         default:
           ERROR("Unknown approximation option");
           break;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
index 911f42b955a72cb756aadc1fc78231187ef3394e..21c6df7f1749e891dba257bbb1933c3beefb8c4f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
@@ -735,6 +735,30 @@ void RuntimeController::readConfigurationFile(const char *str) {
 	  DEBUG ("Found softmax operation\n");
 	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::SOFTMAX);
 	  idx++;
+	} else if (tokens[idx] == "fft") {
+	  DEBUG ("Found fft operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::FFT);
+	  idx++;
+	} else if (tokens[idx] == "reduce") {
+	  DEBUG ("Found reduce operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::REDUCE);
+	  idx++;
+	} else if (tokens[idx] == "projectiveT") {
+	  DEBUG ("Found projectiveT operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T);
+	  idx++;
+	} else if (tokens[idx] == "map1") {
+	  DEBUG ("Found map1 operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP1);
+	  idx++;
+	} else if (tokens[idx] == "map2") {
+	  DEBUG ("Found map2 operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP2);
+	  idx++;
+	} else if (tokens[idx] == "map3") {
+	  DEBUG ("Found map3 operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP3);
+	  idx++;
 	} else /*Not a new operation. This means an approximation option*/
 	  if (tokens[idx] == "fp32") {
 	    DEBUG("Found fp32 option\n");
@@ -756,6 +780,18 @@ void RuntimeController::readConfigurationFile(const char *str) {
 	    DEBUG("perf parameter: %d\n", perf);
         NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::PERFORATION, perf);
           idx += 2;
+        } else if (tokens[idx] == "input_samp") {
+	    DEBUG("Found input_samp option\n");
+        int input_samp = std::stoi(tokens[idx+1]);
+	    DEBUG("input_samp parameter: %d\n", input_samp);
+        NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::INPUT_SAMPLING, input_samp);
+          idx += 2;
+        } else if (tokens[idx] == "red_samp") {
+	    DEBUG("Found red_samp option\n");
+        int red_samp = std::stoi(tokens[idx+1]);
+	    DEBUG("red_samp parameter: %d\n", red_samp);
+        NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING, red_samp);
+          idx += 2;
         }
 	// TODO: other approximation options handled here
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c098719e52e31fcd06b6425964c8e1d48a15210
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
@@ -0,0 +1,26 @@
+#ifndef IMG_TENSOR_RUNTIME_H
+#define IMG_TENSOR_RUNTIME_H
+
+// ***                        Runtime declaration                        *** //
+void* tensorFft(void* input);
+void* tensorReduce(void* input);
+void* tensorReductionSamplingReduce(void* input);
+void* tensorProjectiveT(void* input);
+void* tensorMap1(void* input);
+void* tensorMap2(void* input);
+void* tensorMap3(void* input);
+
+// ***                      Wrapper API declaration                      *** //
+void* wrapper_tensorFft(const char* hpvm_node_id, void* input);
+void* wrapper_tensorReduce(const char* hpvm_node_id, void* input);
+void* wrapper_tensorProjectiveT(const char* hpvm_node_id, void* input);
+void* wrapper_tensorMap1(const char* hpvm_node_id, void* input);
+void* wrapper_tensorMap2(const char* hpvm_node_id, void* input);
+void* wrapper_tensorMap3(const char* hpvm_node_id, void* input);
+
+// Tentative
+void* wrapper_tensorStencil(const char* hpvm_node_id, void* input);
+void* wrapper_tensorCosineT(const char* hpvm_node_id, void* input);
+
+
+#endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
index 6041bb4de989be20acef973ac7f632b838a097a4..06c492c9e8fb45e0a51de153e8cf434a79a50e23 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
@@ -10,6 +10,7 @@
 #include "tensor.h"
 #include "rt-controller-api.h"
 
+#include "img_tensor_runtime.h"
 
 #ifndef CUDNN_HEADER
 #define CUDNN_HEADER
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0460e490fd2b188b85f53cf9b109f09ac3d6b83a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
@@ -0,0 +1,137 @@
+#include "../include/debug.h"
+#include "../include/img_tensor_runtime.h"
+#include "../include/approxhpvm_img_runtime_utils.h" 
+
+// ***                       Runtime implementation                      *** //
+void* tensorFft(void* input) {
+
+}
+
+void* tensorReduce(void* input) {
+
+}
+
+void* tensorReductionSamplingReduce(void* input) {
+
+}
+
+void* tensorProjectiveT(void* input) {
+
+}
+
+void* tensorMap1(void* input) {
+
+}
+
+void* tensorMap2(void* input) {
+
+}
+
+void* tensorMap3(void* input) {
+
+}
+
+
+// ***                     Wrapper API implementation                    *** //
+
+void* wrapper_tensorFft(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a fft operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::FFT &&
+         "Invalid configuration generated for tensor fft wrapper operation");
+  return handleTensorFftApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorReduce(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a reduce operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::REDUCE &&
+         "Invalid configuration generated for tensor reduce wrapper operation");
+  return handleTensorReduceApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorProjectiveT(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a projectiveT operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T &&
+         "Invalid configuration generated for tensor projectiveT wrapper operation");
+  return handleTensorProjectiveTApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorMap1(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a map1 operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP1 &&
+         "Invalid configuration generated for tensor map1 wrapper operation");
+  return handleTensorMap1ApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorMap2(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a map2 operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP2 &&
+         "Invalid configuration generated for tensor map2 wrapper operation");
+  return handleTensorMap2ApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorMap3(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a map3 operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP3 &&
+         "Invalid configuration generated for tensor map3 wrapper operation");
+  return handleTensorMap3ApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+// Tentative
+void* wrapper_tensorStencil(const char* hpvm_node_id, void* input) {
+  ERROR("Stencil operation currently unsupported.\n");
+  abort();
+}
+
+void* wrapper_tensorCosineT(const char* hpvm_node_id, void* input) {
+  ERROR("CosineT operation currently unsupported.\n");
+  abort();
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index 9e58f36a402844c33a1cb665ae4113e6e6a8534f..cc2a5d6ba91ff3c7bae29980340142656e2ef62b 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -49,6 +49,8 @@
 
 #include "../include/approx_simulation.h"
 
+// Image tensor runtime implementation
+#include "img_tensor_runtime.cu"
 
 //** Potential Improvements:
 //   1) Add support for dataypes beyond floats and half