diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index d3a037ea152c279c1528d8a4ec24d32f28df64f9..89148bd5e05e350c29ac49c2eed0a5b93696d38a 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -34,9 +34,11 @@
 #include <omp.h>
 
 // Tensor runtime header files
-//#include "tensor_cpu.h"
 #include "tensor.h"
+#include "tensor_runtime.h"
 #include "tensor_cpu_runtime.h"
+#include "approx_api.h"
+
 
 void llvm_hpvm_initTensorRtCPU() {
   // NOTE: Do Nothing
@@ -50,78 +52,6 @@ void hpvm_request_tensorCPU(void *tensor, int destination) {
   // NOTE: Do Nothing
 }
 
-std::vector<void *> PtrVect;
-
-void freeBatchMemory() {
-  for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
-    free(*it);
-  }
-  PtrVect.erase(PtrVect.begin(), PtrVect.end());
-}
-
-int getTypeSizeCPU(int data_type) __attribute__((always_inline));
-inline int getTypeSizeCPU(int data_type) {
-  return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
-}
-
-void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems)
-    __attribute__((always_inline));
-inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type,
-                              size_t num_elems) {
-  int type_size = getTypeSizeCPU(data_type);
-  size_t size_in_bytes = type_size * num_elems;
-  tensor->size_in_bytes = size_in_bytes;
-}
-
-void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems,
-                    bool freeMemory = true) __attribute__((always_inline));
-inline void allocateMemCPU(struct Tensor *tensor, int data_type,
-                           size_t num_elems, bool freeMemory) {
-  setSizeInBytesCPU(tensor, data_type, num_elems);
-  tensor->data_type = data_type;
-  tensor->num_elems = num_elems;
-  tensor->host_data =
-      (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
-  if (freeMemory)
-    PtrVect.push_back(tensor->host_data);
-}
-
-void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes)
-    __attribute__((always_inline));
-inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr,
-                              size_t size_in_bytes) {
-  Tensor *tensor = (Tensor *)tensor_ptr;
-  if (tensor->size_in_bytes != size_in_bytes) {
-    printf("The destination and source sizes don't match");
-  }
-  memcpy(tensor->host_data, data_ptr,
-         size_in_bytes); // Is this efficient enough?
-}
-
-void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
-                        size_t dim2_size, size_t dim3_size, size_t dim4_size,
-                        bool freeMemory = true) __attribute__((always_inline));
-inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
-                               size_t dim2_size, size_t dim3_size,
-                               size_t dim4_size, bool freeMemory) {
-  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
-  size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  if (freeMemory)
-    PtrVect.push_back(tensor);
-  allocateMemCPU(tensor, data_type, num_elems, freeMemory);
-
-  // Setting the tensor dimensions
-  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
-  dim_sizes[0] = dim1_size;
-  dim_sizes[1] = dim2_size;
-  dim_sizes[2] = dim3_size;
-  dim_sizes[3] = dim4_size;
-  tensor->dims.dim_sizes = dim_sizes;
-  tensor->dims.num_dims = 4;
-  tensor->data_placement = HOST;
-  return tensor;
-}
-
 void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
                                   int vertical_pad, int horizontal_pad,
                                   int vertical_stride, int horizontal_stride,
@@ -146,7 +76,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
   int num_filter_elem = kernel_height * kernel_width * channels;
   int output_size = output_width * output_height;
   printf("--CREATE 4D TENSOR\n");
-  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
                                                output_height, output_width);
   float *__restrict__ output_data = (float *)output->host_data;
   printf("CREATED 4D TENSOR\n");
@@ -235,7 +165,7 @@ void *tensorRegularFilterSamplingConvolutionCPU(
       num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
   const int output_size = output_width * output_height;
 
-  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
                                                output_height, output_width);
   float *__restrict__ output_data = (float *)output->host_data;
 
@@ -359,7 +289,7 @@ void *tensorIrregularFilterSamplingConvolutionCPU(
       num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
   const int output_size = output_width * output_height;
 
-  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
                                                output_height, output_width);
   float *__restrict__ output_data = (float *)output->host_data;
 
@@ -478,7 +408,7 @@ void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
   int num_filter_elem = kernel_height * kernel_width * channels;
   int full_output_size = full_output_height * full_output_width;
 
-  Tensor *full_output = (Tensor *)create4DTensorCPU(
+  Tensor *full_output = (Tensor *)create4DTensor(
       0, 0, batch_size, num_filters, full_output_height, full_output_width);
   float *__restrict__ full_output_data = (float *)full_output->host_data;
 
@@ -619,7 +549,7 @@ void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
   int num_filter_elem = kernel_height * kernel_width * channels;
   int full_output_size = full_output_height * full_output_width;
 
-  Tensor *full_output = (Tensor *)create4DTensorCPU(
+  Tensor *full_output = (Tensor *)create4DTensor(
       0, 0, batch_size, num_filters, full_output_height, full_output_width);
   float *__restrict__ full_output_data = (float *)full_output->host_data;
 
@@ -785,7 +715,6 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
   const int channels = input->dims.dim_sizes[1];
   const int image_height = input->dims.dim_sizes[2];
   const int image_width = input->dims.dim_sizes[3];
-  const int num_filters = filter->dims.dim_sizes[0];
   const int kernel_height = filter->dims.dim_sizes[2];
   const int kernel_width = filter->dims.dim_sizes[3];
   const int output_height =
@@ -797,8 +726,8 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
   const int num_filter_elem = filter_dim * channels;
   const int output_size = output_width * output_height;
 
-  Tensor *output = (Tensor *)create4DTensorCPU(
-      0, 0, batch_size, num_filters, channels, output_height * output_width);
+  Tensor *output = (Tensor *)create4DTensor(
+      0, 0, batch_size, channels, output_height, output_width);
   float *__restrict__ output_data = (float *)output->host_data;
 
   const long int conv_data_size = sizeof(float) * num_filter_elem *
@@ -836,22 +765,18 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
         }
       }
     }
-    for (int p = 0; p < num_filters; ++p) {
-      for (int m = 0; m < output_size; ++m) {
+    for (int m = 0; m < output_size; ++m) {
         for (int ch = 0; ch < channels; ch++) {
           float sum = 0;
 #pragma omp simd reduction(+ : sum)
           for (int k = 0; k < filter_dim; ++k) {
             int input_index = k + ch * filter_dim + num_filter_elem * m +
                               b * num_filter_elem * output_size;
-            sum += host_data[input_index] *
-                   host_filter[p * num_filter_elem + ch * filter_dim + k];
+            sum += host_data[input_index] * host_filter[ch * filter_dim + k];
           }
-          output_data[b * (output_size * num_filters * channels) +
-                      p * output_size * channels + ch * output_size + m] = sum;
+          output_data[b * (output_size * channels) + ch * output_size + m] = sum;
         }
-      }
-    }
+     }
   }
 
   free(host_data);
@@ -928,7 +853,7 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
   int x_radius = (window_width - 1) / 2;
   int y_radius = (window_height - 1) / 2;
 
-  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels,
+  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels,
                                                output_height, output_width);
   float *__restrict__ output_data = (float *)output->host_data;
 
@@ -1026,7 +951,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
   int m = lhs->dims.dim_sizes[0];
   int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
 
-  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1);
+  Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1);
 
   float *__restrict__ lhs_arr = (float *)lhs->host_data;
   float *__restrict__ rhs_arr = (float *)rhs->host_data;