diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
index 138ddd0887b57ce583b8f5cfeaba19ad7d20eb4e..330d97600e6cdcf44bb93dbf28625cca8051c3ec 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
@@ -3,48 +3,44 @@
 #ifndef APPROXHPVM_RUNTIME_UTILS
 #define APPROXHPVM_RUNTIME_UTILS
 
+
+#include "tensor_runtime.h"
+#include "tensor_cpu_runtime.h"
 #include "configuration.h"
 #include "hpvm-rt-controller.h"
-#include "tensor_runtime.h"
 
 #include "approx_knob_utils.h"
 
 // Utilities header for ApproxHPVM runtime API (wrapper runtime API)
 
-void *handleTensorAddApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input, void *bias) {
+//----------------------------------------------------------------------------//
+//---                      CPU Approximation handling                      ---//
+//----------------------------------------------------------------------------//
 
-  if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+void* handleTensorAddApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input, void* bias) {
+
+if (approxTuples.size() == 1) {
+    enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorAdd(input, bias);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorHalfAdd(input, bias);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
+      case CPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorAddCPU(input, bias);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -57,42 +53,32 @@ void *handleTensorAddApproximationTuples(
   return NULL;
 }
 
-void *handleTensorMulApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *lhs, void *rhs) {
+void* handleTensorMulApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* lhs, void* rhs) {
 
   if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorGemmGPU(lhs, rhs);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorHalfGemmGPU(lhs, rhs);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
+      case CPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorGemmCPU(lhs, rhs);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
       // TODO additional approx methods implemented here
-    }
+      }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
     abort();
@@ -103,90 +89,79 @@ void *handleTensorMulApproximationTuples(
   return NULL;
 }
 
-void *handleTensorConvApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input, void *filter, int conv_pad_h, int conv_pad_w,
-    int conv_stride_h, int conv_stride_w) {
+void* handleTensorConvApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* input, void* filter, 
+  int conv_pad_h, int conv_pad_w,
+  int conv_stride_h, int conv_stride_w) {
 
   if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w,
-                               conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
-
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out =
-          tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
-                                conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
-
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf",
-                                             pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::PERFORATION:
-    case GPUNodeConfiguration::APPROX::PERFORATION_HP: {
-      PerfParams params = perfParamSet->getPerfParams(param);
-      // PerfParams params = PerfParamSet().getPerfParams(param);
-      INFO("perforation param = %i\n", param);
-      INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
-           params.row, params.col, params.skip_offset);
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorConvApproxHalf2(
-          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
-          1, 1, params.row, params.col, 1, params.skip_offset);
-
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)",
-                                           pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)",
-                                             pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::INPUT_SAMPLING:
-    case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP: {
-      SampParams params = sampParamSet->getSampParams(param);
-      // SampParams params = SampParamSet().getSampParams(param);
-      INFO("sampling param = %i\n", param);
-      INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate,
-           params.skip_offset);
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
-                                    conv_stride_h, conv_stride_w, 1, 1, 1, 1,
-                                    params.skip_rate, params.skip_offset);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)",
-                                           pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)",
-                                             pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
+      case CPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorConvApproxCPU(input, filter,
+                               conv_pad_h, conv_pad_w,
+                               conv_stride_h, conv_stride_w,
+                               1, 1,
+                               1, 1, 1, 1);
+
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
+        return t_out;
+        }
+      case CPUNodeConfiguration::APPROX::PERFORATION :
+        {
+          PerfParams params = perfParamSet->getPerfParams(param);
+          INFO("perforation param = %i\n", param);
+          INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
+                params.row, params.col, params.skip_offset);
+          void* t_out;
+          RC->resume_profiler();
+          t_out = tensorConvApproxCPU(input, filter,
+                                 conv_pad_h, conv_pad_w,
+                                 conv_stride_h, conv_stride_w,
+                                 1, 1,
+                                 params.row, params.col, 1, params.skip_offset);
+
+          RC->pause_profiler();
+          std::pair<double, double> pinfo = RC->get_time_energy();
+          RC->reset_profiler();
+          RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)", pinfo.first);
+          RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)", pinfo.second);
+          return t_out;
+        }
+      case CPUNodeConfiguration::APPROX::INPUT_SAMPLING :
+        {
+          SampParams params = sampParamSet->getSampParams(param);
+          INFO("sampling param = %i\n", param);
+          INFO("params.skip_rate = %i, params.skip_offset = %i\n",
+                params.skip_rate, params.skip_offset);
+          void* t_out;
+          RC->resume_profiler();
+          t_out = tensorConvApproxCPU(input, filter,
+                                 conv_pad_h, conv_pad_w,
+                                 conv_stride_h, conv_stride_w,
+                                 1, 1,
+                                 1, 1,
+                                params.skip_rate, params.skip_offset);
+          RC->pause_profiler();
+          std::pair<double, double> pinfo = RC->get_time_energy();
+          RC->reset_profiler();
+          RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)", pinfo.first);
+          RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)", pinfo.second);
+          return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -199,49 +174,75 @@ void *handleTensorConvApproximationTuples(
   return NULL;
 }
 
-void *handleTensorGroupConvApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input, void *filter, int vertical_pad, int horizontal_pad,
-    int vertical_stride, int horizontal_stride, int conv_mode,
-    int conv_groups) {
+void* handleTensorGroupConvApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* input, void* filter,
+  int vertical_pad, int horizontal_pad,
+  int vertical_stride, int horizontal_stride,
+  int conv_mode, int conv_groups) {
 
   if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorConvCutlass(input, filter, vertical_pad, horizontal_pad,
-                                vertical_stride, horizontal_stride, conv_mode,
-                                conv_groups);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorHalfConvCutlass(input, filter, vertical_pad, horizontal_pad,
-                                    vertical_stride, horizontal_stride,
-                                    conv_mode, conv_groups);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass",
-                                           pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass",
-                                             pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
+      case CPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorConvCutlassCPU(input, filter,
+                                     vertical_pad, horizontal_pad,
+                                     vertical_stride, horizontal_stride,
+                                     conv_mode, conv_groups);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
       // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorBatchNormApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* input_ptr, void* gamma_ptr, void* beta_ptr,
+  void* mean_ptr, void* variance_ptr, double epsilon) {
+
+  if (approxTuples.size() == 1) {
+    enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case CPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr,
+                                  mean_ptr, variance_ptr, epsilon);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+    // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
@@ -253,44 +254,211 @@ void *handleTensorGroupConvApproximationTuples(
   return NULL;
 }
 
-void *handleTensorBatchNormApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr,
-    void *variance_ptr, double epsilon) {
+void* handleTensorReluApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
 
   if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
-                              variance_ptr, epsilon);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
-      return t_out;
+      case CPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorReluCPU(input);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
     }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
-                                  variance_ptr, epsilon);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm",
-                                             pinfo.second);
-      return t_out;
+  return NULL;
+}
+
+void* handleTensorClippedReluApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input, float min, float max) {
+
+  if (approxTuples.size() == 1) {
+    enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case CPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorRelu2CPU(input, min, max);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorTanhApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+
+  if (approxTuples.size() == 1) {
+    enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case CPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorTanhCPU(input);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
     }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
+  return NULL;
+}
+
+void* handleTensorPoolingApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* input_ptr, int poolFunction,
+  int window_height, int window_width,
+  int vertical_pad, int horizontal_pad,
+  int vertical_stride, int horizontal_stride) {
+
+  if (approxTuples.size() == 1) {
+    enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case CPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorPoolingCPU(input_ptr,
+                                 poolFunction,
+                                 window_height, window_width,
+                                 vertical_pad, horizontal_pad,
+                                 vertical_stride, horizontal_stride);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
       abort();
+    }
+  return NULL;
+}
+
+void* handleTensorSoftmaxApproximationTuples_CPU(
+  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input_ptr) {
+  void* t_out;
+  RC->resume_profiler();
+  t_out = tensorSoftmaxCPU(input_ptr);
+  RC->pause_profiler();
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationComputeTime("tensorSoftmaxCPU", pinfo.first);
+  RC->addToCurrentIterationComputeEnergy("tensorSoftmaxCPU", pinfo.second);
+  return t_out;
+}
+
+//----------------------------------------------------------------------------//
+//---                      GPU Approximation handling                      ---//
+//----------------------------------------------------------------------------//
+
+void* handleTensorAddApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input, void* bias) {
+
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorAdd(input, bias);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::FP16 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorHalfAdd(input, bias);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -303,42 +471,44 @@ void *handleTensorBatchNormApproximationTuples(
   return NULL;
 }
 
-void *handleTensorReluApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input) {
+void* handleTensorMulApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* lhs, void* rhs) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorRelu(input);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorHalfRelu(input);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorGemmGPU(lhs, rhs);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::FP16 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorHalfGemmGPU(lhs, rhs);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
       // TODO additional approx methods implemented here
-    }
+      }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
     abort();
@@ -349,40 +519,100 @@ void *handleTensorReluApproximationTuples(
   return NULL;
 }
 
-void *handleTensorClippedReluApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input, float min, float max) {
+void* handleTensorConvApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* input, void* filter, 
+  int conv_pad_h, int conv_pad_w,
+  int conv_stride_h, int conv_stride_w) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorRelu2(input, min, max);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorHalfRelu2(input, min, max);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorConvApprox(input, filter,
+                                 conv_pad_h, conv_pad_w,
+                                 conv_stride_h, conv_stride_w,
+                                 1, 1,
+                                 1, 1, 1, 1);
+	
+
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::FP16 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorConvApproxHalf2(input, filter,
+                                     conv_pad_h, conv_pad_w,
+                                     conv_stride_h, conv_stride_w,
+                                     1, 1,
+                                     1, 1, 1, 1);
+	
+
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::PERFORATION :
+      case GPUNodeConfiguration::APPROX::PERFORATION_HP :
+        {
+          PerfParams params = perfParamSet->getPerfParams(param);
+          INFO("perforation param = %i\n", param);
+          INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
+                params.row, params.col, params.skip_offset);
+          void* t_out;
+          RC->resume_profiler();
+          t_out = tensorConvApproxHalf2(input, filter,
+                                       conv_pad_h, conv_pad_w,
+                                       conv_stride_h, conv_stride_w,
+                                       1, 1,
+                                       params.row, params.col, 1, params.skip_offset);
+
+          RC->pause_profiler();
+          std::pair<double, double> pinfo = RC->get_time_energy();
+          RC->reset_profiler();
+          RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", pinfo.first);
+          RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", pinfo.second);
+          return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::INPUT_SAMPLING :
+      case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP :
+        {
+          SampParams params = sampParamSet->getSampParams(param);
+          INFO("sampling param = %i\n", param);
+          INFO("params.skip_rate = %i, params.skip_offset = %i\n",
+                params.skip_rate, params.skip_offset);
+          void* t_out;
+          RC->resume_profiler();
+          t_out = tensorConvApproxHalf2(input, filter,
+                                       conv_pad_h, conv_pad_w,
+                                       conv_stride_h, conv_stride_w,
+                                       1, 1,
+                                       1, 1,
+                                       params.skip_rate, params.skip_offset);
+          RC->pause_profiler();
+          std::pair<double, double> pinfo = RC->get_time_energy();
+          RC->reset_profiler();
+          RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", pinfo.first);
+          RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", pinfo.second);
+          return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -395,41 +625,103 @@ void *handleTensorClippedReluApproximationTuples(
   return NULL;
 }
 
-void *handleTensorTanhApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input) {
+void* handleTensorGroupConvApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* input, void* filter,
+  int vertical_pad, int horizontal_pad,
+  int vertical_stride, int horizontal_stride,
+  int conv_mode, int conv_groups) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorTanh(input);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorHalfTanh(input);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorConvCutlass(input, filter,
+                                  vertical_pad, horizontal_pad,
+                                  vertical_stride, horizontal_stride,
+                                  conv_mode, conv_groups);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::FP16 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorHalfConvCutlass(input, filter,
+                                      vertical_pad, horizontal_pad,
+                                      vertical_stride, horizontal_stride,
+                                      conv_mode, conv_groups);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
       // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorBatchNormApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* input_ptr, void* gamma_ptr, void* beta_ptr,
+  void* mean_ptr, void* variance_ptr, double epsilon) {
+
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr,
+                               mean_ptr, variance_ptr, epsilon);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::FP16 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr,
+                                   mean_ptr, variance_ptr, epsilon);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+    // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
@@ -441,64 +733,215 @@ void *handleTensorTanhApproximationTuples(
   return NULL;
 }
 
-void *handleTensorPoolingApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input_ptr, int poolFunction, int window_height, int window_width,
-    int vertical_pad, int horizontal_pad, int vertical_stride,
-    int horizontal_stride) {
+void* handleTensorReluApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorPooling(input_ptr, poolFunction, window_height,
-                            window_width, vertical_pad, horizontal_pad,
-                            vertical_stride, horizontal_stride);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
-      return t_out;
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorRelu(input);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::FP16 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorHalfRelu(input);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
     }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorHalfPooling(input_ptr, poolFunction, window_height,
-                                window_width, vertical_pad, horizontal_pad,
-                                vertical_stride, horizontal_stride);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
-      return t_out;
+  return NULL;
+}
+
+void* handleTensorClippedReluApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input, float min, float max) {
+
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorRelu2(input, min, max);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::FP16 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorHalfRelu2(input, min, max);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
     }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
+  return NULL;
+}
+
+void* handleTensorTanhApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorTanh(input);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::FP16 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorHalfTanh(input);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
       abort();
+    }
+  return NULL;
+}
+
+void* handleTensorPoolingApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+  void* input_ptr, int poolFunction,
+  int window_height, int window_width,
+  int vertical_pad, int horizontal_pad,
+  int vertical_stride, int horizontal_stride) {
+
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorPooling(input_ptr,
+                             poolFunction,
+                             window_height, window_width,
+                             vertical_pad, horizontal_pad,
+                             vertical_stride, horizontal_stride);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::FP16 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorHalfPooling(input_ptr,
+                                 poolFunction,
+                                 window_height, window_width,
+                                 vertical_pad, horizontal_pad,
+                                 vertical_stride, horizontal_stride);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
       // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
     }
-  } else if (approxTuples.size() == 2) {
-    ERROR("Currently unsupported case");
-    abort();
-  } else {
-    ERROR("Unsupported case");
-    abort();
-  }
   return NULL;
 }
 
-void *handleTensorSoftmaxApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input_ptr) {
-  // TODO: if approximation choices are added for softmax operation,
+void* handleTensorSoftmaxApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input_ptr) {
+  //TODO: if approximation choices are added for softmax operation,
   // implement this like the other handle* functions
-  void *t_out;
+  void* t_out;
   RC->resume_profiler();
   t_out = tensorSoftmax(input_ptr);
   RC->pause_profiler();
@@ -509,4 +952,5 @@ void *handleTensorSoftmaxApproximationTuples(
   return t_out;
 }
 
+
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
index 2067609c5a476291a27763b80a558da099e62e60..d27f463e789fae2e2c41bf31ea6498b47fd5240f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
@@ -11,7 +11,7 @@
 // Describes the internal choices made for an ApproxHPVM node
 class NodeConfiguration {
 public:
-  enum NODE_CONFIGURATION_TARGET { PROMISE, GPU, END };
+  enum NODE_CONFIGURATION_TARGET { PROMISE, GPU, CPU, END };
 
 protected:
   enum NODE_CONFIGURATION_TARGET NODE_CONFIGURATION_TARGET_ID;
@@ -21,6 +21,8 @@ public:
 
   bool isGPUNodeConfiguration();
 
+  bool isCPUNodeConfiguration();
+
   virtual void print() = 0;
 };
 
@@ -108,6 +110,57 @@ public:
   void print() override;
 };
 
+class CPUNodeConfiguration : public NodeConfiguration {
+public:
+  // Approximation methods available for this HW type
+  enum APPROX {
+    FP32,
+    PERFORATION,
+    INPUT_SAMPLING,
+    //  ADDITIONAL_APPROXIMATION_METHOD
+    APPROX_END
+  };
+
+  // Operations to be approximated in the node using this configuration
+  enum TENSOR_OP {
+    ADD,
+    BATCHNORM,
+    CONV,
+    GROUP_CONV,
+    MUL,
+    RELU,
+    CLIPPED_RELU,
+    TANH,
+    POOL_MAX,
+    POOL_MEAN,
+    POOL_MIN,
+    SOFTMAX,
+    //  ADDITIONAL_TENSOR_OPERATION
+    TENSOR_OP_END
+  };
+
+private:
+  // A vector, containing pairs of approximation method and tunable parameter
+  // (expressed as int, or ignored when not applicable) for each operation
+  std::vector<
+      std::pair<enum TENSOR_OP, std::vector<std::pair<enum APPROX, int>>>>
+      ApproxChoices;
+
+public:
+  void pushNewTensorOperation(enum TENSOR_OP top);
+
+  void pushNewApproximationChoiceForOperation(enum APPROX approx, int u);
+
+  std::vector<
+      std::pair<enum TENSOR_OP, std::vector<std::pair<enum APPROX, int>>>> &
+  getApproxChoices();
+
+  CPUNodeConfiguration();
+  ~CPUNodeConfiguration();
+
+  void print() override;
+};
+
 // Configuration : Includes configuration information :
 // - name
 // - speedup
@@ -140,8 +193,8 @@ struct Configuration {
 // Comparison operator definition, in increasing accuracy loss
 // (for std sort, used in pareto optimal computation)
 struct ConfigurationLessThan {
-  bool operator()(const struct Configuration &a,
-                  const struct Configuration &b) const;
+  bool operator()(
+      const struct Configuration &a, const struct Configuration &b) const;
 };
 
 // Comparison operator definition, in increasing accuracy loss
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
index 24f69c03903faf29b074284482f172efa334549f..31214eaaa799d5cd8d4de5b0935b41aa7fce617d 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
@@ -1,64 +1,75 @@
-#include <cmath>
+#include <stdio.h>
 #include <cstdlib>
+#include <cmath>
 #include <memory>
-#include <stdio.h>
 #include <string>
 
-#ifndef CUDNN_HEADER
-#define CUDNN_HEADER
 
-extern "C" {
-/****  Initialization Routine - Must be inserted at program start (in the
- * backend)  ****/
-void llvm_hpvm_initTensorRt(int gpuid = 0);
-void llvm_hpvm_cleanupTensorRt();
+#ifndef TENSOR_CPU_HEADER
+#define TENSOR_CPU_HEADER
 
-// Routine to moving tensor data (from and to GPU,CPU)
-void hpvm_request_tensor(void *tensor, int destination);
 
-// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for
-// cuDNN operations NOTE: The only data format supported as of now is: NCHW
-// (batch_dimension, channels, Height, Width)
-void *create4DTensor(int data_type, int data_format, size_t dim1_size,
-                     size_t dim2_size, size_t dim3_size, size_t dim4_size,
-                     bool freeMemory = true);
+extern "C"{
+  /****  Initialization Routine - Must be inserted at program start (in the backend)  ****/
+  void llvm_hpvm_initTensorRtCPU();
+  void llvm_hpvm_cleanupTensorRtCPU();
 
-void initTensorData(void *tensor, void *data_ptr, size_t size_in_bytes);
+  // Routine to moving tensor data (from and to GPU,CPU)
+  void hpvm_request_tensorCPU(void* tensor, int destination);
 
-/********** Tensor Operation API ******/
 
-// NOTE: For conv_mode, only value '1' is supported
-void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                           int horizontal_pad, int vertical_stride,
-                           int horizontal_stride, int conv_mode,
-                           int compute_precision, int row, int col,
-                           int skip_every, int start);
+  // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations
+  // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width)
+  //void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
+	///	       size_t dim3_size, size_t dim4_size, bool freeMemory = true);
+  
+  void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes);
 
-void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                           int horizontal_pad, int vertical_stride,
-                           int horizontal_stride, int conv_mode,
-                           int conv_groups);
+  /********** Tensor Operation API ******/
 
-void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
-                         void *mean_ptr, void *variance_ptr, double epsilon);
+  // NOTE: For conv_mode, only value '1' is supported
+void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr,
+                          int vertical_pad, int horizontal_pad,
+                          int vertical_stride, int horizontal_stride,
+                          int conv_mode, int compute_precision,
+                          int row, int col, int skip_every, int start);
 
-void *tensorPoolingCPU(void *input, int poolFunction, int window_height,
-                       int window_width, int vertical_pad, int horizontal_pad,
-                       int vertical_stride, int horizontal_stride);
+void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr,
+                          int vertical_pad, int horizontal_pad,
+                          int vertical_stride, int horizontal_stride,
+                          int conv_mode, int compute_precision,
+                          int row, int col, int skip_every, int start);
 
-void *tensorGemmCPU(void *lhs, void *rhs);
+void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr,
+			int vertical_pad, int horizontal_pad,
+			int vertical_stride, int horizontal_stride,
+			int conv_mode, int conv_groups);
+			
+ void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr,
+                         void* mean_ptr, void* variance_ptr, double epsilon);
 
-void *tensorAddCPU(void *x, void *bias);
 
-void *tensorReluCPU(void *input);
+  void* tensorPoolingCPU(void* input,
+			 int poolFunction,
+			 int window_height, int window_width,
+			 int vertical_pad, int horizontal_pad,
+			 int vertical_stride, int horizontal_stride);
 
-void *tensorRelu2CPU(void *input, float min, float max);
+  void* tensorGemmCPU(void* lhs, void* rhs);
 
-void *tensorTanhCPU(void *input);
+  void* tensorAddCPU(void* x, void* bias);
 
-void *tensorSoftmaxCPU(void *input);
+  void* tensorReluCPU(void* input);
+
+  void* tensorRelu2CPU(void* input, float min, float max);
+  
+  void* tensorTanhCPU(void* input);
+  
+  void* tensorSoftmaxCPU(void* input);
+    
 }
 
+
 /*
 void dummyFunction(){
 
@@ -92,8 +103,9 @@ void dummyFunction(){
   void* tensorTanhPtr = (void*) &tensorTanh;
   void* tensorHalfTanhPtr = (void*) &tensorHalfTanh;
   void* tensorSoftmaxPtr = (void*) &tensorSoftmax;
-  void* tensorAddErrorPtr = (void*) &tensorAddError;
+  void* tensorAddErrorPtr = (void*) &tensorAddError;    
 }
 */
 
+
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
index 517b7c7009645c43c7f2af6fa733b4205590efd8..d9d598d2a64cd898bc6c2b51607e1fb92b9afb8a 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
@@ -2,7 +2,9 @@
 
 using P_APPROX = PROMISENodeConfiguration::APPROX;
 using G_APPROX = GPUNodeConfiguration::APPROX;
+using C_APPROX = CPUNodeConfiguration::APPROX;
 using G_TENSOR_OP = GPUNodeConfiguration::TENSOR_OP;
+using C_TENSOR_OP = CPUNodeConfiguration::TENSOR_OP;
 
 bool NodeConfiguration::isPROMISENodeConfiguration() {
   return NODE_CONFIGURATION_TARGET_ID == PROMISE;
@@ -12,8 +14,12 @@ bool NodeConfiguration::isGPUNodeConfiguration() {
   return NODE_CONFIGURATION_TARGET_ID == GPU;
 }
 
-void PROMISENodeConfiguration::pushNewApproximationChoice(P_APPROX approx,
-                                                          int u) {
+bool NodeConfiguration::isCPUNodeConfiguration() {
+  return NODE_CONFIGURATION_TARGET_ID == CPU;
+}
+
+void PROMISENodeConfiguration::pushNewApproximationChoice(
+    P_APPROX approx, int u) {
   ApproxChoices.push_back(std::make_pair(approx, u));
 }
 
@@ -28,7 +34,7 @@ PROMISENodeConfiguration::PROMISENodeConfiguration() {
 
 PROMISENodeConfiguration::~PROMISENodeConfiguration() {}
 
-void GPUNodeConfiguration::pushNewTensorOperation(enum TENSOR_OP top) {
+void GPUNodeConfiguration::pushNewTensorOperation(G_TENSOR_OP top) {
   std::vector<std::pair<G_APPROX, int>> emptyVec;
   ApproxChoices.push_back(std::make_pair(top, emptyVec));
 }
@@ -36,8 +42,9 @@ void GPUNodeConfiguration::pushNewTensorOperation(enum TENSOR_OP top) {
 void GPUNodeConfiguration::pushNewApproximationChoiceForOperation(
     G_APPROX approx, int u) {
   unsigned size = ApproxChoices.size();
-  CUSTOM_ASSERT(size >= 1 &&
-                "Cannot apply approximation choice to non existent operation.");
+  CUSTOM_ASSERT(
+      size >= 1 &&
+      "Cannot apply approximation choice to non existent operation.");
   ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u));
 }
 
@@ -51,8 +58,32 @@ GPUNodeConfiguration::GPUNodeConfiguration() {
 }
 GPUNodeConfiguration::~GPUNodeConfiguration() {}
 
-Configuration::Configuration(std::string &n, float f, float e, float a,
-                             float al)
+void CPUNodeConfiguration::pushNewTensorOperation(C_TENSOR_OP top) {
+  std::vector<std::pair<C_APPROX, int>> emptyVec;
+  ApproxChoices.push_back(std::make_pair(top, emptyVec));
+}
+
+void CPUNodeConfiguration::pushNewApproximationChoiceForOperation(
+    C_APPROX approx, int u) {
+  unsigned size = ApproxChoices.size();
+  CUSTOM_ASSERT(
+      size >= 1 &&
+      "Cannot apply approximation choice to non existent operation.");
+  ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u));
+}
+
+std::vector<std::pair<C_TENSOR_OP, std::vector<std::pair<C_APPROX, int>>>> &
+CPUNodeConfiguration::getApproxChoices() {
+  return ApproxChoices;
+}
+
+CPUNodeConfiguration::CPUNodeConfiguration() {
+  NODE_CONFIGURATION_TARGET_ID = CPU;
+}
+CPUNodeConfiguration::~CPUNodeConfiguration() {}
+
+Configuration::Configuration(
+    std::string &n, float f, float e, float a, float al)
     : name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {}
 
 float Configuration::getSpeedup() { return speedup; }
@@ -62,20 +93,20 @@ float Configuration::getEnergy() { return energy; }
 float Configuration::getAccuracy() { return accuracy; }
 
 float Configuration::getAccuracyLoss() { return accuracyLoss; }
-bool ConfigurationLessThan::operator()(const struct Configuration &a,
-                                       const struct Configuration &b) const {
+bool ConfigurationLessThan::
+operator()(const struct Configuration &a, const struct Configuration &b) const {
   return (a.accuracyLoss < b.accuracyLoss);
 }
-bool ConfigurationLessThan_AL::operator()(const struct Configuration *a,
-                                          const float &b) const {
+bool ConfigurationLessThan_AL::
+operator()(const struct Configuration *a, const float &b) const {
   return (a->accuracyLoss < b);
 }
-bool ConfigurationLessThan_SP::operator()(const struct Configuration *a,
-                                          const float &b) const {
+bool ConfigurationLessThan_SP::
+operator()(const struct Configuration *a, const float &b) const {
   return (a->speedup < b);
 }
-bool ConfigurationLessThan_E::operator()(const struct Configuration *a,
-                                         const float &b) const {
+bool ConfigurationLessThan_E::
+operator()(const struct Configuration *a, const float &b) const {
   return (a->energy < b);
 }
 
@@ -92,10 +123,10 @@ void PROMISENodeConfiguration::print() {
     case P_APPROX::SWING_LEVEL:
       printf("swing_level");
       break;
+      // TODO additional approx methods to be printed here
     default:
       ERROR("Unknown approximation option");
       break;
-      // TODO additional approx methods to be printed here
     }
     printf(" %d", it.second);
   }
@@ -110,64 +141,64 @@ void GPUNodeConfiguration::print() {
 
     printf(" ");
     switch (it.first) {
-    case TENSOR_OP::ADD:
+    case G_TENSOR_OP::ADD:
       printf("add");
       break;
-    case TENSOR_OP::BATCHNORM:
+    case G_TENSOR_OP::BATCHNORM:
       printf("batchnorm");
       break;
-    case TENSOR_OP::CONV:
+    case G_TENSOR_OP::CONV:
       printf("conv");
       break;
-    case TENSOR_OP::GROUP_CONV:
+    case G_TENSOR_OP::GROUP_CONV:
       printf("group_conv");
       break;
-    case TENSOR_OP::MUL:
+    case G_TENSOR_OP::MUL:
       printf("mul");
       break;
-    case TENSOR_OP::RELU:
+    case G_TENSOR_OP::RELU:
       printf("relu");
       break;
-    case TENSOR_OP::CLIPPED_RELU:
+    case G_TENSOR_OP::CLIPPED_RELU:
       printf("clipped_relu");
       break;
-    case TENSOR_OP::TANH:
+    case G_TENSOR_OP::TANH:
       printf("tanh");
       break;
-    case TENSOR_OP::POOL_MAX:
+    case G_TENSOR_OP::POOL_MAX:
       printf("pool_max");
       break;
-    case TENSOR_OP::POOL_MEAN:
+    case G_TENSOR_OP::POOL_MEAN:
       printf("pool_mean");
       break;
-    case TENSOR_OP::POOL_MIN:
+    case G_TENSOR_OP::POOL_MIN:
       printf("pool_min");
       break;
-    case TENSOR_OP::SOFTMAX:
+    case G_TENSOR_OP::SOFTMAX:
       printf("softmax");
       break;
-    case TENSOR_OP::FFT:
+    case G_TENSOR_OP::FFT:
       printf("fft");
       break;
-    case TENSOR_OP::REDUCE:
+    case G_TENSOR_OP::REDUCE:
       printf("reduce");
       break;
-    case TENSOR_OP::PROJECTIVE_T:
+    case G_TENSOR_OP::PROJECTIVE_T:
       printf("projectiveT");
       break;
-    case TENSOR_OP::MAP1:
+    case G_TENSOR_OP::MAP1:
       printf("map1");
       break;
-    case TENSOR_OP::MAP2:
+    case G_TENSOR_OP::MAP2:
       printf("map2");
       break;
-    case TENSOR_OP::MAP3:
+    case G_TENSOR_OP::MAP3:
       printf("map3");
       break;
+      // TODO additional operations to be printed here
     default:
       ERROR("Unknown tensor operation.");
       break;
-      // TODO additional operations to be printed here
     }
 
     auto &approxVec = it.second;
@@ -195,10 +226,85 @@ void GPUNodeConfiguration::print() {
       case G_APPROX::REDUCTION_SAMPLING:
         printf("red_samp");
         break;
+        // TODO additional approx methods to be printed here
       default:
         ERROR("Unknown approximation option");
         break;
+      }
+
+      printf(" %d", inner_it.second);
+    }
+  }
+
+  printf("\n");
+}
+
+void CPUNodeConfiguration::print() {
+
+  printf(" cpu");
+  for (auto &it : ApproxChoices) {
+
+    printf(" ");
+    switch (it.first) {
+    case C_TENSOR_OP::ADD:
+      printf("add");
+      break;
+    case C_TENSOR_OP::BATCHNORM:
+      printf("batchnorm");
+      break;
+    case C_TENSOR_OP::CONV:
+      printf("conv");
+      break;
+    case C_TENSOR_OP::GROUP_CONV:
+      printf("group_conv");
+      break;
+    case C_TENSOR_OP::MUL:
+      printf("mul");
+      break;
+    case C_TENSOR_OP::RELU:
+      printf("relu");
+      break;
+    case C_TENSOR_OP::CLIPPED_RELU:
+      printf("clipped_relu");
+      break;
+    case C_TENSOR_OP::TANH:
+      printf("tanh");
+      break;
+    case C_TENSOR_OP::POOL_MAX:
+      printf("pool_max");
+      break;
+    case C_TENSOR_OP::POOL_MEAN:
+      printf("pool_mean");
+      break;
+    case C_TENSOR_OP::POOL_MIN:
+      printf("pool_min");
+      break;
+    case C_TENSOR_OP::SOFTMAX:
+      printf("softmax");
+      break;
+      // TODO additional operations to be printed here
+    default:
+      ERROR("Unknown tensor operation.");
+      break;
+    }
+
+    auto &approxVec = it.second;
+    for (auto &inner_it : approxVec) {
+      printf(" ");
+      switch (inner_it.first) {
+      case C_APPROX::FP32:
+        printf("fp32");
+        break;
+      case C_APPROX::PERFORATION:
+        printf("perf");
+        break;
+      case C_APPROX::INPUT_SAMPLING:
+        printf("samp");
+        break;
         // TODO additional approx methods to be printed here
+      default:
+        ERROR("Unknown approximation option");
+        break;
       }
 
       printf(" %d", inner_it.second);
@@ -211,8 +317,9 @@ void GPUNodeConfiguration::print() {
 void Configuration::print() {
 
   printf("+++++\n");
-  printf("%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy,
-         accuracyLoss);
+  printf(
+      "%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy,
+      accuracyLoss);
   for (std::map<std::string, NodeConfiguration *>::const_iterator it =
            setup.begin();
        it != setup.end(); ++it) {
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
index fd308a3409dc679a07b5374238e7150fb3c34beb..2dcbf9dcef6f049439f1f7332662db33f532e7a6 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -1,35 +1,37 @@
 
 #include "hpvm-rt-controller.h"
-#include "global_data.h"
 #include "img_tensor_utils.h"
+#include "global_data.h"
 #include <fstream>
 
 //-------- Functionality to read and update frequency on Jetson board -------//
 /*const char* available_freqs[] = {"140250000", "229500000", "318750000",
-                                 "408000000", "497250000", "586500000",
+                                 "408000000", "497250000", "586500000", 
                                  "675750000", "765000000", "854250000",
                                  "943500000", "1032750000", "1122000000",
                                  "1211250000", "1300500000"};
 
 */
 
+
 const int available_freqs[] = {
-    140250000,  // 0
-    229500000,  // 1
-    318750000,  // 2
-    408000000,  // 3
-    497250000,  // 4
-    586500000,  // 5
-    675750000,  // 6
-    765000000,  // 7
-    854250000,  // 8
-    943500000,  // 9
-    1032750000, // 10
-    1122000000, // 11
-    1211250000, // 12
-    1300500000  // 13
+140250000, // 0
+229500000, // 1
+318750000, // 2
+408000000, // 3
+497250000, // 4
+586500000, // 5
+675750000, // 6
+765000000, // 7
+854250000, // 8
+943500000, // 9
+1032750000,// 10
+1122000000,// 11
+1211250000,// 12
+1300500000 // 13
 };
 
+
 /*void updateJetsonGPUFreq(int freq_level) {
 
   if (freq_level < 0 || freq_level > 13) {
@@ -37,7 +39,7 @@ const int available_freqs[] = {
     abort();
   }
 
-  const char* freq_val = available_freqs[freq_level];
+  const char* freq_val = available_freqs[freq_level]; 
   printf("freq-val[0] = %s \n", freq_val);
 
   FILE* max_file =
@@ -47,7 +49,7 @@ const int available_freqs[] = {
   }
   fwrite(freq_val, strlen(freq_val), 1, max_file);
   fclose(max_file);
-
+  
   FILE* min_file =
     fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+");
   if (min_file == NULL){
@@ -68,7 +70,7 @@ unsigned long int readJetsonGPUFreq() {
 
   char buf[50];
   char* ptr;
-
+  
   fread(buf, 50, 1, cur_freq_file);
   unsigned long cur_freq = strtoul(buf, &ptr, 10);
   fclose(cur_freq_file);
@@ -77,15 +79,14 @@ unsigned long int readJetsonGPUFreq() {
 
 */
 
+
 // Sets frequency
 void setFreq(unsigned freq_index) {
 
   unsigned target_freq = available_freqs[freq_index];
-
-  const char *const min_freq_file =
-      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
-  const char *const max_freq_file =
-      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
+  
+  const char * const min_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
+  const char * const max_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
 
   std::ofstream min_stream;
   std::ofstream max_stream;
@@ -104,8 +105,7 @@ void setFreq(unsigned freq_index) {
 unsigned recordFreq() {
 
   // Current frequency file
-  const char *const cur_freq_file =
-      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
+  const char * const cur_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
   std::ifstream cur_stream;
   cur_stream.open(cur_freq_file, std::ifstream::in);
 
@@ -118,6 +118,10 @@ unsigned recordFreq() {
   return cur_freq;
 }
 
+
+
+
+
 //---------------------------------------------------------------------------//
 
 /*
@@ -131,13 +135,13 @@ bool fileExists(const std::string &file) {
 
 // There will be no frequency request for the first batch
 // Therefore, we skip the first element by initializing to 1, not 0.
-FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf)
-    : idx_list(il), rep_factor(rf), count(1), idx(0) {}
+FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) :
+  idx_list(il), rep_factor(rf), count(1), idx(0) {}
 
 unsigned FrequencyIndexList::getNextIndex() {
   if (count == rep_factor) {
     count = 0;
-    idx = (idx + 1) % idx_list.size();
+    idx = (idx+1) % idx_list.size();
   }
   count++;
   return idx_list[idx];
@@ -204,7 +208,7 @@ void ProfileInfo::readIterationFrequency() {
   frequency_current_iteration = recordFreq();
 #else
   frequency_current_iteration = 0;
-#endif // JETSON_EXECUTION
+#endif //JETSON_EXECUTION
 }
 
 unsigned long ProfileInfo::getIterationFrequency() {
@@ -271,14 +275,15 @@ void ProfileInfo::printToFile() {
   // to have equal sizes, in outer and inner vectors both,
   // and all time_info and energy_info vectors must have the same size.
   unsigned iterations = tensor_time_info.size();
-  CUSTOM_ASSERT((tensor_time_info.size() == iterations) &&
-                (tensor_energy_info.size() == iterations) &&
-                (control_time_info.size() == iterations) &&
-                (control_energy_info.size() == iterations) &&
-                (config_time_info.size() == iterations) &&
-                (config_energy_info.size() == iterations) &&
-                (frequency_info.size() == iterations) &&
-                "time_info, energy_info, frequency_info size: \
+  CUSTOM_ASSERT(
+      (tensor_time_info.size() == iterations) &&
+      (tensor_energy_info.size() == iterations) &&
+      (control_time_info.size() == iterations) &&
+      (control_energy_info.size() == iterations) &&
+      (config_time_info.size() == iterations) &&
+      (config_energy_info.size() == iterations) &&
+      (frequency_info.size() == iterations) &&
+      "time_info, energy_info, frequency_info size: \
                    iteration number does not match.");
 
   for (unsigned i = 0; i < tensor_time_info.size(); i++) {
@@ -328,8 +333,8 @@ ProfileInfo::ProfileInfo()
       time_control_current_iteration(0.0), time_config_current_iteration(0.0),
       energy_compute_current_iteration(0.0),
       energy_control_current_iteration(0.0),
-      energy_config_current_iteration(0.0), frequency_current_iteration(0),
-      in_iteration(false) {}
+      energy_config_current_iteration(0.0),
+      frequency_current_iteration(0), in_iteration(false) {}
 
 Slowdowns::Slowdowns() {
   idx = 0;
@@ -371,37 +376,37 @@ void RuntimeController::stop_profiler() {
     profiler->stop_profiler();
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &
-RuntimeController::getSpeedupConfigurations() {
+std::vector<struct Configuration *> &RuntimeController::
+getSpeedupConfigurations() {
   return SpeedupConfigurations;
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &
-RuntimeController::getEnergyConfigurations() {
+std::vector<struct Configuration *> &RuntimeController::
+getEnergyConfigurations() {
   return EnergyConfigurations;
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &
-RuntimeController::getThreeDCurveConfigurations() {
+std::vector<struct Configuration *> &RuntimeController::
+getThreeDCurveConfigurations() {
   return ThreeDCurveConfigurations;
 }
 // For testing purposes only - do not use widely
 unsigned RuntimeController::getConfigurationIdx() { return configurationIdx; }
 
 double RuntimeController::getCurrentConfigurationSpeedup() {
-  return (double)(*Configurations)[configurationIdx]->speedup;
+  return (double) (*Configurations)[configurationIdx]->speedup;
 }
 
 double RuntimeController::getCurrentConfigurationEnergy() {
-  return (double)(*Configurations)[configurationIdx]->energy;
+  return (double) (*Configurations)[configurationIdx]->energy;
 }
 
 double RuntimeController::getCurrentConfigurationAccuracy() {
-  return (double)(*Configurations)[configurationIdx]->accuracy;
+  return (double) (*Configurations)[configurationIdx]->accuracy;
 }
 
 double RuntimeController::getCurrentConfigurationAccuracyLoss() {
-  return (double)(*Configurations)[configurationIdx]->accuracyLoss;
+  return (double) (*Configurations)[configurationIdx]->accuracyLoss;
 }
 
 std::vector<float> &RuntimeController::getQuantizationRanges(const char *data) {
@@ -443,10 +448,8 @@ void RuntimeController::init(const char *Cstr, const char *Qstr) {
   // Pseudo random variable (when we did few experiments)
   // or true random numbers for probabilistic control
   pseudo_rd = 0.0;
-  std::random_device
-      rd; // Will be used to obtain a seed for the random number engine
-  generator =
-      std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd()
+  std::random_device rd;  //Will be used to obtain a seed for the random number engine
+  generator = std::mt19937 (rd()); //Standard mersenne_twister_engine seeded with rd()
   distr = std::uniform_real_distribution<>(0.0, 1.0);
 
   g_freq = available_freqs[13];
@@ -468,8 +471,8 @@ void RuntimeController::end_iteration() {
     PI->end_iteration();
 }
 
-void RuntimeController::addToCurrentIterationComputeTime(const char *s,
-                                                         double t) {
+void RuntimeController::addToCurrentIterationComputeTime(
+    const char *s, double t) {
   if (PI)
     PI->addToCurrentIterationComputeTime(s, t);
 }
@@ -484,8 +487,8 @@ void RuntimeController::addToCurrentIterationConfigTime(double t) {
     PI->addToCurrentIterationConfigTime(t);
 }
 
-void RuntimeController::addToCurrentIterationComputeEnergy(const char *s,
-                                                           double e) {
+void RuntimeController::addToCurrentIterationComputeEnergy(
+    const char *s, double e) {
   if (PI)
     PI->addToCurrentIterationComputeEnergy(s, e);
 }
@@ -523,8 +526,8 @@ void RuntimeController::updateFrequency() {
   //--- updateJetsonGPUFreq(freq_idx);
 
   setFreq(freq_idx);
-
-#endif // JETSON_EXECUTION
+  
+#endif //JETSON_EXECUTION
 }
 
 void RuntimeController::writeProfileInfo() {
@@ -557,9 +560,11 @@ std::pair<double, double> RuntimeController::fc_profile(
     const unsigned num_rows_a, const unsigned num_cols_a,
     const unsigned num_rows_b, const unsigned num_cols_b,
     const unsigned voltage_swing, const unsigned patch_factor) {
-  return (promise ? promise->fc_profile(num_rows_a, num_cols_a, num_rows_b,
-                                        num_cols_b, voltage_swing, patch_factor)
-                  : std::make_pair(0.0, 0.0));
+  return (
+      promise ? promise->fc_profile(
+                    num_rows_a, num_cols_a, num_rows_b, num_cols_b,
+                    voltage_swing, patch_factor)
+              : std::make_pair(0.0, 0.0));
 }
 
 std::pair<double, double> RuntimeController::conv_profile(
@@ -567,16 +572,17 @@ std::pair<double, double> RuntimeController::conv_profile(
     const unsigned c_out, const unsigned c_in, const unsigned k_h,
     const unsigned k_w, const unsigned s_h, const unsigned s_w,
     const unsigned voltage_swing, const unsigned patch_factor) {
-  return (promise ? promise->conv_profile(n, c, h, w, c_out, c_in, k_h, k_w,
-                                          s_h, s_w, voltage_swing, patch_factor)
-                  : std::make_pair(0.0, 0.0));
+  return (
+      promise ? promise->conv_profile(
+                    n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing,
+                    patch_factor)
+              : std::make_pair(0.0, 0.0));
 }
 
 // Constructor and descructor
 RuntimeController::RuntimeController() {
   configurationIdx = 0;
-  FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
-                               10);
+  FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10);
 #ifdef ACTIVE_PROFILING
   PI = new ProfileInfo();
   profiler = new Profiler();
@@ -713,13 +719,14 @@ void RuntimeController::readConfigurationFile(const char *str) {
   std::getline(qin, first_line);
   DEBUG("first_line: %s\n", first_line.c_str());
 
-  try {
+  try{
     baseline_time = std::stod(first_line);
     DEBUG("Baseline time: %lf\n\n", baseline_time);
-  } catch (...) {
+  }
+  catch(...){
     ERROR("Please Add/Fix Baseline Time at Top of Config File.. ");
   }
-
+  
   for (std::string line; std::getline(qin, line);) {
     DEBUG("line: %s\n", line.c_str());
 
@@ -751,9 +758,9 @@ void RuntimeController::readConfigurationFile(const char *str) {
     if (readingFirstLine) {
       // Read first line, to create the new configuration struct
       readingFirstLine = false;
-      InitialConfigurations.push_back(
-          Configuration(tokens[0], std::stof(tokens[1]), std::stof(tokens[2]),
-                        std::stof(tokens[3]), std::stof(tokens[4])));
+      InitialConfigurations.push_back(Configuration(
+          tokens[0], std::stof(tokens[1]), std::stof(tokens[2]),
+          std::stof(tokens[3]), std::stof(tokens[4])));
       continue;
     }
 
@@ -761,8 +768,8 @@ void RuntimeController::readConfigurationFile(const char *str) {
       DEBUG("Found promise configuration\n");
 
       // There must be at least one approximation option
-      CUSTOM_ASSERT((tokens.size() >= 2) &&
-                    "Not enough approximation options.");
+      CUSTOM_ASSERT(
+          (tokens.size() >= 2) && "Not enough approximation options.");
 
       PROMISENodeConfiguration *NodeConf = new PROMISENodeConfiguration();
       InitialConfigurations.back().setup.insert(
@@ -785,8 +792,9 @@ void RuntimeController::readConfigurationFile(const char *str) {
       DEBUG("Found gpu configuration\n");
 
       // There must be at least one operation, with an approximation option
-      CUSTOM_ASSERT((tokens.size() >= 5) &&
-                    "Not enough operations - approximation options.");
+      CUSTOM_ASSERT(
+          (tokens.size() >= 5) &&
+          "Not enough operations - approximation options.");
 
       GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration();
       InitialConfigurations.back().setup.insert(
@@ -938,6 +946,106 @@ void RuntimeController::readConfigurationFile(const char *str) {
         // TODO: other approximation options handled here
       }
 
+    } else if (tokens[1] == "cpu") {
+      DEBUG("Found gpu configuration\n");
+
+      // There must be at least one operation, with an approximation option
+      CUSTOM_ASSERT(
+          (tokens.size() >= 5) &&
+          "Not enough operations - approximation options.");
+
+      CPUNodeConfiguration *NodeConf = new CPUNodeConfiguration();
+      InitialConfigurations.back().setup.insert(
+          std::make_pair(tokens[0], NodeConf));
+
+      unsigned idx = 2;
+      while (idx < tokens.size()) {
+        if (tokens[idx] == "add") {
+          DEBUG("Found add operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::ADD);
+          idx++;
+        } else if (tokens[idx] == "batchnorm") {
+          DEBUG("Found batchnorm operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::BATCHNORM);
+          idx++;
+        } else if (tokens[idx] == "conv") {
+          DEBUG("Found conv operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::CONV);
+          idx++;
+        } else if (tokens[idx] == "group_conv") {
+          DEBUG("Found group_conv operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::GROUP_CONV);
+          idx++;
+        } else if (tokens[idx] == "mul") {
+          DEBUG("Found mul operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::MUL);
+          idx++;
+        } else if (tokens[idx] == "relu") {
+          DEBUG("Found relu operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::RELU);
+          idx++;
+        } else if (tokens[idx] == "clipped_relu") {
+          DEBUG("Found clipped_relu operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU);
+          idx++;
+        } else if (tokens[idx] == "tanh") {
+          DEBUG("Found tanh operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::TANH);
+          idx++;
+        } else if (tokens[idx] == "pool_max") {
+          DEBUG("Found pool_max operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::POOL_MAX);
+          idx++;
+        } else if (tokens[idx] == "pool_mean") {
+          DEBUG("Found pool_mean operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::POOL_MEAN);
+          idx++;
+        } else if (tokens[idx] == "pool_min") {
+          DEBUG("Found pool_min operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::POOL_MIN);
+          idx++;
+        } else if (tokens[idx] == "softmax") {
+          DEBUG("Found softmax operation\n");
+          NodeConf->pushNewTensorOperation(
+              CPUNodeConfiguration::TENSOR_OP::SOFTMAX);
+          idx++;
+        } else /*Not a new operation. This means an approximation option*/
+            if (tokens[idx] == "fp32") {
+          DEBUG("Found fp32 option\n");
+          int fp32 = std::stoi(tokens[idx + 1]);
+          DEBUG("fp32 parameter: %d, ignoring\n", fp32);
+          NodeConf->pushNewApproximationChoiceForOperation(
+              CPUNodeConfiguration::APPROX::FP32, fp32);
+          idx += 2;
+        } else if (tokens[idx] == "perf") {
+          DEBUG("Found perf option\n");
+          int perf = std::stoi(tokens[idx + 1]);
+          DEBUG("perf parameter: %d\n", perf);
+          NodeConf->pushNewApproximationChoiceForOperation(
+              CPUNodeConfiguration::APPROX::PERFORATION, perf);
+          idx += 2;
+        } else if (tokens[idx] == "samp") {
+          DEBUG("Found samp option\n");
+          int samp = std::stoi(tokens[idx + 1]);
+          DEBUG("samp parameter: %d\n", samp);
+          NodeConf->pushNewApproximationChoiceForOperation(
+              CPUNodeConfiguration::APPROX::INPUT_SAMPLING, samp);
+          idx += 2;
+        }
+        // TODO: other approximation options handled here
+      }
+
     } else {
       DEBUG("Invalid Configuration File\n");
       exit(1);
@@ -960,8 +1068,9 @@ void RuntimeController::computeParetoConfigurationPoints() {
 
   // Sort the configurations according to accuracy loss
   INFO("Sorting autotuner configurations...\n");
-  std::sort(InitialConfigurations.begin() + 1, InitialConfigurations.end(),
-            ConfigurationLessThan());
+  std::sort(
+      InitialConfigurations.begin() + 1, InitialConfigurations.end(),
+      ConfigurationLessThan());
   INFO("Done sorting.\n");
 
   for (unsigned start_idx = 1; start_idx < InitialConfigurations.size();) {
@@ -995,12 +1104,14 @@ void RuntimeController::computeParetoConfigurationPoints() {
         en_idx = i;
       }
     }
-    DEBUG("accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
-          InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
+    DEBUG(
+        "accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
+        InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
     // Found best speedup for this accuracy point (not dominated by any of
     // these).
-    DEBUG("accuracy loss = %f, energy = %f, at en_idx = %d\n",
-          InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
+    DEBUG(
+        "accuracy loss = %f, energy = %f, at en_idx = %d\n",
+        InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
     // Found best energy for this accuracy point (not dominated by any of
     // these).
 
@@ -1070,8 +1181,9 @@ void RuntimeController::compute3DParetoConfigurationPoints() {
 
   // Sort the configurations according to accuracy loss
   INFO("Sorting autotuner configurations...\n");
-  std::sort(InitialConfigurations.begin(), InitialConfigurations.end(),
-            ConfigurationLessThan());
+  std::sort(
+      InitialConfigurations.begin(), InitialConfigurations.end(),
+      ConfigurationLessThan());
   INFO("Done sorting.\n");
 
   for (unsigned start_idx = 0; start_idx < InitialConfigurations.size();) {
@@ -1105,10 +1217,11 @@ void RuntimeController::compute3DParetoConfigurationPoints() {
         }
       }
       if (!dominated) {
-        DEBUG("accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
-              InitialConfigurations[i].accuracyLoss,
-              InitialConfigurations[i].speedup, InitialConfigurations[i].energy,
-              i);
+        DEBUG(
+            "accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
+            InitialConfigurations[i].accuracyLoss,
+            InitialConfigurations[i].speedup, InitialConfigurations[i].energy,
+            i);
         Indices.push_back(i);
       }
     }
@@ -1167,22 +1280,31 @@ void RuntimeController::printConfigurations(
   }
 }
 
-unsigned long RuntimeController::getLastFrequency() { return g_freq; }
+unsigned long RuntimeController::getLastFrequency() {
+  return g_freq;
+}
 
-void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; }
+void RuntimeController::setLastFrequency(unsigned long f) {
+  g_freq = f;
+}
 
-double RuntimeController::getLastSpeedup() { return g_speedup; }
+double RuntimeController::getLastSpeedup() {
+  return g_speedup;
+}
 
-void RuntimeController::setLastSpeedup(double s) { g_speedup = s; }
+void RuntimeController::setLastSpeedup(double s) {
+  g_speedup = s;
+}
 
 void RuntimeController::findNextConfiguration() {
   configurationIdx = (configurationIdx + 1) % Configurations->size();
-  DEBUG("findNextConfiguration: Updated configurationIdx to %u.\n",
-        configurationIdx);
+  DEBUG(
+      "findNextConfiguration: Updated configurationIdx to %u.\n",
+      configurationIdx);
 }
 
-void RuntimeController::findTargetConfiguration(float goal,
-                                                enum SEARCH_KIND sk) {
+void RuntimeController::findTargetConfiguration(
+    float goal, enum SEARCH_KIND sk) {
   // We search in range begin(), end()-1 . It is OK to decrement end(), because
   // the configurations vector always points to one of the pareto curves, and
   // they are never empty - we have always pushed at least one configuration.
@@ -1192,25 +1314,25 @@ void RuntimeController::findTargetConfiguration(float goal,
   switch (sk) {
   case SPEEDUP: {
     Configurations = &SpeedupConfigurations;
-    low_it =
-        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
-                         goal, ConfigurationLessThan_SP());
+    low_it = std::lower_bound(
+        Configurations->begin(), Configurations->end() - 1, goal,
+        ConfigurationLessThan_SP());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ENERGY: {
     Configurations = &EnergyConfigurations;
-    low_it =
-        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
-                         goal, ConfigurationLessThan_E());
+    low_it = std::lower_bound(
+        Configurations->begin(), Configurations->end() - 1, goal,
+        ConfigurationLessThan_E());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ACCURACY_LOSS: {
     Configurations = &SpeedupConfigurations;
-    low_it =
-        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
-                         goal, ConfigurationLessThan_AL());
+    low_it = std::lower_bound(
+        Configurations->begin(), Configurations->end() - 1, goal,
+        ConfigurationLessThan_AL());
     if ((*low_it)->accuracyLoss > goal)
       --low_it;
     configurationIdx = low_it - Configurations->begin();
@@ -1225,8 +1347,9 @@ void RuntimeController::findTargetConfiguration(float goal,
   // After search, low_it points to the Configuration to the element with the
   // goal value or the immediately lower value if it does not exist
 
-  DEBUG("findTargetConfiguration: Updated configurationIdx to %u.\n",
-        configurationIdx);
+  DEBUG(
+      "findTargetConfiguration: Updated configurationIdx to %u.\n",
+      configurationIdx);
 }
 
 void RuntimeController::adjustTargetConfiguration(float goal) {
@@ -1237,8 +1360,8 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
   // Find configuration before the selected one.
   // There is always one, unless goal is 1. Then, we would pick baseline, and
   //  both upper and lower should be the same configuration, at index 0.
-  unsigned prev_conf_idx =
-      configurationIdx > 0 ? configurationIdx - 1 : configurationIdx;
+  unsigned prev_conf_idx = configurationIdx > 0 ? configurationIdx - 1
+                                                : configurationIdx;
   // Get the two configurations' speedup, and compute the appropriate ranges
   float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup;
   float prev_conf_speedup = (*Configurations)[prev_conf_idx]->speedup;
@@ -1257,32 +1380,32 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
 
     //***--- Probability adjustment strategy 1 ---***//
     // No big adjustments at edges of probability range
-    //    float adjust_val = 0.0;
-    //    if (low_pb < high_pb) {
-    //      adjust_val = low_pb * 0.2;
-    //    } else {
-    //      adjust_val = high_pb * 0.2;
-    //    }
-    //    low_pb -= adjust_val;
-    //    high_pb += adjust_val;
+//    float adjust_val = 0.0;
+//    if (low_pb < high_pb) {
+//      adjust_val = low_pb * 0.2;
+//    } else {
+//      adjust_val = high_pb * 0.2;
+//    }
+//    low_pb -= adjust_val;
+//    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 2 ---***//
     // No big adjustment at high edge of probability range
-    //    float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2;
-    //    low_pb -= adjust_val;
-    //    high_pb += adjust_val;
+//    float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2;
+//    low_pb -= adjust_val;
+//    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 3 ---***//
-    // Similar to 2, but higher always increases, more significantly
-    //    float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5;
-    //    low_pb -= adjust_val;
-    //    high_pb += adjust_val;
+    //Similar to 2, but higher always increases, more significantly
+//    float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5;
+//    low_pb -= adjust_val;
+//    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 4 ---***//
-    // Similar to 2, but higher always increases, more significantly
+    //Similar to 2, but higher always increases, more significantly
     // Low end, high end a bit less aggressive than total range
     float adjust_val = low_pb * 0.6 > 0.2 ? 0.2 : low_pb * 0.6;
     adjust_val = adjust_val > high_pb ? high_pb : adjust_val;
@@ -1291,18 +1414,20 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
     //***---                                   ---***//
   }
 
-  DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: "
-        "%f.\n",
-        ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
-  DEBUG("**---- adjustTargetConfiguration: lower conf = %s with probability: "
-        "%f.\n\n",
-        ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb);
+  DEBUG(
+      "**---- adjustTargetConfiguration: upper conf = %s with probability: "
+      "%f.\n",
+      ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
+  DEBUG(
+      "**---- adjustTargetConfiguration: lower conf = %s with probability: "
+      "%f.\n\n",
+      ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb);
 
   // Select a random number from 0 to 1
   // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1)
   // to the upper
   // float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ;
-  // float rd = pseudo_rd;
+  //float rd = pseudo_rd;
   float rd = distr(generator);
   if (rd < low_pb) {
     // If the probability is in the low range
@@ -1322,8 +1447,8 @@ double RuntimeController::getBaselineTime() { return baseline_time; }
 Slowdowns *RuntimeController::getSlowdowns() { return slowdowns; }
 
 // Functions to be inserted with initializeTensorRT and clearTensorRT
-extern "C" void llvm_hpvm_initializeRuntimeController(const char *ConfigFile,
-                                                      const char *QRangeFile) {
+extern "C" void llvm_hpvm_initializeRuntimeController(
+    const char *ConfigFile, const char *QRangeFile) {
   RC = new RuntimeController();
   RC->init(ConfigFile, QRangeFile);
   return;
@@ -1337,8 +1462,8 @@ extern "C" void llvm_hpvm_clearRuntimeController() {
 //*** Methods to compute accuracy of a tensor by the runtime controller   ***//
 uint32_t *labels_from_file = NULL;
 
-uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start,
-                                         int end) {
+uint32_t *
+hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) {
 
   // Initialize buffer
   if (!labels_from_file) {
@@ -1423,12 +1548,13 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
   return accuracy;
 }
 
+
 //#define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl
-#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl
-//#define llvm_hpvm_invokeRtControl_ADJUST llvm_hpvm_invokeRtControl
+//#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl
+#define llvm_hpvm_invokeRtControl_ITERATE llvm_hpvm_invokeRtControl
 
-extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str,
-                                               int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_BASE(
+    void *result, const char *str, int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1445,15 +1571,16 @@ extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str,
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n\n",
+      current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str,
-                                                  int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ITERATE(
+    void *result, const char *str, int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1477,15 +1604,16 @@ extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str,
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n\n",
+      current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str,
-                                                 int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ADJUST(
+    void *result, const char *str, int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1528,17 +1656,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str,
   RC->addToCurrentIterationConfigEnergy(pinfo2.second);
   //*                                                                        */
 
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n",
+      current_iteration_time, current_iteration_energy);
   INFO("target speedup = %lf\n\n", target_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result,
-                                                    const char *str, int start,
-                                                    int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(
+    void *result, const char *str, int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1582,17 +1710,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result,
   RC->addToCurrentIterationConfigEnergy(pinfo2.second);
   //*                                                                        */
 
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n",
+      current_iteration_time, current_iteration_energy);
   INFO("target speedup = %lf\n\n", target_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result,
-                                                   const char *str, int start,
-                                                   int end) {
+extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(
+    void *result, const char *str, int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1619,20 +1747,21 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result,
   float next_conf_speedup =
       RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
 
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n",
+      current_iteration_time, current_iteration_energy);
   INFO("slowdown (target speedup) = %f\n", slowdown);
   INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO("Swapping to next configuration: %s with speedup %f\n\n",
-       next_conf_name.c_str(), next_conf_speedup);
+  INFO(
+      "Swapping to next configuration: %s with speedup %f\n\n",
+      next_conf_name.c_str(), next_conf_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result,
-                                                      const char *str,
-                                                      int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(
+    void *result, const char *str, int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1660,19 +1789,21 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result,
   float next_conf_speedup =
       RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
 
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n",
+      current_iteration_time, current_iteration_energy);
   INFO("slowdown (target speedup) = %f\n", slowdown);
   INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO("Swapping to next configuration: %s with speedup %f\n\n",
-       next_conf_name.c_str(), next_conf_speedup);
+  INFO(
+      "Swapping to next configuration: %s with speedup %f\n\n",
+      next_conf_name.c_str(), next_conf_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str,
-                                               int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_RAND(
+    void *result, const char *str, int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1690,8 +1821,9 @@ extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str,
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n\n",
+      current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
@@ -1702,13 +1834,12 @@ static void writeVectorToFile(const char *path, const std::vector<T> &vec) {
   std::ofstream of(path, std::ofstream::out | std::ofstream::app);
   if (!of.good())
     ERROR("Cannot write to %s file", path);
-  for (float f : vec)
+  for (float f: vec)
     of << f << ' ';
   of << '\n';
 }
 
-extern "C" void llvm_hpvm_imgInvokeRtControl(void *result, void *gold,
-                                             int start, int end) {
+extern "C" void llvm_hpvm_imgInvokeRtControl(void* result, void *gold, int start, int end) {
   RC->resume_profiler();
 
   if (gold != nullptr) {
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index 98fd30ba9ee0ec7e0b81cfcaa9b3a699ec8e57b0..898d92c18cb8ad0b2df7a6d0c9d905c9649c53c1 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -1,12 +1,10 @@
-/* This file includes the API implementation of the HPVM tensor runtime built
-*for CPU
+/* This file includes the API implementation of the HPVM tensor runtime built for CPU
 **
 **  Author: Hashim Sharif
 **  Email: hsharif3@illinois.edu
 */
 
 #include <algorithm>
-#include <bits/stdc++.h>
 #include <cfloat>
 #include <cmath>
 #include <cstdio>
@@ -16,1152 +14,1101 @@
 #include <iostream>
 #include <limits>
 #include <map>
-#include <math.h>
+#include <cmath>
 #include <memory>
-#include <omp.h>
-#include <pthread.h>
+#include <vector>
 #include <sstream>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string>
 #include <vector>
+#include <math.h>
+#include<bits/stdc++.h>
+#include <pthread.h>
+#include <omp.h>
 
 // Tensor runtime header files
-#include "tensor_cpu.h"
+//#include "tensor_cpu.h"
+#include "tensor.h"
 #include "tensor_cpu_runtime.h"
 
-void llvm_hpvm_initTensorRt(int) {
-  // NOTE: Do Nothing
+void llvm_hpvm_initTensorRtCPU() {
+    // NOTE: Do Nothing
 }
 
-void llvm_hpvm_cleanupTensorRt() {
-  // NOTE: Do Nothing
+void llvm_hpvm_cleanupTensorRtCPU() {
+    // NOTE: Do Nothing
 }
 
-void hpvm_request_tensor(void *tensor, int destination) {
-  // NOTE: Do Nothing
+void hpvm_request_tensorCPU(void *tensor, int destination) {
+    // NOTE: Do Nothing
 }
-
+  
 std::vector<void *> PtrVect;
 
 void freeBatchMemory() {
-  for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
-    free(*it);
-  }
-  PtrVect.erase(PtrVect.begin(), PtrVect.end());
+    for(auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
+        free(*it);
+    }
+    PtrVect.erase(PtrVect.begin(), PtrVect.end());
 }
 
-inline int getTypeSize(int data_type) {
-  return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
+
+int getTypeSizeCPU(int data_type)  __attribute__((always_inline));
+inline int getTypeSizeCPU(int data_type) {
+    return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
 }
 
-void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems)
-    __attribute__((always_inline));
-inline void setSizeInBytes(struct Tensor *tensor, int data_type,
-                           size_t num_elems) {
-  int type_size = getTypeSize(data_type);
-  size_t size_in_bytes = type_size * num_elems;
-  tensor->size_in_bytes = size_in_bytes;
+void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline));
+inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) {
+    int type_size = getTypeSizeCPU(data_type);
+    size_t size_in_bytes = type_size * num_elems;
+    tensor->size_in_bytes = size_in_bytes;
 }
 
-void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems,
-                    bool freeMemory = true) __attribute__((always_inline));
-inline void allocateMemCPU(struct Tensor *tensor, int data_type,
-                           size_t num_elems, bool freeMemory) {
-  setSizeInBytes(tensor, data_type, num_elems);
-  tensor->data_type = data_type;
-  tensor->num_elems = num_elems;
-  tensor->host_data =
-      (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
-  if (freeMemory)
-    PtrVect.push_back(tensor->host_data);
+void allocateMemCPU(struct Tensor *tensor, int data_type, 
+                    size_t num_elems, bool freeMemory = true) __attribute__((always_inline));
+inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, bool freeMemory) {
+    setSizeInBytesCPU(tensor, data_type, num_elems);
+    tensor->data_type = data_type;
+    tensor->num_elems = num_elems;
+    tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
+    if(freeMemory)
+        PtrVect.push_back(tensor->host_data);
 }
 
-void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
-  Tensor *tensor = (Tensor *)tensor_ptr;
-  if (tensor->size_in_bytes != size_in_bytes) {
-    printf("The destination and source sizes don't match");
-  }
-  memcpy(tensor->host_data, data_ptr,
-         size_in_bytes); // Is this efficient enough?
+void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) __attribute__((always_inline));
+inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
+    Tensor *tensor = (Tensor *)tensor_ptr;
+    if (tensor->size_in_bytes != size_in_bytes) {
+        printf("The destination and source sizes don't match");
+    }
+    memcpy(tensor->host_data, data_ptr, size_in_bytes); // Is this efficient enough?
 }
 
-// void *create4DTensor(int data_type, int data_format, size_t dim1_size,
-//  size_t dim2_size, size_t dim3_size, size_t dim4_size,
-// bool freeMemory = true) __attribute__((always_inline));
-inline void *create4DTensor(int data_type, int data_format, size_t dim1_size,
-                            size_t dim2_size, size_t dim3_size,
-                            size_t dim4_size, bool freeMemory) {
-  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
-  size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  if (freeMemory)
-    PtrVect.push_back(tensor);
-  allocateMemCPU(tensor, data_type, num_elems, freeMemory);
-
-  // Setting the tensor dimensions
-  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
-  dim_sizes[0] = dim1_size;
-  dim_sizes[1] = dim2_size;
-  dim_sizes[2] = dim3_size;
-  dim_sizes[3] = dim4_size;
-  tensor->dims.dim_sizes = dim_sizes;
-  tensor->dims.num_dims = 4;
-
-  return tensor;
+void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
+                     size_t dim2_size, size_t dim3_size, size_t dim4_size, 
+                    bool freeMemory = true) __attribute__((always_inline));
+inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,         
+                                    size_t dim2_size, size_t dim3_size, 
+                                    size_t dim4_size, bool freeMemory) {
+    struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+    size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+    if(freeMemory)
+        PtrVect.push_back(tensor);
+    allocateMemCPU(tensor, data_type, num_elems, freeMemory);
+    
+    // Setting the tensor dimensions
+    size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
+    dim_sizes[0] = dim1_size;
+    dim_sizes[1] = dim2_size;
+    dim_sizes[2] = dim3_size;
+    dim_sizes[3] = dim4_size;
+    tensor->dims.dim_sizes = dim_sizes;
+    tensor->dims.num_dims = 4;
+    tensor->data_placement = HOST;    
+    return tensor;
 }
 
-void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
-                                  int vertical_pad, int horizontal_pad,
-                                  int vertical_stride, int horizontal_stride,
-                                  int conv_mode, int compute_precision) {
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *filter = (Tensor *)filter_ptr;
-
-  float *__restrict__ host_image = (float *)input->host_data;
-  float *__restrict__ host_filter = (float *)filter->host_data;
-
-  int batch_size = input->dims.dim_sizes[0];
-  int channels = input->dims.dim_sizes[1];
-  int image_height = input->dims.dim_sizes[2];
-  int image_width = input->dims.dim_sizes[3];
-  int num_filters = filter->dims.dim_sizes[0];
-  int kernel_height = filter->dims.dim_sizes[2];
-  int kernel_width = filter->dims.dim_sizes[3];
-  int output_height =
-      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-  int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) /
-                          horizontal_stride);
-  int num_filter_elem = kernel_height * kernel_width * channels;
-  int output_size = output_width * output_height;
-
-  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
-                                            output_height, output_width);
-  float *__restrict__ output_data = (float *)output->host_data;
-
-  long int conv_data_size = sizeof(float) * num_filter_elem * output_height *
-                            output_width * batch_size;
-  float *host_data = (float *)malloc(conv_data_size);
-  // printf("number of batches: %d\n", batch_size);
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; b++) {
-    for (int ch = 0; ch < channels; ch++) {
-      for (int h = 0; h < output_height; h++) {
-        for (int w = 0; w < output_width; w++) {
-          const int inH = h * vertical_stride - vertical_pad;
-          const int inW = w * horizontal_stride - horizontal_pad;
-          for (int i = 0; i < kernel_height; i++) {
-            for (int j = 0; j < kernel_width; j++) {
-              const int filter_elem_num =
-                  (ch * kernel_height + i) * kernel_width + j;
-              const int output_index = h * output_width + w;
-              const int out_index = b * num_filter_elem * output_size +
-                                    output_index * num_filter_elem +
-                                    filter_elem_num;
-              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
-                  inW + j < image_width) {
-                host_data[out_index] =
-                    host_image[((b * channels + ch) * image_height +
-                                (inH + i)) *
-                                   image_width +
-                               (inW + j)];
-              } else {
-                host_data[out_index] = 0;
-              }
+void* tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                                    int horizontal_pad, int vertical_stride,
+                                    int horizontal_stride, int conv_mode,
+                                    int compute_precision) {
+    Tensor *input = (Tensor *)input_ptr;
+    Tensor *filter = (Tensor *)filter_ptr;
+    
+    float * __restrict__ host_image = (float *)input->host_data;
+    float * __restrict__ host_filter = (float *)filter->host_data;
+
+    int batch_size = input->dims.dim_sizes[0];
+    int channels = input->dims.dim_sizes[1];
+    int image_height = input->dims.dim_sizes[2];
+    int image_width = input->dims.dim_sizes[3];
+    int num_filters = filter->dims.dim_sizes[0];
+    int kernel_height = filter->dims.dim_sizes[2];
+    int kernel_width = filter->dims.dim_sizes[3];
+    int output_height = 
+        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+    int output_width = 
+        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+    int num_filter_elem = kernel_height * kernel_width * channels;
+    int output_size = output_width * output_height;
+    printf("--CREATE 4D TENSOR\n");    
+    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
+                                                    output_height, output_width);
+    float * __restrict__ output_data = (float *)output->host_data;
+    printf("CREATED 4D TENSOR\n");
+    long int conv_data_size = 
+        sizeof(float) * num_filter_elem * output_height * output_width * batch_size;
+    float *host_data = (float *) malloc(conv_data_size);
+    printf("host data: %p\n", host_data);
+    printf("number of batches: %d\n", batch_size);
+    omp_set_num_threads(4);
+     #pragma omp parallel for
+    for(int b = 0; b < batch_size; b++) {
+        for(int ch = 0; ch < channels; ch++) {
+            for(int h = 0; h < output_height; h++) {
+                for(int w = 0; w < output_width; w++) {
+                    const int inH = h * vertical_stride - vertical_pad;
+                    const int inW = w * horizontal_stride - horizontal_pad;
+                    for(int i = 0; i < kernel_height; i++) {
+                        for(int j = 0; j < kernel_width; j++) {
+                            const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j;
+                            const int output_index = h * output_width + w;
+                            const int out_index = b * num_filter_elem * output_size 
+                                        + output_index * num_filter_elem + filter_elem_num;
+                            if(inH + i >= 0 && inH + i < image_height 
+                                && inW + j >= 0 && inW + j < image_width) {
+                                host_data[out_index] = 
+                                    host_image[((b * channels + ch) * image_height 
+                                        + (inH + i)) * image_width + (inW + j)];
+                            } else {
+                                host_data[out_index] = 0;
+                            }
+                        }
+                    }
+                }
             }
-          }
         }
-      }
-    }
-    for (int p = 0; p < num_filters; ++p) {
-      for (int m = 0; m < output_size; ++m) {
-        float sum = 0;
-#pragma omp simd reduction(+ : sum)
-        for (int k = 0; k < num_filter_elem; ++k) {
-          int input_index =
-              k + num_filter_elem * m + b * num_filter_elem * output_size;
-          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
+        for (int p = 0; p < num_filters; ++p) {
+             for (int m = 0; m < output_size; ++m) {
+                float sum = 0;
+                #pragma omp simd reduction(+:sum)
+                for (int k = 0; k < num_filter_elem; ++k) {
+                    int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
+                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
+                }
+                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
+            }
         }
-        output_data[b * (output_size * num_filters) + p * output_size + m] =
-            sum;
-      }
     }
-  }
-  free(host_data);
-  printf("END: %p\n", output);
-  return output;
+    free(host_data);
+    printf("END: %p\n", output);
+    return output;
 }
 
-void *tensorRegularFilterSamplingConvolutionCPU(
-    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
-    int vertical_stride, int horizontal_stride, int conv_mode,
-    int compute_precision, int skip_every, int start) {
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *filter = (Tensor *)filter_ptr;
-
-  float *__restrict__ host_image = (float *)input->host_data;
-  float *__restrict__ host_filter = (float *)filter->host_data;
-
-  const int batch_size = input->dims.dim_sizes[0];
-  const int channels = input->dims.dim_sizes[1];
-  const int image_height = input->dims.dim_sizes[2];
-  const int image_width = input->dims.dim_sizes[3];
-  const int num_filters = filter->dims.dim_sizes[0];
-  const int kernel_height = filter->dims.dim_sizes[2];
-  const int kernel_width = filter->dims.dim_sizes[3];
-  const int output_height =
-      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-  const int output_width =
-      1 +
-      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-  const int num_filter_elem = kernel_height * kernel_width * channels;
-
-  const int remainder = ((num_filter_elem - start) % skip_every > 0);
-  const int reduced_num_filter_elem =
-      num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
-  const int output_size = output_width * output_height;
-
-  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
-                                            output_height, output_width);
-  float *__restrict__ output_data = (float *)output->host_data;
-
-  const long int host_data_size = sizeof(float) * reduced_num_filter_elem *
-                                  output_height * output_width * batch_size;
-  float *host_data = (float *)malloc(host_data_size);
-
-  const int reduced_filer_size =
-      sizeof(float) * num_filters * reduced_num_filter_elem;
-  float *reduced_kernels = (float *)malloc(reduced_filer_size);
-
-  float fac = (((float)skip_every) / ((float)skip_every - 1));
-  int reduced_filter_dim = reduced_num_filter_elem / channels;
-
-  // Create reduced filter
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int f = 0; f < num_filters; f++) {
-    for (int i = 0; i < reduced_num_filter_elem; i++) {
-      int ch = i / reduced_filter_dim;
-      int offset = (start + ch) % skip_every;
-      int in_index;
-      if (i < offset) {
-        in_index = i;
-      } else {
-        in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) +
-                   (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) +
-                   offset - 1;
-      }
-      reduced_kernels[f * reduced_num_filter_elem + i] =
-          fac * host_filter[num_filter_elem * f + in_index];
-    }
-  }
-
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; b++) {
-    for (int h = 0; h < output_height; h++) {
-      for (int w = 0; w < output_width; w++) {
-        const int inH = h * vertical_stride - vertical_pad;
-        const int inW = w * horizontal_stride - horizontal_pad;
-        for (int fi = 0; fi < reduced_num_filter_elem; fi++) {
-          int in_index;
-          const int ch = fi / reduced_filter_dim;
-          const int offset = (start + ch) % skip_every;
-          if (fi < offset) {
-            in_index = fi;
-          } else {
-            in_index =
-                ((fi - offset + 1) * skip_every) / (skip_every - 1) +
-                (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
-                offset - 1;
-          }
-          const int i =
-              (in_index % (kernel_width * kernel_height)) / kernel_width;
-          const int j = in_index % kernel_width;
-          const int output_index = h * output_width + w;
-          const int out_index = b * reduced_num_filter_elem * output_size +
-                                output_index * reduced_num_filter_elem + fi;
-          if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
-              inW + j < image_width) {
-            host_data[out_index] =
-                host_image[((b * channels + ch) * image_height + (inH + i)) *
-                               image_width +
-                           (inW + j)];
-          } else {
-            host_data[out_index] = 0;
-          }
+void* tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, 
+                                                int vertical_pad, int horizontal_pad, 
+                                                int vertical_stride, int horizontal_stride, 
+                                                int conv_mode, int compute_precision, 
+                                                int skip_every, int start) {
+    Tensor *input = (Tensor *)input_ptr;
+    Tensor *filter = (Tensor *)filter_ptr;
+    
+    float * __restrict__ host_image = (float *)input->host_data;
+    float * __restrict__ host_filter = (float *)filter->host_data;
+
+    const int batch_size = input->dims.dim_sizes[0];
+    const int channels = input->dims.dim_sizes[1];
+    const int image_height = input->dims.dim_sizes[2];
+    const int image_width = input->dims.dim_sizes[3];
+    const int num_filters = filter->dims.dim_sizes[0];
+    const int kernel_height = filter->dims.dim_sizes[2];
+    const int kernel_width = filter->dims.dim_sizes[3];
+    const int output_height = 
+        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+    const int output_width = 
+        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+    const int num_filter_elem = kernel_height * kernel_width * channels;
+
+    const int remainder = ((num_filter_elem - start) % skip_every > 0);
+    const int reduced_num_filter_elem = 
+            num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
+    const int output_size = output_width * output_height;
+    
+    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
+                                                    output_height, output_width);
+    float * __restrict__ output_data = (float *)output->host_data;
+    
+    const long int host_data_size = sizeof(float) * reduced_num_filter_elem 
+                                    * output_height * output_width * batch_size;
+    float *host_data = (float *) malloc(host_data_size);
+   
+    const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
+    float *reduced_kernels = (float *) malloc(reduced_filer_size);
+   
+    float fac =  (((float) skip_every) / ((float) skip_every - 1));
+    int reduced_filter_dim = reduced_num_filter_elem / channels;
+
+    // Create reduced filter
+    omp_set_num_threads(4);
+    #pragma omp parallel for
+    for(int f = 0; f < num_filters; f++) {
+        for(int i = 0; i < reduced_num_filter_elem; i++) {
+            int ch = i / reduced_filter_dim;
+            int offset  = (start + ch) % skip_every; 
+            int in_index;
+            if(i < offset) {
+                in_index = i;
+            } else {
+                in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) 
+                        + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset -1;
+            }
+            reduced_kernels[f * reduced_num_filter_elem + i] = 
+                                fac * host_filter[num_filter_elem * f + in_index];
         }
-      }
     }
 
-    // Tensor Multiply
-    for (int p = 0; p < num_filters; ++p) {
-      for (int m = 0; m < output_size; ++m) {
-        float sum = 0;
-#pragma omp simd reduction(+ : sum)
-        for (int k = 0; k < reduced_num_filter_elem; ++k) {
-          int input_index = k + reduced_num_filter_elem * m +
-                            b * reduced_num_filter_elem * output_size;
-          sum += host_data[input_index] *
-                 reduced_kernels[p * reduced_num_filter_elem + k];
+    omp_set_num_threads(4);   
+    #pragma omp parallel for
+    for(int b = 0; b < batch_size; b++) {
+            for(int h = 0; h < output_height; h++) {
+                for(int w = 0; w < output_width; w++) {
+                    const int inH = h * vertical_stride - vertical_pad;
+                    const int inW = w * horizontal_stride - horizontal_pad;
+                    for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
+                        int in_index;
+                        const int ch = fi / reduced_filter_dim;
+                        const int offset  = (start + ch) % skip_every;
+                        if(fi < offset) {
+                            in_index = fi;
+                        } else {
+                            in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
+                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
+                        }
+                        const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; 
+                        const int j = in_index % kernel_width;
+                        const int output_index = h * output_width + w;
+                        const int out_index = b * reduced_num_filter_elem * output_size 
+                                            + output_index * reduced_num_filter_elem + fi;
+                        if(inH + i >= 0 && inH + i < image_height 
+                        && inW + j >= 0 && inW + j < image_width) {
+                            host_data[out_index] = 
+                                host_image[((b * channels + ch) * image_height 
+                                            + (inH + i)) * image_width + (inW + j)];
+                        } else {
+                            host_data[out_index] = 0;
+                        }
+                }
+            }
         }
-        output_data[b * (output_size * num_filters) + p * output_size + m] =
-            sum;
-      }
-    }
-  }
-  free(reduced_kernels);
-  free(host_data);
-
-  return output;
-}
 
-void *tensorIrregularFilterSamplingConvolutionCPU(
-    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
-    int vertical_stride, int horizontal_stride, int conv_mode,
-    int compute_precision, int skip_every, int start) {
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *filter = (Tensor *)filter_ptr;
-
-  float *__restrict__ host_image = (float *)input->host_data;
-  float *__restrict__ host_filter = (float *)filter->host_data;
-
-  const int batch_size = input->dims.dim_sizes[0];
-  const int channels = input->dims.dim_sizes[1];
-  const int image_height = input->dims.dim_sizes[2];
-  const int image_width = input->dims.dim_sizes[3];
-  const int num_filters = filter->dims.dim_sizes[0];
-  const int kernel_height = filter->dims.dim_sizes[2];
-  const int kernel_width = filter->dims.dim_sizes[3];
-  const int output_height =
-      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-  const int output_width =
-      1 +
-      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-  const int num_filter_elem = kernel_height * kernel_width * channels;
-
-  const int remainder = ((num_filter_elem - start) % skip_every > 0);
-  const int reduced_num_filter_elem =
-      num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
-  const int output_size = output_width * output_height;
-
-  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
-                                            output_height, output_width);
-  float *__restrict__ output_data = (float *)output->host_data;
-
-  const long int host_data_size = sizeof(float) * reduced_num_filter_elem *
-                                  output_height * output_width * batch_size;
-  float *host_data = (float *)malloc(host_data_size);
-
-  const int reduced_filer_size =
-      sizeof(float) * num_filters * reduced_num_filter_elem;
-  float *reduced_kernels = (float *)malloc(reduced_filer_size);
-
-  float fac = (((float)skip_every) / ((float)skip_every - 1));
-  int reduced_filter_dim = reduced_num_filter_elem / channels;
-
-  // Create Reduced filter
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int f = 0; f < num_filters; f++) {
-    for (int i = 0; i < start; i++) {
-      reduced_kernels[f * reduced_num_filter_elem + i] =
-          host_filter[num_filter_elem * f + i];
-    }
-#pragma omp simd
-    for (int i = start; i < reduced_num_filter_elem; i++) {
-      int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) +
-                     (((i - start + 1) * skip_every) % (skip_every - 1) > 0) +
-                     start - 1;
-      reduced_kernels[f * reduced_num_filter_elem + i] =
-          fac * host_filter[num_filter_elem * f + in_index];
-    }
-  }
-
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; b++) {
-    for (int h = 0; h < output_height; h++) {
-      for (int w = 0; w < output_width; w++) {
-        const int inH = h * vertical_stride - vertical_pad;
-        const int inW = w * horizontal_stride - horizontal_pad;
-        for (int fi = 0; fi < reduced_num_filter_elem; fi++) {
-          int in_index;
-          int offset = start;
-          if (fi < offset) {
-            in_index = fi;
-          } else {
-            in_index =
-                ((fi - offset + 1) * skip_every) / (skip_every - 1) +
-                (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
-                offset - 1;
-          }
-          const int ch = in_index / (kernel_width * kernel_height);
-          const int i =
-              (in_index % (kernel_width * kernel_height)) / kernel_width;
-          const int j = in_index % kernel_width;
-          const int output_index = h * output_width + w;
-          const int out_index = b * reduced_num_filter_elem * output_size +
-                                output_index * reduced_num_filter_elem + fi;
-          if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
-              inW + j < image_width) {
-            host_data[out_index] =
-                host_image[((b * channels + ch) * image_height + (inH + i)) *
-                               image_width +
-                           (inW + j)];
-          } else {
-            host_data[out_index] = 0;
-          }
+         // Tensor Multiply
+        for (int p = 0; p < num_filters; ++p) {
+            for (int m = 0; m < output_size; ++m) {
+                float sum = 0;
+                #pragma omp simd reduction(+:sum)
+                for (int k = 0; k < reduced_num_filter_elem; ++k) {
+                    int input_index = k + reduced_num_filter_elem * m 
+                                    + b * reduced_num_filter_elem * output_size;
+                    sum += host_data[input_index] 
+                            * reduced_kernels[p * reduced_num_filter_elem + k];
+                }
+                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
+            }
         }
-      }
-    }
 
-    // Tensor Multiply
-    for (int p = 0; p < num_filters; ++p) {
-      for (int m = 0; m < output_size; ++m) {
-        float sum = 0;
-#pragma omp simd reduction(+ : sum)
-        for (int k = 0; k < reduced_num_filter_elem; ++k) {
-          int input_index = k + reduced_num_filter_elem * m +
-                            b * reduced_num_filter_elem * output_size;
-          sum += host_data[input_index] *
-                 reduced_kernels[p * reduced_num_filter_elem + k];
-        }
-        output_data[b * (output_size * num_filters) + p * output_size + m] =
-            sum;
-      }
     }
-  }
-  free(reduced_kernels);
-  free(host_data);
-
-  return output;
+    free(reduced_kernels);
+    free(host_data);
+  
+    return output;
 }
 
-void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
-                                  int vertical_pad, int horizontal_pad,
-                                  int vertical_stride, int horizontal_stride,
-                                  int conv_mode, int compute_precision, int row,
-                                  int start) {
-
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *filter = (Tensor *)filter_ptr;
-
-  float *__restrict__ host_image = (float *)input->host_data;
-  float *__restrict__ host_filter = (float *)filter->host_data;
-
-  int batch_size = input->dims.dim_sizes[0];
-  int channels = input->dims.dim_sizes[1];
-  int image_height = input->dims.dim_sizes[2];
-  int image_width = input->dims.dim_sizes[3];
-  int num_filters = filter->dims.dim_sizes[0];
-  int kernel_height = filter->dims.dim_sizes[2];
-  int kernel_width = filter->dims.dim_sizes[3];
-
-  int full_output_height =
-      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-  int full_output_width =
-      1 +
-      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-  int num_filter_elem = kernel_height * kernel_width * channels;
-  int full_output_size = full_output_height * full_output_width;
-
-  Tensor *full_output = (Tensor *)create4DTensor(
-      0, 0, batch_size, num_filters, full_output_height, full_output_width);
-  float *__restrict__ full_output_data = (float *)full_output->host_data;
-
-  int remainder = (full_output_height - start) % row > 0;
-  int output_height =
-      full_output_height - ((full_output_height - start) / row) - remainder;
-
-  int output_width = full_output_width;
-  float *output_data = (float *)malloc(
-      sizeof(float) * batch_size * num_filters * output_height * output_width);
-  int output_size = output_width * output_height;
-  long int host_data_size = sizeof(float) * num_filter_elem * output_height *
-                            output_width * batch_size;
-  float *host_data = (float *)malloc(host_data_size);
-
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; b++) {
-    for (int ch = 0; ch < channels; ch++) {
-      for (int h = 0; h < output_height; h++) {
-        int inH;
-        if (h < start) {
-          inH = h * vertical_stride - vertical_pad;
-        } else {
-          int h_index = ((h - start + 1) * row) / (row - 1) +
-                        (((h - start + 1) * row) % (row - 1) > 0) + start - 1;
-          inH = h_index * vertical_stride - vertical_pad;
+void* tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, 
+                                                  int vertical_pad, int horizontal_pad, 
+                                                  int vertical_stride, int horizontal_stride, 
+                                                  int conv_mode, int compute_precision, 
+                                                  int skip_every, int start) {
+    Tensor *input = (Tensor *)input_ptr;
+    Tensor *filter = (Tensor *)filter_ptr;
+    
+    float * __restrict__ host_image = (float *)input->host_data;
+    float * __restrict__ host_filter = (float *)filter->host_data;
+
+    const int batch_size = input->dims.dim_sizes[0];
+    const int channels = input->dims.dim_sizes[1];
+    const int image_height = input->dims.dim_sizes[2];
+    const int image_width = input->dims.dim_sizes[3];
+    const int num_filters = filter->dims.dim_sizes[0];
+    const int kernel_height = filter->dims.dim_sizes[2];
+    const int kernel_width = filter->dims.dim_sizes[3];
+    const int output_height = 
+        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+    const int output_width = 
+        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+    const int num_filter_elem = kernel_height * kernel_width * channels;
+
+    const int remainder = ((num_filter_elem - start) % skip_every > 0);
+    const int reduced_num_filter_elem = 
+            num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
+    const int output_size = output_width * output_height;
+    
+    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
+                                                    output_height, output_width);
+    float * __restrict__ output_data = (float *)output->host_data;
+    
+    const long int host_data_size = sizeof(float) * reduced_num_filter_elem 
+                                    * output_height * output_width * batch_size;
+    float *host_data = (float *) malloc(host_data_size);
+   
+    const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
+    float *reduced_kernels = (float *) malloc(reduced_filer_size);
+   
+    float fac =  (((float) skip_every) / ((float) skip_every - 1));
+    int reduced_filter_dim = reduced_num_filter_elem / channels;
+
+    // Create Reduced filter
+    omp_set_num_threads(4);
+    #pragma omp parallel for
+    for(int f = 0; f < num_filters; f++) {
+        for(int i = 0; i < start; i++) {
+            reduced_kernels[f * reduced_num_filter_elem + i] = 
+                                        host_filter[num_filter_elem * f + i];
         }
-        for (int w = 0; w < output_width; w++) {
-          int inW = w * horizontal_stride - horizontal_pad;
-          for (int i = 0; i < kernel_height; i++) {
-            for (int j = 0; j < kernel_width; j++) {
-              const int filter_elem_num =
-                  (ch * kernel_height + i) * kernel_width + j;
-              const int output_index = h * output_width + w;
-              const int out_index = b * num_filter_elem * output_size +
-                                    output_index * num_filter_elem +
-                                    filter_elem_num;
-              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
-                  inW + j < image_width) {
-                host_data[out_index] =
-                    host_image[((b * channels + ch) * image_height +
-                                (inH + i)) *
-                                   image_width +
-                               (inW + j)];
-              } else {
-                host_data[out_index] = 0;
-              }
-            }
-          }
+        #pragma omp simd
+        for(int i = start; i < reduced_num_filter_elem; i++) {
+            int in_index = ((i - start + 1) * skip_every) / (skip_every - 1)
+                    + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + start - 1;
+            reduced_kernels[f * reduced_num_filter_elem + i] = 
+                            fac * host_filter[num_filter_elem * f + in_index];
         }
-      }
     }
 
-    // Tensor Multiply
-    for (int p = 0; p < num_filters; ++p) {
-      for (int m = 0; m < output_size; ++m) {
-        float sum = 0;
-#pragma omp simd reduction(+ : sum)
-        for (int k = 0; k < num_filter_elem; ++k) {
-          int input_index =
-              k + num_filter_elem * m + b * num_filter_elem * output_size;
-          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
+    #pragma omp parallel for
+    for(int b = 0; b < batch_size; b++) {
+            for(int h = 0; h < output_height; h++) {
+                for(int w = 0; w < output_width; w++) {
+                    const int inH = h * vertical_stride - vertical_pad;
+                    const int inW = w * horizontal_stride - horizontal_pad;
+                    for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
+                        int in_index;
+                        int offset = start;
+                        if(fi < offset) {
+                            in_index = fi;
+                        } else {
+                            in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
+                             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
+                        }
+                        const int ch = in_index / (kernel_width * kernel_height);
+                        const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; 
+                        const int j = in_index % kernel_width;
+                        const int output_index = h * output_width + w;
+                        const int out_index = b * reduced_num_filter_elem * output_size 
+                                            + output_index * reduced_num_filter_elem + fi;
+                        if(inH + i >= 0 && inH + i < image_height 
+                        && inW + j >= 0 && inW + j < image_width) {
+                            host_data[out_index] = 
+                                host_image[((b * channels + ch) * image_height 
+                                            + (inH + i)) * image_width + (inW + j)];
+                        } else {
+                            host_data[out_index] = 0;
+                        }
+                }
+            }
         }
-        output_data[b * (output_size * num_filters) + p * output_size + m] =
-            sum;
-      }
-    }
 
-    // Interpolate
-    for (int p = 0; p < num_filters; ++p) {
-      for (int h = 0; h < full_output_height; h++) {
-        for (int w = 0; w < full_output_width; w++) {
-          int full_output_index = b * num_filters * full_output_size +
-                                  p * full_output_size + h * full_output_width +
-                                  w;
-          if (h < start) {
-            int output_index = b * num_filters * output_size + p * output_size +
-                               h * output_width + w;
-            full_output_data[full_output_index] = output_data[output_index];
-          } else if (h == full_output_height - 1) {
-            int output_index = b * num_filters * output_size + p * output_size +
-                               (output_height - 1) * output_width + w;
-            full_output_data[full_output_index] = output_data[output_index];
-          } else if (h == 0) {
-            int output_index = b * num_filters * output_size + p * output_size +
-                               0 * output_width + w;
-            full_output_data[full_output_index] = output_data[output_index];
-          } else if ((h - start) % row == 0) {
-            int row_index = h - ((h + 1 - start) / row);
-            int output_index = b * num_filters * output_size + p * output_size +
-                               row_index * output_width + w;
-            full_output_data[full_output_index] =
-                (output_data[output_index] +
-                 output_data[output_index - output_width]) /
-                2;
-          } else {
-            int remainder = ((h + 1 - start) % row) > 0;
-            int row_index = h - ((h + 1 - start) / row) - remainder;
-            int output_index = b * num_filters * output_size + p * output_size +
-                               row_index * output_width + w;
-            full_output_data[full_output_index] = output_data[output_index];
-          }
+        // Tensor Multiply
+        for (int p = 0; p < num_filters; ++p) {
+            for (int m = 0; m < output_size; ++m) {
+                float sum = 0;
+                #pragma omp simd reduction(+:sum)
+                for (int k = 0; k < reduced_num_filter_elem; ++k) {
+                    int input_index = k + reduced_num_filter_elem * m 
+                                    + b * reduced_num_filter_elem * output_size;
+                    sum += host_data[input_index] 
+                                * reduced_kernels[p * reduced_num_filter_elem + k];
+                }
+                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
+            }
         }
-      }
-    }
-  }
-  free(output_data);
-  free(host_data);
 
-  return full_output;
+    }
+    free(reduced_kernels);
+    free(host_data);
+  
+    return output;
 }
 
-void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
-                                  int vertical_pad, int horizontal_pad,
-                                  int vertical_stride, int horizontal_stride,
-                                  int conv_mode, int compute_precision, int col,
-                                  int start) {
-
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *filter = (Tensor *)filter_ptr;
-
-  float *__restrict__ host_image = (float *)input->host_data;
-  float *__restrict__ host_filter = (float *)filter->host_data;
-
-  int batch_size = input->dims.dim_sizes[0];
-  int channels = input->dims.dim_sizes[1];
-  int image_height = input->dims.dim_sizes[2];
-  int image_width = input->dims.dim_sizes[3];
-  int num_filters = filter->dims.dim_sizes[0];
-  int kernel_height = filter->dims.dim_sizes[2];
-  int kernel_width = filter->dims.dim_sizes[3];
-  int full_output_height =
-      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-  int full_output_width =
-      1 +
-      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-  int num_filter_elem = kernel_height * kernel_width * channels;
-  int full_output_size = full_output_height * full_output_width;
-
-  Tensor *full_output = (Tensor *)create4DTensor(
-      0, 0, batch_size, num_filters, full_output_height, full_output_width);
-  float *__restrict__ full_output_data = (float *)full_output->host_data;
-
-  int remainder = (full_output_width - start) % col > 0;
-  int output_width =
-      full_output_width - ((full_output_width - start) / col) - remainder;
-
-  int output_height = full_output_height;
-  float *output_data = (float *)malloc(
-      sizeof(float) * batch_size * num_filters * output_height * output_width);
-  int output_size = output_width * output_height;
-  long int host_data_size = sizeof(float) * num_filter_elem * output_height *
-                            output_width * batch_size;
-  float *host_data = (float *)malloc(host_data_size);
-
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; b++) {
-    for (int ch = 0; ch < channels; ch++) {
-      for (int h = 0; h < output_height; h++) {
-        int inH = h * vertical_stride - vertical_pad;
-        for (int w = 0; w < output_width; w++) {
-          int inW;
-          if (w < start) {
-            inW = w * horizontal_stride - horizontal_pad;
-          } else {
-            int w_index = ((w - start + 1) * col) / (col - 1) +
-                          (((w - start + 1) * col) % (col - 1) > 0) + start - 1;
-            inW = w_index * horizontal_stride - horizontal_pad;
-          }
-          for (int i = 0; i < kernel_height; i++) {
-            for (int j = 0; j < kernel_width; j++) {
-              const int filter_elem_num =
-                  (ch * kernel_height + i) * kernel_width + j;
-              const int output_index = h * output_width + w;
-              const int out_index = b * num_filter_elem * output_size +
-                                    output_index * num_filter_elem +
-                                    filter_elem_num;
-              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
-                  inW + j < image_width) {
-                host_data[out_index] =
-                    host_image[((b * channels + ch) * image_height +
-                                (inH + i)) *
-                                   image_width +
-                               (inW + j)];
-              } else {
-                host_data[out_index] = 0;
-              }
+void* tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                                int horizontal_pad, int vertical_stride, int horizontal_stride, 
+                                int conv_mode, int compute_precision, int row, int start) {
+    
+    Tensor *input = (Tensor *)input_ptr;
+    Tensor *filter = (Tensor *)filter_ptr;
+    
+    float * __restrict__ host_image = (float *)input->host_data;
+    float * __restrict__ host_filter = (float *)filter->host_data;
+
+    int batch_size = input->dims.dim_sizes[0];
+    int channels = input->dims.dim_sizes[1];
+    int image_height = input->dims.dim_sizes[2];
+    int image_width = input->dims.dim_sizes[3];
+    int num_filters = filter->dims.dim_sizes[0];
+    int kernel_height = filter->dims.dim_sizes[2];
+    int kernel_width = filter->dims.dim_sizes[3];
+
+    int full_output_height = 
+        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+    int full_output_width = 
+        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+    int num_filter_elem = kernel_height * kernel_width * channels;
+    int full_output_size = full_output_height * full_output_width;
+
+    Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
+                                            full_output_height, full_output_width);
+    float * __restrict__ full_output_data = (float *)full_output->host_data;
+   
+    int remainder = (full_output_height - start) % row > 0;
+    int output_height = 
+            full_output_height - ((full_output_height - start) / row) - remainder;
+
+    int output_width = full_output_width;
+    float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters 
+                                                * output_height * output_width);   
+    int output_size = output_width * output_height;
+    long int host_data_size = sizeof(float) * num_filter_elem * output_height 
+                                                        * output_width * batch_size;
+    float *host_data = (float *) malloc(host_data_size);
+
+    omp_set_num_threads(4);
+    #pragma omp parallel for
+    for(int b = 0; b < batch_size; b++) {
+        for(int ch = 0; ch < channels; ch++) {
+            for(int h = 0; h < output_height; h++) {
+                int inH;
+                if(h < start) {
+                    inH = h * vertical_stride - vertical_pad;
+                } else {
+                    int h_index = ((h - start + 1) * row) / (row - 1) 
+                                + (((h - start + 1) * row) % (row - 1) > 0) + start - 1;
+                    inH = h_index * vertical_stride - vertical_pad;
+                }
+                for(int w = 0; w < output_width; w++) {
+                    int inW = w * horizontal_stride - horizontal_pad;
+                    for(int i = 0; i < kernel_height; i++) {
+                        for(int j = 0; j < kernel_width; j++) {
+                            const int filter_elem_num = 
+                                        (ch * kernel_height + i) * kernel_width + j;
+                            const int output_index = h * output_width + w;
+                            const int out_index = b * num_filter_elem * output_size 
+                                    + output_index * num_filter_elem + filter_elem_num;
+                            if(inH + i >= 0 && inH + i < image_height 
+                            && inW + j >= 0 && inW + j < image_width) {
+                                host_data[out_index] = 
+                                    host_image[((b * channels + ch) * image_height 
+                                            + (inH + i)) * image_width + (inW + j)];
+                            } else {
+                                host_data[out_index] = 0;
+                            }
+                        }
+                    }
+                }
             }
-          }
         }
-      }
-    }
 
-    // Tensor Multiply
-    for (int p = 0; p < num_filters; ++p) {
-      for (int m = 0; m < output_size; ++m) {
-        float sum = 0;
-#pragma omp simd reduction(+ : sum)
-        for (int k = 0; k < num_filter_elem; ++k) {
-          int input_index =
-              k + num_filter_elem * m + b * num_filter_elem * output_size;
-          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
+        // Tensor Multiply
+        for (int p = 0; p < num_filters; ++p) {
+            for (int m = 0; m < output_size; ++m) {
+                float sum = 0;
+                #pragma omp simd reduction(+:sum)
+                for (int k = 0; k < num_filter_elem; ++k) {
+                    int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
+                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
+                }
+                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
+            }
         }
-        output_data[b * (output_size * num_filters) + p * output_size + m] =
-            sum;
-      }
-    }
 
-    // Interpolate
-    for (int p = 0; p < num_filters; ++p) {
-      for (int h = 0; h < full_output_height; h++) {
-        for (int w = 0; w < full_output_width; w++) {
-          int full_output_index = b * num_filters * full_output_size +
-                                  p * full_output_size + h * full_output_width +
-                                  w;
-          if (w < start) {
-            int output_index = b * num_filters * output_size + p * output_size +
-                               h * output_width + w;
-            full_output_data[full_output_index] = output_data[output_index];
-          } else if (w == full_output_width - 1) {
-            int output_index = b * num_filters * output_size + p * output_size +
-                               h * output_width + output_width - 1;
-            full_output_data[full_output_index] = output_data[output_index];
-          } else if (w == 0) {
-            int output_index = b * num_filters * output_size + p * output_size +
-                               h * output_width + 0;
-            full_output_data[full_output_index] = output_data[output_index];
-          } else if ((w - start) % col == 0) {
-            int col_index = w - ((w + 1 - start) / col);
-            int output_index = b * num_filters * output_size + p * output_size +
-                               h * output_width + col_index;
-            full_output_data[full_output_index] =
-                (output_data[output_index] + output_data[output_index - 1]) / 2;
-          } else {
-            int remainder = ((w + 1 - start) % col) > 0;
-            int col_index = w - ((w + 1 - start) / col) - remainder;
-            int output_index = b * num_filters * output_size + p * output_size +
-                               h * output_width + col_index;
-            full_output_data[full_output_index] = output_data[output_index];
-          }
-        }
-      }
+        // Interpolate
+        for (int p = 0; p < num_filters; ++p) {
+            for(int h = 0; h < full_output_height; h++) { 
+                for(int w = 0; w < full_output_width; w++) {
+                   int full_output_index = b * num_filters * full_output_size 
+                            + p * full_output_size + h * full_output_width  + w;
+                   if(h < start) {
+                       int output_index = b * num_filters * output_size 
+                                        + p * output_size + h * output_width  + w;
+                       full_output_data[full_output_index] = output_data[output_index];
+                   } else if(h == full_output_height - 1) {
+                       int output_index = b * num_filters * output_size + p * output_size 
+                                                + (output_height - 1) * output_width  + w;
+                       full_output_data[full_output_index] = output_data[output_index];
+                    } else if(h == 0) {
+                        int output_index = b * num_filters * output_size 
+                                            + p * output_size + 0 * output_width  + w;
+                        full_output_data[full_output_index] = output_data[output_index]; 
+                    } else if((h - start) % row == 0) {
+                        int row_index = h - ((h + 1 - start) / row); 
+                        int output_index = b * num_filters * output_size + p * output_size 
+                                                            + row_index * output_width + w;
+                        full_output_data[full_output_index] = 
+                            (output_data[output_index] + output_data[output_index - output_width]) / 2;
+                   } else {
+                       int remainder = ((h + 1 - start) % row) > 0;
+                       int row_index = h - ((h + 1 - start) / row) - remainder;
+                       int output_index = b * num_filters * output_size + p * output_size 
+                                                        + row_index * output_width + w;
+                       full_output_data[full_output_index] = output_data[output_index];
+                  }
+                }
+            }
+         }
     }
-  }
-  free(output_data);
-  free(host_data);
+    free(output_data);
+    free(host_data);
 
-  return full_output;
+    return full_output;
 }
 
-void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
-                       int horizontal_pad, int vertical_stride,
-                       int horizontal_stride, int conv_mode,
-                       int compute_precision, int row, int col, int skip_every,
-                       int start) {
-  if (row > 1) {
-    printf("ROW PERFORATION\n");
-    return tensorRowPerfConvolutionCPU(
-        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
-        horizontal_stride, conv_mode, compute_precision, row, start);
-  }
-  if (col > 1) {
-    printf("COL PERFORATION\n");
-    return tensorColPerfConvolutionCPU(
-        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
-        horizontal_stride, conv_mode, compute_precision, col, start);
-  }
-  if (skip_every > 1) {
-    printf("INPUT FILTERING\n");
+void* tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                                int horizontal_pad, int vertical_stride, int horizontal_stride, 
+                                int conv_mode, int compute_precision, int col, int start) {
+    
     Tensor *input = (Tensor *)input_ptr;
     Tensor *filter = (Tensor *)filter_ptr;
+    
+    float * __restrict__ host_image = (float *)input->host_data;
+    float * __restrict__ host_filter = (float *)filter->host_data;
+    
+    int batch_size = input->dims.dim_sizes[0];
+    int channels = input->dims.dim_sizes[1];
+    int image_height = input->dims.dim_sizes[2];
+    int image_width = input->dims.dim_sizes[3];
+    int num_filters = filter->dims.dim_sizes[0];
+    int kernel_height = filter->dims.dim_sizes[2];
+    int kernel_width = filter->dims.dim_sizes[3];
+    int full_output_height = 
+        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+    int full_output_width = 
+        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+    int num_filter_elem = kernel_height * kernel_width * channels;
+    int full_output_size = full_output_height * full_output_width;
+
+    Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
+                                                    full_output_height, full_output_width);
+    float * __restrict__ full_output_data = (float *)full_output->host_data;
+
+    int remainder = (full_output_width - start) % col > 0;
+    int output_width = full_output_width - ((full_output_width - start) / col) - remainder;
+
+    int output_height = full_output_height;
+    float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters 
+                                                    * output_height * output_width);
+    int output_size = output_width * output_height;
+    long int host_data_size = sizeof(float) * num_filter_elem * output_height 
+                                                        * output_width * batch_size;
+    float *host_data = (float *) malloc(host_data_size);
 
-    const int kernel_height = filter->dims.dim_sizes[2];
-    const int kernel_width = filter->dims.dim_sizes[3];
+    omp_set_num_threads(4);
+    #pragma omp parallel for
+    for(int b = 0; b < batch_size; b++) {
+        for(int ch = 0; ch < channels; ch++) {
+            for(int h = 0; h < output_height; h++) {
+                int inH = h * vertical_stride - vertical_pad;
+                for(int w = 0; w < output_width; w++) {
+                    int inW;
+                    if(w < start) {
+                        inW = w * horizontal_stride - horizontal_pad;
+                    } else {
+                        int w_index = ((w - start + 1) * col) / (col - 1) 
+                                + (((w - start + 1) * col) % (col - 1) > 0) + start - 1;
+                        inW = w_index * horizontal_stride - horizontal_pad;
+                    }
+                    for(int i = 0; i < kernel_height; i++) {
+                        for(int j = 0; j < kernel_width; j++) {
+                            const int filter_elem_num = 
+                                        (ch * kernel_height + i) * kernel_width + j;
+                            const int output_index = h * output_width + w;
+                            const int out_index = b * num_filter_elem * output_size 
+                                    + output_index * num_filter_elem + filter_elem_num;
+                            if(inH + i >= 0 && inH + i < image_height 
+                            && inW + j >= 0 && inW + j < image_width) {
+                                host_data[out_index] = 
+                                    host_image[((b * channels + ch) * image_height 
+                                            + (inH + i)) * image_width + (inW + j)];
+                            } else {
+                                host_data[out_index] = 0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
 
-    if (!(kernel_height * kernel_width % skip_every)) {
-      return tensorRegularFilterSamplingConvolutionCPU(
-          input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
-          horizontal_stride, conv_mode, compute_precision, skip_every, start);
-    }
-    return tensorIrregularFilterSamplingConvolutionCPU(
-        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
-        horizontal_stride, conv_mode, compute_precision, skip_every, start);
-  }
-  printf("REGULAR CONV\n");
-  return tensorRegularConvolutionCPU(
-      input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
-      horizontal_stride, conv_mode, compute_precision);
-}
+        // Tensor Multiply
+        for (int p = 0; p < num_filters; ++p) {
+            for (int m = 0; m < output_size; ++m) {
+                float sum = 0;
+                #pragma omp simd reduction(+:sum)
+                for (int k = 0; k < num_filter_elem; ++k) {
+                    int input_index = k + num_filter_elem * m 
+                                            + b * num_filter_elem * output_size;
+                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
+                }
+                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
+            }
+        }
 
-void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                           int horizontal_pad, int vertical_stride,
-                           int horizontal_stride, int conv_mode,
-                           int conv_groups) {
-
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *filter = (Tensor *)filter_ptr;
-
-  float *__restrict__ host_image = (float *)input->host_data;
-  float *__restrict__ host_filter = (float *)filter->host_data;
-
-  int batch_size = input->dims.dim_sizes[0];
-  int channels = input->dims.dim_sizes[1];
-  int image_height = input->dims.dim_sizes[2];
-  int image_width = input->dims.dim_sizes[3];
-  int num_filters = filter->dims.dim_sizes[0];
-  int kernel_height = filter->dims.dim_sizes[2];
-  int kernel_width = filter->dims.dim_sizes[3];
-  int output_height =
-      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-  int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) /
-                          horizontal_stride);
-  int num_filter_elem = kernel_height * kernel_width * channels;
-  int output_size = output_width * output_height;
-
-  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
-                                            output_height, output_width);
-  float *__restrict__ output_data = (float *)output->host_data;
-
-  long int conv_data_size = sizeof(float) * num_filter_elem * output_height *
-                            output_width * batch_size;
-  float *host_data = (float *)malloc(conv_data_size);
-
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; b++) {
-    for (int ch = 0; ch < channels; ch++) {
-      for (int h = 0; h < output_height; h++) {
-        for (int w = 0; w < output_width; w++) {
-          const int inH = h * vertical_stride - vertical_pad;
-          const int inW = w * horizontal_stride - horizontal_pad;
-          for (int i = 0; i < kernel_height; i++) {
-            for (int j = 0; j < kernel_width; j++) {
-              const int filter_elem_num =
-                  (ch * kernel_height + i) * kernel_width + j;
-              const int output_index = h * output_width + w;
-              const int out_index = b * num_filter_elem * output_size +
-                                    output_index * num_filter_elem +
-                                    filter_elem_num;
-              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
-                  inW + j < image_width) {
-                host_data[out_index] =
-                    host_image[((b * channels + ch) * image_height +
-                                (inH + i)) *
-                                   image_width +
-                               (inW + j)];
-              } else {
-                host_data[out_index] = 0;
-              }
+        // Interpolate
+        for (int p = 0; p < num_filters; ++p) {
+            for(int h = 0; h < full_output_height; h++) {
+                for(int w = 0; w < full_output_width; w++) {
+                    int full_output_index = b * num_filters * full_output_size 
+                                + p * full_output_size + h * full_output_width  + w;
+                     if(w < start) {
+                         int output_index = b * num_filters * output_size 
+                                        + p * output_size + h * output_width + w;
+                         full_output_data[full_output_index] = output_data[output_index];
+                    } else if(w == full_output_width - 1) {
+                        int output_index = b * num_filters * output_size + p * output_size 
+                                                    + h * output_width  + output_width - 1;
+                        full_output_data[full_output_index] = output_data[output_index];
+                    } else if(w == 0) {
+                        int output_index = b * num_filters * output_size + p * output_size 
+                                                                + h * output_width  + 0;
+                        full_output_data[full_output_index] = output_data[output_index];
+                    } else if((w - start) % col == 0) {
+                        int col_index = w - ((w + 1 - start) / col);
+                        int output_index = b * num_filters * output_size + p * output_size 
+                                                            + h * output_width + col_index;
+                        full_output_data[full_output_index] = 
+                            (output_data[output_index] + output_data[output_index - 1]) / 2;
+                    } else {
+                        int remainder = ((w + 1 - start) % col) > 0;
+                        int col_index = w - ((w + 1 - start) / col) - remainder;
+                        int output_index = b * num_filters * output_size + p * output_size 
+                                                            + h * output_width + col_index;
+                        full_output_data[full_output_index] = output_data[output_index];
+                    }
+                }
             }
-          }
         }
-      }
     }
-    for (int p = 0; p < num_filters; ++p) {
-      for (int m = 0; m < output_size; ++m) {
-        float sum = 0;
-#pragma omp simd reduction(+ : sum)
-        for (int k = 0; k < num_filter_elem; ++k) {
-          int input_index =
-              k + num_filter_elem * m + b * num_filter_elem * output_size;
-          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
+    free(output_data);
+    free(host_data);
+
+    return full_output;
+}
+
+void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr, 
+                          int vertical_pad, int horizontal_pad, 
+                          int vertical_stride, int horizontal_stride, 
+                          int conv_mode, int compute_precision, 
+                          int row, int col, int skip_every, int start) {
+    if(row > 1) {
+        printf("ROW PERFORATION\n");
+        return tensorRowPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
+                        horizontal_pad, vertical_stride, horizontal_stride, conv_mode, 
+                        compute_precision, row, start);
+    } 
+    if(col > 1) {
+     printf("COL PERFORATION\n");
+     return tensorColPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
+                             horizontal_pad, vertical_stride, horizontal_stride, conv_mode, 
+                            compute_precision, col, start);
+    }  
+    if(skip_every > 1) {
+        printf("INPUT FILTERING\n");
+        Tensor *input = (Tensor *)input_ptr;
+        Tensor *filter = (Tensor *)filter_ptr;
+
+        const int kernel_height = filter->dims.dim_sizes[2];
+        const int kernel_width = filter->dims.dim_sizes[3];
+
+        if(!(kernel_height * kernel_width % skip_every)) {
+            return tensorRegularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, 
+                                    vertical_pad, horizontal_pad, vertical_stride,
+                                    horizontal_stride, conv_mode, 
+                                    compute_precision, skip_every, start);
         }
-        output_data[b * (output_size * num_filters) + p * output_size + m] =
-            sum;
-      }
+        return tensorIrregularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, 
+                                    vertical_pad, horizontal_pad, vertical_stride, 
+                                    horizontal_stride, conv_mode, 
+                                    compute_precision, skip_every, start);
     }
-  }
-  free(host_data);
-  return output;
+    printf("---REGULAR CONV\n");
+    return tensorRegularConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
+                                 horizontal_pad, vertical_stride, 
+                                 horizontal_stride, conv_mode, compute_precision);
 }
 
-void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
-  Tensor *x = (Tensor *)x_ptr;
-  Tensor *bias = (Tensor *)bias_ptr;
-
-  float *__restrict__ x_data = (float *)x->host_data;
-  float *__restrict__ bias_data = (float *)bias->host_data;
-  int n = x->dims.dim_sizes[0];
-  int c = x->dims.dim_sizes[1];
-  int h = x->dims.dim_sizes[2];
-  int w = x->dims.dim_sizes[3];
-
-  if (x->num_elems == bias->num_elems) {
-    int const1 = c * h * w;
-    int const2 = h * w;
+void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr,
+			int vertical_pad, int horizontal_pad,
+			int vertical_stride, int horizontal_stride,
+			int conv_mode, int conv_groups){
+	
+    Tensor *input = (Tensor *)input_ptr;
+    Tensor *filter = (Tensor *)filter_ptr;
+    
+    float * __restrict__ host_image = (float *)input->host_data;
+    float * __restrict__ host_filter = (float *)filter->host_data;
+
+    const int batch_size = input->dims.dim_sizes[0];
+    const int channels = input->dims.dim_sizes[1];
+    const int image_height = input->dims.dim_sizes[2];
+    const int image_width = input->dims.dim_sizes[3];
+    const int num_filters = filter->dims.dim_sizes[0];
+    const int kernel_height = filter->dims.dim_sizes[2];
+    const int kernel_width = filter->dims.dim_sizes[3];
+    const int output_height = 
+        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+    const int output_width = 
+        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+    const int filter_dim = kernel_height * kernel_width;
+    const int num_filter_elem = filter_dim * channels;
+    const int output_size = output_width * output_height;
+    
+    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, channels, 
+                                                    output_height * output_width);
+    float * __restrict__ output_data = (float *)output->host_data;
+    
+    const long int conv_data_size = 
+        sizeof(float) * num_filter_elem * output_height * output_width * batch_size;
+    float *host_data = (float *) malloc(conv_data_size);
+   
     omp_set_num_threads(4);
-#pragma omp parallel for
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-#pragma omp simd collapse(2)
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            x_data[i * const1 + j * const2 + (k * w) + l] +=
-                bias_data[i * const1 + j * const2 + (k * w) + l];
-          }
+     #pragma omp parallel for
+    for(int b = 0; b < batch_size; b++) {
+        for(int ch = 0; ch < channels; ch++) {
+            for(int h = 0; h < output_height; h++) {
+                for(int w = 0; w < output_width; w++) {
+                    const int inH = h * vertical_stride - vertical_pad;
+                    const int inW = w * horizontal_stride - horizontal_pad;
+                    for(int i = 0; i < kernel_height; i++) {
+                        for(int j = 0; j < kernel_width; j++) {
+                            const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j;
+                            const int output_index = h * output_width + w;
+                            const int out_index = b * num_filter_elem * output_size 
+                                        + output_index * num_filter_elem + filter_elem_num;
+                            if(inH + i >= 0 && inH + i < image_height 
+                                && inW + j >= 0 && inW + j < image_width) {
+                                host_data[out_index] = 
+                                    host_image[((b * channels + ch) * image_height 
+                                        + (inH + i)) * image_width + (inW + j)];
+                            } else {
+                                host_data[out_index] = 0;
+                            }
+                        }
+                    }
+                }
+            }
         }
-      }
-    }
-  } else {
-    omp_set_num_threads(4);
-#pragma omp parallel for
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-#pragma omp simd collapse(2)
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
-          }
+        for (int p = 0; p < num_filters; ++p) {
+             for (int m = 0; m < output_size; ++m) {
+                for (int ch = 0; ch < channels; ch++) {
+                    float sum = 0;
+                    #pragma omp simd reduction(+:sum)
+                    for (int k = 0; k < filter_dim; ++k) {
+                        int input_index = k + ch * filter_dim + num_filter_elem * m + b * num_filter_elem * output_size;
+                        sum += host_data[input_index] * host_filter[p * num_filter_elem + ch * filter_dim + k];
+                    }
+                    output_data[b * (output_size * num_filters * channels) + p * output_size * channels + ch * output_size + m] = sum;
+                }
+            }
         }
-      }
     }
-  }
 
-  return x;
+    free(host_data);
+    return output;
+}
+
+void* tensorAddCPU(void *x_ptr, void *bias_ptr) {
+    Tensor *x = (Tensor *)x_ptr;
+    Tensor *bias = (Tensor *)bias_ptr;
+    
+    float * __restrict__ x_data = (float *)x->host_data;
+    float * __restrict__ bias_data = (float *)bias->host_data;
+    int n = x->dims.dim_sizes[0];
+    int c = x->dims.dim_sizes[1];
+    int h = x->dims.dim_sizes[2];
+    int w = x->dims.dim_sizes[3];
+    
+    if(x->num_elems == bias->num_elems) {
+        int const1 = c * h * w;
+        int const2 = h * w;
+         omp_set_num_threads(4);
+        #pragma omp parallel for
+        for (int i = 0; i < n; i++) { 
+            for (int j = 0; j < c; j++) {
+                 #pragma omp simd collapse(2)
+                for (int k = 0; k < h; k++) {
+                    for (int l = 0; l < w; l++) {
+                        x_data[i * const1 + j * const2 + (k * w)  + l] += 
+                                bias_data[i * const1 + j * const2 + (k*w) + l];
+                    }
+                }
+            }
+        }
+    } else {
+         omp_set_num_threads(4);
+        #pragma omp parallel for
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < c; j++) {
+                #pragma omp simd collapse(2)
+                for (int k = 0; k < h; k++) {
+                    for (int l = 0; l < w; l++) {
+                        x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
+                    }
+                }
+            }
+        }   
+    }
+    
+    return x;
 }
 
 float max(float v1, float v2) __attribute__((always_inline));
-inline float maximum(float v1, float v2) { return (v1 < v2) ? v2 : v1; }
+inline float maximum(float v1, float v2){
+    return (v1 < v2) ? v2 : v1;
+}
 
 void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
-                       int window_width, int vertical_pad, int horizontal_pad,
-                       int vertical_stride, int horizontal_stride) {
-
-  Tensor *input = (Tensor *)input_ptr;
-  float *__restrict__ input_data = (float *)input->host_data;
-
-  int batch_size = input->dims.dim_sizes[0];
-  int channels = input->dims.dim_sizes[1];
-  int image_height = input->dims.dim_sizes[2];
-  int image_width = input->dims.dim_sizes[3];
-
-  int output_height =
-      1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
-  int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) /
-                          horizontal_stride);
-
-  int center_x = (window_width - 1) / 2 - horizontal_pad;
-  int center_y = (window_height - 1) / 2 - vertical_pad;
-  int x_radius = (window_width - 1) / 2;
-  int y_radius = (window_height - 1) / 2;
-
-  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels,
-                                            output_height, output_width);
-  float *__restrict__ output_data = (float *)output->host_data;
-
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; b++) {
-    for (int ch = 0; ch < channels; ch++) {
-      int ii = 0, jj = 0;
-      for (int r = center_y; r < image_height + vertical_pad - y_radius;
-           r += vertical_stride) {
-        for (int c = center_x; c < image_width + horizontal_pad - x_radius;
-             c += horizontal_stride) {
-          float val = (poolFunction == 0) ? -3.40282e+38 : 0;
-          int y_radius_var = y_radius - r;
-          int y_radius_var_max = y_radius_var + image_height;
-          int x_radius_var = x_radius - c;
-          int x_radius_var_max = x_radius_var + image_width;
-          int ki_min =
-              (y_radius_var > 0)
-                  ? ((y_radius_var < window_height) ? y_radius_var : -1)
-                  : 0;
-          int ki_max = (y_radius_var_max < window_height)
-                           ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1)
-                           : window_height;
-          int kj_min = (x_radius_var > 0)
-                           ? ((x_radius_var < window_width) ? x_radius_var : -1)
-                           : 0;
-          int kj_max = (x_radius_var_max < window_width)
-                           ? ((x_radius_var_max >= 0) ? x_radius_var_max : -1)
-                           : window_width;
-
-          if (ki_min != ki_max && kj_min != kj_max && ki_min != -1 &&
-              ki_max != -1 && kj_min != -1 && kj_max != -1) {
-            if (!poolFunction) {
-              for (int ki = 0; ki < window_height; ki++) {
-                for (int kj = 0; kj < window_width; kj++) {
-                  val = maximum(
-                      val,
-                      input_data[b * (channels * image_height * image_width) +
-                                 ch * (image_height * image_width) +
-                                 (r - y_radius + ki) * image_width +
-                                 (c - x_radius + kj)]);
-                }
-              }
-            } else {
-              for (int ki = 0; ki < window_height; ki++) {
-                for (int kj = 0; kj < window_width; kj++) {
-                  val +=
-                      input_data[b * (channels * image_height * image_width) +
-                                 ch * (image_height * image_width) +
-                                 (r - y_radius + ki) * image_width +
-                                 (c - x_radius + kj)];
+             int window_width, int vertical_pad, int horizontal_pad,
+                          int vertical_stride, int horizontal_stride) {
+   
+    Tensor *input = (Tensor *)input_ptr;
+    float * __restrict__ input_data = (float *)input->host_data;
+    
+    int batch_size = input->dims.dim_sizes[0];
+    int channels = input->dims.dim_sizes[1];
+    int image_height = input->dims.dim_sizes[2];
+    int image_width = input->dims.dim_sizes[3];
+    
+    int output_height = 
+        1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
+    int output_width = 
+        1 + ((image_width - window_width + 2 * horizontal_pad) / horizontal_stride);
+    
+    int center_x = (window_width - 1) / 2 - horizontal_pad;
+    int center_y = (window_height - 1) / 2 - vertical_pad;
+    int x_radius = (window_width - 1) / 2;
+    int y_radius = (window_height - 1) / 2;
+    
+    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, channels, 
+                                                output_height, output_width);
+    float * __restrict__ output_data = (float *)output->host_data;
+   
+    omp_set_num_threads(4);
+    #pragma omp parallel for
+    for (int b = 0; b < batch_size; b++) {
+        for (int ch = 0; ch < channels; ch++) {
+            int ii = 0, jj = 0;
+            for (int r = center_y; r < image_height + vertical_pad - y_radius; 
+                                                        r += vertical_stride) {
+                for (int c = center_x; c < image_width + horizontal_pad - x_radius; 
+                                                            c += horizontal_stride) {
+                    float val = (poolFunction == 0) ? -3.40282e+38 : 0;
+                    int y_radius_var = y_radius - r;
+                    int y_radius_var_max = y_radius_var + image_height;
+                    int x_radius_var = x_radius - c;
+                    int x_radius_var_max = x_radius_var + image_width;
+                    int ki_min = (y_radius_var > 0) ? 
+                        ((y_radius_var < window_height) ? y_radius_var : -1) : 0;
+                    int ki_max = (y_radius_var_max < window_height) ? 
+                                 ((y_radius_var_max >= 0) ?  y_radius_var_max : -1) : window_height;
+                    int kj_min = (x_radius_var > 0) ? 
+                                ((x_radius_var < window_width) ? x_radius_var : -1) : 0;
+                    int kj_max = (x_radius_var_max < window_width) ? 
+                                    ((x_radius_var_max >= 0) ?  x_radius_var_max : -1) : window_width;
+                                        
+                    if(ki_min != ki_max && kj_min != kj_max && ki_min != -1 
+                            && ki_max != -1 && kj_min != -1 && kj_max != -1) {
+                        if(!poolFunction) {
+                            for (int ki = 0; ki < window_height; ki++) {
+                                for (int kj = 0; kj < window_width; kj++) {
+                                    val = maximum(
+                                    val,
+                                    input_data[b * (channels * image_height * image_width) +
+                                    ch * (image_height * image_width) +
+                                    (r - y_radius + ki) * image_width + (c - x_radius + kj)]);
+                                }
+                            }
+                        } else {
+                            for (int ki = 0; ki < window_height; ki++) {
+                                for (int kj = 0; kj < window_width; kj++) {
+                                    val += input_data[b * (channels * image_height * image_width) 
+                                            + ch * (image_height * image_width) +
+                                            (r - y_radius + ki) * image_width + (c - x_radius + kj)];
+                                }
+                            }
+                        }
+                    }
+                    if (poolFunction == 1) {
+                        val /= window_height * window_width;
+                    }
+                    output_data[b * (channels * output_height * output_width) +
+                        ch * (output_height * output_width) + ii * output_width + jj] = val;
+                    jj++;
+                    if (jj == output_width) {
+                        jj = 0;
+                        ii++;
+                    }
                 }
-              }
             }
-          }
-          if (poolFunction == 1) {
-            val /= window_height * window_width;
-          }
-          output_data[b * (channels * output_height * output_width) +
-                      ch * (output_height * output_width) + ii * output_width +
-                      jj] = val;
-          jj++;
-          if (jj == output_width) {
-            jj = 0;
-            ii++;
-          }
         }
-      }
     }
-  }
-
-  return output;
+  
+    return output;
 }
 
 void *tensorTanhCPU(void *input_ptr) {
-  Tensor *input = (Tensor *)input_ptr;
-
-  float *input_data = (float *)input->host_data;
-  size_t num_elems = input->num_elems;
-
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (size_t i = 0; i < num_elems; i++) {
-    input_data[i] = tanhf(input_data[i]);
-  }
-
-  return input;
+    Tensor *input = (Tensor *)input_ptr;
+    
+    float *input_data = (float *)input->host_data;
+    size_t num_elems = input->num_elems;
+    
+     omp_set_num_threads(4);
+     #pragma omp parallel for
+    for (size_t i = 0; i < num_elems; i++) {
+        input_data[i] = tanhf(input_data[i]);
+    }
+   
+    return input;
 }
 
 void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
-  Tensor *lhs = (Tensor *)lhs_ptr;
-  Tensor *rhs = (Tensor *)rhs_ptr;
-  // printf("GEMM lhs_ptr: %p\n", lhs_ptr);
-  // printf("GEMM rhs_ptr: %p\n", rhs_ptr);
-
-  int m = lhs->dims.dim_sizes[0];
-  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
-  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
-
-  Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1);
-
-  float *__restrict__ lhs_arr = (float *)lhs->host_data;
-  float *__restrict__ rhs_arr = (float *)rhs->host_data;
-  float *__restrict__ output_arr = (float *)output->host_data;
-
-  int k = 1;
-#pragma unroll 4 // Can we unroll more???
-  for (int j = 1; j < lhs->dims.num_dims; j++) {
-    k = k * lhs->dims.dim_sizes[j]; // input neurons
-  }
-  // printf("unroll\n");
-  float *tran_rhs = (float *)malloc(sizeof(float) * k * n);
-  // printf("tran_rhs: %p\n", tran_rhs);
-  // printf("rhs_arr: %p\n", rhs_arr);
-  // printf("lhs_arr: %p\n", lhs_arr);
-  omp_set_num_threads(4);
-#pragma omp parallel for simd
-  for (int l = 0; l < k; l++) {
-    for (int j = 0; j < n; j++) {
-      tran_rhs[j * k + l] = rhs_arr[l * n + j];
+    Tensor *lhs = (Tensor *)lhs_ptr;
+    Tensor *rhs = (Tensor *)rhs_ptr;
+    
+    int m = lhs->dims.dim_sizes[0];
+    int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
+    int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
+    
+    Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1);
+
+    float * __restrict__ lhs_arr = (float *)lhs->host_data;
+    float * __restrict__ rhs_arr = (float *)rhs->host_data;
+    float * __restrict__ output_arr = (float *)output->host_data;
+    
+    int k = 1;
+    #pragma unroll 4   // Can we unroll more???
+    for (int j = 1; j < lhs->dims.num_dims; j++) {
+        k = k * lhs->dims.dim_sizes[j]; // input neurons
     }
-  }
-// printf("TRANS\n");
-#pragma omp parallel for
-  for (int i = 0; i < m; i++) {
-    for (int j = 0; j < n; j++) {
-      float sum = 0.0;
-#pragma omp simd reduction(+ : sum)
-      for (int l = 0; l < k; l++) {
-        sum += lhs_arr[i * k + l] * tran_rhs[j * k + l];
-      }
-      output_arr[i * n + j] = sum;
+    float *tran_rhs = (float *) malloc(sizeof(float) * k * n);
+    omp_set_num_threads(4);
+    #pragma omp parallel for simd
+    for (int l = 0; l < k; l++) {
+        for (int j = 0; j < n; j++) {
+            tran_rhs[j * k + l] = rhs_arr[l * n + j];
+        }   
+    }
+    
+    #pragma omp parallel for
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+           float sum = 0.0;
+          #pragma omp simd reduction(+:sum)
+           for (int l = 0; l < k; l++) {
+                sum += lhs_arr[i * k + l] * tran_rhs[j * k + l];
+            }
+            output_arr[i * n + j] = sum;
+        }
     }
-  }
-  free(tran_rhs);
-  // printf("GEMM OUTPUT: %p\n", output);
-  return output;
+    free(tran_rhs);
+    return output;
 }
 
 void *tensorSoftmaxCPU(void *input_ptr) {
-  Tensor *input = (Tensor *)input_ptr;
-
-  float *logits = (float *)input->host_data;
-  int n = input->dims.dim_sizes[0];
-  int c = input->dims.dim_sizes[1];
-
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int i = 0; i < n; i++) {
-    float x = 0;
-    for (int j = i * c; j < c + i * c; j++) {
-      logits[j] = expf(logits[j]);
-    }
-
-#pragma omp simd reduction(+ : x)
-    for (int j = i * c; j < i * c + c; j++) {
-      x += logits[j];
-    }
-
-#pragma omp simd
-    for (int j = i * c; j < i * c + c; j++) {
-      logits[j] /= x;
+    Tensor *input = (Tensor *)input_ptr;
+    
+    float *logits = (float *)input->host_data;
+    int n = input->dims.dim_sizes[0];
+    int c = input->dims.dim_sizes[1];
+    
+     omp_set_num_threads(4);
+    #pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+        float x = 0;
+        for(int j = i*c; j < c + i*c; j++) {
+            logits[j] = expf(logits[j]);
+        }
+       
+        #pragma omp simd reduction(+:x)
+        for(int j = i*c; j < i*c+c; j++) {
+            x += logits[j];
+        }
+        
+         #pragma omp simd
+        for(int j = i*c; j < i*c + c; j++) {
+            logits[j] /= x;
+        }                                                                                                                                                   
     }
-  }
 
-  return input;
+    return input;
 }
 
-void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
-                         void *mean_ptr, void *variance_ptr, double epsilon) {
-
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *gamma = (Tensor *)gamma_ptr;
-  Tensor *beta = (Tensor *)beta_ptr;
-  Tensor *mean = (Tensor *)mean_ptr;
-  Tensor *variance = (Tensor *)variance_ptr;
-
-  float *__restrict__ host_image = (float *)input->host_data;
-  float *__restrict__ host_beta = (float *)beta->host_data;
-  float *__restrict__ host_gamma = (float *)gamma->host_data;
-  float *__restrict__ host_mean = (float *)mean->host_data;
-  float *__restrict__ host_variance = (float *)variance->host_data;
-
-  float alpha_val = 1.0f, beta_val = 0.0f;
-  size_t num_elems = input->num_elems;
-
-  int batch_size = input->dims.dim_sizes[0];
-  int channels = input->dims.dim_sizes[1];
-  int image_height = input->dims.dim_sizes[2];
-  int image_width = input->dims.dim_sizes[3];
-  int image_dim = image_height * image_width;
-
-  omp_set_num_threads(4);
-#pragma omp parallel for
-  for (int b = 0; b < batch_size; b++) {
-    for (int ch = 0; ch < channels; ch++) {
-      float mean = 0;
-#pragma omp simd reduction(+ : mean)
-      for (int i = 0; i < image_dim; i++) {
-        int index = b * channels * image_dim + ch * image_dim + i;
-        mean += host_image[index];
-      }
-      mean = mean / channels;
-
-      float variance = 0;
-#pragma omp simd reduction(+ : variance)
-      for (int i = 0; i < image_dim; i++) {
-        int index = b * channels * image_dim + ch * image_dim + i;
-        float tmp = host_image[index] - mean;
-        variance += (tmp * tmp);
-      }
-      variance = variance / channels;
-
-#pragma omp simd
-      for (int i = 0; i < image_dim; i++) {
-        int index = b * channels * image_dim + ch * image_dim + i;
-        host_image[index] =
-            host_beta[ch] + (host_gamma[ch] * ((host_image[index] - mean) /
-                                               sqrt(epsilon + variance)));
-      }
+void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr,
+                         void* mean_ptr, void* variance_ptr, double epsilon) {
+    
+    Tensor* input = (Tensor*) input_ptr;
+    Tensor* gamma = (Tensor*) gamma_ptr;
+    Tensor* beta = (Tensor*) beta_ptr;
+    Tensor* mean = (Tensor*) mean_ptr;
+    Tensor* variance = (Tensor*) variance_ptr;
+    
+    float * __restrict__ host_image = (float *)input->host_data;
+    float * __restrict__ host_beta = (float *)beta->host_data;
+    float * __restrict__ host_gamma = (float *)gamma->host_data;
+    float * __restrict__ host_mean = (float *)mean->host_data;
+    float * __restrict__ host_variance = (float *)variance->host_data;
+    
+    float alpha_val = 1.0f, beta_val = 0.0f;
+    size_t num_elems = input->num_elems;
+
+    int batch_size = input->dims.dim_sizes[0];
+    int channels = input->dims.dim_sizes[1];
+    int image_height = input->dims.dim_sizes[2];
+    int image_width = input->dims.dim_sizes[3];
+    int image_dim = image_height * image_width;
+
+    omp_set_num_threads(4);
+    #pragma omp parallel for
+    for(int b = 0; b < batch_size; b++) {
+        for(int ch = 0; ch < channels; ch++) {
+            float mean = 0;
+            #pragma omp simd reduction(+:mean)
+            for(int i = 0; i < image_dim; i++) {
+                int index = b * channels * image_dim + ch * image_dim + i;
+                mean += host_image[index];
+            }
+            mean = mean / channels;
+         
+            float variance = 0;
+            #pragma omp simd reduction(+:variance)
+            for(int i = 0; i < image_dim; i++) {
+                int index = b * channels * image_dim + ch * image_dim + i;
+                float tmp = host_image[index] - mean;
+                variance += (tmp * tmp);  
+            }
+            variance = variance / channels;
+            
+           #pragma omp simd 
+            for(int i = 0; i < image_dim; i++) {
+                int index = b * channels * image_dim + ch * image_dim + i;
+                host_image[index] = host_beta[ch] 
+                                  + (host_gamma[ch] * ((host_image[index] - mean) / sqrt(epsilon + variance)));
+            }
+        }
     }
-  }
-  return input;
+    return input;
 }
 
-void *tensorReluCPU(void *input_ptr) {
-  Tensor *input = (Tensor *)input_ptr;
-  float *input_data = (float *)input->host_data;
-  size_t num_elems = input->num_elems;
-
-#pragma omp simd
-  for (size_t i = 0; i < num_elems; i++) {
-    input_data[i] = (input_data[i] < 0) ? 0 : input_data[i];
-  }
+ void *tensorReluCPU(void *input_ptr) {
+     Tensor *input = (Tensor *)input_ptr;
+     float *input_data = (float *)input->host_data;
+     size_t num_elems = input->num_elems;
+     
+     #pragma omp simd
+     for (size_t i = 0; i < num_elems; i++) {
+         input_data[i] = (input_data[i] < 0) ? 0 : input_data[i];
+    }
 
-  return input;
+    return input;
 }
 
 void *tensorRelu2CPU(void *input_ptr, float min, float max) {
-  Tensor *input = (Tensor *)input_ptr;
-  float *input_data = (float *)input->host_data;
-  size_t num_elems = input->num_elems;
-
-#pragma omp simd
-  for (size_t i = 0; i < num_elems; i++) {
-    input_data[i] = (input_data[i] < min)
-                        ? min
-                        : ((input_data[i] > max) ? max : input_data[i]);
-  }
-
-  return input;
-}
\ No newline at end of file
+    Tensor *input = (Tensor *)input_ptr;
+    float *input_data = (float *)input->host_data;
+    size_t num_elems = input->num_elems;
+    
+    #pragma omp simd
+    for (size_t i = 0; i < num_elems; i++) {
+        input_data[i] = (input_data[i] < min) ? min : ((input_data[i] > max) ? 
+                                                        max : input_data[i]);
+    }       
+
+    return input;
+}