diff --git a/llvm/projects/.gitignore b/llvm/projects/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e45ff9f4750ef9c2823589f4fcfe03b5e686aab8
--- /dev/null
+++ b/llvm/projects/.gitignore
@@ -0,0 +1,2 @@
+soc_simulator/lib
+gpu_profiler/lib
diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
index bf6f0f3a2668af7c25c955042d257f4e32c75b33..74ec43d03292045e7e0b745996dba0e9437bc813 100644
--- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -46,43 +46,62 @@ include_directories(../gpu_profiler/include)
 include_directories(../soc_simulator/include)
 link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64 $ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64)
 
-# Adding new rule for building a cuDNN runtime library
-cuda_add_library(tensor_runtime      tensor_runtime/src/tensor_runtime.cu  tensor_runtime/src/half_precision_api.cu  tensor_runtime/src/tensor_utils.cu  tensor_runtime/src/fp16_gemm.cu  tensor_runtime/src/debug.cc tensor_runtime/src/global_data.cc  tensor_runtime/src/approx_techniques.cu  tensor_runtime/src/approx_techniques2.cu  tensor_runtime/src/approx_simulation.cu  tensor_runtime/src/profiling.cc  tensor_runtime/src/op_overheads.cc   tensor_runtime/src/error.cu  tensor_runtime/src/wrapper_runtime.cu )
-
-cuda_add_cublas_to_target(tensor_runtime)
-
+set(
+  RUNTIME_SRCS_FILENAME
+  approx_simulation.cu
+  approx_techniques.cu
+  approx_techniques2.cu
+  common.cpp
+  configuration.cpp
+  debug.cc
+  device_math.cu
+  error.cu
+  fp16_gemm.cu
+  global_data.cc
+  half_precision_api.cu
+  hpvm-rt-controller.cpp
+  img_tensor_runtime.cu
+  img_tensor_utils.cpp
+  op_overheads.cc
+  profiling.cc
+  tensor_runtime.cu
+  tensor_utils.cu
+  wrapper_runtime.cu
+)
+foreach(FILE ${RUNTIME_SRCS_FILENAME})
+  list(APPEND RUNTIME_SRCS "tensor_runtime/src/${FILE}")
+endforeach()
 
 find_library(GPU_PROFILER_LIB
     NAMES libgpu_profiler.a
     HINTS ../gpu_profiler/lib
 )
-
 find_library(SOC_SIMULATOR_LIB
     NAMES libpromise_profiler.a
     HINTS ../soc_simulator/lib
 )
-
-target_link_libraries(tensor_runtime cudnn cufft stdc++fs -lcurand)
+set(LINK_LIBS cudnn cufft stdc++fs -lcurand)
 if(USE_GFLAGS)
-  target_link_libraries(tensor_runtime gflags)
+  list(APPEND LINK_LIBS gflags)
 endif()
 
+# Adding new rule for building a cuDNN runtime library
+# Offline version
+cuda_add_library(tensor_runtime ${RUNTIME_SRCS})
+cuda_add_cublas_to_target(tensor_runtime)
+target_link_libraries(tensor_runtime ${LINK_LIBS})
 
+# Online version
 remove_definitions(-DONLINE_PROFILING=false)
 add_definitions(-DONLINE_PROFILING=true)
-
-cuda_add_library(tensor_runtime_online   tensor_runtime/src/tensor_runtime.cu  tensor_runtime/src/half_precision_api.cu  tensor_runtime/src/tensor_utils.cu  tensor_runtime/src/fp16_gemm.cu  tensor_runtime/src/debug.cc tensor_runtime/src/global_data.cc  tensor_runtime/src/approx_techniques.cu  tensor_runtime/src/approx_techniques2.cu  tensor_runtime/src/approx_simulation.cu  tensor_runtime/src/profiling.cc  tensor_runtime/src/op_overheads.cc   tensor_runtime/src/error.cu  tensor_runtime/src/wrapper_runtime.cu )
-
+cuda_add_library(tensor_runtime_online ${RUNTIME_SRCS})
 cuda_add_cublas_to_target(tensor_runtime_online)
-
 target_link_libraries(tensor_runtime_online cudnn -lcurand)
 
 # Adding new rule for building a cuDNN runtime library
 cuda_add_library(tensor_cpu_runtime tensor_runtime/src/tensor_cpu_runtime.cc)
 target_link_libraries(tensor_cpu_runtime)
 
-
-
 ### TODO: Remove unsued CMake rules after careful consideration
 
 # Adding rule for the debugging source
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
index e2905db99ae142b013215dcf90e0e0cbb9c5f70e..22a6b5ca951793d26003f0c5ff4dc1e7d4c39f95 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -1,5 +1,4 @@
-
-#include "tensor_utils.cu"
+#include "tensor_utils.h"
 
 
 //produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
index 99c465434a2879d85624b7ff6bb4141dd8fe4634..75e60a1c4a5d6fda8bf32fcaee3e2b9d192cbbcd 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
@@ -11,24 +11,15 @@
 // Describes the internal choices made for an ApproxHPVM node
 class NodeConfiguration {
 public:
-  enum NODE_CONFIGURATION_TARGET
-  {
-    PROMISE,
-    GPU,
-    END
-  };
+  enum NODE_CONFIGURATION_TARGET { PROMISE, GPU, END };
 
 protected:
   enum NODE_CONFIGURATION_TARGET NODE_CONFIGURATION_TARGET_ID;
 
 public:
-  bool isPROMISENodeConfiguration() {
-    return NODE_CONFIGURATION_TARGET_ID == PROMISE;
-  }
+  bool isPROMISENodeConfiguration();
 
-  bool isGPUNodeConfiguration() {
-    return NODE_CONFIGURATION_TARGET_ID == GPU;
-  }
+  bool isGPUNodeConfiguration();
 
   virtual void print() = 0;
 };
@@ -36,30 +27,20 @@ public:
 class PROMISENodeConfiguration : public NodeConfiguration {
 public:
   // Approximation methods available for this HW type
-  enum APPROX
-  {
-    SWING_LEVEL,
-    END
-  };
+  enum APPROX { SWING_LEVEL, END };
 
 private:
   // A vector, containing pairs of approximation method and tunable parameter
   // (expressed as int, or ignored when not applicable)
-  std::vector< std::pair<enum APPROX, int> > ApproxChoices;
+  std::vector<std::pair<enum APPROX, int>> ApproxChoices;
 
 public:
-  void pushNewApproximationChoice(enum APPROX approx, int u) {
-    ApproxChoices.push_back(std::make_pair(approx, u));
-  }
+  void pushNewApproximationChoice(enum APPROX approx, int u);
 
-  std::vector<std::pair<enum APPROX, int> > &getApproxChoices() {
-    return ApproxChoices;
-  }
+  std::vector<std::pair<enum APPROX, int>> &getApproxChoices();
 
-  PROMISENodeConfiguration() {
-    NODE_CONFIGURATION_TARGET_ID = PROMISE;
-  }
-  ~PROMISENodeConfiguration() {}
+  PROMISENodeConfiguration();
+  ~PROMISENodeConfiguration();
 
   void print() override;
 };
@@ -67,20 +48,18 @@ public:
 class GPUNodeConfiguration : public NodeConfiguration {
 public:
   // Approximation methods available for this HW type
-  enum APPROX
-  {
+  enum APPROX {
     FP32,
     FP16,
     PERFORATION,
     INPUT_SAMPLING,
     REDUCTION_SAMPLING,
-//  ADDITIONAL_APPROXIMATION_METHOD
+    //  ADDITIONAL_APPROXIMATION_METHOD
     APPROX_END
   };
 
   // Operations to be approximated in the node using this configuration
-  enum TENSOR_OP
-  {
+  enum TENSOR_OP {
     ADD,
     BATCHNORM,
     CONV,
@@ -99,37 +78,30 @@ public:
     MAP1,
     MAP2,
     MAP3,
-//    STENCIL,
-//    COSINE_T,
-//  ADDITIONAL_TENSOR_OPERATION
+    //    STENCIL,
+    //    COSINE_T,
+    //  ADDITIONAL_TENSOR_OPERATION
     TENSOR_OP_END
   };
 
 private:
   // A vector, containing pairs of approximation method and tunable parameter
   // (expressed as int, or ignored when not applicable) for each operation
-  std::vector< std::pair< enum TENSOR_OP, std::vector< std::pair<enum APPROX, int> > > > ApproxChoices;
+  std::vector<
+      std::pair<enum TENSOR_OP, std::vector<std::pair<enum APPROX, int>>>>
+      ApproxChoices;
 
 public:
-  void pushNewTensorOperation(enum TENSOR_OP top) {
-    std::vector< std::pair<enum APPROX, int> > emptyVec;
-    ApproxChoices.push_back(std::make_pair(top, emptyVec));
-  }
-
-  void pushNewApproximationChoiceForOperation(enum APPROX approx, int u) {
-    unsigned size = ApproxChoices.size();
-    CUSTOM_ASSERT(size >=1 && "Cannot apply approximation choice to non existent operation.");
-    ApproxChoices[size-1].second.push_back(std::make_pair(approx, u));
-  }
-
-  std::vector< std::pair< enum TENSOR_OP, std::vector< std::pair<enum APPROX, int> > > > &getApproxChoices() {
-    return ApproxChoices;
-  }
-
-  GPUNodeConfiguration() {
-    NODE_CONFIGURATION_TARGET_ID = GPU;
-  }
-  ~GPUNodeConfiguration() {}
+  void pushNewTensorOperation(enum TENSOR_OP top);
+
+  void pushNewApproximationChoiceForOperation(enum APPROX approx, int u);
+
+  std::vector<
+      std::pair<enum TENSOR_OP, std::vector<std::pair<enum APPROX, int>>>> &
+  getApproxChoices();
+
+  GPUNodeConfiguration();
+  ~GPUNodeConfiguration();
 
   void print() override;
 };
@@ -140,33 +112,25 @@ public:
 // - energy
 // - accuracy (compared to golden output)
 // - accuracy loss (compared to baseline)
-// - a hardware choice and set or operations-approximation choices, described in setup
+// - a hardware choice and set or operations-approximation choices, described in
+// setup
 struct Configuration {
   std::string name;
   float speedup;
   float energy;
   float accuracy;
   float accuracyLoss;
-  std::map<std::string, NodeConfiguration * > setup;
+  std::map<std::string, NodeConfiguration *> setup;
 
-  Configuration(std::string &n, float f, float e, float a, float al) :
-    name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {}
+  Configuration(std::string &n, float f, float e, float a, float al);
 
-  float getSpeedup() {
-    return speedup;
-  }
+  float getSpeedup();
 
-  float getEnergy() {
-    return energy;
-  }
+  float getEnergy();
 
-  float getAccuracy() {
-    return accuracy;
-  }
+  float getAccuracy();
 
-  float getAccuracyLoss() {
-    return accuracyLoss;
-  }
+  float getAccuracyLoss();
 
   void print();
 };
@@ -174,187 +138,28 @@ struct Configuration {
 // Comparison operator definition, in increasing accuracy loss
 // (for std sort, used in pareto optimal computation)
 struct ConfigurationLessThan {
-  bool operator()(const struct Configuration &a, const struct Configuration &b) const {
-    return (a.accuracyLoss < b.accuracyLoss) ;
-  }
+  bool operator()(
+      const struct Configuration &a, const struct Configuration &b) const;
 };
 
 // Comparison operator definition, in increasing accuracy loss
 // (for std lower bound, used in pareto optimal frontier search)
 struct ConfigurationLessThan_AL {
-  bool operator()(const struct Configuration *a, const float &b) const {
-    return (a->accuracyLoss < b) ;
-  }
+  bool operator()(const struct Configuration *a, const float &b) const;
 };
 
 // Comparison operator definition, in increasing speedup
 // (for std lower bound, used in pareto optimal frontier search)
 struct ConfigurationLessThan_SP {
-  bool operator()(const struct Configuration *a, const float &b) const {
-    return (a->speedup < b) ;
-  }
+  bool operator()(const struct Configuration *a, const float &b) const;
 };
 
 // Comparison operator definition, in decreasing energy
 // (for std lower bound, used in pareto optimal frontier search)
 struct ConfigurationLessThan_E {
-  bool operator()(const struct Configuration *a, const float &b) const {
-    return (a->energy < b) ;
-  }
+  bool operator()(const struct Configuration *a, const float &b) const;
 };
 
-enum SEARCH_KIND
-  {
-    SPEEDUP,
-    ENERGY,
-    ACCURACY_LOSS,
-    END
-  };
-
-//****** HEADER Ends - Source Starts
-
-
-// Helper configuration print methods
-
-void PROMISENodeConfiguration::print() {
-
-  printf(" promise");
-  for (auto &it : ApproxChoices) {
-    printf(" ");
-    switch (it.first) {
-      case APPROX::SWING_LEVEL :
-        printf("swing_level");
-        break;
-      default:
-        ERROR("Unknown approximation option");
-        break;
-      // TODO additional approx methods to be printed here
-    }
-    printf(" %d", it.second);
-  }
-
-  printf("\n");
-
-}
-
-void GPUNodeConfiguration::print() {
-
-  printf(" gpu");
-  for (auto &it : ApproxChoices) {
-
-    printf(" ");
-    switch (it.first) {
-      case TENSOR_OP::ADD :
-        printf("add");
-        break;
-      case TENSOR_OP::BATCHNORM :
-        printf("batchnorm");
-        break;
-      case TENSOR_OP::CONV :
-        printf("conv");
-        break;
-      case TENSOR_OP::GROUP_CONV :
-        printf("group_conv");
-        break;
-      case TENSOR_OP::MUL :
-        printf("mul");
-        break;
-      case TENSOR_OP::RELU :
-        printf("relu");
-        break;
-      case TENSOR_OP::CLIPPED_RELU :
-        printf("clipped_relu");
-        break;
-      case TENSOR_OP::TANH :
-        printf("tanh");
-        break;
-      case TENSOR_OP::POOL_MAX :
-        printf("pool_max");
-        break;
-      case TENSOR_OP::POOL_MEAN :
-        printf("pool_mean");
-        break;
-      case TENSOR_OP::POOL_MIN :
-        printf("pool_min");
-        break;
-      case TENSOR_OP::SOFTMAX :
-        printf("softmax");
-        break;
-      case TENSOR_OP::FFT :
-        printf("fft");
-        break;
-      case TENSOR_OP::REDUCE :
-        printf("reduce");
-        break;
-      case TENSOR_OP::PROJECTIVE_T :
-        printf("projectiveT");
-        break;
-      case TENSOR_OP::MAP1 :
-        printf("map1");
-        break;
-      case TENSOR_OP::MAP2 :
-        printf("map2");
-        break;
-      case TENSOR_OP::MAP3 :
-        printf("map3");
-        break;
-      default :
-        ERROR("Unknown tensor operation.");
-        break;
-      // TODO additional operations to be printed here
-    }
-
-    auto &approxVec = it.second;
-    for (auto &inner_it : approxVec) {
-      printf(" ");
-      switch (inner_it.first) {
-        case APPROX::FP32 :
-          printf("fp32");
-          break;
-        case APPROX::FP16 :
-          printf("fp16");
-          break;
-        case APPROX::PERFORATION :
-          printf("perf");
-          break;
-        case APPROX::INPUT_SAMPLING :
-          printf("samp");
-          break;
-        case APPROX::REDUCTION_SAMPLING :
-          printf("red_samp");
-          break;
-        default:
-          ERROR("Unknown approximation option");
-          break;
-        // TODO additional approx methods to be printed here
-      }
-      
-      printf(" %d", inner_it.second);
-    }
-
-  }
-  
-  printf("\n");
-
-}
-
-void Configuration::print() {
-
-  printf("+++++\n");
-  printf("%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy, accuracyLoss);
-  for  (std::map<std::string, NodeConfiguration* >::const_iterator it = setup.begin();
-    it != setup.end(); ++it) {
-    printf("%s :", it->first.c_str());
-
-    it->second->print();
-  }
-
-  printf("-----\n");
-}
-
-
-
-
-
+enum SEARCH_KIND { SPEEDUP, ENERGY, ACCURACY_LOSS, END };
 
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
index 8c0922ba67451cdbb32e7af7925cf8b95ef0f647..235ac088da8bc9b19785f2e0f00bbb81afba42a6 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
@@ -2,6 +2,8 @@
 #define DEVICE_MATH_H
 
 #include <device_launch_parameters.h>
+#include <cuda_fp16.h>
+#include <stdexcept>
 #include <limits>
 
 enum class MathOp {
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
index 6cffc38201f43874de02a12a0fcc77369dc1786f..0b4d7d056550c6f7f32dbc2c276388328fce6e60 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
@@ -9,6 +9,7 @@
 
 #include "debug.h"
 #include "tensor.h"
+#include "profiling.h"
 
 template <typename T> __host__ __device__ __forceinline__ T ceilDiv(T a, T b) {
   return (a + b - 1) / b;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
index 5e2f672161f3fcfcc1c59c6ba6437bfa963c7a90..7727ffc34962a338e69f08b626ad504f2071244b 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
@@ -11,7 +11,7 @@
 #include "debug.h"
 #include "map_typing.h"
 #include "tensor.h"
-#include "tensor_utils.cu"
+#include "tensor_utils.h"
 
 template <size_t N> void mapPrecheck(const std::array<Tensor *, N> &srcs) {
   for (Tensor *src : srcs) {
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
index 111b50725e4ffc9d676762118d7c46124858c3a8..de5402942cf7be965a342ebfedf80d21475c49d3 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
@@ -5,7 +5,7 @@
 #include "common.h"
 #include "debug.h"
 #include "tensor.h"
-#include "tensor_utils.cu"
+#include "tensor_utils.h"
 
 // Between CUDA compute capability 1.0 and 7.5,
 // Least "max # threads per block" is 512, so 512 is used to be compatible;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
index cc3d785985ffac4ffc3863f19f27dae3bc0d3b52..a14fe22700de6a12017294dfec9f73e28c7bd0b9 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
@@ -47,7 +47,7 @@ extern int total_ops;
 extern std::vector<int> op_accuracies;
 extern std::vector<Range*> quant_ranges;
 
-std::unordered_set<void*> tensors_ptr, host_ptr, obj_ptr;
+extern std::unordered_set<void*> tensors_ptr, host_ptr, obj_ptr;
 
 extern std::unordered_map<void*, int> tracked_tensors;
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
index 6a3f9b610938b19ca33a3544ac30e78b7c264710..4e3542fdfae1ae3e5d8f074f791e2de05b21f9ad 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
@@ -1,4 +1,3 @@
-
 #ifndef LLVM_HPVM_RT_CONTROLLER_H
 #define LLVM_HPVM_RT_CONTROLLER_H
 
@@ -23,10 +22,7 @@
  * Check if a file exists
  * Return true if the file exists, false else
  */
-bool fileExists(const std::string& file) {
-  struct stat buf;
-  return (stat(file.c_str(), &buf) == 0);
-}
+bool fileExists(const std::string& file);
 
 class ProfileInfo {
   private:
@@ -77,184 +73,40 @@ class ProfileInfo {
   std::string out_file_name;
 
   // Functions
-  void resetCurrentIterationTime() {
-    time_compute_current_iteration = 0.0;
-    time_control_current_iteration = 0.0;
-    time_config_current_iteration = 0.0;
-  }
-
-  void resetCurrentIterationEnergy() {
-    energy_compute_current_iteration = 0.0;
-    energy_control_current_iteration = 0.0;
-    energy_config_current_iteration = 0.0;
-  }
-
-  void start_iteration() {
-    if (!in_iteration) {
-      resetCurrentIterationTime();
-      resetCurrentIterationEnergy();
-      tensor_time_info.push_back(std::vector< std::pair< std::string, double > > ());
-      tensor_energy_info.push_back(std::vector< std::pair< std::string, double > > ());
-      in_iteration = true;
-    }
-  }
+  void resetCurrentIterationTime();
+
+  void resetCurrentIterationEnergy();
+
+  void start_iteration();
 
   public:
-  void end_iteration() {
-    // Update time counters
-    time_compute += time_compute_current_iteration;
-    time_control += time_control_current_iteration;
-    time_config  += time_config_current_iteration;
-
-    time_total += (time_compute_current_iteration +
-                   time_control_current_iteration +
-                   time_config_current_iteration);
-
-    // Update energy counters
-    energy_compute += energy_compute_current_iteration;
-    energy_control += energy_control_current_iteration;
-    energy_config  += energy_config_current_iteration;
-
-    energy_total += (energy_compute_current_iteration +
-                     energy_control_current_iteration +
-                     energy_config_current_iteration);
-
-    // Save current iteration counters
-    compute_time_info.push_back(time_compute_current_iteration);
-    compute_energy_info.push_back(energy_compute_current_iteration);
-    control_time_info.push_back(time_control_current_iteration);
-    control_energy_info.push_back(energy_control_current_iteration);
-    config_time_info.push_back(time_config_current_iteration);
-    config_energy_info.push_back(energy_config_current_iteration);
-
-    // Note end of iteration
-    in_iteration = false;
-  }
-
-  void addToCurrentIterationComputeTime(const char *s, double t) {
-    start_iteration();
-    time_compute_current_iteration += t;
-    tensor_time_info.back().push_back(std::make_pair(std::string(s), t));
-  }
-
-  void addToCurrentIterationControlTime(double t) {
-    start_iteration();
-    time_control_current_iteration += t;
-  }
-
-  void addToCurrentIterationConfigTime(double t) {
-    start_iteration();
-    time_config_current_iteration += t;
-  }
-
-  void addToCurrentIterationComputeEnergy(const char *s, double e) {
-    start_iteration();
-    energy_compute_current_iteration += e;
-    tensor_energy_info.back().push_back(std::make_pair(std::string(s), e));
-  }
-
-  void addToCurrentIterationControlEnergy(double e) {
-    start_iteration();
-    energy_control_current_iteration += e;
-  }
-
-  void addToCurrentIterationConfigEnergy(double e) {
-    start_iteration();
-    energy_config_current_iteration += e;
-  }
-
-  double getTotalTime() {
-    return time_total;
-  }
-
-  double getTotalEnergy() {
-    return energy_total;
-  }
-
-  double getCurrentIterationComputeTime() {
-    return time_compute_current_iteration;
-  }
-
-  double getCurrentIterationComputeEnergy() {
-    return energy_compute_current_iteration;
-  }
-
-  void set_out_file_name(std::string &str) {
-    out_file_name = str;
-  }
-
-  void printToFile() {
-
-    INFO("Writing Runtime Profile Info File...\n");
-    std::ofstream s_out(out_file_name.c_str());
-
-    if (!s_out) {
-      ERROR("Failed to open output file.");
-      abort();
-    }
-
-    // By construction, tensor_time_info and tensor_energy_info are expected 
-    // to have equal sizes, in outer and inner vectors both,
-    // and all time_info and energy_info vectors must have the same size.
-    unsigned iterations = tensor_time_info.size();
-    CUSTOM_ASSERT((tensor_time_info.size() == iterations) &&
-                  (tensor_energy_info.size() == iterations) &&
-                  (control_time_info.size() == iterations) &&
-                  (control_energy_info.size() == iterations) &&
-                  (config_time_info.size() == iterations) &&
-                  (config_energy_info.size() == iterations) &&
-                  "time_info and energy_info size: \
-                   iteration number does not match.");
-
-    for (unsigned i = 0; i < tensor_time_info.size(); i++ ) {
-      // time_info.size() == energy_info.size(), since we passed the assertion
-      s_out << "Iteration " << i << "\n";
-      
-      CUSTOM_ASSERT((tensor_time_info[i].size() == tensor_energy_info[i].size()) &&
-                    "time_info and energy_info size: operation number does not match.");
-      for (unsigned j = 0; j < tensor_time_info[i].size(); j++) {
-        // time_info[i].size() == energy_info[i].size(), we passed the assertion
-        CUSTOM_ASSERT((tensor_time_info[i][j].first == tensor_energy_info[i][j].first) &&
-                      "time_info and energy_info: operation does not match.");
-        s_out << tensor_time_info[i][j].first << " "
-              << tensor_time_info[i][j].second << " "
-              << tensor_energy_info[i][j].second << "\n";
-      }
-
-      s_out << "\nIteration Compute Time  : " << compute_time_info[i] << "\n";
-      s_out << "Iteration Compute Energy: " << compute_energy_info[i] << "\n";
-      s_out << "Iteration Control Time  : " << control_time_info[i] << "\n";
-      s_out << "Iteration Control Energy: " << control_energy_info[i] << "\n";
-      s_out << "Iteration Config Time  : " << config_time_info[i] << "\n";
-      s_out << "Iteration Control Energy: " << config_energy_info[i] << "\n\n\n";
-
-    }
-    s_out << "\n\nTotal Compute Time  : " << time_compute << "\n";
-    s_out << "Total Compute Energy: " << energy_compute << "\n";
-
-    s_out << "\nTotal Control Time  : " << time_control << "\n";
-    s_out << "Total Control Energy: " << energy_control << "\n";
-
-    s_out << "\nTotal Config Time  : " << time_config << "\n";
-    s_out << "Total Config Energy: " << energy_config << "\n";
-
-    s_out << "\nTotal Time  : " << time_total << "\n";
-    s_out << "Total Energy: " << energy_total << "\n";
-
-    s_out.close();
-
-    INFO("Done writing profile.\n");
- 
-  }
-
-  ProfileInfo() : time_total(0.0), energy_total(0.0),
-                  time_compute_current_iteration(0.0),
-                  time_control_current_iteration(0.0),
-                  time_config_current_iteration(0.0),
-                  energy_compute_current_iteration(0.0),
-                  energy_control_current_iteration(0.0),
-                  energy_config_current_iteration(0.0),
-                  in_iteration(false) {}
+  void end_iteration();
+
+  void addToCurrentIterationComputeTime(const char *s, double t);
+
+  void addToCurrentIterationControlTime(double t);
+
+  void addToCurrentIterationConfigTime(double t);
+
+  void addToCurrentIterationComputeEnergy(const char *s, double e);
+
+  void addToCurrentIterationControlEnergy(double e);
+
+  void addToCurrentIterationConfigEnergy(double e);
+
+  double getTotalTime();
+
+  double getTotalEnergy();
+
+  double getCurrentIterationComputeTime();
+
+  double getCurrentIterationComputeEnergy();
+
+  void set_out_file_name(std::string &str);
+
+  void printToFile();
+
+  ProfileInfo();
 
 };
 
@@ -264,32 +116,11 @@ class Slowdowns {
   unsigned idx;
 
   public:
-    Slowdowns() {
-      idx = 0;
-
-      std::ifstream s_in("slowdowns.txt");
-      if (!s_in) {
-        DEBUG("slowdowns file not found. Initializing slowdowns randomly.\n");
-        for (unsigned i = 0; i < 10; i++) {
-          slowdowns.push_back( 1.0 + (rand()/(RAND_MAX/(5.0-1.0))) );
-        }
-      } else {
-        for (std::string line; std::getline(s_in, line); ) {
-          float s = std::stof(line);
-          slowdowns.push_back(s);
-        }
-      }
-    }
-
-  unsigned getSlowdownsNumber() {
-    return slowdowns.size();
-  }
-
-  float getNextSlowdown() {
-    float tmp = slowdowns[idx];
-    idx = (idx + 1) % slowdowns.size();
-    return tmp;
-  }
+    Slowdowns();
+
+  unsigned getSlowdownsNumber();
+
+  float getNextSlowdown();
 
 };
 
@@ -333,14 +164,8 @@ class RuntimeController {
   //Functions
 
   // Private functions of profiler
-  void start_profiler() {
-    if (profiler)
-      profiler->start_profiler();
-  }
-  void stop_profiler() {
-    if (profiler)
-      profiler->stop_profiler();
-  }
+  void start_profiler();
+  void stop_profiler();
 
   void setProfileInfoFilename(const char *);
   void readQuantizationFile(const char *);
@@ -351,33 +176,17 @@ class RuntimeController {
 
   public:
   // For testing purposes only - do not use widely
-  std::vector<struct Configuration *> &getSpeedupConfigurations() {
-    return SpeedupConfigurations;
-  }
+  std::vector<struct Configuration *> &getSpeedupConfigurations();
   // For testing purposes only - do not use widely
-  std::vector<struct Configuration *> &getEnergyConfigurations() {
-    return EnergyConfigurations;
-  }
+  std::vector<struct Configuration *> &getEnergyConfigurations();
   // For testing purposes only - do not use widely
-  std::vector<struct Configuration *> &getThreeDCurveConfigurations() {
-    return ThreeDCurveConfigurations;
-  }
+  std::vector<struct Configuration *> &getThreeDCurveConfigurations();
   // For testing purposes only - do not use widely
-  unsigned getConfigurationIdx() {
-    return configurationIdx;
-  }
-
-  std::vector<float> &getQuantizationRanges(const char *data) {
-    std::string s(data);
-    // All nodes are expected to have quantization ranges
-    return QuantizationMap.at(s);
-  }
-
-  NodeConfiguration *getNodeConfiguration(const char *data) {
-    std::string s(data);
-    // All nodes are expected to have a configuration
-    return (*Configurations)[configurationIdx]->setup.at(s);
-  }
+  unsigned getConfigurationIdx();
+
+  std::vector<float> &getQuantizationRanges(const char *data);
+
+  NodeConfiguration *getNodeConfiguration(const char *data);
 
   // Functions for runtime control
   void findNextConfiguration();
@@ -387,105 +196,37 @@ class RuntimeController {
   double getBaselineTime();
   Slowdowns *getSlowdowns();
 
-  void init(const char *Cstr, const char *Qstr) {
-    // We initialize the path to the profile info output file,
-    // based on the path given for the configuration file
-    setProfileInfoFilename(Cstr);
-
-    readQuantizationFile(Qstr);
-    readConfigurationFile(Cstr);
-    Configurations = NULL;
-    computeParetoConfigurationPoints();
-//    compute3DParetoConfigurationPoints(); Not using 3D curve
-    INFO("Speedup Configurations\n");
-    printConfigurations(SpeedupConfigurations);
-//    INFO("Energy Configurations\n");
-//    printConfigurations(EnergyConfigurations);
-//    INFO("3D Configurations\n");
-//    printConfigurations(ThreeDCurveConfigurations);
-    configurationIdx = 0; //TODO: initialize using pareto curve - findTargetConfiguration ?
-    Configurations = &SpeedupConfigurations;
-
-    // Initializations for different runtime control strategies
-    srand(static_cast <unsigned> (time(0)));
-    slowdowns = new Slowdowns();
-    pseudo_rd = 0.0;
-
-    // Start profiling thread in the background, ready to time
-    start_profiler();
-    pause_profiler();
-    reset_profiler();
- }
+  void init(const char *Cstr, const char *Qstr);
 
   // Exposing functionality of ProfileInfo
-  void end_iteration() {
-    if (PI)
-      PI->end_iteration();
-  }
-
-  void addToCurrentIterationComputeTime(const char *s, double t) {
-    if (PI)
-      PI->addToCurrentIterationComputeTime(s, t);
-  }
-
-  void addToCurrentIterationControlTime(double t) {
-    if (PI)
-      PI->addToCurrentIterationControlTime(t);
-  }
-
-  void addToCurrentIterationConfigTime(double t) {
-    if (PI)
-      PI->addToCurrentIterationConfigTime(t);
-  }
-
-  void addToCurrentIterationComputeEnergy(const char *s, double e) {
-    if (PI)
-      PI->addToCurrentIterationComputeEnergy(s, e);
-  }
-
-  void addToCurrentIterationControlEnergy(double e) {
-    if (PI)
-      PI->addToCurrentIterationControlEnergy(e);
-  }
-
-  void addToCurrentIterationConfigEnergy(double e) {
-    if (PI)
-      PI->addToCurrentIterationConfigEnergy(e);
-  }
-
-  double getCurrentIterationComputeTime() {
-    return (PI ? PI->getCurrentIterationComputeTime() : 0.0) ;
-  }
-
-  double getCurrentIterationComputeEnergy() {
-    return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0) ;
-  }
-
-  void writeProfileInfo() {
-    if (PI)
-      PI->printToFile();
-  }
+  void end_iteration();
+
+  void addToCurrentIterationComputeTime(const char *s, double t);
+
+  void addToCurrentIterationControlTime(double t);
+
+  void addToCurrentIterationConfigTime(double t);
+
+  void addToCurrentIterationComputeEnergy(const char *s, double e);
+
+  void addToCurrentIterationControlEnergy(double e);
+
+  void addToCurrentIterationConfigEnergy(double e);
+
+  double getCurrentIterationComputeTime();
+
+  double getCurrentIterationComputeEnergy();
+
+  void writeProfileInfo();
 
   // Exposing functionality of (gpu) profiler
-  void resume_profiler() {
-    if (profiler)
-      profiler->resume_profiler();
-  }
-
-  void pause_profiler() {
-    if (profiler)
-      profiler->pause_profiler();
-  }
-
-  void reset_profiler() {
-    if (profiler)
-      profiler->reset();
-  }
-
-  std::pair<double, double> get_time_energy() const {
-    return (profiler ? profiler->get_time_energy()
-                     : std::make_pair(0.0, 0.0)) ;
-  }
+  void resume_profiler();
+
+  void pause_profiler();
+
+  void reset_profiler();
+
+  std::pair<double, double> get_time_energy() const;
 
   // Exposing functionality of promise simulator
   std::pair<double, double> fc_profile(const unsigned num_rows_a,
@@ -493,15 +234,7 @@ class RuntimeController {
                             const unsigned num_rows_b,
                             const unsigned num_cols_b,
                             const unsigned voltage_swing,
-                            const unsigned patch_factor) {
-    return (promise ? promise->fc_profile(num_rows_a,
-                                          num_cols_a,
-                                          num_rows_b,
-                                          num_cols_b,
-                                          voltage_swing,
-                                          patch_factor)
-                    : std::make_pair(0.0, 0.0)) ;
-  }
+                            const unsigned patch_factor);
 
   std::pair<double, double> conv_profile(const unsigned n,
                             const unsigned c,
@@ -514,58 +247,12 @@ class RuntimeController {
                             const unsigned s_h,
                             const unsigned s_w,
                             const unsigned voltage_swing,
-                            const unsigned patch_factor) {
-    return (promise ? promise->conv_profile(n, c, h, w,
-                                            c_out, c_in, k_h, k_w,
-                                            s_h, s_w,
-                                            voltage_swing, patch_factor)
-                    : std::make_pair(0.0, 0.0)) ;
-  }
+                            const unsigned patch_factor);
 
   // Constructor and descructor
-  RuntimeController() {
-    configurationIdx = 0;
-#ifdef ACTIVE_PROFILING
-    PI = new ProfileInfo();
-    profiler = new Profiler();
-    promise = new Promise();
-#else
-    PI = NULL;
-    profiler = NULL;
-    promise = NULL;
-#endif
+  RuntimeController();
 
-  }
-
-  ~RuntimeController() {
-
-    stop_profiler();
-    writeProfileInfo();
-
-    if (PI) {
-      delete PI;
-    }
-    if (profiler) {
-      delete profiler;
-    }
-    if (promise) {
-      delete promise;
-    }
-
-    for (std::vector<struct Configuration>::iterator it = InitialConfigurations.begin(),
-	   ie = InitialConfigurations.end(); it != ie; ++it) {
-      std::map<std::string, NodeConfiguration * > ConfSetup = it->setup;
-      for (std::map<std::string, NodeConfiguration* >::const_iterator it = ConfSetup.begin();
-	   it != ConfSetup.end(); ++it) {
-        delete it->second;
-      }
-    }
-  // Handle freeing memory, for all configurations
-  // A way to do that is to not free the initial configurations in the pareto curve,
-  // and free all at once in the end
-  // This is done because configurations are stored in different containers, but
-  // share the node setup
-  }
+  ~RuntimeController();
 
   // Helper Functions
   void printQuantizationMap();
@@ -573,1037 +260,35 @@ class RuntimeController {
   void printConfigurations(std::vector<struct Configuration *> &);
 
 };
-
-void RuntimeController::setProfileInfoFilename(const char *str) {
-
-  if (PI) {
-    std::string file_path = std::string(str);
-    size_t idx = file_path.find_last_of("/");
-    file_path.erase(idx + 1);
-    file_path.append("profile_info_");
-
-    bool found = false;
-    std::string profile_filename;
-    for (unsigned i = 0; !found; i++) {
-      profile_filename = file_path;
-      profile_filename.append(std::to_string(i));
-      profile_filename.append(".txt");
-      found = !fileExists(profile_filename);
-    }
-
-    PI->set_out_file_name(profile_filename);
-  }
-}
-
 #define NODE_NAME_BUFFER_SIZE 10
-  
-void RuntimeController::readQuantizationFile(const char *str) {
-
-  INFO("Reading Quantization Ranges File...\n");
- 
-  if (std::string(str).empty()) {
-    INFO("Empty quantization file string.\n");
-    return;
-  } 
-
-  std::ifstream qin(str);
-
-  if (!qin) {
-    ERROR("Failed to open PROMISE quantization file.");
-    abort();
-  }
-
-  while (!qin.eof()) {
-    char NodeName[NODE_NAME_BUFFER_SIZE];
-    std::vector<float> QuantRangeVector;
-
-    qin >> NodeName;
-    
-    float qrange;
-    for (unsigned i = 0; i < 8; i++ ) {
-      qin >> qrange;
-      QuantRangeVector.push_back(qrange);
-    }
-    // See if we need to insert this in map instead - my lookup test seemed to work without it
-    // std::string s(NodeName);
-    QuantizationMap.insert(std::pair< std::string, std::vector<float> > (NodeName,
-									 QuantRangeVector));
-  }
-
-  qin.close();
-  INFO("DONE.\n");
-}
-
-void RuntimeController::printQuantizationMap() {
-
-  DEBUG("Quantization Ranges Map:\n");
-
-  for  (std::map<std::string, std::vector<float> >::const_iterator it = QuantizationMap.begin();
-	it != QuantizationMap.end(); ++it) {
-    DEBUG("%s :", it->first.c_str());
-
-    for (unsigned i = 0; i < it->second.size() ; i++) {
-      DEBUG(" %f", it->second[i]);
-    }
-
-    DEBUG("\n");
-  }
-
-}
-
-
-void RuntimeController::readConfigurationFile(const char *str) {
-
-  INFO("Reading Configuration File...\n");
-
-  std::ifstream qin(str);
-
-  if (!qin) {
-    ERROR("Failed to open configuration file.");
-    abort();
-  }
-
-  bool readingConfiguration = false;
-  bool readingFirstLine = false;
-
-  // Read baseline_time from first line of configuration file
-  std::string first_line;
-  std::getline(qin, first_line);
-  DEBUG("first_line: %s\n", first_line.c_str());
-  baseline_time = std::stod(first_line);
-  DEBUG("Baseline time: %lf\n\n", baseline_time);
-
-  for (std::string line; std::getline(qin, line); ) {
-    DEBUG("line: %s\n", line.c_str());
-
-    // Tokenize using ' ' as delimiter
-    // Vector to store tokens
-    std::vector<std::string> tokens;
-
-    for (auto i = strtok(&line[0], " "); i != NULL; i = strtok(NULL, " "))
-      tokens.push_back(i);
-
-    for (unsigned i = 0; i < tokens.size(); i++ )
-      DEBUG("t: %s\n", tokens[i].c_str());
-
-    DEBUG("\n");
-
-    if (tokens[0] == "+++++") { // Found new configuration start token
-      // Mark the start of a new configuration
-      readingConfiguration = true;
-      readingFirstLine = true;
-      continue;
-    }
-
-    if (tokens[0] == "-----") { // Found configuration end token
-      readingConfiguration = false;
-      // Mark the end of current configuration
-      continue;
-    }
-
-    if (readingFirstLine) {
-      // Read first line, to create the new configuration struct
-      readingFirstLine = false;
-      InitialConfigurations.push_back(Configuration(tokens[0],
-                                      std::stof(tokens[1]),
-                                      std::stof(tokens[2]),
-                                      std::stof(tokens[3]),
-                                      std::stof(tokens[4])));
-      continue;
-    }
-
-    if (tokens[1] == "promise") {
-      DEBUG("Found promise configuration\n");
-
-      // There must be at least one approximation option
-      CUSTOM_ASSERT((tokens.size() >= 2) && "Not enough approximation options.");
-
-      PROMISENodeConfiguration *NodeConf = new PROMISENodeConfiguration();
-      InitialConfigurations.back().setup.insert(std::make_pair(tokens[0], NodeConf));
-
-      // In increments of two, to handle pairs of approx option - tunable parameter
-      for (unsigned idx = 2; idx < tokens.size(); idx += 2) {
-	if (tokens[idx] == "swing_level") {
-	  DEBUG("Found swing voltage option\n");
-	  int vswing = std::stoi(tokens[idx+1]);
-	  DEBUG("vswing: %d\n", vswing);
-	  NodeConf->pushNewApproximationChoice(PROMISENodeConfiguration::APPROX::SWING_LEVEL,
-					       vswing);
-	}
-	// TODO: other approximation options handled here
-      }
-
-    } else if (tokens[1] == "gpu") {
-      DEBUG("Found gpu configuration\n");
-
-      // There must be at least one operation, with an approximation option
-      CUSTOM_ASSERT((tokens.size() >= 5) && "Not enough operations - approximation options.");
-
-      GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration();
-      InitialConfigurations.back().setup.insert(std::make_pair(tokens[0], NodeConf));
-
-      unsigned idx = 2;
-      while (idx < tokens.size()) {
-	if (tokens[idx] == "add") {
-	  DEBUG("Found add operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::ADD);
-	  idx++;
-	} else if (tokens[idx] == "batchnorm") {
-	  DEBUG("Found batchnorm operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::BATCHNORM);
-	  idx++;
-	} else if (tokens[idx] == "conv") {
-	  DEBUG("Found conv operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::CONV);
-	  idx++;
-	} else if (tokens[idx] == "group_conv") {
-	  DEBUG("Found group_conv operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::GROUP_CONV);
-	  idx++;
-	} else if (tokens[idx] == "mul") {
-	  DEBUG("Found mul operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MUL);
-	  idx++;
-	} else if (tokens[idx] == "relu") {
-	  DEBUG("Found relu operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::RELU);
-	  idx++;
-	} else if (tokens[idx] == "clipped_relu") {
-	  DEBUG("Found clipped_relu operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU);
-	  idx++;
-	} else if (tokens[idx] == "tanh") {
-	  DEBUG("Found tanh operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::TANH);
-	  idx++;
-	} else if (tokens[idx] == "pool_max") {
-	  DEBUG("Found pool_max operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::POOL_MAX);
-	  idx++;
-	} else if (tokens[idx] == "pool_mean") {
-	  DEBUG("Found pool_mean operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::POOL_MEAN);
-	  idx++;
-	} else if (tokens[idx] == "pool_min") {
-	  DEBUG("Found pool_min operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::POOL_MIN);
-	  idx++;
-	} else if (tokens[idx] == "softmax") {
-	  DEBUG ("Found softmax operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::SOFTMAX);
-	  idx++;
-	} else if (tokens[idx] == "fft") {
-	  DEBUG ("Found fft operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::FFT);
-	  idx++;
-	} else if (tokens[idx] == "reduce") {
-	  DEBUG ("Found reduce operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::REDUCE);
-	  idx++;
-	} else if (tokens[idx] == "projectiveT") {
-	  DEBUG ("Found projectiveT operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T);
-	  idx++;
-	} else if (tokens[idx] == "map1") {
-	  DEBUG ("Found map1 operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP1);
-	  idx++;
-	} else if (tokens[idx] == "map2") {
-	  DEBUG ("Found map2 operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP2);
-	  idx++;
-	} else if (tokens[idx] == "map3") {
-	  DEBUG ("Found map3 operation\n");
-	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP3);
-	  idx++;
-	} else /*Not a new operation. This means an approximation option*/
-	  if (tokens[idx] == "fp32") {
-	    DEBUG("Found fp32 option\n");
-	    int fp32 = std::stoi(tokens[idx+1]);
-	    DEBUG("fp32 parameter: %d, ignoring\n", fp32);
-	    NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::FP32,
-							     fp32);
-	    idx += 2;
-	  } else if (tokens[idx] == "fp16") {
-	    DEBUG("Found fp16 option\n");
-	    int fp16 = std::stoi(tokens[idx+1]);
-	    DEBUG("fp16 parameter: %d, ignoring\n", fp16);
-	    NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::FP16,
-							     fp16);
-	    idx += 2;
-	  } else if (tokens[idx] == "perf") {
-	    DEBUG("Found perf option\n");
-        int perf = std::stoi(tokens[idx+1]);
-	    DEBUG("perf parameter: %d\n", perf);
-        NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::PERFORATION, perf);
-          idx += 2;
-        } else if (tokens[idx] == "samp") {
-	    DEBUG("Found samp option\n");
-        int samp = std::stoi(tokens[idx+1]);
-	    DEBUG("samp parameter: %d\n", samp);
-        NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::INPUT_SAMPLING, samp);
-          idx += 2;
-        } else if (tokens[idx] == "red_samp") {
-	    DEBUG("Found red_samp option\n");
-        int red_samp = std::stoi(tokens[idx+1]);
-	    DEBUG("red_samp parameter: %d\n", red_samp);
-        NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING, red_samp);
-          idx += 2;
-        }
-	// TODO: other approximation options handled here
-
-      }
-
-    } else {
-      DEBUG ("Invalid Configuration File\n");
-      exit(1);
-    }
-
-  }
-
-  qin.close();
-  DEBUG("DONE.\n");
-
-}
-
 #define AL_THRESHOLD 0.01
-void RuntimeController::computeParetoConfigurationPoints() {
-
-  // Keep indices of pareto optimal points (configurations from
-  // InitialConfigurations vector that were copied to Configurations vector.)
-  // The others' setup pointer needs to be deleted
-  std::vector<unsigned> Indices;
-
-  // Baseline configuration (first one we read) always belongs to the curve
-  SpeedupConfigurations.push_back(&InitialConfigurations[0]);
-  EnergyConfigurations.push_back(&InitialConfigurations[0]);
-
-  // Sort the configurations according to accuracy loss
-  INFO("Sorting autotuner configurations...\n");
-  std::sort(InitialConfigurations.begin()+1,
-            InitialConfigurations.end(),
-            ConfigurationLessThan());
-  INFO("Done sorting.\n");
-
-  for (unsigned start_idx = 1; start_idx < InitialConfigurations.size(); ) {
-    // Points to first Configuration with different (higher) accuracy loss
-    // compared to the one pointed by start_idx 
-    unsigned end_idx = start_idx + 1;
-    while ((end_idx < InitialConfigurations.size()) &&
-           (InitialConfigurations[end_idx].accuracyLoss -
-            InitialConfigurations[start_idx].accuracyLoss < AL_THRESHOLD)) {
-      end_idx++;
-    }
-    DEBUG("start_idx = %d, end_idx = %d\n", start_idx, end_idx);
-    // Now, all elements in [start_idx, end_idx) have equal accuracy loss,
-    // that is lower from later ones.
-
-    // Find the best speedup and energy between them as well
-    float sp = -1.0; //FLT_MIN
-    unsigned sp_idx = 0;
-
-    float en = -1.0; //FLT_MIN
-    unsigned en_idx = 0;
-
-    for (unsigned i = start_idx; i < end_idx; i++) {
-      if (InitialConfigurations[i].speedup > sp) {
-        sp = InitialConfigurations[i].speedup;
-        sp_idx = i;
-      }
-      if (InitialConfigurations[i].energy > en) {
-        en = InitialConfigurations[i].energy;
-        en_idx = i;
-      }
-    }
-    DEBUG("accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
-          InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
-    // Found best speedup for this accuracy point (not dominated by any of these).
-    DEBUG("accuracy loss = %f, energy = %f, at en_idx = %d\n",
-          InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
-    // Found best energy for this accuracy point (not dominated by any of these).
-
-    // Now, we need to check that it is not dominated.
-    // - better accuracy loss of all in initial configurations out of
-    // start_idx, end_idx range
-    // - better or equal speedup to the ones within this range
-    // We only need to check the points already in Configurations, that have
-    // already been inserted in pareto frontier. These have better accuracy
-    // loss, so this one will only be added if it shows better speedup
-    // The one in curve with best speedup so far is the last one (with worst
-    // = highest accuracy loss), so compare only with that one.
-
-    // Similar handling of energy vector
-
-    bool sp_notDominated = true;
-    if (!SpeedupConfigurations.empty()) {
-      if (SpeedupConfigurations.back()->speedup >= sp)
-        sp_notDominated = false;
-    }
-
-    bool en_notDominated = true;
-    if (!EnergyConfigurations.empty()) {
-      if (EnergyConfigurations.back()->energy >= en)
-        en_notDominated = false;
-    }
-
-    DEBUG("sp_notDominated = %d\n", sp_notDominated);
-    DEBUG("en_notDominated = %d\n", en_notDominated);
-
-    // If not dominated, insert in pareto frontier set
-    if (sp_notDominated) {
-      SpeedupConfigurations.push_back(&InitialConfigurations[sp_idx]);
-    }
-    if (en_notDominated) {
-      EnergyConfigurations.push_back(&InitialConfigurations[en_idx]);
-    }
-
-    // Keep track of unnecessary configurations
-    for (unsigned i = start_idx; i < end_idx; i++) {
-      if (((i != sp_idx) || (!sp_notDominated)) &&
-          ((i != en_idx) || (!en_notDominated)))
-        Indices.push_back(i);
-    }
-
-    // Continue from next accuracy loss level
-    start_idx = end_idx;
-
-  }
-
-  // All elements in InitialConfigurations whose index is in Indices are no
-  // longer needed.
-//  for (std::vector<unsigned>::iterator idx_it = Indices.begin(), idx_e = Indices.end();
-//       idx_it != idx_e; ++idx_it) {
-//    std::map<std::string, NodeConfiguration * > ConfSetup =
-//      InitialConfigurations[*idx_it].setup;
-//    for (std::map<std::string, NodeConfiguration* >::const_iterator it = ConfSetup.begin();
-//     it != ConfSetup.end(); ++it) {
-//      delete it->second;
-//    }
-//  }
-//  InitialConfigurations.clear();
-
-}
-
-void RuntimeController::compute3DParetoConfigurationPoints() {
-
-  // Sort the configurations according to accuracy loss
-  INFO("Sorting autotuner configurations...\n");
-  std::sort(InitialConfigurations.begin(),
-            InitialConfigurations.end(),
-            ConfigurationLessThan());
-  INFO("Done sorting.\n");
-
-  for (unsigned start_idx = 0; start_idx < InitialConfigurations.size(); ) {
-    // Points to first Configuration with different (higher) accuracy loss
-    // compared to the one pointed by start_idx 
-    unsigned end_idx = start_idx + 1;
-    while ((end_idx < InitialConfigurations.size()) &&
-           (InitialConfigurations[end_idx].accuracyLoss -
-            InitialConfigurations[start_idx].accuracyLoss < AL_THRESHOLD)) {
-      end_idx++;
-    }
-    DEBUG("start_idx = %d, end_idx = %d\n", start_idx, end_idx);
-    // Now, all elements in [start_idx, end_idx) have equal accuracy loss,
-    // that is lower from later ones and worse than those already in curve
-    // (so they cannot displace them).
-
-    // Find candidates from [start_idx, end_idx) to be inserted
-    // Keep their indices. If a point is dominated (strictly worse),
-    // its index will not be inserted
-    std::vector<unsigned> Indices;
-
-    for (unsigned i = start_idx; i < end_idx; i++) {
-      bool dominated = false;
-      for (unsigned j = i+1; (j < end_idx) && !dominated; j++) {
-        if ((InitialConfigurations[i].speedup < InitialConfigurations[j].speedup) &&
-            (InitialConfigurations[i].energy < InitialConfigurations[j].energy)) {
-          dominated = true;
-        }
-      }
-      if (!dominated) {
-        DEBUG("accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
-              InitialConfigurations[i].accuracyLoss,
-              InitialConfigurations[i].speedup,
-              InitialConfigurations[i].energy,
-              i);
-        Indices.push_back(i);
-      }
-    }
-
-    for (std::vector<unsigned>::iterator idx_it = Indices.begin(), idx_e = Indices.end();
-         idx_it != idx_e; ++idx_it) {
-      Configuration &CandidateConfiguration = InitialConfigurations[*idx_it];
-
-      if (!ThreeDCurveConfigurations.empty()) {
-        bool notDominated = true;
-        for (unsigned i = 0; (i < ThreeDCurveConfigurations.size()) && notDominated; i++) {
-          if ((CandidateConfiguration.speedup <= ThreeDCurveConfigurations[i]->speedup) &&
-              (CandidateConfiguration.energy <= ThreeDCurveConfigurations[i]->energy)) {
-            // This configuration is not better, in at least one characteristic,
-            // compared to the existing ones in the curve.
-            notDominated = false;
-          }
-        }
-        if (notDominated) {
-          ThreeDCurveConfigurations.push_back(&CandidateConfiguration);
-        }
-      } else {
-        // If the curve is empty, we know that this is a point that must be
-        // inserted. It has the best accuracy loss, and belongs here because
-        // it is not dominated by any point in this accuracy range.
-        ThreeDCurveConfigurations.push_back(&CandidateConfiguration);
-      }
-    }
-
-    // Continue from next accuracy loss level
-    start_idx = end_idx;
-  }
-
-}
-
-
-void RuntimeController::printConfigurations(std::vector<struct Configuration> &Confs) {
-
-  for (std::vector<struct Configuration>::iterator it = Confs.begin(),
-       ie = Confs.end(); it != ie; ++it) {
-    it->print();
-  }
-
-}
-
-void RuntimeController::printConfigurations(std::vector<struct Configuration *> &Confs) {
-
-  for (std::vector<struct Configuration *>::iterator it = Confs.begin(),
-       ie = Confs.end(); it != ie; ++it) {
-    (*it)->print();
-  }
-
-}
-
-void RuntimeController::findNextConfiguration() {
-  configurationIdx = (configurationIdx + 1) % Configurations->size() ;
-  DEBUG("findNextConfiguration: Updated configurationIdx to %u.\n", configurationIdx);
-}
-
-void RuntimeController::findTargetConfiguration(float goal, enum SEARCH_KIND sk) {
-  // We search in range begin(), end()-1 . It is OK to decrement end(), because
-  // the configurations vector always points to one of the pareto curves, and
-  // they are never empty - we have always pushed at least one configuration.
-
-  DEBUG("findTargetConfiguration: goalVal: %f, search kind: %d.\n", goal, sk);
-  std::vector<struct Configuration *>::iterator low_it;
-  switch (sk) {
-    case SPEEDUP:
-      {
-        Configurations = &SpeedupConfigurations;
-        low_it = std::lower_bound(Configurations->begin(),
-                                  Configurations->end()-1,
-                                  goal,
-                                  ConfigurationLessThan_SP());
-        configurationIdx = low_it - Configurations->begin();
-        break;
-      }
-    case ENERGY:
-      {
-        Configurations = &EnergyConfigurations;
-        low_it = std::lower_bound(Configurations->begin(),
-                                  Configurations->end()-1,
-                                  goal,
-                                  ConfigurationLessThan_E());
-        configurationIdx = low_it - Configurations->begin();
-        break;
-      }
-    case ACCURACY_LOSS:
-      {
-        Configurations = &SpeedupConfigurations;
-        low_it = std::lower_bound(Configurations->begin(),
-                                  Configurations->end()-1,
-                                  goal,
-                                  ConfigurationLessThan_AL());
-        if ((*low_it)->accuracyLoss > goal)
-          --low_it;
-        configurationIdx = low_it - Configurations->begin();
-        break;
-      }
-    default:
-      {
-        CUSTOM_ASSERT(false && "Unknown search option for optimization target");
-        ERROR("Unknown search option for optimization target.");
-        abort();
-      }
-  }
-  // After search, low_it points to the Configuration to the element with the
-  // goal value or the immediately lower value if it does not exist
-
-  DEBUG("findTargetConfiguration: Updated configurationIdx to %u.\n", configurationIdx);
-
-}
-
-void RuntimeController::adjustTargetConfiguration(float goal) {
-
-  DEBUG("adjustTargetConfiguration: goalVal: %f.\n\n", goal);
-
-  // Find configuration before the selected one. There is always one.
-  // Get the two configurations' speedup, and compute the appropriate ranges
-  float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup;
-  float prev_conf_speedup = (*Configurations)[configurationIdx-1]->speedup;
-  float sp_diff = curr_conf_speedup - prev_conf_speedup;
-
-  float high_range = curr_conf_speedup - goal;
-  float low_range = goal - prev_conf_speedup;
-
-  // These represent how likely we are to pick the upper or lower configuration
-  float high_pb = low_range / sp_diff;
-  float low_pb = high_range / sp_diff;
-
-  DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: %f.\n",
-        ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
-  DEBUG("**---- adjustTargetConfiguration: lower conf = %s with probability: %f.\n\n",
-        ((*Configurations)[configurationIdx-1]->name).c_str(), low_pb);
-
-  // Select a random number from 0 to 1
-  // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1) to the upper
-//  float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ;
-  pseudo_rd += 0.1f;
-  float rd = pseudo_rd;
-  if (rd < low_pb) {
-    // If the probability is in the low range
-    configurationIdx--;
-  }
-
-  DEBUG("adjustTargetConfiguration: rand: %f : Updated configurationIdx to %u.\n",
-        rd, configurationIdx);
-}
-
 #define MAX_GOAL_SPEEDUP 9
-float RuntimeController::getGoalSpeedup() {
-  return 1.0 + (rand() / (RAND_MAX / (MAX_GOAL_SPEEDUP - 1.0) ) );
-}
-
-double RuntimeController::getBaselineTime() {
-  return baseline_time;
-}
-
-Slowdowns *RuntimeController::getSlowdowns() {
-  return slowdowns;
-}
 
 // Functions to be inserted with initializeTensorRT and clearTensorRT
-void llvm_hpvm_initializeRuntimeController(const char *ConfigFile, const char *QRangeFile) {
-  RC = new RuntimeController();
-  RC->init(ConfigFile, QRangeFile);
-  return;
-}
+void llvm_hpvm_initializeRuntimeController(const char *ConfigFile, const char *QRangeFile);
 
-void llvm_hpvm_clearRuntimeController() {
-  delete RC;
-  return;
-}
+void llvm_hpvm_clearRuntimeController();
 
 //*** Methods to compute accuracy of a tensor by the runtime controller   ***//
-uint32_t* labels_from_file = NULL;
-
-uint32_t* hpvm_rt_readLabelsBatch_cached(const char* labels_file, int start, int end) {
-
-  // Initialize buffer
-  if (!labels_from_file) {
-    FILE* file = fopen(labels_file, "rb");
-    if (file == NULL) {
-      ERROR("Data file %s is not found. Aborting...\n", labels_file);
-      abort();
-    }
-    // Get number of labels
-    fseek(file, 0, SEEK_END);
-    long size = ftell(file);
-    fseek(file, 0, SEEK_SET); // return file pointer to beginning
-
-    // Allocate memory for labels
-    labels_from_file = (uint32_t*) malloc(size);
-    if (labels_from_file == NULL) {
-      ERROR("Memory allocation for labels unsucessfull. Aborting...\n");
-      abort();
-    }
-
-    // Copy the labels file into the allocated buffer
-    size_t result = fread(labels_from_file, 1, size, file);
-    if (result != size) {
-      // We did not read as many elemets as there are in the file
-      ERROR("Reading labels file unsucessfull. Aborting...\n");
-      abort();
-    }
-
-    fclose(file);
-  }
-
-//  int num_labels = end - start; 
-//  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-//  for (unsigned i = start; i < end; i++) {
-//    labels[i-start] = labels_from_file[i];
-//  }
-//  return labels;
-
-  // Return pointer to labels
-  return &labels_from_file[start];
-
-}
+extern uint32_t* labels_from_file;
+
+uint32_t* hpvm_rt_readLabelsBatch_cached(const char* labels_file, int start, int end);
 
 //*** Copied from dnn_sources/include/utils.h                             ***//
-float hpvm_rt_computeAccuracy3(uint32_t* labels, void* result_ptr) {
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
-  size_t batch_dim = result->dims.dim_sizes[0];
-  size_t num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
-  int num_errors = 0;
-
-  printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
-  
-  for(int i = 0; i < batch_dim; i++){
-  
-    int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
-    }
-    
-    if(chosen != labels[i])
-      num_errors++;
-  }
-
-  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
-  printf("****** Accuracy = %f \n\n", accuracy);
-
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
-
-    std::ostringstream ss;
-    ss << std::fixed << accuracy;
-    std::string print_str = ss.str();
-  
-    fwrite(print_str.c_str(), 1, print_str.length(), fp);
-  }
-
-  fclose(fp);
-
-  return accuracy;    
-}
+float hpvm_rt_computeAccuracy3(uint32_t* labels, void* result_ptr);
 
 #define llvm_hpvm_invokeRtControl_SLOWDOWN_PR llvm_hpvm_invokeRtControl
 
-void llvm_hpvm_invokeRtControl_BASE(void* result, const char* str, int start, int end) {
-
-  RC->resume_profiler();
-
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  RC->pause_profiler();
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-void llvm_hpvm_invokeRtControl_ITERATE(void* result, const char* str, int start, int end) {
-
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  RC->resume_profiler();
-  RC->findNextConfiguration();
-  // Still use findNext configuration, to update the configurationIdx,
-  // to point to next location
-  enum SEARCH_KIND k = ACCURACY_LOSS;
-  float goalVal = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss;
-  RC->findTargetConfiguration(goalVal, k);
-
-  RC->pause_profiler();
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
+void llvm_hpvm_invokeRtControl_BASE(void* result, const char* str, int start, int end);
 
-void llvm_hpvm_invokeRtControl_ADJUST(void* result, const char* str, int start, int end) {
+void llvm_hpvm_invokeRtControl_ITERATE(void* result, const char* str, int start, int end);
 
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
+void llvm_hpvm_invokeRtControl_ADJUST(void* result, const char* str, int start, int end);
 
-  // Read stats for iteration that was just completed
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
+void llvm_hpvm_invokeRtControl_SLOWDOWN(void* result, const char* str, int start, int end);
 
-  RC->resume_profiler();
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double baseline_time = RC->getBaselineTime();
-  double target_speedup = current_iteration_time / baseline_time;
-  RC->findTargetConfiguration(target_speedup, SPEEDUP);
-  RC->pause_profiler();
+void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void* result, const char* str, int start, int end);
 
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
+void llvm_hpvm_invokeRtControl_RAND(void* result, const char* str, int start, int end);
 
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
-  INFO("target speedup = %lf\n\n", target_speedup);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-void llvm_hpvm_invokeRtControl_SLOWDOWN(void* result, const char* str, int start, int end) {
-
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  std::string prev_conf_name =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-
-  RC->resume_profiler();
-  float slowdown = RC->getSlowdowns()->getNextSlowdown();
-  RC->findTargetConfiguration(slowdown, SPEEDUP);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  std::string next_conf_name =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-  float next_conf_speedup =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
-
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
-  INFO("slowdown (target speedup) = %f\n", slowdown);
-  INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO("Swapping to next configuration: %s with speedup %f\n\n",
-       next_conf_name.c_str(), next_conf_speedup);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void* result, const char* str, int start, int end) {
-
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  std::string prev_conf_name =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-
-  RC->resume_profiler();
-  float slowdown = RC->getSlowdowns()->getNextSlowdown();
-  RC->findTargetConfiguration(slowdown, SPEEDUP);
-  RC->adjustTargetConfiguration(slowdown);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  std::string next_conf_name =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-  float next_conf_speedup =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
-
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
-  INFO("slowdown (target speedup) = %f\n", slowdown);
-  INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO("Swapping to next configuration: %s with speedup %f\n\n",
-       next_conf_name.c_str(), next_conf_speedup);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-void llvm_hpvm_invokeRtControl_RAND(void* result, const char* str, int start, int end) {
-
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  RC->resume_profiler();
-  RC->findTargetConfiguration(RC->getGoalSpeedup(), SPEEDUP);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-
-
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  RC->resume_profiler();
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double baseline_time = RC->getBaselineTime();
-  double target_speedup = current_iteration_time / baseline_time;
-  RC->findTargetConfiguration(target_speedup, SPEEDUP);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
-  INFO("target speedup = %lf\n\n", target_speedup);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-void llvm_hpvm_invokeRtControl_SLOWDOWN(void* result, const char* str, int start, int end) {
-
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  std::string prev_conf_name =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-
-  RC->resume_profiler();
-  float slowdown = RC->getSlowdowns()->getNextSlowdown();
-  RC->findTargetConfiguration(slowdown, SPEEDUP);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  std::string next_conf_name =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-  float next_conf_speedup =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
-
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
-  INFO("slowdown (target speedup) = %f\n", slowdown);
-  INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO("Swapping to next configuration: %s with speedup %f\n\n",
-       next_conf_name.c_str(), next_conf_speedup);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void* result, const char* str, int start, int end) {
-
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  std::string prev_conf_name =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-
-  RC->resume_profiler();
-  float slowdown = RC->getSlowdowns()->getNextSlowdown();
-  RC->findTargetConfiguration(slowdown, SPEEDUP);
-  RC->adjustTargetConfiguration(slowdown);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  std::string next_conf_name =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-  float next_conf_speedup =
-    RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
-
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
-  INFO("slowdown (target speedup) = %f\n", slowdown);
-  INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO("Swapping to next configuration: %s with speedup %f\n\n",
-       next_conf_name.c_str(), next_conf_speedup);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-void llvm_hpvm_invokeRtControl_RAND(void* result, const char* str, int start, int end) {
-
-  uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  RC->resume_profiler();
-  RC->findTargetConfiguration(RC->getGoalSpeedup(), SPEEDUP);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-
-
-#endif
\ No newline at end of file
+#endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
index b1c472dc61665144c09743629192ce069c43a2d2..a35fd3eae58ac708ea10386416072736ddd98d91 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
@@ -1,4 +1,5 @@
 #include "functional/common.h"
+#include "tensor_utils.h"
 
 #include <algorithm>
 #include <functional>
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..63dd927cd09f25ed995a8dd3ec2fd3f91c1a6032
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
@@ -0,0 +1,221 @@
+#include "configuration.h"
+
+using P_APPROX = PROMISENodeConfiguration::APPROX;
+using G_APPROX = GPUNodeConfiguration::APPROX;
+using G_TENSOR_OP = GPUNodeConfiguration::TENSOR_OP;
+
+bool NodeConfiguration::isPROMISENodeConfiguration() {
+  return NODE_CONFIGURATION_TARGET_ID == PROMISE;
+}
+
+bool NodeConfiguration::isGPUNodeConfiguration() {
+  return NODE_CONFIGURATION_TARGET_ID == GPU;
+}
+
+void PROMISENodeConfiguration::pushNewApproximationChoice(
+    P_APPROX approx, int u) {
+  ApproxChoices.push_back(std::make_pair(approx, u));
+}
+
+std::vector<std::pair<P_APPROX, int>> &
+PROMISENodeConfiguration::getApproxChoices() {
+  return ApproxChoices;
+}
+
+PROMISENodeConfiguration::PROMISENodeConfiguration() {
+  NODE_CONFIGURATION_TARGET_ID = PROMISE;
+}
+
+PROMISENodeConfiguration::~PROMISENodeConfiguration() {}
+
+void GPUNodeConfiguration::pushNewTensorOperation(enum TENSOR_OP top) {
+  std::vector<std::pair<G_APPROX, int>> emptyVec;
+  ApproxChoices.push_back(std::make_pair(top, emptyVec));
+}
+
+void GPUNodeConfiguration::pushNewApproximationChoiceForOperation(
+    G_APPROX approx, int u) {
+  unsigned size = ApproxChoices.size();
+  CUSTOM_ASSERT(
+      size >= 1 &&
+      "Cannot apply approximation choice to non existent operation.");
+  ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u));
+}
+
+std::vector<std::pair<G_TENSOR_OP, std::vector<std::pair<G_APPROX, int>>>> &
+GPUNodeConfiguration::getApproxChoices() {
+  return ApproxChoices;
+}
+
+GPUNodeConfiguration::GPUNodeConfiguration() {
+  NODE_CONFIGURATION_TARGET_ID = GPU;
+}
+GPUNodeConfiguration::~GPUNodeConfiguration() {}
+
+Configuration::Configuration(
+    std::string &n, float f, float e, float a, float al)
+    : name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {}
+
+float Configuration::getSpeedup() { return speedup; }
+
+float Configuration::getEnergy() { return energy; }
+
+float Configuration::getAccuracy() { return accuracy; }
+
+float Configuration::getAccuracyLoss() { return accuracyLoss; }
+bool ConfigurationLessThan::
+operator()(const struct Configuration &a, const struct Configuration &b) const {
+  return (a.accuracyLoss < b.accuracyLoss);
+}
+bool ConfigurationLessThan_AL::
+operator()(const struct Configuration *a, const float &b) const {
+  return (a->accuracyLoss < b);
+}
+bool ConfigurationLessThan_SP::
+operator()(const struct Configuration *a, const float &b) const {
+  return (a->speedup < b);
+}
+bool ConfigurationLessThan_E::
+operator()(const struct Configuration *a, const float &b) const {
+  return (a->energy < b);
+}
+
+//****** HEADER Ends - Source Starts
+
+// Helper configuration print methods
+
+void PROMISENodeConfiguration::print() {
+
+  printf(" promise");
+  for (auto &it : ApproxChoices) {
+    printf(" ");
+    switch (it.first) {
+    case P_APPROX::SWING_LEVEL:
+      printf("swing_level");
+      break;
+    default:
+      ERROR("Unknown approximation option");
+      break;
+      // TODO additional approx methods to be printed here
+    }
+    printf(" %d", it.second);
+  }
+
+  printf("\n");
+}
+
+void GPUNodeConfiguration::print() {
+
+  printf(" gpu");
+  for (auto &it : ApproxChoices) {
+
+    printf(" ");
+    switch (it.first) {
+    case TENSOR_OP::ADD:
+      printf("add");
+      break;
+    case TENSOR_OP::BATCHNORM:
+      printf("batchnorm");
+      break;
+    case TENSOR_OP::CONV:
+      printf("conv");
+      break;
+    case TENSOR_OP::GROUP_CONV:
+      printf("group_conv");
+      break;
+    case TENSOR_OP::MUL:
+      printf("mul");
+      break;
+    case TENSOR_OP::RELU:
+      printf("relu");
+      break;
+    case TENSOR_OP::CLIPPED_RELU:
+      printf("clipped_relu");
+      break;
+    case TENSOR_OP::TANH:
+      printf("tanh");
+      break;
+    case TENSOR_OP::POOL_MAX:
+      printf("pool_max");
+      break;
+    case TENSOR_OP::POOL_MEAN:
+      printf("pool_mean");
+      break;
+    case TENSOR_OP::POOL_MIN:
+      printf("pool_min");
+      break;
+    case TENSOR_OP::SOFTMAX:
+      printf("softmax");
+      break;
+    case TENSOR_OP::FFT:
+      printf("fft");
+      break;
+    case TENSOR_OP::REDUCE:
+      printf("reduce");
+      break;
+    case TENSOR_OP::PROJECTIVE_T:
+      printf("projectiveT");
+      break;
+    case TENSOR_OP::MAP1:
+      printf("map1");
+      break;
+    case TENSOR_OP::MAP2:
+      printf("map2");
+      break;
+    case TENSOR_OP::MAP3:
+      printf("map3");
+      break;
+    default:
+      ERROR("Unknown tensor operation.");
+      break;
+      // TODO additional operations to be printed here
+    }
+
+    auto &approxVec = it.second;
+    for (auto &inner_it : approxVec) {
+      printf(" ");
+      switch (inner_it.first) {
+      case G_APPROX::FP32:
+        printf("fp32");
+        break;
+      case G_APPROX::FP16:
+        printf("fp16");
+        break;
+      case G_APPROX::PERFORATION:
+        printf("perf");
+        break;
+      case G_APPROX::INPUT_SAMPLING:
+        printf("samp");
+        break;
+      case G_APPROX::REDUCTION_SAMPLING:
+        printf("red_samp");
+        break;
+      default:
+        ERROR("Unknown approximation option");
+        break;
+        // TODO additional approx methods to be printed here
+      }
+
+      printf(" %d", inner_it.second);
+    }
+  }
+
+  printf("\n");
+}
+
+void Configuration::print() {
+
+  printf("+++++\n");
+  printf(
+      "%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy,
+      accuracyLoss);
+  for (std::map<std::string, NodeConfiguration *>::const_iterator it =
+           setup.begin();
+       it != setup.end(); ++it) {
+    printf("%s :", it->first.c_str());
+
+    it->second->print();
+  }
+
+  printf("-----\n");
+}
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
index 6a8e5cb0f4fae763328114576213cb66f131e00c..41be0d65a46214daffac44719140a8faca505946 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
@@ -1,6 +1,6 @@
 #include "device_math.h"
+#include "error.h"
 
-#include <cuda_fp16.h>
 #include <thrust/complex.h>
 
 #define DEF_FUNC_PTR(fname) __device__ void *fname##_ptr = (void *)(fname);
@@ -18,6 +18,18 @@
 
 namespace _internal {
 
+// The following functions are not used, but they reference their cuda counterpart
+// which is necessary, otherwise ptx compilation breaks
+__device__ float hypotf_(float x, float y) { return hypotf(x, y); }
+
+__device__ float atan2f_(float x, float y) { return atan2f(x, y); }
+
+__device__ float sqrtf_(float x) { return sqrtf(x); }
+
+__device__ float fmax_(float x, float y) { return fmax(x, y); }
+
+__device__ float fmin_(float x, float y) { return fmin(x, y); }
+
 __device__ float add(float x, float y) { return x + y; }
 
 __device__ float sub(float x, float y) { return x - y; }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
index 951cfbfdb0097e3376a7c0038ed3e550ce795c07..98ea25df2ed793220857e9da944054543472a724 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
@@ -1,8 +1,3 @@
-
-#ifndef GLOBAL_DATA_HEADER
-#define GLOBAL_DATA_HEADER
-
-
 #include <stdio.h>
 #include <stdarg.h>
 #include <cstdio>
@@ -14,11 +9,13 @@
 #include <cublas_v2.h>
 #include <cudnn.h>
 #include <cublas_api.h>
-#include "tensor.h"
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "tensor.h"
+#include "global_data.h"
+
 /* Data declarations */
 cudnnHandle_t cudnnHandle;
 cublasHandle_t cublasHandle;
@@ -35,9 +32,7 @@ int total_ops = 0;
 std::vector<int> op_accuracies;
 std::vector<Range*> quant_ranges;
 
-std::vector<void*> tensors_ptr;
-std::vector<void*> host_ptr;
-std::vector<void*> obj_ptr;
+std::unordered_set<void*> tensors_ptr, host_ptr, obj_ptr;
 
 std::unordered_map<void*, int> tracked_tensors;
 
@@ -47,6 +42,3 @@ std::unordered_map<int, int> skip_tensors;
 // Profiling Data
 std::unordered_map<std::string, int> func_counters;
 std::string profile_data = "";
-
-
-#endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2730e2f2786987eb77527011bf29de1a87b3f582
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -0,0 +1,1357 @@
+#include "hpvm-rt-controller.h"
+
+/*
+ * Check if a file exists
+ * Return true if the file exists, false else
+ */
+bool fileExists(const std::string &file) {
+  struct stat buf;
+  return (stat(file.c_str(), &buf) == 0);
+}
+
+// Functions
+void ProfileInfo::resetCurrentIterationTime() {
+  time_compute_current_iteration = 0.0;
+  time_control_current_iteration = 0.0;
+  time_config_current_iteration = 0.0;
+}
+
+void ProfileInfo::resetCurrentIterationEnergy() {
+  energy_compute_current_iteration = 0.0;
+  energy_control_current_iteration = 0.0;
+  energy_config_current_iteration = 0.0;
+}
+
+void ProfileInfo::start_iteration() {
+  if (!in_iteration) {
+    resetCurrentIterationTime();
+    resetCurrentIterationEnergy();
+    tensor_time_info.push_back(std::vector<std::pair<std::string, double>>());
+    tensor_energy_info.push_back(std::vector<std::pair<std::string, double>>());
+    in_iteration = true;
+  }
+}
+void ProfileInfo::end_iteration() {
+  // Update time counters
+  time_compute += time_compute_current_iteration;
+  time_control += time_control_current_iteration;
+  time_config += time_config_current_iteration;
+
+  time_total +=
+      (time_compute_current_iteration + time_control_current_iteration +
+       time_config_current_iteration);
+
+  // Update energy counters
+  energy_compute += energy_compute_current_iteration;
+  energy_control += energy_control_current_iteration;
+  energy_config += energy_config_current_iteration;
+
+  energy_total +=
+      (energy_compute_current_iteration + energy_control_current_iteration +
+       energy_config_current_iteration);
+
+  // Save current iteration counters
+  compute_time_info.push_back(time_compute_current_iteration);
+  compute_energy_info.push_back(energy_compute_current_iteration);
+  control_time_info.push_back(time_control_current_iteration);
+  control_energy_info.push_back(energy_control_current_iteration);
+  config_time_info.push_back(time_config_current_iteration);
+  config_energy_info.push_back(energy_config_current_iteration);
+
+  // Note end of iteration
+  in_iteration = false;
+}
+
+void ProfileInfo::addToCurrentIterationComputeTime(const char *s, double t) {
+  start_iteration();
+  time_compute_current_iteration += t;
+  tensor_time_info.back().push_back(std::make_pair(std::string(s), t));
+}
+
+void ProfileInfo::addToCurrentIterationControlTime(double t) {
+  start_iteration();
+  time_control_current_iteration += t;
+}
+
+void ProfileInfo::addToCurrentIterationConfigTime(double t) {
+  start_iteration();
+  time_config_current_iteration += t;
+}
+
+void ProfileInfo::addToCurrentIterationComputeEnergy(const char *s, double e) {
+  start_iteration();
+  energy_compute_current_iteration += e;
+  tensor_energy_info.back().push_back(std::make_pair(std::string(s), e));
+}
+
+void ProfileInfo::addToCurrentIterationControlEnergy(double e) {
+  start_iteration();
+  energy_control_current_iteration += e;
+}
+
+void ProfileInfo::addToCurrentIterationConfigEnergy(double e) {
+  start_iteration();
+  energy_config_current_iteration += e;
+}
+
+double ProfileInfo::getTotalTime() { return time_total; }
+
+double ProfileInfo::getTotalEnergy() { return energy_total; }
+
+double ProfileInfo::getCurrentIterationComputeTime() {
+  return time_compute_current_iteration;
+}
+
+double ProfileInfo::getCurrentIterationComputeEnergy() {
+  return energy_compute_current_iteration;
+}
+
+void ProfileInfo::set_out_file_name(std::string &str) { out_file_name = str; }
+
+void ProfileInfo::printToFile() {
+
+  INFO("Writing Runtime Profile Info File...\n");
+  std::ofstream s_out(out_file_name.c_str());
+
+  if (!s_out) {
+    ERROR("Failed to open output file.");
+    abort();
+  }
+
+  // By construction, tensor_time_info and tensor_energy_info are expected
+  // to have equal sizes, in outer and inner vectors both,
+  // and all time_info and energy_info vectors must have the same size.
+  unsigned iterations = tensor_time_info.size();
+  CUSTOM_ASSERT(
+      (tensor_time_info.size() == iterations) &&
+      (tensor_energy_info.size() == iterations) &&
+      (control_time_info.size() == iterations) &&
+      (control_energy_info.size() == iterations) &&
+      (config_time_info.size() == iterations) &&
+      (config_energy_info.size() == iterations) &&
+      "time_info and energy_info size: \
+                   iteration number does not match.");
+
+  for (unsigned i = 0; i < tensor_time_info.size(); i++) {
+    // time_info.size() == energy_info.size(), since we passed the assertion
+    s_out << "Iteration " << i << "\n";
+
+    CUSTOM_ASSERT(
+        (tensor_time_info[i].size() == tensor_energy_info[i].size()) &&
+        "time_info and energy_info size: operation number does not match.");
+    for (unsigned j = 0; j < tensor_time_info[i].size(); j++) {
+      // time_info[i].size() == energy_info[i].size(), we passed the assertion
+      CUSTOM_ASSERT(
+          (tensor_time_info[i][j].first == tensor_energy_info[i][j].first) &&
+          "time_info and energy_info: operation does not match.");
+      s_out << tensor_time_info[i][j].first << " "
+            << tensor_time_info[i][j].second << " "
+            << tensor_energy_info[i][j].second << "\n";
+    }
+
+    s_out << "\nIteration Compute Time  : " << compute_time_info[i] << "\n";
+    s_out << "Iteration Compute Energy: " << compute_energy_info[i] << "\n";
+    s_out << "Iteration Control Time  : " << control_time_info[i] << "\n";
+    s_out << "Iteration Control Energy: " << control_energy_info[i] << "\n";
+    s_out << "Iteration Config Time  : " << config_time_info[i] << "\n";
+    s_out << "Iteration Control Energy: " << config_energy_info[i] << "\n\n\n";
+  }
+  s_out << "\n\nTotal Compute Time  : " << time_compute << "\n";
+  s_out << "Total Compute Energy: " << energy_compute << "\n";
+
+  s_out << "\nTotal Control Time  : " << time_control << "\n";
+  s_out << "Total Control Energy: " << energy_control << "\n";
+
+  s_out << "\nTotal Config Time  : " << time_config << "\n";
+  s_out << "Total Config Energy: " << energy_config << "\n";
+
+  s_out << "\nTotal Time  : " << time_total << "\n";
+  s_out << "Total Energy: " << energy_total << "\n";
+
+  s_out.close();
+
+  INFO("Done writing profile.\n");
+}
+
+ProfileInfo::ProfileInfo()
+    : time_total(0.0), energy_total(0.0), time_compute_current_iteration(0.0),
+      time_control_current_iteration(0.0), time_config_current_iteration(0.0),
+      energy_compute_current_iteration(0.0),
+      energy_control_current_iteration(0.0),
+      energy_config_current_iteration(0.0), in_iteration(false) {}
+Slowdowns::Slowdowns() {
+  idx = 0;
+
+  std::ifstream s_in("slowdowns.txt");
+  if (!s_in) {
+    DEBUG("slowdowns file not found. Initializing slowdowns randomly.\n");
+    for (unsigned i = 0; i < 10; i++) {
+      slowdowns.push_back(1.0 + (rand() / (RAND_MAX / (5.0 - 1.0))));
+    }
+  } else {
+    for (std::string line; std::getline(s_in, line);) {
+      float s = std::stof(line);
+      slowdowns.push_back(s);
+    }
+  }
+}
+
+unsigned Slowdowns::getSlowdownsNumber() { return slowdowns.size(); }
+
+float Slowdowns::getNextSlowdown() {
+  float tmp = slowdowns[idx];
+  idx = (idx + 1) % slowdowns.size();
+  return tmp;
+}
+
+// Functions
+
+// Private functions of profiler
+void RuntimeController::start_profiler() {
+  if (profiler)
+    profiler->start_profiler();
+}
+void RuntimeController::stop_profiler() {
+  if (profiler)
+    profiler->stop_profiler();
+}
+// For testing purposes only - do not use widely
+std::vector<struct Configuration *> &RuntimeController::
+getSpeedupConfigurations() {
+  return SpeedupConfigurations;
+}
+// For testing purposes only - do not use widely
+std::vector<struct Configuration *> &RuntimeController::
+getEnergyConfigurations() {
+  return EnergyConfigurations;
+}
+// For testing purposes only - do not use widely
+std::vector<struct Configuration *> &RuntimeController::
+getThreeDCurveConfigurations() {
+  return ThreeDCurveConfigurations;
+}
+// For testing purposes only - do not use widely
+unsigned RuntimeController::getConfigurationIdx() { return configurationIdx; }
+
+std::vector<float> &RuntimeController::getQuantizationRanges(const char *data) {
+  std::string s(data);
+  // All nodes are expected to have quantization ranges
+  return QuantizationMap.at(s);
+}
+
+NodeConfiguration *RuntimeController::getNodeConfiguration(const char *data) {
+  std::string s(data);
+  // All nodes are expected to have a configuration
+  return (*Configurations)[configurationIdx]->setup.at(s);
+}
+
+void RuntimeController::init(const char *Cstr, const char *Qstr) {
+  // We initialize the path to the profile info output file,
+  // based on the path given for the configuration file
+  setProfileInfoFilename(Cstr);
+
+  readQuantizationFile(Qstr);
+  readConfigurationFile(Cstr);
+  Configurations = NULL;
+  computeParetoConfigurationPoints();
+  //    compute3DParetoConfigurationPoints(); Not using 3D curve
+  INFO("Speedup Configurations\n");
+  printConfigurations(SpeedupConfigurations);
+  //    INFO("Energy Configurations\n");
+  //    printConfigurations(EnergyConfigurations);
+  //    INFO("3D Configurations\n");
+  //    printConfigurations(ThreeDCurveConfigurations);
+  configurationIdx =
+      0; // TODO: initialize using pareto curve - findTargetConfiguration ?
+  Configurations = &SpeedupConfigurations;
+
+  // Initializations for different runtime control strategies
+  srand(static_cast<unsigned>(time(0)));
+  slowdowns = new Slowdowns();
+  pseudo_rd = 0.0;
+
+  // Start profiling thread in the background, ready to time
+  start_profiler();
+  pause_profiler();
+  reset_profiler();
+}
+
+// Exposing functionality of ProfileInfo
+void RuntimeController::end_iteration() {
+  if (PI)
+    PI->end_iteration();
+}
+
+void RuntimeController::addToCurrentIterationComputeTime(
+    const char *s, double t) {
+  if (PI)
+    PI->addToCurrentIterationComputeTime(s, t);
+}
+
+void RuntimeController::addToCurrentIterationControlTime(double t) {
+  if (PI)
+    PI->addToCurrentIterationControlTime(t);
+}
+
+void RuntimeController::addToCurrentIterationConfigTime(double t) {
+  if (PI)
+    PI->addToCurrentIterationConfigTime(t);
+}
+
+void RuntimeController::addToCurrentIterationComputeEnergy(
+    const char *s, double e) {
+  if (PI)
+    PI->addToCurrentIterationComputeEnergy(s, e);
+}
+
+void RuntimeController::addToCurrentIterationControlEnergy(double e) {
+  if (PI)
+    PI->addToCurrentIterationControlEnergy(e);
+}
+
+void RuntimeController::addToCurrentIterationConfigEnergy(double e) {
+  if (PI)
+    PI->addToCurrentIterationConfigEnergy(e);
+}
+
+double RuntimeController::getCurrentIterationComputeTime() {
+  return (PI ? PI->getCurrentIterationComputeTime() : 0.0);
+}
+
+double RuntimeController::getCurrentIterationComputeEnergy() {
+  return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0);
+}
+
+void RuntimeController::writeProfileInfo() {
+  if (PI)
+    PI->printToFile();
+}
+
+// Exposing functionality of (gpu) profiler
+void RuntimeController::resume_profiler() {
+  if (profiler)
+    profiler->resume_profiler();
+}
+
+void RuntimeController::pause_profiler() {
+  if (profiler)
+    profiler->pause_profiler();
+}
+
+void RuntimeController::reset_profiler() {
+  if (profiler)
+    profiler->reset();
+}
+
+std::pair<double, double> RuntimeController::get_time_energy() const {
+  return (profiler ? profiler->get_time_energy() : std::make_pair(0.0, 0.0));
+}
+
+// Exposing functionality of promise simulator
+std::pair<double, double> RuntimeController::fc_profile(
+    const unsigned num_rows_a, const unsigned num_cols_a,
+    const unsigned num_rows_b, const unsigned num_cols_b,
+    const unsigned voltage_swing, const unsigned patch_factor) {
+  return (
+      promise ? promise->fc_profile(
+                    num_rows_a, num_cols_a, num_rows_b, num_cols_b,
+                    voltage_swing, patch_factor)
+              : std::make_pair(0.0, 0.0));
+}
+
+std::pair<double, double> RuntimeController::conv_profile(
+    const unsigned n, const unsigned c, const unsigned h, const unsigned w,
+    const unsigned c_out, const unsigned c_in, const unsigned k_h,
+    const unsigned k_w, const unsigned s_h, const unsigned s_w,
+    const unsigned voltage_swing, const unsigned patch_factor) {
+  return (
+      promise ? promise->conv_profile(
+                    n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing,
+                    patch_factor)
+              : std::make_pair(0.0, 0.0));
+}
+
+// Constructor and descructor
+RuntimeController::RuntimeController() {
+  configurationIdx = 0;
+#ifdef ACTIVE_PROFILING
+  PI = new ProfileInfo();
+  profiler = new Profiler();
+  promise = new Promise();
+#else
+  PI = NULL;
+  profiler = NULL;
+  promise = NULL;
+#endif
+}
+
+RuntimeController::~RuntimeController() {
+
+  stop_profiler();
+  writeProfileInfo();
+
+  if (PI) {
+    delete PI;
+  }
+  if (profiler) {
+    delete profiler;
+  }
+  if (promise) {
+    delete promise;
+  }
+
+  for (std::vector<struct Configuration>::iterator
+           it = InitialConfigurations.begin(),
+           ie = InitialConfigurations.end();
+       it != ie; ++it) {
+    std::map<std::string, NodeConfiguration *> ConfSetup = it->setup;
+    for (std::map<std::string, NodeConfiguration *>::const_iterator it =
+             ConfSetup.begin();
+         it != ConfSetup.end(); ++it) {
+      delete it->second;
+    }
+  }
+  // Handle freeing memory, for all configurations
+  // A way to do that is to not free the initial configurations in the pareto
+  // curve, and free all at once in the end This is done because configurations
+  // are stored in different containers, but share the node setup
+}
+
+void RuntimeController::setProfileInfoFilename(const char *str) {
+
+  if (PI) {
+    std::string file_path = std::string(str);
+    size_t idx = file_path.find_last_of("/");
+    file_path.erase(idx + 1);
+    file_path.append("profile_info_");
+
+    bool found = false;
+    std::string profile_filename;
+    for (unsigned i = 0; !found; i++) {
+      profile_filename = file_path;
+      profile_filename.append(std::to_string(i));
+      profile_filename.append(".txt");
+      found = !fileExists(profile_filename);
+    }
+
+    PI->set_out_file_name(profile_filename);
+  }
+}
+void RuntimeController::readQuantizationFile(const char *str) {
+
+  INFO("Reading Quantization Ranges File...\n");
+
+  if (std::string(str).empty()) {
+    INFO("Empty quantization file string.\n");
+    return;
+  }
+
+  std::ifstream qin(str);
+
+  if (!qin) {
+    ERROR("Failed to open PROMISE quantization file.");
+    abort();
+  }
+
+  while (!qin.eof()) {
+    char NodeName[NODE_NAME_BUFFER_SIZE];
+    std::vector<float> QuantRangeVector;
+
+    qin >> NodeName;
+
+    float qrange;
+    for (unsigned i = 0; i < 8; i++) {
+      qin >> qrange;
+      QuantRangeVector.push_back(qrange);
+    }
+    // See if we need to insert this in map instead - my lookup test seemed to
+    // work without it std::string s(NodeName);
+    QuantizationMap.insert(
+        std::pair<std::string, std::vector<float>>(NodeName, QuantRangeVector));
+  }
+
+  qin.close();
+  INFO("DONE.\n");
+}
+
+void RuntimeController::printQuantizationMap() {
+
+  DEBUG("Quantization Ranges Map:\n");
+
+  for (std::map<std::string, std::vector<float>>::const_iterator it =
+           QuantizationMap.begin();
+       it != QuantizationMap.end(); ++it) {
+    DEBUG("%s :", it->first.c_str());
+
+    for (unsigned i = 0; i < it->second.size(); i++) {
+      DEBUG(" %f", it->second[i]);
+    }
+
+    DEBUG("\n");
+  }
+}
+
+void RuntimeController::readConfigurationFile(const char *str) {
+
+  INFO("Reading Configuration File...\n");
+
+  std::ifstream qin(str);
+
+  if (!qin) {
+    ERROR("Failed to open configuration file.");
+    abort();
+  }
+
+  bool readingConfiguration = false;
+  bool readingFirstLine = false;
+
+  // Read baseline_time from first line of configuration file
+  std::string first_line;
+  std::getline(qin, first_line);
+  DEBUG("first_line: %s\n", first_line.c_str());
+  baseline_time = std::stod(first_line);
+  DEBUG("Baseline time: %lf\n\n", baseline_time);
+
+  for (std::string line; std::getline(qin, line);) {
+    DEBUG("line: %s\n", line.c_str());
+
+    // Tokenize using ' ' as delimiter
+    // Vector to store tokens
+    std::vector<std::string> tokens;
+
+    for (auto i = strtok(&line[0], " "); i != NULL; i = strtok(NULL, " "))
+      tokens.push_back(i);
+
+    for (unsigned i = 0; i < tokens.size(); i++)
+      DEBUG("t: %s\n", tokens[i].c_str());
+
+    DEBUG("\n");
+
+    if (tokens[0] == "+++++") { // Found new configuration start token
+      // Mark the start of a new configuration
+      readingConfiguration = true;
+      readingFirstLine = true;
+      continue;
+    }
+
+    if (tokens[0] == "-----") { // Found configuration end token
+      readingConfiguration = false;
+      // Mark the end of current configuration
+      continue;
+    }
+
+    if (readingFirstLine) {
+      // Read first line, to create the new configuration struct
+      readingFirstLine = false;
+      InitialConfigurations.push_back(Configuration(
+          tokens[0], std::stof(tokens[1]), std::stof(tokens[2]),
+          std::stof(tokens[3]), std::stof(tokens[4])));
+      continue;
+    }
+
+    if (tokens[1] == "promise") {
+      DEBUG("Found promise configuration\n");
+
+      // There must be at least one approximation option
+      CUSTOM_ASSERT(
+          (tokens.size() >= 2) && "Not enough approximation options.");
+
+      PROMISENodeConfiguration *NodeConf = new PROMISENodeConfiguration();
+      InitialConfigurations.back().setup.insert(
+          std::make_pair(tokens[0], NodeConf));
+
+      // In increments of two, to handle pairs of approx option - tunable
+      // parameter
+      for (unsigned idx = 2; idx < tokens.size(); idx += 2) {
+        if (tokens[idx] == "swing_level") {
+          DEBUG("Found swing voltage option\n");
+          int vswing = std::stoi(tokens[idx + 1]);
+          DEBUG("vswing: %d\n", vswing);
+          NodeConf->pushNewApproximationChoice(
+              PROMISENodeConfiguration::APPROX::SWING_LEVEL, vswing);
+        }
+        // TODO: other approximation options handled here
+      }
+
+    } else if (tokens[1] == "gpu") {
+      DEBUG("Found gpu configuration\n");
+
+      // There must be at least one operation, with an approximation option
+      CUSTOM_ASSERT(
+          (tokens.size() >= 5) &&
+          "Not enough operations - approximation options.");
+
+      GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration();
+      InitialConfigurations.back().setup.insert(
+          std::make_pair(tokens[0], NodeConf));
+
+      unsigned idx = 2;
+      while (idx < tokens.size()) {
+        if (tokens[idx] == "add") {
+          DEBUG("Found add operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::ADD);
+          idx++;
+        } else if (tokens[idx] == "batchnorm") {
+          DEBUG("Found batchnorm operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::BATCHNORM);
+          idx++;
+        } else if (tokens[idx] == "conv") {
+          DEBUG("Found conv operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::CONV);
+          idx++;
+        } else if (tokens[idx] == "group_conv") {
+          DEBUG("Found group_conv operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::GROUP_CONV);
+          idx++;
+        } else if (tokens[idx] == "mul") {
+          DEBUG("Found mul operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::MUL);
+          idx++;
+        } else if (tokens[idx] == "relu") {
+          DEBUG("Found relu operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::RELU);
+          idx++;
+        } else if (tokens[idx] == "clipped_relu") {
+          DEBUG("Found clipped_relu operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU);
+          idx++;
+        } else if (tokens[idx] == "tanh") {
+          DEBUG("Found tanh operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::TANH);
+          idx++;
+        } else if (tokens[idx] == "pool_max") {
+          DEBUG("Found pool_max operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::POOL_MAX);
+          idx++;
+        } else if (tokens[idx] == "pool_mean") {
+          DEBUG("Found pool_mean operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::POOL_MEAN);
+          idx++;
+        } else if (tokens[idx] == "pool_min") {
+          DEBUG("Found pool_min operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::POOL_MIN);
+          idx++;
+        } else if (tokens[idx] == "softmax") {
+          DEBUG("Found softmax operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::SOFTMAX);
+          idx++;
+        } else if (tokens[idx] == "fft") {
+          DEBUG("Found fft operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::FFT);
+          idx++;
+        } else if (tokens[idx] == "reduce") {
+          DEBUG("Found reduce operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::REDUCE);
+          idx++;
+        } else if (tokens[idx] == "projectiveT") {
+          DEBUG("Found projectiveT operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T);
+          idx++;
+        } else if (tokens[idx] == "map1") {
+          DEBUG("Found map1 operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::MAP1);
+          idx++;
+        } else if (tokens[idx] == "map2") {
+          DEBUG("Found map2 operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::MAP2);
+          idx++;
+        } else if (tokens[idx] == "map3") {
+          DEBUG("Found map3 operation\n");
+          NodeConf->pushNewTensorOperation(
+              GPUNodeConfiguration::TENSOR_OP::MAP3);
+          idx++;
+        } else /*Not a new operation. This means an approximation option*/
+            if (tokens[idx] == "fp32") {
+          DEBUG("Found fp32 option\n");
+          int fp32 = std::stoi(tokens[idx + 1]);
+          DEBUG("fp32 parameter: %d, ignoring\n", fp32);
+          NodeConf->pushNewApproximationChoiceForOperation(
+              GPUNodeConfiguration::APPROX::FP32, fp32);
+          idx += 2;
+        } else if (tokens[idx] == "fp16") {
+          DEBUG("Found fp16 option\n");
+          int fp16 = std::stoi(tokens[idx + 1]);
+          DEBUG("fp16 parameter: %d, ignoring\n", fp16);
+          NodeConf->pushNewApproximationChoiceForOperation(
+              GPUNodeConfiguration::APPROX::FP16, fp16);
+          idx += 2;
+        } else if (tokens[idx] == "perf") {
+          DEBUG("Found perf option\n");
+          int perf = std::stoi(tokens[idx + 1]);
+          DEBUG("perf parameter: %d\n", perf);
+          NodeConf->pushNewApproximationChoiceForOperation(
+              GPUNodeConfiguration::APPROX::PERFORATION, perf);
+          idx += 2;
+        } else if (tokens[idx] == "samp") {
+          DEBUG("Found samp option\n");
+          int samp = std::stoi(tokens[idx + 1]);
+          DEBUG("samp parameter: %d\n", samp);
+          NodeConf->pushNewApproximationChoiceForOperation(
+              GPUNodeConfiguration::APPROX::INPUT_SAMPLING, samp);
+          idx += 2;
+        } else if (tokens[idx] == "red_samp") {
+          DEBUG("Found red_samp option\n");
+          int red_samp = std::stoi(tokens[idx + 1]);
+          DEBUG("red_samp parameter: %d\n", red_samp);
+          NodeConf->pushNewApproximationChoiceForOperation(
+              GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING, red_samp);
+          idx += 2;
+        }
+        // TODO: other approximation options handled here
+      }
+
+    } else {
+      DEBUG("Invalid Configuration File\n");
+      exit(1);
+    }
+  }
+
+  qin.close();
+  DEBUG("DONE.\n");
+}
+void RuntimeController::computeParetoConfigurationPoints() {
+
+  // Keep indices of pareto optimal points (configurations from
+  // InitialConfigurations vector that were copied to Configurations vector.)
+  // The others' setup pointer needs to be deleted
+  std::vector<unsigned> Indices;
+
+  // Baseline configuration (first one we read) always belongs to the curve
+  SpeedupConfigurations.push_back(&InitialConfigurations[0]);
+  EnergyConfigurations.push_back(&InitialConfigurations[0]);
+
+  // Sort the configurations according to accuracy loss
+  INFO("Sorting autotuner configurations...\n");
+  std::sort(
+      InitialConfigurations.begin() + 1, InitialConfigurations.end(),
+      ConfigurationLessThan());
+  INFO("Done sorting.\n");
+
+  for (unsigned start_idx = 1; start_idx < InitialConfigurations.size();) {
+    // Points to first Configuration with different (higher) accuracy loss
+    // compared to the one pointed by start_idx
+    unsigned end_idx = start_idx + 1;
+    while ((end_idx < InitialConfigurations.size()) &&
+           (InitialConfigurations[end_idx].accuracyLoss -
+                InitialConfigurations[start_idx].accuracyLoss <
+            AL_THRESHOLD)) {
+      end_idx++;
+    }
+    DEBUG("start_idx = %d, end_idx = %d\n", start_idx, end_idx);
+    // Now, all elements in [start_idx, end_idx) have equal accuracy loss,
+    // that is lower from later ones.
+
+    // Find the best speedup and energy between them as well
+    float sp = -1.0; // FLT_MIN
+    unsigned sp_idx = 0;
+
+    float en = -1.0; // FLT_MIN
+    unsigned en_idx = 0;
+
+    for (unsigned i = start_idx; i < end_idx; i++) {
+      if (InitialConfigurations[i].speedup > sp) {
+        sp = InitialConfigurations[i].speedup;
+        sp_idx = i;
+      }
+      if (InitialConfigurations[i].energy > en) {
+        en = InitialConfigurations[i].energy;
+        en_idx = i;
+      }
+    }
+    DEBUG(
+        "accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
+        InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
+    // Found best speedup for this accuracy point (not dominated by any of
+    // these).
+    DEBUG(
+        "accuracy loss = %f, energy = %f, at en_idx = %d\n",
+        InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
+    // Found best energy for this accuracy point (not dominated by any of
+    // these).
+
+    // Now, we need to check that it is not dominated.
+    // - better accuracy loss of all in initial configurations out of
+    // start_idx, end_idx range
+    // - better or equal speedup to the ones within this range
+    // We only need to check the points already in Configurations, that have
+    // already been inserted in pareto frontier. These have better accuracy
+    // loss, so this one will only be added if it shows better speedup
+    // The one in curve with best speedup so far is the last one (with worst
+    // = highest accuracy loss), so compare only with that one.
+
+    // Similar handling of energy vector
+
+    bool sp_notDominated = true;
+    if (!SpeedupConfigurations.empty()) {
+      if (SpeedupConfigurations.back()->speedup >= sp)
+        sp_notDominated = false;
+    }
+
+    bool en_notDominated = true;
+    if (!EnergyConfigurations.empty()) {
+      if (EnergyConfigurations.back()->energy >= en)
+        en_notDominated = false;
+    }
+
+    DEBUG("sp_notDominated = %d\n", sp_notDominated);
+    DEBUG("en_notDominated = %d\n", en_notDominated);
+
+    // If not dominated, insert in pareto frontier set
+    if (sp_notDominated) {
+      SpeedupConfigurations.push_back(&InitialConfigurations[sp_idx]);
+    }
+    if (en_notDominated) {
+      EnergyConfigurations.push_back(&InitialConfigurations[en_idx]);
+    }
+
+    // Keep track of unnecessary configurations
+    for (unsigned i = start_idx; i < end_idx; i++) {
+      if (((i != sp_idx) || (!sp_notDominated)) &&
+          ((i != en_idx) || (!en_notDominated)))
+        Indices.push_back(i);
+    }
+
+    // Continue from next accuracy loss level
+    start_idx = end_idx;
+  }
+
+  // All elements in InitialConfigurations whose index is in Indices are no
+  // longer needed.
+  //  for (std::vector<unsigned>::iterator idx_it = Indices.begin(), idx_e =
+  //  Indices.end();
+  //       idx_it != idx_e; ++idx_it) {
+  //    std::map<std::string, NodeConfiguration * > ConfSetup =
+  //      InitialConfigurations[*idx_it].setup;
+  //    for (std::map<std::string, NodeConfiguration* >::const_iterator it =
+  //    ConfSetup.begin();
+  //     it != ConfSetup.end(); ++it) {
+  //      delete it->second;
+  //    }
+  //  }
+  //  InitialConfigurations.clear();
+}
+
+void RuntimeController::compute3DParetoConfigurationPoints() {
+
+  // Sort the configurations according to accuracy loss
+  INFO("Sorting autotuner configurations...\n");
+  std::sort(
+      InitialConfigurations.begin(), InitialConfigurations.end(),
+      ConfigurationLessThan());
+  INFO("Done sorting.\n");
+
+  for (unsigned start_idx = 0; start_idx < InitialConfigurations.size();) {
+    // Points to first Configuration with different (higher) accuracy loss
+    // compared to the one pointed by start_idx
+    unsigned end_idx = start_idx + 1;
+    while ((end_idx < InitialConfigurations.size()) &&
+           (InitialConfigurations[end_idx].accuracyLoss -
+                InitialConfigurations[start_idx].accuracyLoss <
+            AL_THRESHOLD)) {
+      end_idx++;
+    }
+    DEBUG("start_idx = %d, end_idx = %d\n", start_idx, end_idx);
+    // Now, all elements in [start_idx, end_idx) have equal accuracy loss,
+    // that is lower from later ones and worse than those already in curve
+    // (so they cannot displace them).
+
+    // Find candidates from [start_idx, end_idx) to be inserted
+    // Keep their indices. If a point is dominated (strictly worse),
+    // its index will not be inserted
+    std::vector<unsigned> Indices;
+
+    for (unsigned i = start_idx; i < end_idx; i++) {
+      bool dominated = false;
+      for (unsigned j = i + 1; (j < end_idx) && !dominated; j++) {
+        if ((InitialConfigurations[i].speedup <
+             InitialConfigurations[j].speedup) &&
+            (InitialConfigurations[i].energy <
+             InitialConfigurations[j].energy)) {
+          dominated = true;
+        }
+      }
+      if (!dominated) {
+        DEBUG(
+            "accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
+            InitialConfigurations[i].accuracyLoss,
+            InitialConfigurations[i].speedup, InitialConfigurations[i].energy,
+            i);
+        Indices.push_back(i);
+      }
+    }
+
+    for (std::vector<unsigned>::iterator idx_it = Indices.begin(),
+                                         idx_e = Indices.end();
+         idx_it != idx_e; ++idx_it) {
+      Configuration &CandidateConfiguration = InitialConfigurations[*idx_it];
+
+      if (!ThreeDCurveConfigurations.empty()) {
+        bool notDominated = true;
+        for (unsigned i = 0;
+             (i < ThreeDCurveConfigurations.size()) && notDominated; i++) {
+          if ((CandidateConfiguration.speedup <=
+               ThreeDCurveConfigurations[i]->speedup) &&
+              (CandidateConfiguration.energy <=
+               ThreeDCurveConfigurations[i]->energy)) {
+            // This configuration is not better, in at least one characteristic,
+            // compared to the existing ones in the curve.
+            notDominated = false;
+          }
+        }
+        if (notDominated) {
+          ThreeDCurveConfigurations.push_back(&CandidateConfiguration);
+        }
+      } else {
+        // If the curve is empty, we know that this is a point that must be
+        // inserted. It has the best accuracy loss, and belongs here because
+        // it is not dominated by any point in this accuracy range.
+        ThreeDCurveConfigurations.push_back(&CandidateConfiguration);
+      }
+    }
+
+    // Continue from next accuracy loss level
+    start_idx = end_idx;
+  }
+}
+
+void RuntimeController::printConfigurations(
+    std::vector<struct Configuration> &Confs) {
+
+  for (std::vector<struct Configuration>::iterator it = Confs.begin(),
+                                                   ie = Confs.end();
+       it != ie; ++it) {
+    it->print();
+  }
+}
+
+void RuntimeController::printConfigurations(
+    std::vector<struct Configuration *> &Confs) {
+
+  for (std::vector<struct Configuration *>::iterator it = Confs.begin(),
+                                                     ie = Confs.end();
+       it != ie; ++it) {
+    (*it)->print();
+  }
+}
+
+void RuntimeController::findNextConfiguration() {
+  configurationIdx = (configurationIdx + 1) % Configurations->size();
+  DEBUG(
+      "findNextConfiguration: Updated configurationIdx to %u.\n",
+      configurationIdx);
+}
+
+void RuntimeController::findTargetConfiguration(
+    float goal, enum SEARCH_KIND sk) {
+  // We search in range begin(), end()-1 . It is OK to decrement end(), because
+  // the configurations vector always points to one of the pareto curves, and
+  // they are never empty - we have always pushed at least one configuration.
+
+  DEBUG("findTargetConfiguration: goalVal: %f, search kind: %d.\n", goal, sk);
+  std::vector<struct Configuration *>::iterator low_it;
+  switch (sk) {
+  case SPEEDUP: {
+    Configurations = &SpeedupConfigurations;
+    low_it = std::lower_bound(
+        Configurations->begin(), Configurations->end() - 1, goal,
+        ConfigurationLessThan_SP());
+    configurationIdx = low_it - Configurations->begin();
+    break;
+  }
+  case ENERGY: {
+    Configurations = &EnergyConfigurations;
+    low_it = std::lower_bound(
+        Configurations->begin(), Configurations->end() - 1, goal,
+        ConfigurationLessThan_E());
+    configurationIdx = low_it - Configurations->begin();
+    break;
+  }
+  case ACCURACY_LOSS: {
+    Configurations = &SpeedupConfigurations;
+    low_it = std::lower_bound(
+        Configurations->begin(), Configurations->end() - 1, goal,
+        ConfigurationLessThan_AL());
+    if ((*low_it)->accuracyLoss > goal)
+      --low_it;
+    configurationIdx = low_it - Configurations->begin();
+    break;
+  }
+  default: {
+    CUSTOM_ASSERT(false && "Unknown search option for optimization target");
+    ERROR("Unknown search option for optimization target.");
+    abort();
+  }
+  }
+  // After search, low_it points to the Configuration to the element with the
+  // goal value or the immediately lower value if it does not exist
+
+  DEBUG(
+      "findTargetConfiguration: Updated configurationIdx to %u.\n",
+      configurationIdx);
+}
+
+void RuntimeController::adjustTargetConfiguration(float goal) {
+
+  DEBUG("adjustTargetConfiguration: goalVal: %f.\n\n", goal);
+
+  // Find configuration before the selected one. There is always one.
+  // Get the two configurations' speedup, and compute the appropriate ranges
+  float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup;
+  float prev_conf_speedup = (*Configurations)[configurationIdx - 1]->speedup;
+  float sp_diff = curr_conf_speedup - prev_conf_speedup;
+
+  float high_range = curr_conf_speedup - goal;
+  float low_range = goal - prev_conf_speedup;
+
+  // These represent how likely we are to pick the upper or lower configuration
+  float high_pb = low_range / sp_diff;
+  float low_pb = high_range / sp_diff;
+
+  DEBUG(
+      "**---- adjustTargetConfiguration: upper conf = %s with probability: "
+      "%f.\n",
+      ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
+  DEBUG(
+      "**---- adjustTargetConfiguration: lower conf = %s with probability: "
+      "%f.\n\n",
+      ((*Configurations)[configurationIdx - 1]->name).c_str(), low_pb);
+
+  // Select a random number from 0 to 1
+  // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1)
+  // to the upper
+  //  float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ;
+  pseudo_rd += 0.1f;
+  float rd = pseudo_rd;
+  if (rd < low_pb) {
+    // If the probability is in the low range
+    configurationIdx--;
+  }
+
+  DEBUG(
+      "adjustTargetConfiguration: rand: %f : Updated configurationIdx to %u.\n",
+      rd, configurationIdx);
+}
+float RuntimeController::getGoalSpeedup() {
+  return 1.0 + (rand() / (RAND_MAX / (MAX_GOAL_SPEEDUP - 1.0)));
+}
+
+double RuntimeController::getBaselineTime() { return baseline_time; }
+
+Slowdowns *RuntimeController::getSlowdowns() { return slowdowns; }
+
+// Functions to be inserted with initializeTensorRT and clearTensorRT
+void llvm_hpvm_initializeRuntimeController(
+    const char *ConfigFile, const char *QRangeFile) {
+  RC = new RuntimeController();
+  RC->init(ConfigFile, QRangeFile);
+  return;
+}
+
+void llvm_hpvm_clearRuntimeController() {
+  delete RC;
+  return;
+}
+
+//*** Methods to compute accuracy of a tensor by the runtime controller   ***//
+uint32_t *labels_from_file = NULL;
+
+uint32_t *
+hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) {
+
+  // Initialize buffer
+  if (!labels_from_file) {
+    FILE *file = fopen(labels_file, "rb");
+    if (file == NULL) {
+      ERROR("Data file %s is not found. Aborting...\n", labels_file);
+      abort();
+    }
+    // Get number of labels
+    fseek(file, 0, SEEK_END);
+    long size = ftell(file);
+    fseek(file, 0, SEEK_SET); // return file pointer to beginning
+
+    // Allocate memory for labels
+    labels_from_file = (uint32_t *)malloc(size);
+    if (labels_from_file == NULL) {
+      ERROR("Memory allocation for labels unsucessfull. Aborting...\n");
+      abort();
+    }
+
+    // Copy the labels file into the allocated buffer
+    size_t result = fread(labels_from_file, 1, size, file);
+    if (result != size) {
+      // We did not read as many elemets as there are in the file
+      ERROR("Reading labels file unsucessfull. Aborting...\n");
+      abort();
+    }
+
+    fclose(file);
+  }
+
+  //  int num_labels = end - start;
+  //  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
+  //  for (unsigned i = start; i < end; i++) {
+  //    labels[i-start] = labels_from_file[i];
+  //  }
+  //  return labels;
+
+  // Return pointer to labels
+  return &labels_from_file[start];
+}
+
+//*** Copied from dnn_sources/include/utils.h                             ***//
+float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
+
+  struct Tensor *result = (struct Tensor *)result_ptr;
+
+  size_t batch_dim = result->dims.dim_sizes[0];
+  size_t num_classes = result->dims.dim_sizes[1];
+  float *data = (float *)result->host_data;
+  int num_errors = 0;
+
+  printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
+
+  for (int i = 0; i < batch_dim; i++) {
+
+    int chosen = 0;
+    for (int id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
+    }
+
+    if (chosen != labels[i])
+      num_errors++;
+  }
+
+  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
+  printf("****** Accuracy = %f \n\n", accuracy);
+
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
+
+    std::ostringstream ss;
+    ss << std::fixed << accuracy;
+    std::string print_str = ss.str();
+
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+  }
+
+  fclose(fp);
+
+  return accuracy;
+}
+void llvm_hpvm_invokeRtControl_BASE(
+    void *result, const char *str, int start, int end) {
+
+  RC->resume_profiler();
+
+  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
+  hpvm_rt_computeAccuracy3(labels_cached, result);
+
+  // Read stats for iteration that was just completed
+  double current_iteration_time = RC->getCurrentIterationComputeTime();
+  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
+
+  RC->pause_profiler();
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationControlTime(pinfo.first);
+  RC->addToCurrentIterationControlEnergy(pinfo.second);
+
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n\n",
+      current_iteration_time, current_iteration_energy);
+
+  // Note the end of iteration
+  RC->end_iteration();
+}
+
+void llvm_hpvm_invokeRtControl_ITERATE(
+    void *result, const char *str, int start, int end) {
+
+  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
+  hpvm_rt_computeAccuracy3(labels_cached, result);
+
+  // Read stats for iteration that was just completed
+  double current_iteration_time = RC->getCurrentIterationComputeTime();
+  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
+
+  RC->resume_profiler();
+  RC->findNextConfiguration();
+  // Still use findNext configuration, to update the configurationIdx,
+  // to point to next location
+  enum SEARCH_KIND k = ACCURACY_LOSS;
+  float goalVal =
+      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss;
+  RC->findTargetConfiguration(goalVal, k);
+
+  RC->pause_profiler();
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationControlTime(pinfo.first);
+  RC->addToCurrentIterationControlEnergy(pinfo.second);
+
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n\n",
+      current_iteration_time, current_iteration_energy);
+
+  // Note the end of iteration
+  RC->end_iteration();
+}
+
+void llvm_hpvm_invokeRtControl_ADJUST(
+    void *result, const char *str, int start, int end) {
+
+  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
+  hpvm_rt_computeAccuracy3(labels_cached, result);
+
+  // Read stats for iteration that was just completed
+  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
+
+  RC->resume_profiler();
+  double current_iteration_time = RC->getCurrentIterationComputeTime();
+  double baseline_time = RC->getBaselineTime();
+  double target_speedup = current_iteration_time / baseline_time;
+  RC->findTargetConfiguration(target_speedup, SPEEDUP);
+  RC->pause_profiler();
+
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationControlTime(pinfo.first);
+  RC->addToCurrentIterationControlEnergy(pinfo.second);
+
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n",
+      current_iteration_time, current_iteration_energy);
+  INFO("target speedup = %lf\n\n", target_speedup);
+
+  // Note the end of iteration
+  RC->end_iteration();
+}
+
+void llvm_hpvm_invokeRtControl_SLOWDOWN(
+    void *result, const char *str, int start, int end) {
+
+  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
+  hpvm_rt_computeAccuracy3(labels_cached, result);
+
+  // Read stats for iteration that was just completed
+  double current_iteration_time = RC->getCurrentIterationComputeTime();
+  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
+
+  std::string prev_conf_name =
+      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
+
+  RC->resume_profiler();
+  float slowdown = RC->getSlowdowns()->getNextSlowdown();
+  RC->findTargetConfiguration(slowdown, SPEEDUP);
+  RC->pause_profiler();
+
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationControlTime(pinfo.first);
+  RC->addToCurrentIterationControlEnergy(pinfo.second);
+
+  std::string next_conf_name =
+      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
+  float next_conf_speedup =
+      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
+
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n",
+      current_iteration_time, current_iteration_energy);
+  INFO("slowdown (target speedup) = %f\n", slowdown);
+  INFO("Previous configuration: %s\n", prev_conf_name.c_str());
+  INFO(
+      "Swapping to next configuration: %s with speedup %f\n\n",
+      next_conf_name.c_str(), next_conf_speedup);
+
+  // Note the end of iteration
+  RC->end_iteration();
+}
+
+void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(
+    void *result, const char *str, int start, int end) {
+
+  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
+  hpvm_rt_computeAccuracy3(labels_cached, result);
+
+  // Read stats for iteration that was just completed
+  double current_iteration_time = RC->getCurrentIterationComputeTime();
+  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
+
+  std::string prev_conf_name =
+      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
+
+  RC->resume_profiler();
+  float slowdown = RC->getSlowdowns()->getNextSlowdown();
+  RC->findTargetConfiguration(slowdown, SPEEDUP);
+  RC->adjustTargetConfiguration(slowdown);
+  RC->pause_profiler();
+
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationControlTime(pinfo.first);
+  RC->addToCurrentIterationControlEnergy(pinfo.second);
+
+  std::string next_conf_name =
+      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
+  float next_conf_speedup =
+      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
+
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n",
+      current_iteration_time, current_iteration_energy);
+  INFO("slowdown (target speedup) = %f\n", slowdown);
+  INFO("Previous configuration: %s\n", prev_conf_name.c_str());
+  INFO(
+      "Swapping to next configuration: %s with speedup %f\n\n",
+      next_conf_name.c_str(), next_conf_speedup);
+
+  // Note the end of iteration
+  RC->end_iteration();
+}
+
+void llvm_hpvm_invokeRtControl_RAND(
+    void *result, const char *str, int start, int end) {
+
+  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
+  hpvm_rt_computeAccuracy3(labels_cached, result);
+
+  // Read stats for iteration that was just completed
+  double current_iteration_time = RC->getCurrentIterationComputeTime();
+  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
+
+  RC->resume_profiler();
+  RC->findTargetConfiguration(RC->getGoalSpeedup(), SPEEDUP);
+  RC->pause_profiler();
+
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationControlTime(pinfo.first);
+  RC->addToCurrentIterationControlEnergy(pinfo.second);
+
+  INFO(
+      "current iteration time = %f, current iteration energy = %f\n\n",
+      current_iteration_time, current_iteration_energy);
+
+  // Note the end of iteration
+  RC->end_iteration();
+}
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
index 6929785df3532d917bff726e6d0af0b2dbb93842..7f10ee0acdb3da4ec7008b23cc3322ab3a0b13fa 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
@@ -4,7 +4,7 @@
 
 #include "functional/map.cuh"
 #include "functional/reduce.cuh"
-#include "tensor_utils.cu"
+#include "tensor_utils.h"
 
 #include <cufft.h>
 #include <cufftXt.h>
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
index dc65b8acdc31242879b4b1f1e34af1b1ccc661c2..de254b218853047b5f2147054c7a18a2faa71cf5 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
@@ -1,13 +1,17 @@
+#include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <experimental/filesystem>
-#include <string>
+#include <numeric>
 #include <sstream>
+#include <string>
 
 #include "debug.h"
 #include "device_math.h"
+#include "functional/common.h"
 #include "img_tensor_runtime.h"
 #include "img_tensor_utils.h"
+#include "tensor_utils.h"
 
 // Image I/O utilities
 #define STB_IMAGE_IMPLEMENTATION
@@ -18,12 +22,12 @@
 
 static inline uint8_t *float_to_uint8(const float *fl, size_t count) {
   auto *ret = new uint8_t[count];
-  float max_v = *std::max_element(fl, fl + count), min_v = *std::min_element(fl, fl + count);
+  float max_v = *std::max_element(fl, fl + count),
+        min_v = *std::min_element(fl, fl + count);
   if (max_v - min_v < 1e-3) {
     for (size_t i = 0; i < count; i++)
       ret[i] = 0;
-  }
-  else {
+  } else {
     float frac = 255 / (max_v - min_v);
     for (size_t i = 0; i < count; i++)
       ret[i] = uint8_t(frac * (fl[i] - min_v));
@@ -149,7 +153,8 @@ readDataSet(const char *path, size_t start, size_t count, size_t n_color) {
       throw std::runtime_error("Image load failed");
     if (x != h || y != w) {
       std::ostringstream os;
-      os << "Image file " << path << " have different shape (" << x << ", " << y << ")";
+      os << "Image file " << path << " have different shape (" << x << ", " << y
+         << ")";
       throw std::runtime_error(os.str());
     }
     float *converted = uint8_to_float(data, n_floats);
@@ -398,7 +403,7 @@ float violationRate(
 
 float mean(const std::vector<float> &values) {
   std::vector<float> non_nan;
-  for (float f: values)
+  for (float f : values)
     if (!std::isnan(f))
       non_nan.push_back(f);
   if (non_nan.empty())
@@ -428,5 +433,6 @@ void reshape(void *t, const std::vector<size_t> &shape) {
   free(tensor->dims.dim_sizes);
   tensor->dims.dim_sizes = (size_t *)malloc(sizeof(size_t) * shape.size());
   std::copy(shape.begin(), shape.end(), tensor->dims.dim_sizes);
-  set4DTensorDescriptor(tensor, tensor->data_format, shape[0], shape[1], shape[2], shape[3]);
+  set4DTensorDescriptor(
+      tensor, tensor->data_format, shape[0], shape[1], shape[2], shape[3]);
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
index 4210d5cfcee8b38c426c418d611e3e5264ec50b8..1d848430330c346f0e10efef407324f23defd911 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
@@ -1,8 +1,3 @@
-
-#ifndef TENSOR_UTILS_HEADER
-#define TENSOR_UTILS_HEADER
-
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
@@ -30,6 +25,7 @@
 #include <cublas_api.h>
 #include <vector>
 
+#include "tensor_utils.h"
 #include "tensor_runtime.h"
 #include "debug.h"
 #include "tensor.h"
@@ -58,7 +54,7 @@ void freeTensor(void* tensor_ptr){
 
 
 // Returns the size of the target datatype
-static int getTypeSize(int data_type){
+int getTypeSize(int data_type){
   // TODO: Add support for more data types
   switch (data_type) {
     case float_type:
@@ -530,14 +526,4 @@ void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement
     ERROR("Tensor == NULL");
   tensor->data_placement = data_placement;
 }
-
-
- 
-
-  
 } // end of Extern"C"
-
-
-#endif
-
-