diff --git a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
index cbabc8bbe0111a0ec6c99520176a8b37a530a4fb..be42ebec07cc5aebad8ad975155f86cc25715dab 100644
--- a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -140,8 +140,12 @@ add_tensor_runtime(tensor_runtime_online -DONLINE_PROFILING=true -DFP16_tuning=f
 add_dependencies(tensor_runtime_online tensor_runtime)
 
 # Adding rule for the debugging source
-add_executable(unit_tests tests/unit_tests.cc)
-target_link_libraries(unit_tests tensor_runtime_online)
+add_executable(sampling_tests tests/sampling_tests.cc)
+target_link_libraries(sampling_tests tensor_runtime_online)
+
+add_executable(perforation_tests tests/perforation_tests.cc)
+target_link_libraries(perforation_tests tensor_runtime_online)
+
 
 # -- Compile tensor_runtime.ll if possible
 if(INDEP_BUILD)
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/jetson_freq_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/jetson_freq_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7caee936a232516d6b8a4bd5531d09aa3e939ab9
--- /dev/null
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/jetson_freq_utils.h
@@ -0,0 +1,125 @@
+
+/****  
+
+   This file contains freqency setting routines specific to the Jetson Tx2 
+
+   NOTE: These routines are not used directly in the current code. 
+
+   Users testing frequency changes on the Jetson Tx2 (or similar devices) can use/repurpose these routines
+
+***/
+
+#include <fstream>
+
+
+const int available_freqs[] = {
+    140250000,  // 0
+    229500000,  // 1
+    318750000,  // 2
+    408000000,  // 3
+    497250000,  // 4
+    586500000,  // 5
+    675750000,  // 6
+    765000000,  // 7
+    854250000,  // 8
+    943500000,  // 9
+    1032750000, // 10
+    1122000000, // 11
+    1211250000, // 12
+    1300500000  // 13
+};
+
+
+// Sets frequency
+void setFreq(unsigned freq_index) {
+
+  unsigned target_freq = available_freqs[freq_index];
+
+  const char *const min_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
+  const char *const max_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
+
+  std::ofstream min_stream;
+  std::ofstream max_stream;
+
+  min_stream.open(min_freq_file, std::ofstream::out);
+  max_stream.open(max_freq_file, std::ofstream::out);
+
+  min_stream << target_freq << std::flush;
+  max_stream << target_freq << std::flush;
+
+  min_stream.close();
+  max_stream.close();
+}
+
+// Records frequency
+unsigned recordFreq() {
+
+  // Current frequency file
+  const char *const cur_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
+  std::ifstream cur_stream;
+  cur_stream.open(cur_freq_file, std::ifstream::in);
+
+  // Get starting frequency
+  unsigned cur_freq;
+  cur_stream >> cur_freq;
+  std::cout << "Starting frequency = " << cur_freq << "\n";
+  cur_stream.close();
+
+  return cur_freq;
+}
+
+// There will be no frequency request for the first batch
+// Therefore, we skip the first element by initializing to 1, not 0.
+FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf)
+    : idx_list(il), rep_factor(rf), count(1), idx(0) {}
+
+unsigned FrequencyIndexList::getNextIndex() {
+  if (count == rep_factor) {
+    count = 0;
+    idx = (idx + 1) % idx_list.size();
+  }
+  count++;
+  return idx_list[idx];
+}
+
+
+void RuntimeController::readIterationFrequency() {
+  if (PI)
+    PI->readIterationFrequency();
+}
+
+unsigned long RuntimeController::getIterationFrequency() {
+  return (PI ? PI->getIterationFrequency() : 0);
+}
+
+void RuntimeController::updateFrequency() {
+#ifdef JETSON_EXECUTION
+  unsigned freq_idx = FIL->getNextIndex();
+  //--- updateJetsonGPUFreq(freq_idx);
+
+  setFreq(freq_idx);
+
+#endif // JETSON_EXECUTION
+}
+
+unsigned long RuntimeController::getLastFrequency() { return g_freq; }
+
+void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; }
+
+
+
+void ProfileInfo::readIterationFrequency() {
+#ifdef JETSON_EXECUTION
+  //----- frequency_current_iteration = readJetsonGPUFreq();
+  frequency_current_iteration = recordFreq();
+#else
+  frequency_current_iteration = 0;
+#endif // JETSON_EXECUTION
+}
+
+unsigned long ProfileInfo::getIterationFrequency() {
+  return frequency_current_iteration;
+}
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
index bea66370ba073490fe7970014f1005f123e58988..0332313c573bcd28215a4277cd788e63a7820b2a 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -3,155 +3,35 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file contains code for that allows the tensor runtime to adapt
-// in response to external changes in conditions (such as frequency changes)
-// by helping to choose correct approximation configurations. It also provides
-// routines for the rest of the runtime to get performance and energy profiling.
+//  This file contains code for HPVM Dynamic Approximation Control.
+//
+//  The runtime controller:
+//    * Reads in the configuration file passed to the HPVM binary
+//    * Contructs a Pareto Curve
+//    * Based on the selected Mode it switches configurations at runtime
+//
+//   Author: Maria Kotsifakou
 //
 //===----------------------------------------------------------------------===//
 
-#include "hpvm-rt-controller.h"
-#include "global_data.h"
-#include <fstream>
-
-//-------- Functionality to read and update frequency on Jetson board -------//
-/*const char* available_freqs[] = {"140250000", "229500000", "318750000",
-                                 "408000000", "497250000", "586500000",
-                                 "675750000", "765000000", "854250000",
-                                 "943500000", "1032750000", "1122000000",
-                                 "1211250000", "1300500000"};
-
-*/
-
-const int available_freqs[] = {
-    140250000,  // 0
-    229500000,  // 1
-    318750000,  // 2
-    408000000,  // 3
-    497250000,  // 4
-    586500000,  // 5
-    675750000,  // 6
-    765000000,  // 7
-    854250000,  // 8
-    943500000,  // 9
-    1032750000, // 10
-    1122000000, // 11
-    1211250000, // 12
-    1300500000  // 13
-};
-
-/*void updateJetsonGPUFreq(int freq_level) {
-
-  if (freq_level < 0 || freq_level > 13) {
-    printf("ERROR: Provide freq level between {0, 13}  \n\n\n");
-    abort();
-  }
-
-  const char* freq_val = available_freqs[freq_level];
-  printf("freq-val[0] = %s \n", freq_val);
-
-  FILE* max_file =
-    fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq", "w+");
-  if (max_file == NULL) {
-    printf("Could not min_freq file \n");
-  }
-  fwrite(freq_val, strlen(freq_val), 1, max_file);
-  fclose(max_file);
-
-  FILE* min_file =
-    fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+");
-  if (min_file == NULL){
-    printf("Could not min_freq file \n");
-    abort();
-  }
-  fwrite(freq_val, strlen(freq_val), 1, min_file);
-  fclose(min_file);
-}
-
-unsigned long int readJetsonGPUFreq() {
-  FILE* cur_freq_file =
-    fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq", "r");
-//    fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "r");
-  if (cur_freq_file == NULL) {
-    printf("Could not open cur_freq file \n");
-  }
-
-  char buf[50];
-  char* ptr;
-
-  fread(buf, 50, 1, cur_freq_file);
-  unsigned long cur_freq = strtoul(buf, &ptr, 10);
-  fclose(cur_freq_file);
-  return cur_freq;
-}
-
-*/
-
-// Sets frequency
-void setFreq(unsigned freq_index) {
-
-  unsigned target_freq = available_freqs[freq_index];
-
-  const char *const min_freq_file =
-      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
-  const char *const max_freq_file =
-      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
-
-  std::ofstream min_stream;
-  std::ofstream max_stream;
-
-  min_stream.open(min_freq_file, std::ofstream::out);
-  max_stream.open(max_freq_file, std::ofstream::out);
 
-  min_stream << target_freq << std::flush;
-  max_stream << target_freq << std::flush;
+// ***NOTE*** The macro definitions below control the runtime policy
 
-  min_stream.close();
-  max_stream.close();
-}
+//--- llvm_hpvm_invokeRtControl_BASE is the baseline policy (default) that just uses the first config (configuration file)
+#define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl
+//--- llvm_hpvm_invokeRtControl_ADJUST_PR is the probabilistic config selection from Pareto curve - Uncomment to use
+//#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl
 
-// Records frequency
-unsigned recordFreq() {
 
-  // Current frequency file
-  const char *const cur_freq_file =
-      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
-  std::ifstream cur_stream;
-  cur_stream.open(cur_freq_file, std::ifstream::in);
 
-  // Get starting frequency
-  unsigned cur_freq;
-  cur_stream >> cur_freq;
-  std::cout << "Starting frequency = " << cur_freq << "\n";
-  cur_stream.close();
+#include "hpvm-rt-controller.h"
+#include "global_data.h"
+#include "jetson_freq_utils.h"
+#include <fstream>
 
-  return cur_freq;
-}
 
 //---------------------------------------------------------------------------//
 
-/*
- * Check if a file exists
- * Return true if the file exists, false else
- */
-bool fileExists(const std::string &file) {
-  struct stat buf;
-  return (stat(file.c_str(), &buf) == 0);
-}
-
-// There will be no frequency request for the first batch
-// Therefore, we skip the first element by initializing to 1, not 0.
-FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf)
-    : idx_list(il), rep_factor(rf), count(1), idx(0) {}
-
-unsigned FrequencyIndexList::getNextIndex() {
-  if (count == rep_factor) {
-    count = 0;
-    idx = (idx + 1) % idx_list.size();
-  }
-  count++;
-  return idx_list[idx];
-}
 
 // Functions
 void ProfileInfo::resetCurrentIterationTime() {
@@ -208,18 +88,6 @@ void ProfileInfo::end_iteration() {
   in_iteration = false;
 }
 
-void ProfileInfo::readIterationFrequency() {
-#ifdef JETSON_EXECUTION
-  //----- frequency_current_iteration = readJetsonGPUFreq();
-  frequency_current_iteration = recordFreq();
-#else
-  frequency_current_iteration = 0;
-#endif // JETSON_EXECUTION
-}
-
-unsigned long ProfileInfo::getIterationFrequency() {
-  return frequency_current_iteration;
-}
 
 void ProfileInfo::addToCurrentIterationComputeTime(const char *s, double t) {
   start_iteration();
@@ -346,8 +214,8 @@ ProfileInfo::ProfileInfo()
       in_iteration(false) {}
 
 Slowdowns::Slowdowns() {
-  idx = 0;
 
+  idx = 0;
   std::ifstream s_in("slowdowns.txt");
   if (!s_in) {
     DEBUG("slowdowns file not found. Initializing slowdowns randomly.\n");
@@ -446,12 +314,8 @@ void RuntimeController::init(const char *Cstr) {
   //    compute3DParetoConfigurationPoints(); Not using 3D curve
   INFO("Speedup Configurations\n");
   printConfigurations(SpeedupConfigurations);
-  //    INFO("Energy Configurations\n");
-  //    printConfigurations(EnergyConfigurations);
-  //    INFO("3D Configurations\n");
-  //    printConfigurations(ThreeDCurveConfigurations);
-  configurationIdx =
-      0; // TODO: initialize using pareto curve - findTargetConfiguration ?
+
+  configurationIdx = 0; 
   Configurations = &SpeedupConfigurations;
 
   // Initializations for different runtime control strategies
@@ -461,10 +325,8 @@ void RuntimeController::init(const char *Cstr) {
   // Pseudo random variable (when we did few experiments)
   // or true random numbers for probabilistic control
   pseudo_rd = 0.0;
-  std::random_device
-      rd; // Will be used to obtain a seed for the random number engine
-  generator =
-      std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd()
+  std::random_device rd; // Will be used to obtain a seed for the random number engine
+  generator = std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd()
   distr = std::uniform_real_distribution<>(0.0, 1.0);
 
   g_freq = available_freqs[13];
@@ -526,24 +388,6 @@ double RuntimeController::getCurrentIterationComputeEnergy() {
   return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0);
 }
 
-void RuntimeController::readIterationFrequency() {
-  if (PI)
-    PI->readIterationFrequency();
-}
-
-unsigned long RuntimeController::getIterationFrequency() {
-  return (PI ? PI->getIterationFrequency() : 0);
-}
-
-void RuntimeController::updateFrequency() {
-#ifdef JETSON_EXECUTION
-  unsigned freq_idx = FIL->getNextIndex();
-  //--- updateJetsonGPUFreq(freq_idx);
-
-  setFreq(freq_idx);
-
-#endif // JETSON_EXECUTION
-}
 
 void RuntimeController::writeProfileInfo() {
   if (PI)
@@ -575,6 +419,7 @@ std::pair<double, double> RuntimeController::fc_profile(
     const unsigned num_rows_a, const unsigned num_cols_a,
     const unsigned num_rows_b, const unsigned num_cols_b,
     const unsigned voltage_swing, const unsigned patch_factor) {
+
   return (promise ? promise->fc_profile(num_rows_a, num_cols_a, num_rows_b,
                                         num_cols_b, voltage_swing, patch_factor)
                   : std::make_pair(0.0, 0.0));
@@ -585,6 +430,7 @@ std::pair<double, double> RuntimeController::conv_profile(
     const unsigned c_out, const unsigned c_in, const unsigned k_h,
     const unsigned k_w, const unsigned s_h, const unsigned s_w,
     const unsigned voltage_swing, const unsigned patch_factor) {
+  
   return (promise ? promise->conv_profile(n, c, h, w, c_out, c_in, k_h, k_w,
                                           s_h, s_w, voltage_swing, patch_factor)
                   : std::make_pair(0.0, 0.0));
@@ -593,8 +439,12 @@ std::pair<double, double> RuntimeController::conv_profile(
 // Constructor and descructor
 RuntimeController::RuntimeController() {
   configurationIdx = 0;
+
+  // NOTE: The 14 Frequency levels are specific to NVIDIA Jetson Tx2
+  // More Frequency utils (not used by default) present in include/jetson_freq_utils.h 
   FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
                                10);
+  
 #ifdef ACTIVE_PROFILING
   PI = new ProfileInfo();
   profiler = new Profiler();
@@ -1052,20 +902,7 @@ void RuntimeController::computeParetoConfigurationPoints() {
     start_idx = end_idx;
   }
 
-  // All elements in InitialConfigurations whose index is in Indices are no
-  // longer needed.
-  //  for (std::vector<unsigned>::iterator idx_it = Indices.begin(), idx_e =
-  //  Indices.end();
-  //       idx_it != idx_e; ++idx_it) {
-  //    std::map<std::string, NodeConfiguration * > ConfSetup =
-  //      InitialConfigurations[*idx_it].setup;
-  //    for (std::map<std::string, NodeConfiguration* >::const_iterator it =
-  //    ConfSetup.begin();
-  //     it != ConfSetup.end(); ++it) {
-  //      delete it->second;
-  //    }
-  //  }
-  //  InitialConfigurations.clear();
+
 }
 
 void RuntimeController::compute3DParetoConfigurationPoints() {
@@ -1153,8 +990,9 @@ void RuntimeController::printConfigurations(
     std::vector<struct Configuration> &Confs) {
 
   for (std::vector<struct Configuration>::iterator it = Confs.begin(),
-                                                   ie = Confs.end();
+       ie = Confs.end();
        it != ie; ++it) {
+    
     it->print();
   }
 }
@@ -1163,15 +1001,13 @@ void RuntimeController::printConfigurations(
     std::vector<struct Configuration *> &Confs) {
 
   for (std::vector<struct Configuration *>::iterator it = Confs.begin(),
-                                                     ie = Confs.end();
+       ie = Confs.end();
        it != ie; ++it) {
+    
     (*it)->print();
   }
 }
 
-unsigned long RuntimeController::getLastFrequency() { return g_freq; }
-
-void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; }
 
 double RuntimeController::getLastSpeedup() { return g_speedup; }
 
@@ -1196,24 +1032,24 @@ void RuntimeController::findTargetConfiguration(float goal,
     // Assigning one of Pareto configs to 'Configurations' class attribute
     Configurations = &SpeedupConfigurations;
     low_it =
-        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
-                         goal, ConfigurationLessThan_SP());
+      std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+		       goal, ConfigurationLessThan_SP());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ENERGY: {
     Configurations = &EnergyConfigurations;
     low_it =
-        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
-                         goal, ConfigurationLessThan_E());
+      std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+		       goal, ConfigurationLessThan_E());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ACCURACY_LOSS: {
     Configurations = &SpeedupConfigurations;
     low_it =
-        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
-                         goal, ConfigurationLessThan_AL());
+      std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+		       goal, ConfigurationLessThan_AL());
     if ((*low_it)->accuracyLoss > goal)
       --low_it;
     configurationIdx = low_it - Configurations->begin();
@@ -1232,6 +1068,11 @@ void RuntimeController::findTargetConfiguration(float goal,
         configurationIdx);
 }
 
+/***  This routine takes as input goal (target speedup) and computes the probabilty of selecting the higher configuration 
+     (one with higher than target speedup) and probability of lower configuration (config with lower than target speedup).
+
+     Motivation: The Pareto curve often does not have a configuration providing the exact req speedup
+***/
 void RuntimeController::adjustTargetConfiguration(float goal) {
 
   DEBUG("adjustTargetConfiguration: goalVal: %f.\n\n", goal);
@@ -1245,53 +1086,22 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
   // Get the two configurations' speedup, and compute the appropriate ranges
   float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup;
   float prev_conf_speedup = (*Configurations)[prev_conf_idx]->speedup;
-  float sp_diff = curr_conf_speedup - prev_conf_speedup;
 
+  // Computation of how far the target speedup is for lower and higher speedup config
+  float sp_diff = curr_conf_speedup - prev_conf_speedup;
   float high_range = curr_conf_speedup - goal;
   float low_range = goal - prev_conf_speedup;
 
   // These represent how likely we are to pick the upper or lower configuration
   float high_pb = 0.0, low_pb = 0.0;
+  
   if (configurationIdx == prev_conf_idx) {
     high_pb = low_pb = 1.0;
-  } else {
+  }
+  else {
+    // Compute the probabitly of selection for higher config and lower config
     high_pb = low_range / sp_diff;
     low_pb = high_range / sp_diff;
-
-    //***--- Probability adjustment strategy 1 ---***//
-    // No big adjustments at edges of probability range
-    //    float adjust_val = 0.0;
-    //    if (low_pb < high_pb) {
-    //      adjust_val = low_pb * 0.2;
-    //    } else {
-    //      adjust_val = high_pb * 0.2;
-    //    }
-    //    low_pb -= adjust_val;
-    //    high_pb += adjust_val;
-    //***---                                   ---***//
-
-    //***--- Probability adjustment strategy 2 ---***//
-    // No big adjustment at high edge of probability range
-    //    float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2;
-    //    low_pb -= adjust_val;
-    //    high_pb += adjust_val;
-    //***---                                   ---***//
-
-    //***--- Probability adjustment strategy 3 ---***//
-    // Similar to 2, but higher always increases, more significantly
-    //    float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5;
-    //    low_pb -= adjust_val;
-    //    high_pb += adjust_val;
-    //***---                                   ---***//
-
-    //***--- Probability adjustment strategy 4 ---***//
-    // Similar to 2, but higher always increases, more significantly
-    // Low end, high end a bit less aggressive than total range
-    float adjust_val = low_pb * 0.6 > 0.2 ? 0.2 : low_pb * 0.6;
-    adjust_val = adjust_val > high_pb ? high_pb : adjust_val;
-    low_pb -= adjust_val;
-    high_pb += adjust_val;
-    //***---                                   ---***//
   }
 
   DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: "
@@ -1373,13 +1183,6 @@ uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start,
     fclose(file);
   }
 
-  //  int num_labels = end - start;
-  //  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-  //  for (unsigned i = start; i < end; i++) {
-  //    labels[i-start] = labels_from_file[i];
-  //  }
-  //  return labels;
-
   // Return pointer to labels
   return &labels_from_file[start];
 }
@@ -1387,7 +1190,7 @@ uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start,
 static float average_accuracy = 0.0;
 static int num_executations = 0;
 
-//*** Copied from dnn_sources/include/utils.h                             ***//
+
 float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
   struct Tensor *result = (struct Tensor *)result_ptr;
@@ -1433,10 +1236,8 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
   return accuracy;
 }
 
-#define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl
-//#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl
-//#define llvm_hpvm_invokeRtControl_ITERATE llvm_hpvm_invokeRtControl
-
+// This routine is used when llvm_hpvm_invokeRtControl macro is set to llvm_hpvm_invokeRtControl_BASE 
+// This is the default config selection routine - it selects the first configuration in the config-file  
 extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str,
                                                int start, int end) {
 
@@ -1462,92 +1263,12 @@ extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str,
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str,
-                                                  int start, int end) {
-
-  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  RC->resume_profiler();
-  RC->findNextConfiguration();
-  // Still use findNext configuration, to update the configurationIdx,
-  // to point to next location
-  enum SEARCH_KIND k = ACCURACY_LOSS;
-  float goalVal =
-      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss;
-  RC->findTargetConfiguration(goalVal, k);
-
-  RC->pause_profiler();
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str,
-                                                 int start, int end) {
-
-  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-  RC->readIterationFrequency();
-
-  RC->resume_profiler();
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double target_speedup;
-  if (RC->getLastFrequency() == RC->getIterationFrequency()) {
-    target_speedup = RC->getLastSpeedup();
-  } else {
-    double baseline_time = RC->getBaselineTime();
-    // Relative to current configuration
-    target_speedup = current_iteration_time / baseline_time;
-    // Adjust to baseline
-    target_speedup *= RC->getCurrentConfigurationSpeedup();
-    RC->setLastFrequency(RC->getIterationFrequency());
-    RC->setLastSpeedup(target_speedup);
-  }
-  RC->findTargetConfiguration(target_speedup, SPEEDUP);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  //*                                                                        *
-  //*Needed for the frequency variation experiment - not part of the control *
-  RC->resume_profiler();
-  RC->updateFrequency();
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo2 = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationConfigTime(pinfo2.first);
-  RC->addToCurrentIterationConfigEnergy(pinfo2.second);
-  //*                                                                        */
-
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
-  INFO("target speedup = %lf\n\n", target_speedup);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
 
+/// This routine is used when `llvm_hpvm_invokeRtControl` macro is set to `llvm_hpvm_invokeRtControl_ADJUST_PR` 
+/// This routine does probabilistic selection of configurations from the Pareto curve
 extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result,
-                                                    const char *str, int start,
+                                                    const char *str,
+						    int start,
                                                     int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
@@ -1555,22 +1276,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result,
 
   // Read stats for iteration that was just completed
   double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-  RC->readIterationFrequency();
 
   RC->resume_profiler();
   double current_iteration_time = RC->getCurrentIterationComputeTime();
   double target_speedup;
-  if (RC->getLastFrequency() == RC->getIterationFrequency()) {
-    target_speedup = RC->getLastSpeedup();
-  } else {
-    double baseline_time = RC->getBaselineTime();
-    // Relative to current configuration
-    target_speedup = current_iteration_time / baseline_time;
-    // Adjust to baseline
-    target_speedup *= RC->getCurrentConfigurationSpeedup();
-    RC->setLastFrequency(RC->getIterationFrequency());
-    RC->setLastSpeedup(target_speedup);
-  }
+
+  double baseline_time = RC->getBaselineTime();
+  // Relative to current configuration
+  target_speedup = current_iteration_time / baseline_time;
+  // Adjust to baseline
+  target_speedup *= RC->getCurrentConfigurationSpeedup();
+  
   RC->findTargetConfiguration(target_speedup, SPEEDUP);
   RC->adjustTargetConfiguration(target_speedup);
   RC->pause_profiler();
@@ -1580,18 +1296,14 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result,
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  //*                                                                        *
-  //*Needed for the frequency variation experiment - not part of the control *
   RC->resume_profiler();
-  RC->updateFrequency();
   RC->pause_profiler();
 
   std::pair<double, double> pinfo2 = RC->get_time_energy();
   RC->reset_profiler();
   RC->addToCurrentIterationConfigTime(pinfo2.first);
   RC->addToCurrentIterationConfigEnergy(pinfo2.second);
-  //*                                                                        */
-
+ 
   INFO("current iteration time = %f, current iteration energy = %f\n",
        current_iteration_time, current_iteration_energy);
   INFO("target speedup = %lf\n\n", target_speedup);
@@ -1600,119 +1312,4 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result,
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result,
-                                                   const char *str, int start,
-                                                   int end) {
-
-  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  std::string prev_conf_name =
-      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-
-  RC->resume_profiler();
-  float slowdown = RC->getSlowdowns()->getNextSlowdown();
-  RC->findTargetConfiguration(slowdown, SPEEDUP);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  std::string next_conf_name =
-      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-  float next_conf_speedup =
-      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
-
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
-  INFO("slowdown (target speedup) = %f\n", slowdown);
-  INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO("Swapping to next configuration: %s with speedup %f\n\n",
-       next_conf_name.c_str(), next_conf_speedup);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result,
-                                                      const char *str,
-                                                      int start, int end) {
-
-  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  std::string prev_conf_name =
-      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-
-  RC->resume_profiler();
-  float slowdown = RC->getSlowdowns()->getNextSlowdown();
-  RC->findTargetConfiguration(slowdown, SPEEDUP);
-  RC->adjustTargetConfiguration(slowdown);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  std::string next_conf_name =
-      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name;
-  float next_conf_speedup =
-      RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
-
-  INFO("current iteration time = %f, current iteration energy = %f\n",
-       current_iteration_time, current_iteration_energy);
-  INFO("slowdown (target speedup) = %f\n", slowdown);
-  INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO("Swapping to next configuration: %s with speedup %f\n\n",
-       next_conf_name.c_str(), next_conf_speedup);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
 
-extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str,
-                                               int start, int end) {
-
-  uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
-  hpvm_rt_computeAccuracy3(labels_cached, result);
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  RC->resume_profiler();
-  RC->findTargetConfiguration(RC->getGoalSpeedup(), SPEEDUP);
-  RC->pause_profiler();
-
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
-
-template <typename T>
-static void writeVectorToFile(const char *path, const std::vector<T> &vec) {
-  std::ofstream of(path, std::ofstream::out | std::ofstream::app);
-  if (!of.good())
-    ERROR("Cannot write to %s file", path);
-  for (float f : vec)
-    of << f << ' ';
-  of << '\n';
-}
diff --git a/hpvm/projects/hpvm-tensor-rt/tests/perforation_tests.cc b/hpvm/projects/hpvm-tensor-rt/tests/perforation_tests.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c36a2c81d04a64398e106006c49746e0dd70037b
--- /dev/null
+++ b/hpvm/projects/hpvm-tensor-rt/tests/perforation_tests.cc
@@ -0,0 +1,16 @@
+
+#include "tests.h"
+
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  UnitTestResults unitTestResults;
+  
+  testPerforation(unitTestResults);
+  
+  unitTestResults.printSummary();
+
+  return 0;
+}
diff --git a/hpvm/projects/hpvm-tensor-rt/tests/sampling_tests.cc b/hpvm/projects/hpvm-tensor-rt/tests/sampling_tests.cc
new file mode 100644
index 0000000000000000000000000000000000000000..087511413e56c7b8653ea5cb5d9798839af88ebb
--- /dev/null
+++ b/hpvm/projects/hpvm-tensor-rt/tests/sampling_tests.cc
@@ -0,0 +1,16 @@
+
+#include "tests.h"
+
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  UnitTestResults unitTestResults;
+  
+  testSampling(unitTestResults); 
+  
+  unitTestResults.printSummary();
+
+  return 0;
+}
diff --git a/hpvm/projects/hpvm-tensor-rt/tests/tests.h b/hpvm/projects/hpvm-tensor-rt/tests/tests.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2cf1d70de2e640568232c61abe39248d1eb9bc6
--- /dev/null
+++ b/hpvm/projects/hpvm-tensor-rt/tests/tests.h
@@ -0,0 +1,451 @@
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <vector>
+#include <string.h>
+#include "tensor_runtime.h"
+#include "tensor_cpu_runtime.h"
+#include "tensorUtils.h"
+#include "tensor_custom_ops_cpu.h"
+
+using namespace std;
+
+
+class UnitTestResults {
+
+private:
+  unsigned int total_tests;
+  unsigned int failed_tests;
+  unsigned int passed_tests;
+  std::vector<string> failed_test_ids;
+
+public:
+  UnitTestResults() {
+    total_tests = 0;
+    failed_tests = 0;
+    passed_tests = 0;
+  }
+
+  void evalTestResult(Tensor *res, const float *expected_result,
+                      size_t num_elems, float epsilon, string test_name) {
+
+    total_tests += 1;
+    if (res->num_elems != num_elems) {
+      failed_tests += 1;
+      failed_test_ids.push_back(test_name);
+      return;
+    }
+
+    float *data_ptr = (float *)res->host_data;
+    for (unsigned int i = 0; i < res->num_elems; i++) {
+      if (std::abs(data_ptr[i] - expected_result[i]) > epsilon) {
+        failed_tests += 1;
+        failed_test_ids.push_back(test_name);
+        return;
+      }
+    }
+
+    passed_tests += 1;
+  }
+
+  void compareTensors(Tensor *res, Tensor *gold_res, float epsilon,
+                      string test_name) {
+
+    const float *expected_result = (float *)gold_res->host_data;
+    unsigned int num_elems = res->num_elems;
+
+    evalTestResult(res, expected_result, num_elems, epsilon, test_name);
+  }
+
+  void printSummary() {
+
+    printf("\n\n\n ************* Printing Results Summary ********** \n\n");
+    printf("-- Total tests :=  %d \n", total_tests);
+    printf("-- Tests Passed := %d \n", passed_tests);
+    printf("-- Tests Failed := %d \n", failed_tests);
+
+    printf("\n\n Tests that failed : \n\n");
+    for (int i = 0; i < failed_test_ids.size(); i++) {
+      printf("*** Test = %s \n", failed_test_ids[i].c_str());
+    }
+
+    if (failed_test_ids.size() > 0){
+      
+      printf("Some Tests Failed. Aborting");
+      exit(1);
+    }
+    
+  }
+};
+
+
+
+
+
+void testSampleFilter() {
+
+  printf("***** Tensor Sample Filter ***** \n\n");
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
+
+  fillWithOnesAndTwos(input);
+
+  Tensor *input2 = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
+                                            3, 2, 32, 32);
+  fillTensorWithVal(input2, 1);
+  printTensorValues(input);
+
+  void *exact_res = tensorConvolution(input2, input, 0, 0, 1, 1, 1, 1);
+  printTensorValues(exact_res);
+
+  void *res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0);
+
+  printTensorValues(res);
+}
+
+void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w,
+                          int stride_h, int stride_w, int row, int col,
+                          UnitTestResults &unitTestResults) {
+
+  float interpolation_rate = 1.0;
+  for (int offset = 0; offset < 2; offset++) {
+
+    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
+           "row = %d col = %d  offset= %d \n\n",
+           pad_h, pad_w, stride_h, stride_w, row, col, offset);
+
+    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1);
+
+    printf("tensorConvolution Result :");
+    printTensorValues(res_exact);
+
+    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1, 1, 1, 1, 1);
+
+    printf("\nBaseline Result :");
+    printTensorValues(res_exact2);
+
+    void *res_exact3 = tensorConvApproxHalf2(
+        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
+    convertToFP32((struct Tensor *)res_exact3);
+
+    printf("\nFP16_Baseline Result :");
+    printTensorValues(res_exact3);
+
+    void *res_sim = tensorConvPerfCuda(input, filter, pad_h, pad_w, stride_h,
+                                       stride_w, 1, 1, row, col, offset);
+
+    printf("\nConvPerfCuda Result :");
+    printTensorValues(res_sim);
+
+    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                  stride_w, 1, 1, row, col, 1, offset);
+
+    printf("\nConvApprox Result :");
+    printTensorValues(res);
+
+    hpvm_request_tensor(input, HOST);
+    hpvm_request_tensor(filter, HOST);
+
+    void *res_cpu = tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1, row, col, 1, offset);
+
+    printf("\nConvApproxCPU Result :");
+    printTensorValues(res_cpu);
+
+    void *res_half =
+        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
+                              1, 1, row, col, 1, offset);
+
+    convertToFP32((struct Tensor *)res_half);
+
+    printf("\nConvApproxHalf2 Result :");
+    printTensorValues(res_half);
+
+    std::string suffix =
+        std::string(" pad_h = ") + std::to_string(pad_h) +
+        std::string(" pad_w = ") + std::to_string(pad_w) +
+        std::string(" stride_h = ") + std::to_string(stride_h) +
+        std::string(" stride_w = ") + std::to_string(stride_w) +
+        std::string(" row = ") + std::to_string(row) + std::string(" col = ") +
+        std::to_string(col) + std::string(" offset = ") +
+        std::to_string(offset);
+
+    std::string test_name = std::string("PERF_FP32 ") + suffix;
+
+    unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05,
+                                   test_name);
+
+    std::string fp16_test_name = std::string("PERF_FP16 ") + suffix;
+    unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1,
+                                    fp16_test_name);
+
+    std::string cpu_test_name = std::string("PERF_CPU ") + suffix;
+    unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05,
+                                   cpu_test_name);
+  }
+
+  printf("\n\n\n--- End of Test \n\n\n");
+}
+
+/**** Tests Perforation for a set of different inputs */
+void testPerforation(UnitTestResults &unitTestResults) {
+
+  printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
+  fillTensorWithVal(input, 1);
+
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
+  fillTensorWithVal(filter, 1);
+
+
+  testPerforationCalls(input, filter, 0, 0, 1, 1, 1, 2, unitTestResults);
+
+  testPerforationCalls(input, filter, 0, 0, 1, 1, 2, 1, unitTestResults);
+
+  testPerforationCalls(input, filter, 1, 1, 1, 1, 1, 3, unitTestResults);
+
+  testPerforationCalls(input, filter, 1, 1, 1, 1, 3, 1, unitTestResults);
+
+  testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4, unitTestResults);
+
+  testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1, unitTestResults);
+}
+
+void testSampling() {
+
+  printf("***** Testing Sampling ***** \n\n");
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
+  fillTensorWithVal(input, 1);
+
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
+  fillTensorWithVal(filter, 1);
+
+  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
+  host_ptr[0] = 2;
+  host_ptr[2] = 2;
+  host_ptr[4] = 2;
+  host_ptr[6] = 2;
+  host_ptr[8] = 2;
+  host_ptr[10] = 2;
+  host_ptr[12] = 2;
+  host_ptr[14] = 2;
+  host_ptr[16] = 2;
+  host_ptr[18] = 2;
+  host_ptr[20] = 2;
+  host_ptr[22] = 2;
+  host_ptr[24] = 2;
+  host_ptr[26] = 2;
+  // printTensorValues(input);
+
+  void *res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
+
+  printTensorValues(res);
+
+  void *res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1);
+
+  printTensorValues(res2);
+
+  void *res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0);
+
+  printTensorValues(res2_sim);
+
+  void *res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0);
+
+  printTensorValues(res3);
+
+  void *res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
+
+  printTensorValues(res4);
+
+  void *res4_half =
+      tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
+
+  convertToFP32((struct Tensor *)res4_half);
+
+  printTensorValues(res4_half);
+}
+
+void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w,
+                       int stride_h, int stride_w, int skip_every,
+                       std::string filter_string,
+                       UnitTestResults &unitTestResults) {
+
+  float interpolation_rate = 1.0;
+  for (int offset = 0; offset < 2; offset++) {
+
+    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
+           "skip_every = %d offset= %d interpolation_rate = %f \n\n",
+           pad_h, pad_w, stride_h, stride_w, skip_every, offset,
+           interpolation_rate);
+
+    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1);
+
+    printf("tensorConvolution Result :");
+    printTensorValues(res_exact);
+
+    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1, 1, 1, 1, 1);
+
+    printf("\nBaseline Result :");
+    printTensorValues(res_exact2);
+
+    void *res_exact3 = tensorConvApproxHalf2(
+        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
+    convertToFP32((struct Tensor *)res_exact3);
+
+    printf("\nFP16_Baseline Result :");
+    printTensorValues(res_exact3);
+
+    void *res_sim =
+        tensorConvSampSim2(input, filter, pad_h, pad_w, stride_h, stride_w, 1,
+                           1, skip_every, offset, interpolation_rate);
+
+    printf("\nConvSampSim Result :");
+    printTensorValues(res_sim);
+
+    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                 stride_w, 1, 1, 1, 1, skip_every, offset);
+
+    printf("\nConvApprox Result :");
+    printTensorValues(res);
+
+    hpvm_request_tensor(input, HOST);
+    hpvm_request_tensor(filter, HOST);
+
+    void *res_cpu =
+        tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h, stride_w, 1,
+                            1, 1, 1, skip_every, offset);
+
+    printf("\nConvApproxCPU Result :");
+    printTensorValues(res_cpu);
+
+    void *res_half =
+        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
+                              1, 1, 1, 1, skip_every, offset);
+
+    convertToFP32((struct Tensor *)res_half);
+
+    printf("\nConvApproxHalf2 Result :");
+    printTensorValues(res_half);
+
+    std::string suffix =
+        "filter = " + std::string(filter_string) + std::string(" pad_h = ") +
+        std::to_string(pad_h) + std::string(" pad_w = ") +
+        std::to_string(pad_w) + std::string(" stride_h = ") +
+        std::to_string(stride_h) + std::string(" stride_w = ") +
+        std::to_string(stride_w) + std::string(" skip_every = ") +
+        std::to_string(skip_every) + std::string(" offset = ") +
+        std::to_string(offset);
+
+    std::string test_name = std::string("SAMP_FP32 ") + suffix;
+
+    unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05,
+                                   test_name);
+
+    std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix;
+    unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1,
+                                   fp16_test_name);
+
+    std::string cpu_test_name = std::string("SAMP_CPU ") + suffix;
+    unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05,
+                                   cpu_test_name);
+  }
+
+  printf("\n\n\n --- End of Test \n\n\n");
+}
+
+/**** Tests Sample for a sample 3 * 3 Filter */
+void testSampling_3_3(UnitTestResults &unitTestResults) {
+
+  printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
+  fillTensorWithVal(input, 1);
+  // fillWithOnesAndTwos(input);
+
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
+  fillTensorWithVal(filter, 1);
+
+  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
+  host_ptr[0] = 10;
+  host_ptr[2] = 2;
+  host_ptr[4] = 2;
+  host_ptr[6] = 2;
+  host_ptr[8] = 2;
+  host_ptr[10] = 2;
+  host_ptr[12] = 2;
+  host_ptr[14] = 2;
+  host_ptr[16] = 2;
+  host_ptr[18] = 2;
+  host_ptr[20] = 2;
+  host_ptr[22] = 2;
+  host_ptr[24] = 2;
+  host_ptr[26] = 10;
+
+  // Tests with padding = 0 stride = 1
+  testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "3_3", unitTestResults);
+
+  testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "3_3", unitTestResults);
+
+  testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "3_3", unitTestResults);
+
+  // Tests with padding = 1 stride = 1
+  testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "3_3", unitTestResults);
+
+  testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "3_3", unitTestResults);
+
+  testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "3_3", unitTestResults);
+
+  // Tests with padding = 1 stride = 2
+  testSamplingCalls(input, filter, 1, 1, 2, 2, 2, "3_3", unitTestResults);
+
+  testSamplingCalls(input, filter, 1, 1, 2, 2, 3, "3_3", unitTestResults);
+
+  testSamplingCalls(input, filter, 1, 1, 2, 2, 4, "3_3", unitTestResults);
+}
+
+/**** Tests Sample for a sample 1 * 1 Filter */
+void testSampling_1_1(UnitTestResults &unitTestResults) {
+
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2);
+  fillTensorWithVal(input, 2);
+  // fillWithOnesAndTwos(input);
+
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1);
+  fillTensorWithVal(filter, 2);
+
+  // Tests with padding = 0 stride = 1
+  testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "1_1", unitTestResults);
+
+  testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "1_1", unitTestResults);
+
+  testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "1_1", unitTestResults);
+
+  // Tests with padding = 1 stride = 1
+  testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "1_1", unitTestResults);
+
+  testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "1_1", unitTestResults);
+
+  testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "1_1", unitTestResults);
+}
+
+
+
+void testSampling(UnitTestResults &unitTestResults){
+
+  testSampling_3_3(unitTestResults);
+  testSampling_1_1(unitTestResults);
+}
+
diff --git a/hpvm/projects/hpvm-tensor-rt/tests/unit_tests.cc b/hpvm/projects/hpvm-tensor-rt/tests/unit_tests.cc
deleted file mode 100644
index ffb4c3a809b3e936f6c27ebd7c11aef5c4460104..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tests/unit_tests.cc
+++ /dev/null
@@ -1,1120 +0,0 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <vector>
-#include <string.h>
-#include "tensor_runtime.h"
-#include "tensor_cpu_runtime.h"
-#include "tensorUtils.h"
-#include "tensor_custom_ops_cpu.h"
-
-using namespace std;
-
-class UnitTestResults {
-
-private:
-  unsigned int total_tests;
-  unsigned int failed_tests;
-  unsigned int passed_tests;
-  std::vector<string> failed_test_ids;
-
-public:
-  UnitTestResults() {
-    total_tests = 0;
-    failed_tests = 0;
-    passed_tests = 0;
-  }
-
-  void evalTestResult(Tensor *res, const float *expected_result,
-                      size_t num_elems, float epsilon, string test_name) {
-
-    total_tests += 1;
-    if (res->num_elems != num_elems) {
-      failed_tests += 1;
-      failed_test_ids.push_back(test_name);
-      return;
-    }
-
-    float *data_ptr = (float *)res->host_data;
-    for (unsigned int i = 0; i < res->num_elems; i++) {
-      // printf("**diff value = %f ", std::abs(data_ptr[i] -
-      // expected_result[i]));
-      if (std::abs(data_ptr[i] - expected_result[i]) > epsilon) {
-        failed_tests += 1;
-        failed_test_ids.push_back(test_name);
-        return;
-      }
-    }
-
-    passed_tests += 1;
-  }
-
-  void compareTensors(Tensor *res, Tensor *gold_res, float epsilon,
-                      string test_name) {
-
-    const float *expected_result = (float *)gold_res->host_data;
-    unsigned int num_elems = res->num_elems;
-
-    evalTestResult(res, expected_result, num_elems, epsilon, test_name);
-  }
-
-  void printSummary() {
-
-    printf("\n\n\n ************* Printing Results Summary ********** \n\n");
-    printf("-- Total tests :=  %d \n", total_tests);
-    printf("-- Tests Passed := %d \n", passed_tests);
-    printf("-- Tests Failed := %d \n", failed_tests);
-
-    printf("\n\n Tests that failed : \n\n");
-    for (int i = 0; i < failed_test_ids.size(); i++) {
-      printf("*** Test = %s \n", failed_test_ids[i].c_str());
-    }
-  }
-};
-
-void testTensorHgemm(UnitTestResults &unitTestResults) {
-
-  printf("***** TensorHgemm ***** \n\n");
-  void *lhs_ptr =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
-  struct Tensor *lhs = (struct Tensor *)lhs_ptr;
-  fillTensorWithOnes(lhs);
-
-  float *data_arr = (float *)lhs->host_data;
-  for (int i = 0; i < lhs->num_elems; i++) {
-    data_arr[i] = (i / 4) + 1;
-  }
-
-  void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);
-  fillTensorWithOnes(rhs);
-
-  void *output = tensorHalfGemm(lhs, rhs);
-  convertToFP32((struct Tensor *)output);
-
-  printTensorValues(output);
-
-  const float expected_result[15] = {4,  4,  4,  8,  8,  8,  12, 12,
-                                     12, 16, 16, 16, 20, 20, 20};
-
-  unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01,
-                                 "Hgemm");
-}
-
-void testTensorSgemm(UnitTestResults &unitTestResults) {
-
-  printf("***** TensorSgemm ***** \n\n");
-  void *lhs_ptr =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
-  struct Tensor *lhs = (struct Tensor *)lhs_ptr;
-  fillTensorWithOnes(lhs);
-
-  float *data_arr = (float *)lhs->host_data;
-  for (int i = 0; i < lhs->num_elems; i++) {
-    data_arr[i] = (i / 4) + 1;
-  }
-
-  void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);
-  fillTensorWithOnes(rhs);
-
-  void *output = tensorGemmGPU(lhs, rhs);
-  printTensorValues(output);
-
-  const float expected_result[15] = {4,  4,  4,  8,  8,  8,  12, 12,
-                                     12, 16, 16, 16, 20, 20, 20};
-
-  unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01,
-                                 "Sgemm");
-}
-
-void testTensorConcatAndSplit() {
-
-  int conv_mode = 1;         // CROSS_CORRELATION mode
-  int compute_precision = 0; // floating point precision
-
-  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
-  fillWithOnesAndTwos(input);
-  void **splits = tensorSplit(input, 2, 1);
-
-  void *conv2W =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
-  fillTensorWithOnes(conv2W);
-
-  void **conv2fils = tensorSplit(conv2W, 2, 0);
-
-  void *conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0, 1, 1,
-                                       conv_mode, compute_precision);
-  printTensorDims(conv2a_out);
-
-  void *conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0, 1, 1,
-                                       conv_mode, compute_precision);
-  printTensorDims(conv2b_out);
-
-  void *conv2_outs[2];
-  conv2_outs[0] = conv2a_out;
-  conv2_outs[1] = conv2b_out;
-
-  void *conv2_concat_out = tensorConcat(conv2_outs, 2, 1);
-  printTensorDims(conv2_concat_out);
-  printTensorValues(conv2_concat_out);
-}
-
-void testLRN() {
-
-  void *input =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20);
-  fillTensorWithOnes(input);
-
-  unsigned LRN_window = 5;
-  double LRN_alpha = 2e-05;
-  printf("LRN_alpha = %f \n", LRN_alpha);
-
-  double LRN_beta = 0.75;
-  double LRN_k = 1.0;
-
-  // TEST-point - Compare TF vs CUDNN
-  void *lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k);
-  printTensorDims(lrn1out);
-  dumpWeightsToFile("tensors_out/lrn1_test.out", lrn1out);
-
-  void *input2 =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7);
-  fillTensorWithOnes(input2);
-
-  LRN_window = 5;
-  LRN_alpha = 0.5 * LRN_window;
-
-  LRN_beta = 0.75;
-  LRN_k = 1.0;
-
-  void *lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k);
-  printTensorDims(lrn2out);
-  dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out);
-}
-
-void testTensorAdd() {
-
-  // Tensor add with equal dimensions
-  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
-  void *bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
-  fillTensorWithOnes(x);
-  fillTensorWithOnes(bias);
-
-  printTensorValues(x);
-  printTensorValues(bias);
-
-  tensorAdd(x, bias);
-  printTensorValues(x);
-
-  // Tensor addd with matching channel dimension
-  void *x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2);
-  void *bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1);
-  fillTensorWithOnes(x2);
-  fillTensorWithOnes(bias2);
-
-  tensorAdd(x2, bias2);
-  printTensorValues(x2);
-}
-
-void testTensorConv() {
-
-  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void *filter =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
-
-  fillTensorWithOnes(input);
-  fillTensorWithOnes(filter);
-
-  int conv_mode = 1;         // NOTE: uses CROSS_CORRELATION
-  int compute_precision = 0; // floating point precision for conv
-
-  void *conv_out = tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode,
-                                     compute_precision);
-  printTensorValues(conv_out);
-}
-
-void testTensorHalfConv() {
-
-  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void *filter =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
-
-  fillTensorWithOnes(input);
-  fillTensorWithOnes(filter);
-
-  int conv_mode = 1;         // NOTE: uses CROSS_CORRELATION
-  int compute_precision = 0; // floating point precision for conv
-
-  void *conv_out = tensorHalfConvolution(input, filter, 0, 0, 1, 1, conv_mode,
-                                         compute_precision);
-  printTensorValues(conv_out);
-}
-
-void testTensorGroupConv() {
-
-  // NOTE: The input channel count value (param2 to Tensor and Filter) must be
-  // the same
-  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void *filter =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
-
-  // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor,
-  // val)
-  fillTensorWithOnes(input);
-  fillTensorWithOnes(filter);
-
-  int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
-  int conv_groups = 2;
-
-  void *conv_out =
-      tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups);
-  printTensorValues(conv_out);
-}
-
-void testTensorHalfGroupConv() {
-
-  // NOTE: The input channel count value (param2 to Tensor and Filter) must be
-  // the same
-  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void *filter =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
-
-  fillTensorWithOnes(input);
-  fillTensorWithOnes(filter);
-
-  int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
-  int conv_groups = 2;
-
-  void *conv_out =
-      tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups);
-
-  convertToFP32((struct Tensor *)conv_out);
-
-  printTensorValues(conv_out);
-}
-
-void testTensorPooling() {
-
-  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
-  fillTensorWithOnes(x);
-
-  float *data_arr = (float *)((Tensor *)x)->host_data;
-  for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) {
-    data_arr[i] = i;
-  }
-
-  void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
-  printTensorValues(output);
-}
-
-void testTensorHalfPooling() {
-
-  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
-  fillTensorWithOnes(x);
-
-  float *data_arr = (float *)((Tensor *)x)->host_data;
-  for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) {
-    data_arr[i] = i;
-  }
-
-  void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
-  convertToFP32((struct Tensor *)output);
-
-  printTensorValues(output);
-}
-
-void testTensorBatchNorm() {
-
-  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
-  fillTensorWithVal(x, 3);
-
-  void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
-  fillTensorWithVal(gamma, 1);
-
-  void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
-  fillTensorWithVal(beta, 0);
-
-  void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
-  fillTensorWithVal(mean, 1);
-
-  void *variance =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
-  fillTensorWithVal(variance, 1);
-
-  double epsilon = 1;
-  // NOTE: result = X - mean / sqrt(epsilon + variance)
-  void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);
-
-  printTensorValues(output);
-}
-
-void testTensorHalfBatchNorm() {
-
-  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
-  fillTensorWithVal(x, 3);
-
-  void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
-  fillTensorWithVal(gamma, 1);
-
-  void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
-  fillTensorWithVal(beta, 0);
-
-  void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
-  fillTensorWithVal(mean, 1);
-
-  void *variance =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
-  fillTensorWithVal(variance, 1);
-
-  double epsilon = 1;
-  // NOTE: result = X - mean / sqrt(epsilon + variance)
-  void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);
-  convertToFP32((struct Tensor *)output);
-
-  printTensorValues(output);
-}
-
-void testTensorRelu() {
-
-  // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match
-  printf("***** TensorRelu ***** \n\n");
-  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
-  fillTensorWithNegOnes(input);
-
-  void *output = tensorRelu(input);
-  printTensorValues(output);
-}
-
-void testTensorSoftmax() {
-
-  printf("***** TensorSoftmax ***** \n\n");
-  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1);
-
-  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
-  host_ptr[0] = 0.1;
-  host_ptr[1] = 0.2;
-  host_ptr[2] = 0.3;
-  host_ptr[3] = 0.4;
-  host_ptr[4] = 0.5;
-  host_ptr[5] = 0.6;
-  host_ptr[6] = 0.7;
-  host_ptr[7] = 2.5;
-
-  void *output = tensorSoftmax(input);
-  printTensorValues(output);
-}
-
-void testSoftmaxOutput(void *output_ptr) {
-
-  struct Tensor *output = (struct Tensor *)output_ptr;
-
-  size_t batch_dim = output->dims.dim_sizes[0];
-  size_t channels = output->dims.dim_sizes[1];
-
-  float *data = (float *)output->host_data;
-  for (int i = 0; i < batch_dim; i++) {
-    float sum = 0.0;
-    for (int j = 0; j < channels; j++) {
-      sum += data[i * channels + j];
-    }
-    printf("output_sum = %f \n", sum);
-  }
-}
-
-void testPromiseError() {
-
-  printf("***** TensorQuantize ***** \n\n");
-  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
-  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
-
-  void *gold_tensor =
-      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
-  float *gold_ptr = (float *)((struct Tensor *)gold_tensor)->host_data;
-
-  gold_ptr[0] = -1;
-  gold_ptr[1] = -2;
-  gold_ptr[2] = -3;
-  gold_ptr[3] = -4;
-  gold_ptr[4] = -5;
-  gold_ptr[5] = 0;
-  gold_ptr[6] = 5;
-  gold_ptr[7] = 4;
-  gold_ptr[8] = 3;
-  gold_ptr[9] = 2;
-  gold_ptr[10] = 1;
-  gold_ptr[11] = 1;
-
-  int num_elems = 12;
-  int num_runs = 1000;
-
-  float *result_ptr = (float *)malloc(sizeof(float) * num_elems);
-
-  for (int swing = 1; swing <= 7; swing++) {
-
-    for (int j = 0; j < num_elems; j++) {
-      result_ptr[j] = 0;
-    }
-
-    float error_sum = 0.0;
-
-    for (int i = 0; i < 1000; i++) {
-      host_ptr[0] = -1;
-      host_ptr[1] = -2;
-      host_ptr[2] = -3;
-      host_ptr[3] = -4;
-      host_ptr[4] = -5;
-      host_ptr[5] = 0;
-      host_ptr[6] = 5;
-      host_ptr[7] = 4;
-      host_ptr[8] = 3;
-      host_ptr[9] = 2;
-      host_ptr[10] = 1;
-      host_ptr[11] = 1;
-
-      void *error_out = addPromiseError(input, swing);
-      // printTensorValues(error_out);
-
-      // Move result data back to the host
-      hpvm_request_tensor(input, 0);
-      float *error_out_ptr = (float *)((struct Tensor *)input)->host_data;
-
-      for (int j = 0; j < num_elems; j++) {
-        result_ptr[j] += error_out_ptr[j];
-        error_sum +=
-            (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]);
-      }
-    }
-
-    printf("\n\n - Swing %d results : \n", swing);
-    for (int j = 0; j < num_elems; j++) {
-      result_ptr[j] = result_ptr[j] / num_runs;
-      printf(" %f ", result_ptr[j]);
-    }
-
-    printf("mean_error = %f \n", error_sum / num_runs);
-
-    printf(" \n");
-  }
-}
-
-void testQuantization() {
-
-  printf("***** TensorQuantize ***** \n\n");
-  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
-
-  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
-  host_ptr[0] = -0.1;
-  host_ptr[1] = -25;
-  host_ptr[2] = 0.2;
-  host_ptr[3] = -0.4;
-  host_ptr[4] = 1.7;
-  host_ptr[5] = -2.9;
-  host_ptr[6] = 0.7;
-  host_ptr[7] = 0.99;
-  host_ptr[8] = 7;
-  host_ptr[9] = 7.2;
-  host_ptr[10] = 2.5;
-  host_ptr[11] = 3;
-
-  void *quantize_result1 = quantizeTensorPromise(input, -4, 6);
-
-  printf("\n ** quantizing with range min = %d max = %d \n", -4, 6);
-  printTensorValues(quantize_result1);
-
-  host_ptr[0] = -0.1;
-  host_ptr[1] = -25;
-  host_ptr[2] = 0.2;
-  host_ptr[3] = -0.4;
-  host_ptr[4] = 1.7;
-  host_ptr[5] = -2.9;
-  host_ptr[6] = 0.7;
-  host_ptr[7] = 0.99;
-  host_ptr[8] = 7;
-  host_ptr[9] = 7.2;
-  host_ptr[10] = 2.5;
-  host_ptr[11] = 3;
-
-  void *quantize_result2 = quantizeTensorPromise(input, -2, 2);
-
-  printf("\n ** quantizing with range min = %d max = %d \n", -2, 2);
-  printTensorValues(quantize_result2);
-
-  host_ptr[0] = -0.1;
-  host_ptr[1] = -25;
-  host_ptr[2] = 0.2;
-  host_ptr[3] = -0.4;
-  host_ptr[4] = 1.7;
-  host_ptr[5] = -2.9;
-  host_ptr[6] = 0.7;
-  host_ptr[7] = 0.99;
-  host_ptr[8] = 7;
-  host_ptr[9] = 7.2;
-  host_ptr[10] = 2.5;
-  host_ptr[11] = 3;
-
-  void *quantize_result3 = quantizeTensorPromise(input, -25, 8);
-
-  printf("\n ** quantizing with range min = %d max = %d \n", -25, 8);
-  printTensorValues(quantize_result3);
-
-  printf("\n ** quantizing with range min = %d max = %d \n", -10, 10);
-
-  host_ptr[0] = -0.1;
-  host_ptr[1] = -25;
-  host_ptr[2] = 0.2;
-  host_ptr[3] = -0.4;
-  host_ptr[4] = 1.7;
-  host_ptr[5] = -2.9;
-  host_ptr[6] = 0.7;
-  host_ptr[7] = 0.99;
-  host_ptr[8] = 7;
-  host_ptr[9] = 7.2;
-  host_ptr[10] = 2.5;
-  host_ptr[11] = 3;
-
-  void *quantize_result4 = quantizeTensorPromise(input, -10, 10);
-  printTensorValues(quantize_result4);
-
-  void *quantize_result5 = quantizeTensorPromise(input, -10, 10);
-  printTensorValues(quantize_result5);
-
-  // void* error_out = addPromiseError(quantize_result, 1);
-  // printTensorValues(error_out);
-}
-
-void testSampleFilter() {
-
-  printf("***** Tensor Sample Filter ***** \n\n");
-  Tensor *input =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
-  // fillTensorWithVal(input, 3);
-  fillWithOnesAndTwos(input);
-
-  Tensor *input2 = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
-                                            3, 2, 32, 32);
-  fillTensorWithVal(input2, 1);
-
-  /*  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
-  host_ptr[0] = -0.1;
-  host_ptr[1] = -25;
-  host_ptr[2] = 0.2;
-  host_ptr[3] = -0.4;
-  host_ptr[4] = 1.7;
-  host_ptr[5] = -2.9;
-  host_ptr[6] = 0.7;
-  host_ptr[7] = 0.99;
-  */
-
-  printTensorValues(input);
-
-  /*  printf("\n\n");
-
-  hpvm_request_tensor(input, DEVICE);
-
-  sampleFilter(input, 2, 1);
-
-  hpvm_request_tensor(input, HOST);
-
-  printTensorValues(input);
-  */
-
-  void *exact_res = tensorConvolution(input2, input, 0, 0, 1, 1, 1, 1);
-  printTensorValues(exact_res);
-
-  void *res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0);
-
-  // void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3);
-
-  printTensorValues(res);
-}
-
-void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w,
-                          int stride_h, int stride_w, int row, int col,
-                          UnitTestResults &unitTestResults) {
-
-  float interpolation_rate = 1.0;
-  for (int offset = 0; offset < 2; offset++) {
-
-    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
-           "row = %d col = %d  offset= %d \n\n",
-           pad_h, pad_w, stride_h, stride_w, row, col, offset);
-
-    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
-                                        stride_w, 1, 1);
-
-    printf("tensorConvolution Result :");
-    printTensorValues(res_exact);
-
-    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
-                                        stride_w, 1, 1, 1, 1, 1, 1);
-
-    printf("\nBaseline Result :");
-    printTensorValues(res_exact2);
-
-    void *res_exact3 = tensorConvApproxHalf2(
-        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
-    convertToFP32((struct Tensor *)res_exact3);
-
-    printf("\nFP16_Baseline Result :");
-    printTensorValues(res_exact3);
-
-    void *res_sim = tensorConvPerfCuda(input, filter, pad_h, pad_w, stride_h,
-                                       stride_w, 1, 1, row, col, offset);
-
-    printf("\nConvPerfCuda Result :");
-    printTensorValues(res_sim);
-
-    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
-                                 stride_w, 1, 1, row, col, 1, offset);
-
-    printf("\nConvApprox Result :");
-    printTensorValues(res);
-
-    hpvm_request_tensor(input, HOST);
-    hpvm_request_tensor(filter, HOST);
-
-    void *res_cpu = tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h,
-                                        stride_w, 1, 1, row, col, 1, offset);
-
-    printf("\nConvApproxCPU Result :");
-    printTensorValues(res_cpu);
-
-    void *res_half =
-        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
-                              1, 1, row, col, 1, offset);
-
-    convertToFP32((struct Tensor *)res_half);
-
-    printf("\nConvApproxHalf2 Result :");
-    printTensorValues(res_half);
-
-    std::string suffix =
-        std::string(" pad_h = ") + std::to_string(pad_h) +
-        std::string(" pad_w = ") + std::to_string(pad_w) +
-        std::string(" stride_h = ") + std::to_string(stride_h) +
-        std::string(" stride_w = ") + std::to_string(stride_w) +
-        std::string(" row = ") + std::to_string(row) + std::string(" col = ") +
-        std::to_string(col) + std::string(" offset = ") +
-        std::to_string(offset);
-
-    std::string test_name = std::string("PERF_FP32 ") + suffix;
-
-    unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05,
-                                   test_name);
-
-    std::string fp16_test_name = std::string("PERF_FP16 ") + suffix;
-    unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1,
-                                   fp16_test_name);
-
-    std::string cpu_test_name = std::string("PERF_CPU ") + suffix;
-    unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05,
-                                   cpu_test_name);
-  }
-
-  printf("\n\n\n--- End of Test \n\n\n");
-}
-
-/**** Tests Perforation for a set of different inputs */
-void testPerforation(UnitTestResults &unitTestResults) {
-
-  printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
-  Tensor *input =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
-  fillTensorWithVal(input, 1);
-
-  Tensor *filter =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
-  fillTensorWithVal(filter, 1);
-
-  /*
-  float* host_ptr = (float*) ((struct Tensor*) filter)->host_data;
-  host_ptr[0] = 2;
-  host_ptr[2] = 2;
-  host_ptr[4] = 2;
-  host_ptr[6] = 2;
-  host_ptr[8] = 2;
-  host_ptr[10] = 2;
-  host_ptr[12] = 2;
-  host_ptr[14] = 2;
-  host_ptr[16] = 2;
-  host_ptr[18] = 2;
-  host_ptr[20] = 2;
-  host_ptr[22] = 2;
-  host_ptr[24] = 2;
-  host_ptr[26] = 2;
-  */
-
-  testPerforationCalls(input, filter, 0, 0, 1, 1, 1, 2, unitTestResults);
-
-  testPerforationCalls(input, filter, 0, 0, 1, 1, 2, 1, unitTestResults);
-
-  testPerforationCalls(input, filter, 1, 1, 1, 1, 1, 3, unitTestResults);
-
-  testPerforationCalls(input, filter, 1, 1, 1, 1, 3, 1, unitTestResults);
-
-  testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4, unitTestResults);
-
-  testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1, unitTestResults);
-}
-
-void testSampling() {
-
-  printf("***** Testing Sampling ***** \n\n");
-  Tensor *input =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
-  fillTensorWithVal(input, 1);
-  // fillWithOnesAndTwos(input);
-
-  Tensor *filter =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
-  fillTensorWithVal(filter, 1);
-
-  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
-  host_ptr[0] = 2;
-  host_ptr[2] = 2;
-  host_ptr[4] = 2;
-  host_ptr[6] = 2;
-  host_ptr[8] = 2;
-  host_ptr[10] = 2;
-  host_ptr[12] = 2;
-  host_ptr[14] = 2;
-  host_ptr[16] = 2;
-  host_ptr[18] = 2;
-  host_ptr[20] = 2;
-  host_ptr[22] = 2;
-  host_ptr[24] = 2;
-  host_ptr[26] = 2;
-  // printTensorValues(input);
-
-  void *res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-
-  printTensorValues(res);
-
-  void *res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1);
-
-  printTensorValues(res2);
-
-  void *res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0);
-
-  printTensorValues(res2_sim);
-
-  void *res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0);
-
-  printTensorValues(res3);
-
-  void *res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
-
-  printTensorValues(res4);
-
-  void *res4_half =
-      tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
-
-  convertToFP32((struct Tensor *)res4_half);
-
-  printTensorValues(res4_half);
-}
-
-void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w,
-                       int stride_h, int stride_w, int skip_every,
-                       std::string filter_string,
-                       UnitTestResults &unitTestResults) {
-
-  float interpolation_rate = 1.0;
-  for (int offset = 0; offset < 2; offset++) {
-
-    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
-           "skip_every = %d offset= %d interpolation_rate = %f \n\n",
-           pad_h, pad_w, stride_h, stride_w, skip_every, offset,
-           interpolation_rate);
-
-    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
-                                        stride_w, 1, 1);
-
-    printf("tensorConvolution Result :");
-    printTensorValues(res_exact);
-
-    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
-                                        stride_w, 1, 1, 1, 1, 1, 1);
-
-    printf("\nBaseline Result :");
-    printTensorValues(res_exact2);
-
-    void *res_exact3 = tensorConvApproxHalf2(
-        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
-    convertToFP32((struct Tensor *)res_exact3);
-
-    printf("\nFP16_Baseline Result :");
-    printTensorValues(res_exact3);
-
-    void *res_sim =
-        tensorConvSampSim2(input, filter, pad_h, pad_w, stride_h, stride_w, 1,
-                           1, skip_every, offset, interpolation_rate);
-
-    printf("\nConvSampSim Result :");
-    printTensorValues(res_sim);
-
-    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
-                                 stride_w, 1, 1, 1, 1, skip_every, offset);
-
-    printf("\nConvApprox Result :");
-    printTensorValues(res);
-
-    hpvm_request_tensor(input, HOST);
-    hpvm_request_tensor(filter, HOST);
-
-    void *res_cpu =
-        tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h, stride_w, 1,
-                            1, 1, 1, skip_every, offset);
-
-    printf("\nConvApproxCPU Result :");
-    printTensorValues(res_cpu);
-
-    void *res_half =
-        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
-                              1, 1, 1, 1, skip_every, offset);
-
-    convertToFP32((struct Tensor *)res_half);
-
-    printf("\nConvApproxHalf2 Result :");
-    printTensorValues(res_half);
-
-    std::string suffix =
-        "filter = " + std::string(filter_string) + std::string(" pad_h = ") +
-        std::to_string(pad_h) + std::string(" pad_w = ") +
-        std::to_string(pad_w) + std::string(" stride_h = ") +
-        std::to_string(stride_h) + std::string(" stride_w = ") +
-        std::to_string(stride_w) + std::string(" skip_every = ") +
-        std::to_string(skip_every) + std::string(" offset = ") +
-        std::to_string(offset);
-
-    std::string test_name = std::string("SAMP_FP32 ") + suffix;
-
-    unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05,
-                                   test_name);
-
-    std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix;
-    unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.1,
-                                   fp16_test_name);
-
-    std::string cpu_test_name = std::string("SAMP_CPU ") + suffix;
-    unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05,
-                                   cpu_test_name);
-  }
-
-  printf("\n\n\n --- End of Test \n\n\n");
-}
-
-/**** Tests Sample for a sample 3 * 3 Filter */
-void testSampling_3_3(UnitTestResults &unitTestResults) {
-
-  printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
-  Tensor *input =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
-  fillTensorWithVal(input, 1);
-  // fillWithOnesAndTwos(input);
-
-  Tensor *filter =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
-  fillTensorWithVal(filter, 1);
-
-  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
-  host_ptr[0] = 10;
-  host_ptr[2] = 2;
-  host_ptr[4] = 2;
-  host_ptr[6] = 2;
-  host_ptr[8] = 2;
-  host_ptr[10] = 2;
-  host_ptr[12] = 2;
-  host_ptr[14] = 2;
-  host_ptr[16] = 2;
-  host_ptr[18] = 2;
-  host_ptr[20] = 2;
-  host_ptr[22] = 2;
-  host_ptr[24] = 2;
-  host_ptr[26] = 10;
-
-  // Tests with padding = 0 stride = 1
-  testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "3_3", unitTestResults);
-
-  testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "3_3", unitTestResults);
-
-  testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "3_3", unitTestResults);
-
-  // Tests with padding = 1 stride = 1
-  testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "3_3", unitTestResults);
-
-  testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "3_3", unitTestResults);
-
-  testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "3_3", unitTestResults);
-
-  // Tests with padding = 1 stride = 2
-  testSamplingCalls(input, filter, 1, 1, 2, 2, 2, "3_3", unitTestResults);
-
-  testSamplingCalls(input, filter, 1, 1, 2, 2, 3, "3_3", unitTestResults);
-
-  testSamplingCalls(input, filter, 1, 1, 2, 2, 4, "3_3", unitTestResults);
-}
-
-/**** Tests Sample for a sample 1 * 1 Filter */
-void testSampling_1_1(UnitTestResults &unitTestResults) {
-
-  Tensor *input =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2);
-  fillTensorWithVal(input, 2);
-  // fillWithOnesAndTwos(input);
-
-  Tensor *filter =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1);
-  fillTensorWithVal(filter, 2);
-
-  // Tests with padding = 0 stride = 1
-  testSamplingCalls(input, filter, 0, 0, 1, 1, 2, "1_1", unitTestResults);
-
-  testSamplingCalls(input, filter, 0, 0, 1, 1, 3, "1_1", unitTestResults);
-
-  testSamplingCalls(input, filter, 0, 0, 1, 1, 4, "1_1", unitTestResults);
-
-  // Tests with padding = 1 stride = 1
-  testSamplingCalls(input, filter, 1, 1, 1, 1, 2, "1_1", unitTestResults);
-
-  testSamplingCalls(input, filter, 1, 1, 1, 1, 3, "1_1", unitTestResults);
-
-  testSamplingCalls(input, filter, 1, 1, 1, 1, 4, "1_1", unitTestResults);
-}
-
-void *testTensorArgMax() {
-
-  Tensor *input =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1);
-
-  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
-
-  // Input 0
-  host_ptr[0] = 1;
-  host_ptr[1] = 7; // highest - max index = 1
-  host_ptr[2] = 3;
-
-  // Input 1
-  host_ptr[3] = 3;
-  host_ptr[4] = 3;
-  host_ptr[5] = 8; // highest - max index = 2
-
-  // Input 2
-  host_ptr[6] = 2;
-  host_ptr[7] = 5;
-  host_ptr[8] = 9; // highest - max index = 2
-
-  // Input 3
-  host_ptr[9] = 11; // highest - max index = 0
-  host_ptr[10] = 2;
-  host_ptr[11] = 8;
-
-  void *argmax_out = tensorArgMax(input);
-
-  // Expect Output of call below to be:
-  //   1    2    2    0
-  printTensorValues(argmax_out);
-
-  return argmax_out;
-}
-
-void *testTensorSelect(void *argmax_out) {
-
-  void *select_out = tensorSelect(argmax_out, 2);
-  printf("***** tensorSelect output \n");
-
-  printTensorValues(select_out);
-
-  return select_out;
-}
-
-void testTensorContract(void *select_out) {
-
-  Tensor *input =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1);
-  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
-
-  // Input 0
-  host_ptr[0] = 1;
-  host_ptr[1] = 1;
-  host_ptr[2] = 1;
-  host_ptr[3] = 1;
-
-  // Input 1
-  host_ptr[4] = 2;
-  host_ptr[5] = 2;
-  host_ptr[6] = 2;
-  host_ptr[7] = 2;
-
-  // Input 2
-  host_ptr[8] = 3;
-  host_ptr[9] = 3;
-  host_ptr[10] = 3;
-  host_ptr[11] = 3;
-
-  // Input 3
-  host_ptr[12] = 4;
-  host_ptr[13] = 4;
-  host_ptr[14] = 4;
-  host_ptr[15] = 4;
-
-  void *contract_out = tensorContract(input, select_out);
-  printf("***** tensorContract output \n");
-
-  printTensorValues(contract_out);
-}
-
-void testNewTensorOps() {
-
-  void *argmax_out = testTensorArgMax();
-  void *select_out = testTensorSelect(argmax_out);
-  testTensorContract(select_out);
-}
-
-int main() {
-
-  llvm_hpvm_initTensorRt(0);
-
-  UnitTestResults unitTestResults;
-
-  // Function call per unit test
-  testTensorHgemm(unitTestResults);
-  testTensorSgemm(unitTestResults);
-
-  /*
-  testTensorConv();
-  testTensorHalfConv();
-
-  testTensorGroupConv();
-  testTensorHalfGroupConv();
-
-  testTensorBatchNorm();
-  testTensorHalfBatchNorm();
-
-  testTensorPooling();
-  testTensorHalfPooling();
-
-  */
-
-  testSampling_3_3(unitTestResults);
-  testSampling_1_1(unitTestResults);
-
-  testPerforation(unitTestResults);
-
-  unitTestResults.printSummary();
-
-  // testTensorError();
-  // testQuantization();
-  // testTensorGemm();
-  // testTensorGemmGPU();
-  // testTensorGemmBias();
-  // testTensorConv2();
-  // testTensorConv3();
-  // testLRN();
-  // testSampleFilter();
-  // testNewTensorOps();
-  // testQuantization();
-  // testPromiseError();
-
-  return 0;
-}