diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index ff4f28aed4e0c4986a9249bb26937f43e351b607..0000000000000000000000000000000000000000
--- a/.gitattributes
+++ /dev/null
@@ -1 +0,0 @@
-hpvm/test/dnn_benchmarks/model_params/**/*.bin filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index d2c654528dbfbcf8065eaff49f07627902f20615..605cade94d773b32c3d3c79d2340e247aad51886 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,20 +1,3 @@
-<<<<<<< HEAD
@@ -22,4 +5,143 @@ hpvm/llvm-*.src.tar.xz
->>>>>>> a9f09dd1d7c769b6e9bef9dca334a9c0761f2136
+# Below is taken from Python.gitignore: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+# C extensions
+# Distribution / packaging
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+# Installer logs
+# Unit test / coverage reports
+# Translations
+# Django stuff:
+# Flask stuff:
+# Scrapy stuff:
+# Sphinx documentation
+# PyBuilder
+# Jupyter Notebook
+# IPython
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+# Celery stuff
+# SageMath parsed files
+# Environments
+# Spyder project settings
+# Rope project settings
+# mkdocs documentation
+# mypy
+# Pyre type checker
+# pytype static type analyzer
+# Cython debug symbols
diff --git a/README.md b/README.md
index 5c37358a49b45335d04b30d13ae4caf2a6e729c8..d1a28fae2ee717a765ab80ee9eef34b1cc0cb738 100644
--- a/README.md
+++ b/README.md
@@ -31,10 +31,18 @@ The following components are required to be installed on your machine to build H
   * In addition, each version of CUDA-nvcc requires GCC to be not newer than a certain version.
     See [here](https://gist.github.com/ax3l/9489132) for the support matrix.
 * CMake (>=3.17)
-* Python (>=3.7) with Pip
 * GNU Make (>=3.79)
 * OpenCL (>=1.0.0)
 * CUDA (>=9.1)
+* Python (==3.6) with pip (>=20)
+Python must be strictly 3.6 (any subversion between 3.6.0~3.6.13).
+Alternatively, if you use Anaconda for package management,
+we provide a conda environment file that covers all Python and Python package requirements:
+conda env create -n hpvm -f hpvm/env.yaml
 ## Supported Targets
@@ -65,8 +73,8 @@ HPVM has not been tested but might work on other CPUs supported by LLVM Backend,
 Checkout HPVM and go to directory `./hpvm` under project root:
-git clone --recursive -b approx_hpvm_reorg_keras --single-branch https://gitlab.engr.illinois.edu/llvm/hpvm.git
-cd hpvm/hpvm/
+git clone --recursive -b approx_hpvm_reorg --single-branch https://gitlab.engr.illinois.edu/llvm/hpvm.git
+cd hpvm/
 HPVM needs to be able to find CUDA.
diff --git a/hpvm/CMakeLists.txt b/hpvm/CMakeLists.txt
index a882be7d0ab9468ae7e1bb66912590b0f3ed58a5..809a30cfa52e16f436dac4e22843f4c5a3add3d9 100644
--- a/hpvm/CMakeLists.txt
+++ b/hpvm/CMakeLists.txt
+# find_package will use the auxillary cmake/Find*.cmake we provide
 # Generate TENSOR_RT_PREFIX into config.h
diff --git a/hpvm/cmake/TestFile.cmake b/hpvm/cmake/TestFile.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..fda27adb0863507130c9be8fa0d394fce28a4655
--- /dev/null
+++ b/hpvm/cmake/TestFile.cmake
@@ -0,0 +1,97 @@
+# This file is taken from llvm-testsuite:
+# https://github.com/llvm/llvm-test-suite/blob/main/cmake/modules/TestFile.cmake
+##===- TestSuite.cmake ----------------------------------------------------===##
+# Defines helper functions to create .test files that describe how to run a
+# benchmark and optionally how to prepare before running, how to verify the
+# results and how to extract metrics from the output.
+# Specify a "RUN: " line to be put in a .test file. See also llvm_add_test().
+  # If no executable is specified use $EXECUTABLE$ placeholder which will be
+  # replaced later.
+  endif()
+  endif()
+  endif()
+  # ARGS_UNPARSED_ARGUMENTS is a semicolon-separated list. Change it into a
+  # whitespace-separated string.
+  endif()
+# Specify a "VERIFY: " line to be put in a .test file. See also llvm_add_test().
+  endif()
+  # ARGS_UNPARSED_ARGUMENTS is a semicolon-separated list. Change it into a
+  # whitespace-separated string.
+  endif()
+  endif()
+# Specify a "PREPARE: " line to be put in a .test file. See also
+# llvm_add_test().
+  endif()
+  # ARGS_UNPARSED_ARGUMENTS is a semicolon-separated list. Change it into a
+  # whitespace-separated string.
+  endif()
+  endif()
+# Specify a "METRIC: " line to be put in a .test file. See also llvm_add_test().
+  endif()
+  # ARGS_UNPARSED_ARGUMENTS is a semicolon-separated list. Change it into a
+  # whitespace-separated string.
+  endif()
+# Create a .test driver file suitable for LIT.
+# The file contents come from previous llvm_test_XXX() calls.
+function(llvm_add_test testfile executable)
+  # Replace $EXECUTABLE$ placeholder.
+  string(REPLACE "$EXECUTABLE$" "${executable}" TESTSCRIPT "${TESTSCRIPT}")
+  # Produce .test file
+  # flush the test script
+function(llvm_add_test_for_target target)
+  llvm_add_test($<TARGET_FILE:${target}>.test $<TARGET_FILE:${target}>)
\ No newline at end of file
diff --git a/hpvm/env.yaml b/hpvm/env.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d2dcaa3de27105ac07efd81cfdf9a37655c33af7
--- /dev/null
+++ b/hpvm/env.yaml
@@ -0,0 +1,26 @@
+name: hpvm
+  - pytorch
+  - defaults
+  - jinja2=2.11
+  - jsonpickle=2
+  - keras==2.1.6
+  - matplotlib=3.3
+  - networkx=2.5
+  - pandas=1.1
+  - python==3.6.13
+  - pip
+  - pytorch==1.6.0
+  - torchvision=0.8
+  - tqdm=4.59
+  - scipy==1.1.0
+  - h5py==2.10.0
+  - pip:
+    - argparse==1.4
+    - onnx==1.8
+    - onnx-simplifier==0.3
+    - opentuner==0.8.3
+    - sqlalchemy==1.3.0
+    - tensorflow==1.14.0
+    - tensorflow-gpu==1.14.0
diff --git a/hpvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp b/hpvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp
index 9b6ca06f631104d5d65711495e18f64babbcf6e7..5117cc23d30a7392ee53107e63e7c2d13a4f9692 100644
--- a/hpvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp
+++ b/hpvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp
@@ -9,14 +9,12 @@
 // This pass is uses fuses HPVM nodes based on the tensor operations contained
 // the nodes. This helps create the groundwork for indicating to the compiler
-// that a set of tensor operations in a node are fusionable and it can have 
+// that a set of tensor operations in a node are fusionable and it can have
 // implications on performance and energy consumption of set of tensor
 // operations in question.
 #define DEBUG_TYPE "FuseTensorNodes"
 #include "llvm/IR/ValueMap.h"
@@ -141,6 +139,9 @@ static DFNode *findNextNodeInSequence(DFNode *SrcN) {
+  if (!DstN)
+    return NULL;
   // If we reach this point, DstN is the unique successor of SrcN
   // Now, test that the DstN has a single predeccessor except Root (dummy)
@@ -672,7 +673,9 @@ void FuseHPVMTensorNodes::updateParentNodeFunction(IntrinsicInst *II1,
     } break;
     case Intrinsic::hpvm_bind_output: {
-      assert(false && "Source node of node fusion not expected in bind.out\n");
+      // Replace BindOut node argument with fused function node.
+      II->setArgOperand(0, IInew);
     } break;
       llvm_unreachable("Unknown use of HPVM createNode handle\n");
@@ -822,7 +825,7 @@ void FindFusionTargetsTraversal::codeGen(DFLeafNode *N) {
   errs() << "THIS IS NOT A DUMMY NODE\n";
   errs() << "INTRINSIC: " << *isValidHPVMTensorNode(N) << "\n";
-  if(!preferredTargetIncludes(N, hpvm::TENSOR_TARGET)) {
+  if (!preferredTargetIncludes(N, hpvm::TENSOR_TARGET)) {
     // Only fuse if we plan to target PROMISE/Layers API
     // The CUDNN backend would be able to generate calls for the fused node,
     // but not the other way around
@@ -987,13 +990,24 @@ bool FuseHPVMTensorNodesWrapper::runOnModule(Module &M) {
   FindFusionTargetsTraversal *FTTVisitor =
       new FindFusionTargetsTraversal(M, DFG);
+  // Visit each DFG only once
+  std::set<Function *> Visited;
   errs() << "Find targets\n";
   // Iterate over all the DFGs and produce code for each one of them
   for (auto rootNode : Roots) {
+    Function *rootFunc = rootNode->getFuncPointer();
+    if (Visited.find(rootFunc) != Visited.end())
+      continue;
     // Initiate code generation for root DFNode
+    Visited.insert(rootFunc);
+  errs() << "Finished visiting DFGs ...\n";
   FuseHPVMTensorNodes::FusionTargets &FTs = FTTVisitor->getFusionTargets();
   FuseHPVMTensorNodes Fuse;
diff --git a/hpvm/projects/gpu_profiler/.gitignore b/hpvm/projects/gpu_profiler/.gitignore
deleted file mode 100644
index dd2c293453382269c150c372d926f287a74edea5..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
diff --git a/hpvm/projects/gpu_profiler/CMakeLists.txt b/hpvm/projects/gpu_profiler/CMakeLists.txt
deleted file mode 100644
index c6cf3041eee354609b3999e5a8dcd424990f75ec..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-cmake_minimum_required(VERSION 3.5)
-set(libsrc src/profiler.cpp)
-add_library(gpu_profiler STATIC ${libsrc})
-target_include_directories(gpu_profiler PRIVATE include)
diff --git a/hpvm/projects/gpu_profiler/Makefile b/hpvm/projects/gpu_profiler/Makefile
deleted file mode 100644
index 412d38265ab5c9408d4ac444ded9d6bd8b72f1b7..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-	g++ -std=c++11 -O3 profiler.cpp -o pp -lpthread
-	rm -rf pp
diff --git a/hpvm/projects/gpu_profiler/README.md b/hpvm/projects/gpu_profiler/README.md
deleted file mode 100644
index 59891da04cf3d28c9b854230255a4d935ab40724..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Tegra GPU Profiler
-## Build
-mkdir lib
-cmake ../
diff --git a/hpvm/projects/gpu_profiler/offline_profiler.cpp b/hpvm/projects/gpu_profiler/offline_profiler.cpp
deleted file mode 100644
index 6b9f37ef62cc2c8600d11474100f27873bc36d7a..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/offline_profiler.cpp
+++ /dev/null
@@ -1,595 +0,0 @@
-#include <cmath>
-#include <chrono>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <boost/algorithm/string.hpp>
-#include <vector>
-#include <map>
-#include <thread>
-#include <atomic>
-#include <sched.h>
-#define NUM_ARGS 4
-// This is a simple power profiler that can sample the power of the various
-// components in a Jetson TX2. The usage is simple: profile() measures power
-// for the specified program, and then dumpOutput() prints the readings to a
-// file. profile() can be called as many times as desired - the internal state
-// is reset each time and thus the measurements are not cumulative.
-class Profiler {
-    // Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
-    // we can't use them.
-    const unsigned core0 = 0;
-    const unsigned core1 = 3;
-    const unsigned core2 = 4;
-    const unsigned core3 = 5;
-    // sysfs paths for i2c buses of various components
-    const char * const cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
-    const char * const gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
-    const char * const ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
-    const char * const soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
-    const char * const sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
-    // It takes some time for the GPU's power to return to idle (ms)
-    const unsigned gpu_idle_time = 0;
-    // An individual power reading
-    struct PowerReading {
-        std::chrono::time_point<std::chrono::high_resolution_clock> time_;
-        double cpu_;
-        double gpu_;
-        double ddr_;
-        double soc_;
-        double sys_;
-    };
-    // Individual tensor op
-    struct TensorOp {
-        std::string name_;
-        double start_;
-        double finish_;
-        double time_;
-        double energy_;
-        double gpu_energy_;
-        double ddr_energy_;
-        double power_;
-        double gpu_power_;
-        double ddr_power_;
-        TensorOp(std::string name, double start, double finish)
-            : name_(name), start_(start), finish_(finish), time_(finish - start),
-            energy_(0.0), gpu_energy_(0.0), ddr_energy_(0.0),
-            power_(0.0), gpu_power_(0.0), ddr_power_(0.0) {
-        }
-    };
-    // Aggregate tensor info
-    struct AggTensorInfo {
-        // Op name
-        std::string name_;
-        // Averages
-        double average_time_;
-        double average_energy_;
-        double average_gpu_energy_;
-        double average_ddr_energy_;
-        double average_power_;
-        double average_gpu_power_;
-        double average_ddr_power_;
-        // Standard deviations
-        double time_std_;
-        double energy_std_;
-        double gpu_energy_std_;
-        double ddr_energy_std_;
-        double power_std_;
-        double gpu_power_std_;
-        double ddr_power_std_;
-    };
-    // Total time, energy, and power
-    struct TotalInfo {
-        double time_;
-        double energy_;
-        double gpu_energy_;
-        double ddr_energy_;
-        double power_;
-        double gpu_power_;
-        double ddr_power_;
-        void clear() {
-            time_ = 0.0;
-            energy_ = 0.0;
-            gpu_energy_ = 0.0;
-            ddr_energy_ = 0.0;
-            power_ = 0.0;
-            gpu_power_ = 0.0;
-            ddr_power_ = 0.0;
-        }
-    };
-    // For reading the i2c buses via sysfs
-    std::ifstream cpu_stream_;
-    std::ifstream gpu_stream_;
-    std::ifstream ddr_stream_;
-    std::ifstream soc_stream_;
-    std::ifstream sys_stream_;
-    // Start time (so graph begins from t=0)
-    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
-    // Per-run info
-    std::vector<PowerReading> power_readings_;
-    // Aggregate (across all runs) info
-    std::map<std::string, std::vector<TensorOp>> tensor_info_;
-    std::vector<AggTensorInfo> agg_tensor_info_;
-    TotalInfo total_info_;
-    unsigned iterations_;
-    // Start and stop flags to synchronize the program and profiling threads
-    std::atomic_bool start_;
-    std::atomic_bool stop_;
-    // Resets tensor info and total time and energy
-    void resetGlobal() {
-        tensor_info_.clear();
-        agg_tensor_info_.clear();
-        total_info_.clear();
-    }
-    // Resets power readings and flags
-    void resetLocal() {
-        power_readings_.clear();
-        start_ = false;
-        stop_ = false;
-    }
-    // Pins the given thread to the specified core
-    void pinThread(std::thread &t, const unsigned core) const {
-        cpu_set_t cpuset;
-        CPU_ZERO(&cpuset);
-        CPU_SET(core, &cpuset);
-        if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0)
-            std::cout << "Couldn't set thread affinity\n";
-    }
-    // Adds a tensor op to the map
-    void addTensorOp(std::string &op_name, TensorOp &top) {
-        // Create a vector if this is the first entry
-        auto it = tensor_info_.find(op_name);
-        if (it == tensor_info_.end()) {
-            tensor_info_.insert(std::pair<std::string, std::vector<TensorOp>>(op_name, std::vector<TensorOp>()));
-        }
-        tensor_info_[op_name].push_back(top);
-    }
-    // Obtain's a single power reading from the GPU and DDR rails
-    void getPowerReading() {
-        PowerReading reading;
-        // The order matters here. All the reads have to happen together first
-        // and then all the seeks have to happen together at the end, otherwise
-        // there will be a significant time difference between the readings of
-        // the different rails.
-        reading.time_ = std::chrono::high_resolution_clock::now();
-        gpu_stream_ >> reading.gpu_;
-        ddr_stream_ >> reading.ddr_;
-        power_readings_.push_back(reading);
-        // Reset the input position of the files
-        gpu_stream_.seekg(0);
-        ddr_stream_.seekg(0);
-    }
-    // Executes the program to be profiled
-    void runProgram(const std::string& program) {
-        // Tell the profiling thread to start, execute the program that needs
-        // to be profiled, and then tell the profiling thread to stop.
-        start_ = true;
-        const auto result = std::system(program.c_str());
-        stop_ = true;
-    }
-    // Records power while the program is running
-    void recordPower() {
-        // Obtain the new start time, wait for the start signal, and keep
-        // profiling until the stop flag is set.
-        start_time_ = std::chrono::high_resolution_clock::now();
-        while (!start_);
-        while (!stop_)
-            getPowerReading();
-    }
-    // Calculates stats for the entire execution (CPU+GPU phase)
-    void updateTotalStats() {
-        double energy = 0.0;
-        double gpu_energy = 0.0;
-        double ddr_energy = 0.0;
-        std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
-        for (auto reading : power_readings_) {
-            std::chrono::duration<double> duration = reading.time_ - prev_time;
-            gpu_energy += reading.gpu_ * duration.count();
-            ddr_energy += reading.ddr_ * duration.count();
-            prev_time = reading.time_;
-        }
-        energy = gpu_energy + ddr_energy;
-        auto time = std::chrono::duration<double>(prev_time - start_time_).count();
-        total_info_.time_ += time;
-        total_info_.energy_ += (gpu_energy + ddr_energy);
-        total_info_.gpu_energy_ += gpu_energy;
-        total_info_.ddr_energy_ += ddr_energy;
-        total_info_.power_ += (energy / time);
-        total_info_.gpu_power_ += (gpu_energy / time);
-        total_info_.ddr_power_ += (ddr_energy / time);
-    }
-    // Calculates energy and power usage of the given tensor operation
-    void calculateTensorEP(TensorOp &top) const {
-        auto prev_time = top.start_;
-        unsigned i = 0;
-        // Skip until we hit the start time of the operation
-        for (; std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count() < top.start_; i++);
-        // Keep going until we hit the finish time of the operation or we run out of readings
-        for (double curr_time; ((curr_time = std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count()) <= top.finish_)
-                && (i < power_readings_.size()); i++) {
-            auto duration = curr_time - prev_time;
-            prev_time = curr_time;
-            top.gpu_energy_ += power_readings_[i].gpu_ * duration;
-            top.ddr_energy_ += power_readings_[i].ddr_ * duration;
-        }
-        top.energy_ = top.gpu_energy_ + top.ddr_energy_;
-        top.power_ = top.energy_ / top.time_;
-        top.gpu_power_ = top.gpu_energy_ / top.time_;
-        top.ddr_power_ = top.ddr_energy_ / top.time_;
-    }
-    // Calculates stats for all the tensors in the timestamp file
-    void updatePerOpStats() {
-        const char * const op_file = "profile_data.txt";
-        std::string line;
-        std::ifstream ifs(op_file, std::ios::in);
-        // Calculate time and energy for each tensor operation. There are two
-        // possibilities for the file format:
-        // If the line doesn't begin with #, we are looking at FP32 code
-        // without any conversions to/from FP16, and each operation occupies
-        // two consecutive lines in the timestamp file.
-        // If the line does begin with #, we are looking at FP16 code with
-        // conversion routines in the middle. In this case, *after* the current
-        // line, there will be two lines for F2H, two lines for H2F, and then
-        // one line for the end of the operation.
-        while (std::getline(ifs, line)) {
-            std::vector<std::string> tokens;
-            boost::split(tokens, line, boost::is_any_of("\t"));
-            std::string op_name = tokens[0];
-            // FP32
-            if (tokens[0][0] != '#') {
-                // First line with tensor op name and start time
-                std::string op_name = tokens[0];
-                const auto start = std::stod(tokens[1]);
-                // Second line with tensor op end time
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto finish = std::stod(tokens[1]);
-                TensorOp top(op_name, start, finish);
-                calculateTensorEP(top);
-                addTensorOp(op_name, top);
-            } else {
-                // First line with tensor op name and start time
-                std::string op_name = tokens[0].substr(1);
-                const auto start = std::stod(tokens[1]);
-                // Second line with f2h
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                std::string f2h_name = op_name + "_f2h";
-                const auto f2h_start = std::stod(tokens[1]);
-                // Third line with f2h
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto f2h_finish = std::stod(tokens[1]);
-                // Add f2h
-                TensorOp f2h(f2h_name, f2h_start, f2h_finish);
-                calculateTensorEP(f2h);
-                addTensorOp(f2h_name, f2h);
-                // Fourth line with h2f
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                std::string h2f_name = op_name + "_h2f";
-                const auto h2f_start = std::stod(tokens[1]);
-                // Fifth line with h2f
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto h2f_finish = std::stod(tokens[1]);
-                // Add h2f
-                TensorOp h2f(h2f_name, h2f_start, h2f_finish);
-                calculateTensorEP(h2f);
-                addTensorOp(h2f_name, h2f);
-                // Sixth and final line with tensor op end time
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto finish = std::stod(tokens[1]);
-                // Subtract f2h's and h2f's time and energy to get just the computation's info
-                TensorOp top(op_name, start, finish);
-                calculateTensorEP(top);
-                top.time_ -= (f2h.time_ + h2f.time_);
-                top.energy_ -= (f2h.energy_ + h2f.energy_);
-                top.gpu_energy_ -= (f2h.gpu_energy_ + h2f.gpu_energy_);
-                top.ddr_energy_ -= (f2h.ddr_energy_ + h2f.ddr_energy_);
-                top.power_ = top.energy_ / top.time_;
-                top.gpu_power_ = top.gpu_energy_ / top.time_;
-                top.ddr_power_ = top.ddr_energy_ / top.time_;
-                addTensorOp(op_name, top);
-            }
-        }
-        ifs.close();
-    }
-    void updateStats() {
-        updatePerOpStats();
-        updateTotalStats();
-    }
-    // Calculates the average and standard deviation of each metric of each tensor op
-    void calculateAggregateStats() {
-        for (auto it = tensor_info_.begin(); it != tensor_info_.end(); it++) {
-            AggTensorInfo ati;
-            ati.name_ = it->first;
-            auto topv = it->second;
-            double total_time = 0.0;
-            double total_energy = 0.0;
-            double total_gpu_energy = 0.0;
-            double total_ddr_energy = 0.0;
-            double total_power = 0.0;
-            double total_gpu_power = 0.0;
-            double total_ddr_power = 0.0;
-            double time_sum = 0.0;
-            double energy_sum = 0.0;
-            double gpu_energy_sum = 0.0;
-            double ddr_energy_sum = 0.0;
-            double power_sum = 0.0;
-            double gpu_power_sum = 0.0;
-            double ddr_power_sum = 0.0;
-            // Calculate average
-            for (const auto &top : topv) {
-                total_time += top.time_;
-                total_energy += top.energy_;
-                total_gpu_energy += top.gpu_energy_;
-                total_ddr_energy += top.ddr_energy_;
-                total_power += top.power_;
-                total_gpu_power += top.gpu_power_;
-                total_ddr_power += top.ddr_power_;
-            }
-            ati.average_time_ = total_time / iterations_;
-            ati.average_energy_ = total_energy / iterations_;
-            ati.average_gpu_energy_ = total_gpu_energy / iterations_;
-            ati.average_ddr_energy_ = total_ddr_energy / iterations_;
-            ati.average_power_ = total_power / iterations_;
-            ati.average_gpu_power_ = total_gpu_power / iterations_;
-            ati.average_ddr_power_ = total_ddr_power / iterations_;
-            // Calculate standard deviation
-            for (const auto &top : topv) {
-                auto time_diff = top.time_ - ati.average_time_;
-                time_sum += time_diff * time_diff;
-                auto energy_diff = top.energy_ - ati.average_energy_;
-                energy_sum += energy_diff * energy_diff;
-                auto gpu_energy_diff = top.gpu_energy_ - ati.average_gpu_energy_;
-                gpu_energy_sum += gpu_energy_diff * gpu_energy_diff;
-                auto ddr_energy_diff = top.ddr_energy_ - ati.average_ddr_energy_;
-                ddr_energy_sum += ddr_energy_diff * ddr_energy_diff;
-                auto power_diff = top.power_ - ati.average_power_;
-                power_sum += power_diff * power_diff;
-                auto gpu_power_diff = top.gpu_power_ - ati.average_gpu_power_;
-                gpu_power_sum += gpu_power_diff * gpu_power_diff;
-                auto ddr_power_diff = top.ddr_power_ - ati.average_ddr_power_;
-                ddr_power_sum += ddr_power_diff * ddr_power_diff;
-            }
-            ati.time_std_ = std::sqrt(time_sum / iterations_);
-            ati.energy_std_ = std::sqrt(energy_sum / iterations_);
-            ati.gpu_energy_std_ = std::sqrt(gpu_energy_sum / iterations_);
-            ati.ddr_energy_std_ = std::sqrt(ddr_energy_sum / iterations_);
-            ati.power_std_ = std::sqrt(power_sum / iterations_);
-            ati.gpu_power_std_ = std::sqrt(gpu_power_sum / iterations_);
-            ati.ddr_power_std_ = std::sqrt(ddr_power_sum / iterations_);
-            agg_tensor_info_.push_back(ati);
-        }
-    }
-    Profiler() {
-        cpu_stream_.open(cpu_power_rail, std::ifstream::in);
-        gpu_stream_.open(gpu_power_rail, std::ifstream::in);
-        ddr_stream_.open(ddr_power_rail, std::ifstream::in);
-        soc_stream_.open(soc_power_rail, std::ifstream::in);
-        sys_stream_.open(sys_power_rail, std::ifstream::in);
-        if (!cpu_stream_.is_open() or !gpu_stream_.is_open() or !ddr_stream_.is_open()
-            or !soc_stream_.is_open() or !sys_stream_.is_open()) {
-            std::cout << "Failed to open one of the power rails for reading\n";
-            exit(1);
-        }
-    }
-    ~Profiler() {
-        cpu_stream_.close();
-        gpu_stream_.close();
-        ddr_stream_.close();
-        soc_stream_.close();
-        sys_stream_.close();
-    }
-    void profile(const std::string& program, const int iterations) {
-        iterations_ = iterations;
-        resetGlobal();
-        for (unsigned i = 0; i < iterations_; i++) {
-            resetLocal();
-            // Launch two threads: one for running the program and one for
-            // profiling it. Pin the threads to specific cores to remove migration
-            // overhead. Profiling showed that the sampling rate increases slightly
-            // with pinning.
-            std::thread prog(&Profiler::runProgram, this, program);
-            std::thread power(&Profiler::recordPower, this);
-            pinThread(prog, core1);
-            pinThread(power, core2);
-            prog.join();
-            power.join();
-            updateStats();
-            // Sleep for some time to bring the GPU back to idle
-            std::this_thread::sleep_for(std::chrono::milliseconds(gpu_idle_time));
-        }
-        calculateAggregateStats();
-    }
-    void dumpTensorInfo(const char * const filename) const {
-        std::cout<<"dumping to"<<filename<<'\n';
-        const std::string header = "Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std\n";
-        std::ofstream ofs;
-        ofs.open(filename);
-        //ofs << header;
-        for (const auto &ati : agg_tensor_info_) {
-            ofs << ati.name_
-                << "," << ati.average_time_ * 1e3
-                << "," << ati.average_energy_
-                /*
-                << "," << ati.average_gpu_energy_
-                << "," << ati.average_ddr_energy_
-                << "," << ati.average_power_
-                << "," << ati.average_gpu_power_
-                << "," << ati.average_ddr_power_
-                << "," << ati.time_std_ * 1e3
-                << "," << ati.energy_std_
-                << "," << ati.gpu_energy_std_
-                << "," << ati.ddr_energy_std_
-                << "," << ati.power_std_
-                << "," << ati.gpu_power_std_
-                << "," << ati.ddr_power_std_*/
-                << "\n";
-            std::cout << ati.average_time_ * 1e3 << "," << ati.average_energy_ << "\n";
-        }
-        ofs.close();
-    }
-    void dumpPowerReadings(const char * const filename) const {
-        std::ofstream ofs;
-        ofs.open(filename);
-        for (const auto &reading : power_readings_) {
-            std::chrono::duration<double> duration = reading.time_ - start_time_;
-            //std::chrono::duration<double> duration = reading.time_.time_since_epoch();
-            ofs << std::to_string(duration.count())
-                << " " << reading.gpu_
-                << " " << reading.ddr_
-                << "\n";
-        }
-        ofs.close();
-    }
-    void dumpTotalInfo() const {
-        auto total_time = total_info_.time_ / iterations_;
-        auto total_energy = total_info_.energy_ / iterations_;
-        auto gpu_energy = total_info_.gpu_energy_ / iterations_;
-        auto ddr_energy = total_info_.ddr_energy_ / iterations_;
-        auto power = total_info_.power_ / iterations_;
-        auto gpu_power = total_info_.gpu_power_ / iterations_;
-        auto ddr_power = total_info_.ddr_power_ / iterations_;
-        std::cout << "-----------------------------------------------------\n";
-        std::cout << "Program info (average)\n";
-        std::cout << "-----------------------------------------------------\n";
-        std::cout << "\tExecution time: " << total_time << " seconds\n";
-        std::cout << "\tTotal energy:   " << total_energy << " mJ\n";
-        std::cout << "\t    GPU:        " << gpu_energy << " mJ\n";
-        std::cout << "\t    DDR:        " << ddr_energy << " mJ\n";
-        std::cout << "\tPower:          " << power << " mW\n";
-        std::cout << "\t    GPU:        " << gpu_power << " mW\n";
-        std::cout << "\t    DDR:        " << ddr_power << " mW\n";
-        std::cout << "-----------------------------------------------------\n";
-    }
-int main(int argc, char *argv[]) {
-    if (argc < NUM_ARGS) {
-        std::cout << "Usage: " << argv[0] << " <program> <params> END_PARAM <iterations> <tensor output file> [power output file]\n";
-        exit(1);
-    }
-    std::string program(argv[1]);
-    size_t i = 2;
-    for (; i < argc; i++){
-        if (std::string(argv[i]) == "END_PARAM"){
-            break;
-        }
-        program += " " + std::string(argv[i]);
-    }
-    i += 1;
-    Profiler pp;
-    pp.profile(program, std::stoi(argv[i]));
-    pp.dumpTensorInfo(argv[i + 1]);
-    if (argc > NUM_ARGS)
-        pp.dumpPowerReadings(argv[i + 2]);
-    return 0;
diff --git a/hpvm/projects/gpu_profiler/plot.sh b/hpvm/projects/gpu_profiler/plot.sh
deleted file mode 100755
index 8e4573b10c2fab993b4998d2040d10b0f7e9f9c5..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/plot.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-gnuplot -p << EOF
-    #set terminal png
-    #set output "$input.png"
-    set xlabel "Time (s)"
-    set ylabel "Power (mW)"
-    set title "Power usage of GPU and DDR over time"
-    plot "$input" using 1:2 title 'GPU' with lines,"$input" using 1:3 title 'DDR' with lines
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-1.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-1.pdf
deleted file mode 100644
index af57723b4091da6feffa9ef8f789698837b90bfa..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-1.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-2.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-2.pdf
deleted file mode 100644
index 57c5597c28e1028fa643bc5b03db8fc51d0f4b6b..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-2.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-3.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-3.pdf
deleted file mode 100644
index c7e0e3b2e7ff9d52c66b208321ecfa858ef5d9da..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-3.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-1.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-1.pdf
deleted file mode 100644
index 85bba9ee4c6dea2b1a7356d3847acb9aa5ea85aa..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-1.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-2.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-2.pdf
deleted file mode 100644
index a7ddb64b1e9e97f8ba93c52b38402dd2293725ef..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-2.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-3.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-3.pdf
deleted file mode 100644
index 5865ac69d6c5187fac0476f87d20e3a5154d516f..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-3.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-1.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-1.pdf
deleted file mode 100644
index f235128927672fdb46c54dd357cafb3c275a7144..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-1.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-2.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-2.pdf
deleted file mode 100644
index e80e4cde621b99ef074e1c97aadd191d6b9777c1..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-2.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-3.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-3.pdf
deleted file mode 100644
index 61f5d13e1cd040dbf7bdef058f34387f83a7df23..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-3.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-1.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-1.pdf
deleted file mode 100644
index fbd3ebd141c6eb0582496c85dcd5388a6a0bce7b..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-1.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-2.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-2.pdf
deleted file mode 100644
index f19c5a204ccc418c5af80c8953b9c28f39c3fd93..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-2.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-3.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-3.pdf
deleted file mode 100644
index 4ae009ab08d8368139c09a4f51e5cadc3623fefb..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-3.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-gpu-ddr.pdf b/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-gpu-ddr.pdf
deleted file mode 100644
index 5996b934a55fd90d9cd773d42e9cfa89429cab68..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/lenet/lenet-m3normal-gpu-ddr.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/tests/merge1.pdf b/hpvm/projects/gpu_profiler/results/tests/merge1.pdf
deleted file mode 100644
index 5d3c3540c2e67aa7943f5993cfebf0cca40412fe..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/tests/merge1.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/tests/merge3.pdf b/hpvm/projects/gpu_profiler/results/tests/merge3.pdf
deleted file mode 100644
index c38e37e2d610175311fb3221fd5fa26892fdf1e8..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/tests/merge3.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-1.pdf b/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-1.pdf
deleted file mode 100644
index 61bdcc890e0b5800ee4b3a8e19abdc724461a01b..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-1.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-2.pdf b/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-2.pdf
deleted file mode 100644
index bf80ebf1859640bdec386ef470fc04ada74ea822..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-2.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-3.pdf b/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-3.pdf
deleted file mode 100644
index 557bc295b6386ea73a0fcbd140de63a326f6aecd..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-3.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-tool.pdf b/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-tool.pdf
deleted file mode 100644
index d00032f35f77934432832cef4c00124327a14169..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-tool.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu-1.pdf b/hpvm/projects/gpu_profiler/results/tests/mm300-gpu-1.pdf
deleted file mode 100644
index 030d102341f58c57753d436554278c8d628137a8..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu-1.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu-2.pdf b/hpvm/projects/gpu_profiler/results/tests/mm300-gpu-2.pdf
deleted file mode 100644
index e6f97075597d8080bf223a1fd1a35b8969b9b141..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu-2.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu-3.pdf b/hpvm/projects/gpu_profiler/results/tests/mm300-gpu-3.pdf
deleted file mode 100644
index 6898f235ef85bd5148f5fcfdcf50b4f14ca2ab19..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/gpu_profiler/results/tests/mm300-gpu-3.pdf and /dev/null differ
diff --git a/hpvm/projects/gpu_profiler/run.pl b/hpvm/projects/gpu_profiler/run.pl
deleted file mode 100755
index 8674e63d9453fbb1e07371d99cf22c4745f234b3..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/run.pl
+++ /dev/null
@@ -1,64 +0,0 @@
-use strict;
-use warnings;
-my $time;
-my $iterations = 100;
-# FP32
-print "############### FP32 ##############\n";
-print "Running Lenet\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./lenet_tanh $iterations lenet-fp32.csv`;
-print "Running FC2\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./fc2_clipped $iterations fc2-fp32.csv`;
-print "Running FC3\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./fc3_clipped $iterations fc3-fp32.csv`;
-print "Running FC4\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./fc4_clipped $iterations fc4-fp32.csv`;
-print "Running CIFAR\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./cifar_keras $iterations cifar-fp32.csv`;
-# FP16
-print "############### FP16 ##############\n";
-print "Running Lenet\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./lenet_tanh_half $iterations lenet-fp16.csv`;
-print "Running FC2\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./fc2_half $iterations fc2-fp16.csv`;
-print "Running FC3\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./fc3_half $iterations fc3-fp16.csv`;
-print "Running FC4\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./fc4_half $iterations fc4-fp16.csv`;
-print "Running CIFAR\n";
-$time = `date`;
-print $time;
-`~/awesome_profiler/pp ./cifar_keras_half $iterations cifar-fp16.csv`;
diff --git a/hpvm/projects/gpu_profiler/run_dnns.pl b/hpvm/projects/gpu_profiler/run_dnns.pl
deleted file mode 100755
index 041f3e3cae8598d34ac8d38f65cd37d51e8aa0ba..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/run_dnns.pl
+++ /dev/null
@@ -1,29 +0,0 @@
-use strict;
-use warnings;
-my $time;
-my $iterations = 100;
-my @networks = ("alexnet", "alexnet2", "resnet18", "vgg16");
-# FP32
-print "############### FP32 ##############\n";
-foreach my $network (@networks) {
-    print "Running $network\n";
-    $time = `date`;
-    print $time;
-    `~/awesome_profiler/pp ./${network}_cifar10 $iterations ${network}_fp32.csv`;
-# FP16
-print "############### FP16 ##############\n";
-foreach my $network (@networks) {
-    print "Running $network\n";
-    $time = `date`;
-    print $time;
-    `~/awesome_profiler/pp ./${network}_cifar10_half $iterations ${network}_fp16.csv`;
diff --git a/hpvm/projects/gpu_profiler/run_image_pipelines.pl b/hpvm/projects/gpu_profiler/run_image_pipelines.pl
deleted file mode 100755
index 8e6df67d2e96d343cff3cc6a324693c14abaa3f3..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/run_image_pipelines.pl
+++ /dev/null
@@ -1,29 +0,0 @@
-use strict;
-use warnings;
-my $time;
-my $iterations = 100;
-my @pipelines = ("pipeline_GEMO", "pipeline_GEO", "pipeline_GEOM", "pipeline_GSM", "pipeline_GSME");
-# FP32
-print "############### FP32 ##############\n";
-foreach my $pipeline (@pipelines) {
-    print "Running $pipeline\n";
-    $time = `date`;
-    print $time;
-    `~/awesome_profiler/pp ./${pipeline} $iterations ${pipeline}_fp32.csv`;
-# FP16
-print "############### FP16 ##############\n";
-foreach my $pipeline (@pipelines) {
-    print "Running $pipeline\n";
-    $time = `date`;
-    print $time;
-    `~/awesome_profiler/pp ./${pipeline}_half $iterations ${pipeline}_fp16.csv`;
diff --git a/hpvm/projects/gpu_profiler/src/offline_profiler.cpp b/hpvm/projects/gpu_profiler/src/offline_profiler.cpp
deleted file mode 100644
index 25ca45241c29e7a0f8edb0518d8347a185caf5a4..0000000000000000000000000000000000000000
--- a/hpvm/projects/gpu_profiler/src/offline_profiler.cpp
+++ /dev/null
@@ -1,584 +0,0 @@
-#include <cmath>
-#include <chrono>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <boost/algorithm/string.hpp>
-#include <vector>
-#include <map>
-#include <thread>
-#include <atomic>
-#include <sched.h>
-#define NUM_ARGS 4
-// This is a simple power profiler that can sample the power of the various
-// components in a Jetson TX2. The usage is simple: profile() measures power
-// for the specified program, and then dumpOutput() prints the readings to a
-// file. profile() can be called as many times as desired - the internal state
-// is reset each time and thus the measurements are not cumulative.
-class Profiler {
-    // Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
-    // we can't use them.
-    const unsigned core0 = 0;
-    const unsigned core1 = 3;
-    const unsigned core2 = 4;
-    const unsigned core3 = 5;
-    // sysfs paths for i2c buses of various components
-    const char * const cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
-    const char * const gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
-    const char * const ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
-    const char * const soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
-    const char * const sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
-    // It takes some time for the GPU's power to return to idle (ms)
-    const unsigned gpu_idle_time = 0;
-    // An individual power reading
-    struct PowerReading {
-        std::chrono::time_point<std::chrono::high_resolution_clock> time_;
-        double cpu_;
-        double gpu_;
-        double ddr_;
-        double soc_;
-        double sys_;
-    };
-    // Individual tensor op
-    struct TensorOp {
-        std::string name_;
-        double start_;
-        double finish_;
-        double time_;
-        double energy_;
-        double gpu_energy_;
-        double ddr_energy_;
-        double power_;
-        double gpu_power_;
-        double ddr_power_;
-        TensorOp(std::string name, double start, double finish)
-            : name_(name), start_(start), finish_(finish), time_(finish - start),
-            energy_(0.0), gpu_energy_(0.0), ddr_energy_(0.0),
-            power_(0.0), gpu_power_(0.0), ddr_power_(0.0) {
-        }
-    };
-    // Aggregate tensor info
-    struct AggTensorInfo {
-        // Op name
-        std::string name_;
-        // Averages
-        double average_time_;
-        double average_energy_;
-        double average_gpu_energy_;
-        double average_ddr_energy_;
-        double average_power_;
-        double average_gpu_power_;
-        double average_ddr_power_;
-        // Standard deviations
-        double time_std_;
-        double energy_std_;
-        double gpu_energy_std_;
-        double ddr_energy_std_;
-        double power_std_;
-        double gpu_power_std_;
-        double ddr_power_std_;
-    };
-    // Total time, energy, and power
-    struct TotalInfo {
-        double time_;
-        double energy_;
-        double gpu_energy_;
-        double ddr_energy_;
-        double power_;
-        double gpu_power_;
-        double ddr_power_;
-        void clear() {
-            time_ = 0.0;
-            energy_ = 0.0;
-            gpu_energy_ = 0.0;
-            ddr_energy_ = 0.0;
-            power_ = 0.0;
-            gpu_power_ = 0.0;
-            ddr_power_ = 0.0;
-        }
-    };
-    // For reading the i2c buses via sysfs
-    std::ifstream cpu_stream_;
-    std::ifstream gpu_stream_;
-    std::ifstream ddr_stream_;
-    std::ifstream soc_stream_;
-    std::ifstream sys_stream_;
-    // Start time (so graph begins from t=0)
-    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
-    // Per-run info
-    std::vector<PowerReading> power_readings_;
-    // Aggregate (across all runs) info
-    std::map<std::string, std::vector<TensorOp>> tensor_info_;
-    std::vector<AggTensorInfo> agg_tensor_info_;
-    TotalInfo total_info_;
-    unsigned iterations_;
-    // Start and stop flags to synchronize the program and profiling threads
-    std::atomic_bool start_;
-    std::atomic_bool stop_;
-    // Resets tensor info and total time and energy
-    void resetGlobal() {
-        tensor_info_.clear();
-        agg_tensor_info_.clear();
-        total_info_.clear();
-    }
-    // Resets power readings and flags
-    void resetLocal() {
-        power_readings_.clear();
-        start_ = false;
-        stop_ = false;
-    }
-    // Pins the given thread to the specified core
-    void pinThread(std::thread &t, const unsigned core) const {
-        cpu_set_t cpuset;
-        CPU_ZERO(&cpuset);
-        CPU_SET(core, &cpuset);
-        if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0)
-            std::cout << "Couldn't set thread affinity\n";
-    }
-    // Adds a tensor op to the map
-    void addTensorOp(std::string &op_name, TensorOp &top) {
-        // Create a vector if this is the first entry
-        auto it = tensor_info_.find(op_name);
-        if (it == tensor_info_.end()) {
-            tensor_info_.insert(std::pair<std::string, std::vector<TensorOp>>(op_name, std::vector<TensorOp>()));
-        }
-        tensor_info_[op_name].push_back(top);
-    }
-    // Obtain's a single power reading from the GPU and DDR rails
-    void getPowerReading() {
-        PowerReading reading;
-        // The order matters here. All the reads have to happen together first
-        // and then all the seeks have to happen together at the end, otherwise
-        // there will be a significant time difference between the readings of
-        // the different rails.
-        reading.time_ = std::chrono::high_resolution_clock::now();
-        gpu_stream_ >> reading.gpu_;
-        ddr_stream_ >> reading.ddr_;
-        power_readings_.push_back(reading);
-        // Reset the input position of the files
-        gpu_stream_.seekg(0);
-        ddr_stream_.seekg(0);
-    }
-    // Executes the program to be profiled
-    void runProgram(const char * const program) {
-        // Tell the profiling thread to start, execute the program that needs
-        // to be profiled, and then tell the profiling thread to stop.
-        start_ = true;
-        const auto result = std::system(program);
-        stop_ = true;
-    }
-    // Records power while the program is running
-    void recordPower() {
-        // Obtain the new start time, wait for the start signal, and keep
-        // profiling until the stop flag is set.
-        start_time_ = std::chrono::high_resolution_clock::now();
-        while (!start_);
-        while (!stop_)
-            getPowerReading();
-    }
-    // Calculates stats for the entire execution (CPU+GPU phase)
-    void updateTotalStats() {
-        double energy = 0.0;
-        double gpu_energy = 0.0;
-        double ddr_energy = 0.0;
-        std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
-        for (auto reading : power_readings_) {
-            std::chrono::duration<double> duration = reading.time_ - prev_time;
-            gpu_energy += reading.gpu_ * duration.count();
-            ddr_energy += reading.ddr_ * duration.count();
-            prev_time = reading.time_;
-        }
-        energy = gpu_energy + ddr_energy;
-        auto time = std::chrono::duration<double>(prev_time - start_time_).count();
-        total_info_.time_ += time;
-        total_info_.energy_ += (gpu_energy + ddr_energy);
-        total_info_.gpu_energy_ += gpu_energy;
-        total_info_.ddr_energy_ += ddr_energy;
-        total_info_.power_ += (energy / time);
-        total_info_.gpu_power_ += (gpu_energy / time);
-        total_info_.ddr_power_ += (ddr_energy / time);
-    }
-    // Calculates energy and power usage of the given tensor operation
-    void calculateTensorEP(TensorOp &top) const {
-        auto prev_time = top.start_;
-        unsigned i = 0;
-        // Skip until we hit the start time of the operation
-        for (; std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count() < top.start_; i++);
-        // Keep going until we hit the finish time of the operation or we run out of readings
-        for (double curr_time; ((curr_time = std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count()) <= top.finish_)
-                && (i < power_readings_.size()); i++) {
-            auto duration = curr_time - prev_time;
-            prev_time = curr_time;
-            top.gpu_energy_ += power_readings_[i].gpu_ * duration;
-            top.ddr_energy_ += power_readings_[i].ddr_ * duration;
-        }
-        top.energy_ = top.gpu_energy_ + top.ddr_energy_;
-        top.power_ = top.energy_ / top.time_;
-        top.gpu_power_ = top.gpu_energy_ / top.time_;
-        top.ddr_power_ = top.ddr_energy_ / top.time_;
-    }
-    // Calculates stats for all the tensors in the timestamp file
-    void updatePerOpStats() {
-        const char * const op_file = "profile_data.txt";
-        std::string line;
-        std::ifstream ifs(op_file, std::ios::in);
-        // Calculate time and energy for each tensor operation. There are two
-        // possibilities for the file format:
-        // If the line doesn't begin with #, we are looking at FP32 code
-        // without any conversions to/from FP16, and each operation occupies
-        // two consecutive lines in the timestamp file.
-        // If the line does begin with #, we are looking at FP16 code with
-        // conversion routines in the middle. In this case, *after* the current
-        // line, there will be two lines for F2H, two lines for H2F, and then
-        // one line for the end of the operation.
-        while (std::getline(ifs, line)) {
-            std::vector<std::string> tokens;
-            boost::split(tokens, line, boost::is_any_of("\t"));
-            std::string op_name = tokens[0];
-            // FP32
-            if (tokens[0][0] != '#') {
-                // First line with tensor op name and start time
-                std::string op_name = tokens[0];
-                const auto start = std::stod(tokens[1]);
-                // Second line with tensor op end time
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto finish = std::stod(tokens[1]);
-                TensorOp top(op_name, start, finish);
-                calculateTensorEP(top);
-                addTensorOp(op_name, top);
-            } else {
-                // First line with tensor op name and start time
-                std::string op_name = tokens[0].substr(1);
-                const auto start = std::stod(tokens[1]);
-                // Second line with f2h
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                std::string f2h_name = op_name + "_f2h";
-                const auto f2h_start = std::stod(tokens[1]);
-                // Third line with f2h
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto f2h_finish = std::stod(tokens[1]);
-                // Add f2h
-                TensorOp f2h(f2h_name, f2h_start, f2h_finish);
-                calculateTensorEP(f2h);
-                addTensorOp(f2h_name, f2h);
-                // Fourth line with h2f
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                std::string h2f_name = op_name + "_h2f";
-                const auto h2f_start = std::stod(tokens[1]);
-                // Fifth line with h2f
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto h2f_finish = std::stod(tokens[1]);
-                // Add h2f
-                TensorOp h2f(h2f_name, h2f_start, h2f_finish);
-                calculateTensorEP(h2f);
-                addTensorOp(h2f_name, h2f);
-                // Sixth and final line with tensor op end time
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto finish = std::stod(tokens[1]);
-                // Subtract f2h's and h2f's time and energy to get just the computation's info
-                TensorOp top(op_name, start, finish);
-                calculateTensorEP(top);
-                top.time_ -= (f2h.time_ + h2f.time_);
-                top.energy_ -= (f2h.energy_ + h2f.energy_);
-                top.gpu_energy_ -= (f2h.gpu_energy_ + h2f.gpu_energy_);
-                top.ddr_energy_ -= (f2h.ddr_energy_ + h2f.ddr_energy_);
-                top.power_ = top.energy_ / top.time_;
-                top.gpu_power_ = top.gpu_energy_ / top.time_;
-                top.ddr_power_ = top.ddr_energy_ / top.time_;
-                addTensorOp(op_name, top);
-            }
-        }
-        ifs.close();
-    }
-    void updateStats() {
-        updatePerOpStats();
-        updateTotalStats();
-    }
-    // Calculates the average and standard deviation of each metric of each tensor op
-    void calculateAggregateStats() {
-        for (auto it = tensor_info_.begin(); it != tensor_info_.end(); it++) {
-            AggTensorInfo ati;
-            ati.name_ = it->first;
-            auto topv = it->second;
-            double total_time = 0.0;
-            double total_energy = 0.0;
-            double total_gpu_energy = 0.0;
-            double total_ddr_energy = 0.0;
-            double total_power = 0.0;
-            double total_gpu_power = 0.0;
-            double total_ddr_power = 0.0;
-            double time_sum = 0.0;
-            double energy_sum = 0.0;
-            double gpu_energy_sum = 0.0;
-            double ddr_energy_sum = 0.0;
-            double power_sum = 0.0;
-            double gpu_power_sum = 0.0;
-            double ddr_power_sum = 0.0;
-            // Calculate average
-            for (const auto &top : topv) {
-                total_time += top.time_;
-                total_energy += top.energy_;
-                total_gpu_energy += top.gpu_energy_;
-                total_ddr_energy += top.ddr_energy_;
-                total_power += top.power_;
-                total_gpu_power += top.gpu_power_;
-                total_ddr_power += top.ddr_power_;
-            }
-            ati.average_time_ = total_time / iterations_;
-            ati.average_energy_ = total_energy / iterations_;
-            ati.average_gpu_energy_ = total_gpu_energy / iterations_;
-            ati.average_ddr_energy_ = total_ddr_energy / iterations_;
-            ati.average_power_ = total_power / iterations_;
-            ati.average_gpu_power_ = total_gpu_power / iterations_;
-            ati.average_ddr_power_ = total_ddr_power / iterations_;
-            // Calculate standard deviation
-            for (const auto &top : topv) {
-                auto time_diff = top.time_ - ati.average_time_;
-                time_sum += time_diff * time_diff;
-                auto energy_diff = top.energy_ - ati.average_energy_;
-                energy_sum += energy_diff * energy_diff;
-                auto gpu_energy_diff = top.gpu_energy_ - ati.average_gpu_energy_;
-                gpu_energy_sum += gpu_energy_diff * gpu_energy_diff;
-                auto ddr_energy_diff = top.ddr_energy_ - ati.average_ddr_energy_;
-                ddr_energy_sum += ddr_energy_diff * ddr_energy_diff;
-                auto power_diff = top.power_ - ati.average_power_;
-                power_sum += power_diff * power_diff;
-                auto gpu_power_diff = top.gpu_power_ - ati.average_gpu_power_;
-                gpu_power_sum += gpu_power_diff * gpu_power_diff;
-                auto ddr_power_diff = top.ddr_power_ - ati.average_ddr_power_;
-                ddr_power_sum += ddr_power_diff * ddr_power_diff;
-            }
-            ati.time_std_ = std::sqrt(time_sum / iterations_);
-            ati.energy_std_ = std::sqrt(energy_sum / iterations_);
-            ati.gpu_energy_std_ = std::sqrt(gpu_energy_sum / iterations_);
-            ati.ddr_energy_std_ = std::sqrt(ddr_energy_sum / iterations_);
-            ati.power_std_ = std::sqrt(power_sum / iterations_);
-            ati.gpu_power_std_ = std::sqrt(gpu_power_sum / iterations_);
-            ati.ddr_power_std_ = std::sqrt(ddr_power_sum / iterations_);
-            agg_tensor_info_.push_back(ati);
-        }
-    }
-    Profiler() {
-        cpu_stream_.open(cpu_power_rail, std::ifstream::in);
-        gpu_stream_.open(gpu_power_rail, std::ifstream::in);
-        ddr_stream_.open(ddr_power_rail, std::ifstream::in);
-        soc_stream_.open(soc_power_rail, std::ifstream::in);
-        sys_stream_.open(sys_power_rail, std::ifstream::in);
-        if (!cpu_stream_.is_open() or !gpu_stream_.is_open() or !ddr_stream_.is_open()
-            or !soc_stream_.is_open() or !sys_stream_.is_open()) {
-            std::cout << "Failed to open one of the power rails for reading\n";
-            exit(1);
-        }
-    }
-    ~Profiler() {
-        cpu_stream_.close();
-        gpu_stream_.close();
-        ddr_stream_.close();
-        soc_stream_.close();
-        sys_stream_.close();
-    }
-    void profile(const char * const program, const int iterations) {
-        iterations_ = iterations;
-        resetGlobal();
-        for (unsigned i = 0; i < iterations_; i++) {
-            resetLocal();
-            // Launch two threads: one for running the program and one for
-            // profiling it. Pin the threads to specific cores to remove migration
-            // overhead. Profiling showed that the sampling rate increases slightly
-            // with pinning.
-            std::thread prog(&Profiler::runProgram, this, program);
-            std::thread power(&Profiler::recordPower, this);
-            pinThread(prog, core1);
-            pinThread(power, core2);
-            prog.join();
-            power.join();
-            updateStats();
-            // Sleep for some time to bring the GPU back to idle
-            std::this_thread::sleep_for(std::chrono::milliseconds(gpu_idle_time));
-        }
-        calculateAggregateStats();
-    }
-    void dumpTensorInfo(const char * const filename) const {
-        const std::string header = "Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std\n";
-        std::ofstream ofs;
-        ofs.open(filename);
-        //ofs << header;
-        for (const auto &ati : agg_tensor_info_) {
-            ofs << ati.name_
-                << "," << ati.average_time_ * 1e3
-                << "," << ati.average_energy_
-                /*
-                << "," << ati.average_gpu_energy_
-                << "," << ati.average_ddr_energy_
-                << "," << ati.average_power_
-                << "," << ati.average_gpu_power_
-                << "," << ati.average_ddr_power_
-                << "," << ati.time_std_ * 1e3
-                << "," << ati.energy_std_
-                << "," << ati.gpu_energy_std_
-                << "," << ati.ddr_energy_std_
-                << "," << ati.power_std_
-                << "," << ati.gpu_power_std_
-                << "," << ati.ddr_power_std_*/
-                << "\n";
-            std::cout << ati.average_time_ * 1e3 << "," << ati.average_energy_ << "\n";
-        }
-        ofs.close();
-    }
-    void dumpPowerReadings(const char * const filename) const {
-        std::ofstream ofs;
-        ofs.open(filename);
-        for (const auto &reading : power_readings_) {
-            std::chrono::duration<double> duration = reading.time_ - start_time_;
-            //std::chrono::duration<double> duration = reading.time_.time_since_epoch();
-            ofs << std::to_string(duration.count())
-                << " " << reading.gpu_
-                << " " << reading.ddr_
-                << "\n";
-        }
-        ofs.close();
-    }
-    void dumpTotalInfo() const {
-        auto total_time = total_info_.time_ / iterations_;
-        auto total_energy = total_info_.energy_ / iterations_;
-        auto gpu_energy = total_info_.gpu_energy_ / iterations_;
-        auto ddr_energy = total_info_.ddr_energy_ / iterations_;
-        auto power = total_info_.power_ / iterations_;
-        auto gpu_power = total_info_.gpu_power_ / iterations_;
-        auto ddr_power = total_info_.ddr_power_ / iterations_;
-        std::cout << "-----------------------------------------------------\n";
-        std::cout << "Program info (average)\n";
-        std::cout << "-----------------------------------------------------\n";
-        std::cout << "\tExecution time: " << total_time << " seconds\n";
-        std::cout << "\tTotal energy:   " << total_energy << " mJ\n";
-        std::cout << "\t    GPU:        " << gpu_energy << " mJ\n";
-        std::cout << "\t    DDR:        " << ddr_energy << " mJ\n";
-        std::cout << "\tPower:          " << power << " mW\n";
-        std::cout << "\t    GPU:        " << gpu_power << " mW\n";
-        std::cout << "\t    DDR:        " << ddr_power << " mW\n";
-        std::cout << "-----------------------------------------------------\n";
-    }
-int main(int argc, char *argv[]) {
-    if (argc < NUM_ARGS) {
-        std::cout << "Usage: " << argv[0] << " <program> <iterations> <tensor output file> [power output file]\n";
-        exit(1);
-    }
-    Profiler pp;
-    pp.profile(argv[1], std::stoi(argv[2]));
-    pp.dumpTensorInfo(argv[3]);
-    if (argc > NUM_ARGS)
-        pp.dumpPowerReadings(argv[4]);
-    return 0;
diff --git a/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py b/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e007ca9277f9e584708488ee57fd08c693a00279
--- /dev/null
+++ b/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py
@@ -0,0 +1,175 @@
+from pathlib import Path
+from typing import Iterable, List, Tuple, Union
+from dataclasses import dataclass
+PathLike = Union[Path, str]
+def profile_configs(
+    binary_path: PathLike,
+    config_path: PathLike,
+    output_config_path: PathLike,
+    profile_filename: str = "profile_info.txt",
+    qos_filename: str = "final_accuracy",
+    """
+    Profile an HPVM configuration file with an HPVM binary.
+    The configuration file must have the baseline as the first configuration.
+    binary_path: Union[Path, str]
+        Path to binary to be executed in profiling.
+    config_path: Union[Path, str]
+        Path to config file (HPVM configuration format)
+        with configs to enumerate for profiling.
+    output_config_path: Union[Path, str]
+        Path where the output configs are written.
+        The output config file has the same configs as the input `config_path` file,
+        but the performance and energy readings are updated.
+    profile_filename: str
+        Name of profile file generated by the binary (in current directory).
+        This defaults to "profile_info.txt" and should not be changed for HPVM binaries.
+    qos_filename: str
+        Name of QoS file generated by the binary (in current directory).
+        It contains a single float number as the QoS of this run.
+        This defaults to "final_accuracy" and should not be changed for HPVM binaries.
+    """
+    from subprocess import check_call
+    from tempfile import NamedTemporaryFile
+    # Read first line ("the float") and configs in config file
+    header, configs = read_hpvm_configs(Path(config_path))
+    if not configs:
+        raise ValueError("Config file with no configs is unsupported.")
+    temp_file = NamedTemporaryFile("w")
+    baseline_time, baseline_acc = None, None
+    for idx, config in enumerate(configs):
+        # Write config to temp config file
+        write_hpvm_config(header, [config], Path(temp_file.name))
+        # Run binary_path binary,
+        # which generates `profile_filename` and `qos_filename` file in cwd.
+        check_call(str(binary_path))
+        # Read these two files for time and QoS info.
+        time = _read_profile_file(Path(profile_filename))
+        acc = _read_qos_file(Path(qos_filename))
+        if idx == 0:
+            baseline_time, baseline_acc = time, acc
+            continue
+        assert baseline_time is not None and baseline_acc is not None
+        speedup = baseline_time / time
+        config.update_profile_results(speedup, acc, baseline_acc)
+    write_hpvm_config(header, configs, Path(output_config_path))
+    temp_file.close()
+def plot_hpvm_configs(
+    config_path: PathLike,
+    save_to: PathLike = None,
+    show_qos_loss: bool = True,
+    **fig_kwargs,
+    """
+    Plot the QoS-speedup information in an HPVM configuration file.
+    It is recommended to profile the config file first (using `profile_configs`)
+    to obtain real speedup numbers.
+    This function creates a `matplotlib.pyplot.Figure`, plots on it, and returns it.
+    config_path: Union[Path, str]
+        Path to the config file (HPVM configuration format).
+    save_to: Union[Path, str]
+        File to save figure into. Default is None: don't save figure (just return it).
+    show_qos_loss: bool
+        Show the loss of QoS on x axis of the figure. Defaults to True.
+        If False, will use (absolute) QoS instead of QoS loss.
+    fig_kwargs:
+        Arguments to pass to `plt.subplots`.
+    """
+    import numpy as np
+    import matplotlib.pyplot as plt
+    _, configs = read_hpvm_configs(config_path)
+    get_qos = lambda c: c.qos_loss if show_qos_loss else c.qos
+    qos_speedup = np.array([(get_qos(c), c.speedup) for c in configs])
+    qoses, speedups = qos_speedup.T
+    fig, ax = plt.subplots(**fig_kwargs)
+    ax.scatter(qoses, speedups)
+    ax.xlabel("QoS Loss")
+    ax.ylabel("Speedup (X)")
+    if save_to:
+        fig.savefig(save_to, dpi=300)
+    return fig
+class Config:
+    conf_name: str
+    speedup: float
+    energy: float
+    qos: float
+    qos_loss: float
+    # We don't care about the information in this part, and we don't parse this.
+    config_body: List[str]
+    def update_profile_results(self, speedup: float, qos: float, base_qos: float):
+        recorded_base_qos = self.qos + self.qos_loss
+        if abs(recorded_base_qos - base_qos) > 1e-3:
+            raise ValueError(
+                f"Baseline QoS mismatch. Original: {recorded_base_qos}, measured: {base_qos}"
+            )
+        self.speedup = speedup
+        self.qos = qos
+        self.qos_loss = base_qos - qos
+    def __repr__(self) -> str:
+        header_fields = [
+            self.conf_name,
+            self.speedup,
+            self.energy,
+            self.qos,
+            self.qos_loss,
+        ]
+        header = " ".join(str(field) for field in header_fields)
+        return f"{header}\n{self.config_body}"
+    __str__ = __repr__
+def read_hpvm_configs(config_file: PathLike) -> Tuple[str, List[Config]]:
+    # def read_hpvm_configs(config_file, config_num, temp_file):
+    ret_configs = []
+    with open(config_file) as f:
+        text = f.read()
+    opening, closing = "+++++", "-----"
+    # There's 1 float sitting on the first line of config file.
+    # We don't use it, but want to keep that intact.
+    header, *configs = text.split(opening)
+    header = header.strip()
+    for config_text in configs:
+        config_text = config_text.replace(closing, "").strip()
+        config_header, *config_body = config_text.splitlines()
+        conf_name, *number_fields = config_header.split(" ")
+        speedup, energy, qos, qos_drop = [float(s) for s in number_fields]
+        ret_configs.append(
+            Config(conf_name, speedup, energy, qos, qos_drop, config_body)
+        )
+    return header, ret_configs
+def write_hpvm_config(header: str, configs: Iterable[Config], to_file: PathLike):
+    text_segs = [header] + [str(config) for config in configs]
+    with open(to_file, "w") as f:
+        f.write("\n".join(text_segs))
+def _read_profile_file(profile_file_path: Path):
+    with profile_file_path.open() as f:
+        target_lines = [line.strip() for line in f if "Total Time" in line]
+    if len(target_lines) != 1:
+        raise RuntimeError(f"Profile {profile_file_path} malformed")
+    (target_line,) = target_lines
+    return float(target_line.split()[3])
+def _read_qos_file(qos_file_path: Path):
+    with qos_file_path.open() as f:
+        return float(f.read().strip())
diff --git a/hpvm/projects/hpvm-profiler/setup.py b/hpvm/projects/hpvm-profiler/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7b3771a2ee3a67569092d9b956e67c309f9d08e
--- /dev/null
+++ b/hpvm/projects/hpvm-profiler/setup.py
@@ -0,0 +1,11 @@
+import setuptools
+    name="hpvm_profiler",
+    version="0.1",
+    author="Akash Kothari, Yifan Zhao",
+    author_email="akashk4@illinois.edu, yifanz16@illinois.edu",
+    description="A package for profiling of HPVM approximation configurations",
+    packages=["hpvm_profiler"],
+    install_requires=["numpy>=1.19", "matplotlib>=3"],
diff --git a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
index 602c9327b946c9e7ccf07e7eec3519657c11a319..a142d524b69cb605b85c496aa140c806ad258dfd 100644
--- a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -28,21 +28,20 @@ set(
   ./tensor_runtime/include ${CMAKE_CURRENT_BINARY_DIR}/tensor_runtime/include
-  ../gpu_profiler/include ../soc_simulator/include
+# Build gpu_profiler and soc_simulator (dependencies)
+add_library(gpu_profiler SHARED gpu_profiler/profiler.cpp)
+target_include_directories(gpu_profiler PUBLIC gpu_profiler/)
+target_link_libraries(gpu_profiler pthread)
+add_library(soc_simulator SHARED soc_simulator/promise_timing_model.cpp)
+target_include_directories(soc_simulator PUBLIC soc_simulator/)
 # -- Link libraries
 find_package(OpenMP REQUIRED)  # Provides ${OpenMP_CXX_FLAGS}
-# Configure gpu_profiler and soc_simulator, and setup all libs to link to
-# Conditionally add gpu_profiler project if we're building independently
-# (not building the whole hpvm)
-  message(STATUS "Also compiling gpu_profiler and soc_simulator")
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../gpu_profiler ${CMAKE_CURRENT_BINARY_DIR}/gpu_profiler)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../soc_simulator ${CMAKE_CURRENT_BINARY_DIR}/soc_simulator)
-set(LINK_LIBS gpu_profiler promise_profiler stdc++fs cudnn curand cublas)
+set(LINK_LIBS gpu_profiler soc_simulator stdc++fs cudnn curand cublas)
   list(APPEND LINK_LIBS gflags)
@@ -77,17 +76,16 @@ endforeach()
 # -- Adding tensor_runtime targets
 function(add_tensor_runtime target_name)
-  add_library(${target_name} ${RUNTIME_SRCS})
+  add_library(${target_name} SHARED ${RUNTIME_SRCS})
   set_property(TARGET ${target_name} PROPERTY CUDA_ARCHITECTURES 60)
     ${target_name} PRIVATE
-    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr -maxrregcount 32>
-    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CONFIG:DEBUG>>:-lineinfo -Xcompiler -ggdb>
-    $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
+    --expt-relaxed-constexpr -maxrregcount 32 -Xcompiler=${OpenMP_CXX_FLAGS}
+    $<$<CONFIG:DEBUG>:-lineinfo -Xcompiler=-ggdb>
   target_include_directories(${target_name} PUBLIC ${INCLUDES})
   target_link_directories(${target_name} PUBLIC ${LINK_DIR})
-  target_link_libraries(${target_name} PUBLIC ${LINK_LIBS})
+  target_link_libraries(${target_name} PUBLIC ${LINK_LIBS} ${OpenMP_CXX_FLAGS})
   target_compile_definitions(${target_name} PRIVATE ${DEFS} ${ARGN})
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
index e3e6a864fa5128ed21ca6a1a161b3593f7bc9948..61fd362afcc665e21a7ba8636c8df778ac95184e 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
@@ -16,7 +16,7 @@
 #include <string.h>
 std::vector<float> run_accuracies;
-std::string model_params_path =  "../../test/dnn_benchmarks/model_params/";
+std::string model_params_path = "../../test/dnn_benchmarks/model_params/";
 // FIXIT: Move this to debug.h and include in all files
 void dumpWeightsToFile(const char *file_name, void *weights_ptr) {
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
index 7fa76350b5ec8f95f0a27da2436b7cccbe3c21f3..0b344035296bdbab2744e32604f3a8881feb6230 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
@@ -12,7 +12,7 @@ void testCifarNet() {
       model_params_path + std::string("/alexnet2_cifar10/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
index 54cbbed01475ab0f13c3ff9b131bf174ba8e8e12..03dc905bbfcb07ad9a266fc153cd1a6a0db9837e 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
@@ -11,40 +11,46 @@ void testLenetTanh() {
   int test_batch_size = 5000;
-  std::string dir_prefix =
-     model_params_path + std::string("/lenet_mnist/");
+  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   // Loading Input Batch
   void *input =
       readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28);
-  uint32_t *labels = readLabelsBatch3(labels_path.c_str(), 0, test_batch_size);
-  void *conv1_filter = readTrainedWeights((dir_prefix + std::string("/conv2d_1_w.bin")).c_str(),
-					  float_type, 32, 1, 5, 5);
-  void *conv1_bias = readTrainedWeights((dir_prefix + std::string("/conv2d_1_b.bin")).c_str(),
-					float_type, 1, 32, 1, 1);
-  void *conv2_filter = readTrainedWeights((dir_prefix + std::string("/conv2d_2_w.bin")).c_str(),
-					   float_type, 64, 32, 5, 5);
-  void *conv2_bias = readTrainedWeights((dir_prefix + std::string("/conv2d_2_b.bin")).c_str(),
-					 float_type, 1, 64, 1, 1);
-  void *fc1_weights = readTrainedWeights((dir_prefix + std::string("/dense_1_w.bin")).c_str(),
-                                         float_type, 1, 1, 7 * 7 * 64, 1024);
-  void *fc1_bias = readTrainedWeights((dir_prefix + std::string("/dense_1_b.bin")).c_str(),
-				       float_type, 1, 1024, 1, 1);
-  void *fc2_weights = readTrainedWeights((dir_prefix + std::string("/dense_2_w.bin")).c_str(),
-                                         float_type, 1, 1, 1024, 10);
-  void *fc2_bias = readTrainedWeights((dir_prefix + std::string("/dense_2_b.bin")).c_str(),
-				       float_type, 1, 10, 1, 1);
+  uint32_t *labels = readLabelsBatch3(labels_path.c_str(), 0, test_batch_size);
+  void *conv1_filter =
+      readTrainedWeights((dir_prefix + std::string("/conv2d_1_w.bin")).c_str(),
+                         float_type, 32, 1, 5, 5);
+  void *conv1_bias =
+      readTrainedWeights((dir_prefix + std::string("/conv2d_1_b.bin")).c_str(),
+                         float_type, 1, 32, 1, 1);
+  void *conv2_filter =
+      readTrainedWeights((dir_prefix + std::string("/conv2d_2_w.bin")).c_str(),
+                         float_type, 64, 32, 5, 5);
+  void *conv2_bias =
+      readTrainedWeights((dir_prefix + std::string("/conv2d_2_b.bin")).c_str(),
+                         float_type, 1, 64, 1, 1);
+  void *fc1_weights =
+      readTrainedWeights((dir_prefix + std::string("/dense_1_w.bin")).c_str(),
+                         float_type, 1, 1, 7 * 7 * 64, 1024);
+  void *fc1_bias =
+      readTrainedWeights((dir_prefix + std::string("/dense_1_b.bin")).c_str(),
+                         float_type, 1, 1024, 1, 1);
+  void *fc2_weights =
+      readTrainedWeights((dir_prefix + std::string("/dense_2_w.bin")).c_str(),
+                         float_type, 1, 1, 1024, 10);
+  void *fc2_bias =
+      readTrainedWeights((dir_prefix + std::string("/dense_2_b.bin")).c_str(),
+                         float_type, 1, 10, 1, 1);
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
index 7340e52c8d60453a1fb6f68f718c1c08c478c058..d6eaef755743ce961d3d9c2f013eef26a77579f7 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
@@ -7,11 +7,12 @@ int main() {
-  std::string dir_prefix = model_params_path + std::string("/mobilenet_cifar10/");
+  std::string dir_prefix =
+      model_params_path + std::string("/mobilenet_cifar10/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
index 7ca01cd60e8c3d8cb1ce957f7016d2c492550537..40e128eb8a80f6e080c090589a3e91b80ffa082f 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
@@ -7,12 +7,13 @@ int main() {
-  std::string dir_prefix =  model_params_path + std::string("/resnet18_cifar10/");
+  std::string dir_prefix =
+      model_params_path + std::string("/resnet18_cifar10/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3);
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
index a5b703ab6e2461f67c2d05b077ca2bdade86a5f9..eb3275b83009ec4300e9cb713f3b182727661db4 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
@@ -8,10 +8,10 @@ int main() {
   std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
index 287943c0bf2417beccaebbee4f6f5cddfc667549..396e9f11cae92c2f6613b5acb799caecbf025a59 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
@@ -8,7 +8,8 @@ void testCifarNet() {
   printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
-  std::string dir_prefix = model_params_path + std::string("/alexnet2_cifar10/");
+  std::string dir_prefix =
+      model_params_path + std::string("/alexnet2_cifar10/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
index 3dfef856c6eeeb93458ee93f7bf4a8c4feb852ac..600512078563baf850f440ea97e78cb2d73be170 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
@@ -10,7 +10,7 @@ int main() {
   std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
-  //std::string labels_path = dir_prefix + std::string("labels.bin");
+  // std::string labels_path = dir_prefix + std::string("labels.bin");
   std::string labels32_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
index 94da804e8aade0cd296a431b026be637c823201f..29909e5938ca0f700c4ee22165ae2ad354e53a32 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
@@ -7,8 +7,9 @@ int main() {
-  std::string dir_prefix = model_params_path + std::string("/alexnet_imagenet/");
+  std::string dir_prefix =
+      model_params_path + std::string("/alexnet_imagenet/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
index e1550c715cb36dca8eec1fd24e82e038014039a0..61a0eeb441458ff6f91af8bc76ecc17a33428aec 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
@@ -5,50 +5,56 @@
 int total_runs = 1;
 void testLenetTanh() {
   printf("********* Lenet-2 Architecture ********** \n");
   int test_batch_size = 5000;
-  std::string dir_prefix =
-     model_params_path + std::string("/lenet_mnist/");
+  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   // Loading Input Batch
   void *input =
       readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28);
   uint32_t *labels = readLabelsBatch3(labels_path.c_str(), 0, test_batch_size);
-  void *conv1_filter = readTrainedWeights((dir_prefix + std::string("/conv2d_1_w.bin")).c_str(),
-					  float_type, 32, 1, 5, 5);
-  void *conv1_bias = readTrainedWeights((dir_prefix + std::string("/conv2d_1_b.bin")).c_str(),
-					float_type, 1, 32, 1, 1);
-  void *conv2_filter = readTrainedWeights((dir_prefix + std::string("/conv2d_2_w.bin")).c_str(),
-					   float_type, 64, 32, 5, 5);
-  void *conv2_bias = readTrainedWeights((dir_prefix + std::string("/conv2d_2_b.bin")).c_str(),
-					 float_type, 1, 64, 1, 1);
-  void *fc1_weights = readTrainedWeights((dir_prefix + std::string("/dense_1_w.bin")).c_str(),
-                                         float_type, 1, 1, 7 * 7 * 64, 1024);
-  void *fc1_bias = readTrainedWeights((dir_prefix + std::string("/dense_1_b.bin")).c_str(),
-				       float_type, 1, 1024, 1, 1);
-  void *fc2_weights = readTrainedWeights((dir_prefix + std::string("/dense_2_w.bin")).c_str(),
-                                         float_type, 1, 1, 1024, 10);
-  void *fc2_bias = readTrainedWeights((dir_prefix + std::string("/dense_2_b.bin")).c_str(),
-				       float_type, 1, 10, 1, 1);
+  void *conv1_filter =
+      readTrainedWeights((dir_prefix + std::string("/conv2d_1_w.bin")).c_str(),
+                         float_type, 32, 1, 5, 5);
+  void *conv1_bias =
+      readTrainedWeights((dir_prefix + std::string("/conv2d_1_b.bin")).c_str(),
+                         float_type, 1, 32, 1, 1);
+  void *conv2_filter =
+      readTrainedWeights((dir_prefix + std::string("/conv2d_2_w.bin")).c_str(),
+                         float_type, 64, 32, 5, 5);
+  void *conv2_bias =
+      readTrainedWeights((dir_prefix + std::string("/conv2d_2_b.bin")).c_str(),
+                         float_type, 1, 64, 1, 1);
+  void *fc1_weights =
+      readTrainedWeights((dir_prefix + std::string("/dense_1_w.bin")).c_str(),
+                         float_type, 1, 1, 7 * 7 * 64, 1024);
+  void *fc1_bias =
+      readTrainedWeights((dir_prefix + std::string("/dense_1_b.bin")).c_str(),
+                         float_type, 1, 1024, 1, 1);
+  void *fc2_weights =
+      readTrainedWeights((dir_prefix + std::string("/dense_2_w.bin")).c_str(),
+                         float_type, 1, 1, 1024, 10);
+  void *fc2_bias =
+      readTrainedWeights((dir_prefix + std::string("/dense_2_b.bin")).c_str(),
+                         float_type, 1, 10, 1, 1);
   for (int i = 0; i < total_runs; i++) {
     // Start power and performnce profiling
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
index 618b418ec99418a3e02f446729d5dc2e244081d5..85849126cf164693d12fb08aba8326033ca61b82 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
@@ -7,7 +7,8 @@ int main() {
-  std::string dir_prefix = model_params_path + std::string("/mobilenet_cifar10/");
+  std::string dir_prefix =
+      model_params_path + std::string("/mobilenet_cifar10/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
index dcd96119630d1bed0214966c127c83a6a29ac656..bd3dd7dc1ea23f3cb8ad91e8632b347dd51a848b 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
@@ -7,12 +7,13 @@ int main() {
-  std::string dir_prefix =  model_params_path + std::string("/resnet18_cifar10/");
+  std::string dir_prefix =
+      model_params_path + std::string("/resnet18_cifar10/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3);
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
index e19c0b21070807162c791a1a6389ccda87c23c8b..0cccb124b0dca81d45887df50c4a9bcaf2a21db5 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
@@ -7,7 +7,8 @@ int main() {
-  std::string dir_prefix = model_params_path + std::string("/resnet50_imagenet/");
+  std::string dir_prefix =
+      model_params_path + std::string("/resnet50_imagenet/");
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
index 2a2e1c291c420bf95abf26b1168a049bd6441d11..746f62bce19b25c3b74bec4908cdc3c87bee034a 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
@@ -630,7 +630,7 @@ void testSampleFilter() {
 void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w,
                           int stride_h, int stride_w, int row, int col,
-			  UnitTestResults &unitTestResults) {
+                          UnitTestResults &unitTestResults) {
   float interpolation_rate = 1.0;
   for (int offset = 0; offset < 2; offset++) {
@@ -670,19 +670,15 @@ void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w,
     printf("\nConvApprox Result :");
     hpvm_request_tensor(input, HOST);
     hpvm_request_tensor(filter, HOST);
-    void *res_cpu = tensorConvApproxCPU(input, filter,
-					pad_h, pad_w,
-					stride_h, stride_w,
-					1, 1, row, col, 1, offset);
+    void *res_cpu = tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1, row, col, 1, offset);
     printf("\nConvApproxCPU Result :");
     void *res_half =
         tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
                               1, 1, row, col, 1, offset);
@@ -692,17 +688,15 @@ void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w,
     printf("\nConvApproxHalf2 Result :");
     std::string suffix =
         std::string(" pad_h = ") + std::to_string(pad_h) +
         std::string(" pad_w = ") + std::to_string(pad_w) +
         std::string(" stride_h = ") + std::to_string(stride_h) +
         std::string(" stride_w = ") + std::to_string(stride_w) +
-        std::string(" row = ") + std::to_string(row) +
-        std::string(" col = ") + std::to_string(col) +
-        std::string(" offset = ") + std::to_string(offset);
+        std::string(" row = ") + std::to_string(row) + std::string(" col = ") +
+        std::to_string(col) + std::string(" offset = ") +
+        std::to_string(offset);
     std::string test_name = std::string("PERF_FP32 ") + suffix;
     unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.05,
@@ -713,12 +707,10 @@ void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w,
     std::string cpu_test_name = std::string("PERF_CPU ") + suffix;
-    unitTestResults.compareTensors((Tensor *) res_cpu, (Tensor *)res_sim, 0.05,
+    unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05,
   printf("\n\n\n--- End of Test \n\n\n");
@@ -763,7 +755,6 @@ void testPerforation(UnitTestResults &unitTestResults) {
   testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4, unitTestResults);
   testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1, unitTestResults);
 void testSampling() {
@@ -825,7 +816,7 @@ void testSampling() {
 void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w,
                        int stride_h, int stride_w, int skip_every,
-		       std::string filter_string, 
+                       std::string filter_string,
                        UnitTestResults &unitTestResults) {
   float interpolation_rate = 1.0;
@@ -868,17 +859,16 @@ void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w,
     printf("\nConvApprox Result :");
     hpvm_request_tensor(input, HOST);
     hpvm_request_tensor(filter, HOST);
-    void *res_cpu = tensorConvApproxCPU(input, filter, pad_h, pad_w,
-					stride_h, stride_w, 1, 1, 1, 1, skip_every, offset); 
+    void *res_cpu =
+        tensorConvApproxCPU(input, filter, pad_h, pad_w, stride_h, stride_w, 1,
+                            1, 1, 1, skip_every, offset);
     printf("\nConvApproxCPU Result :");
     void *res_half =
         tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
                               1, 1, 1, 1, skip_every, offset);
@@ -889,13 +879,13 @@ void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w,
     std::string suffix =
-      "filter = " + std::string(filter_string) +
-        std::string(" pad_h = ") + std::to_string(pad_h) +
-        std::string(" pad_w = ") + std::to_string(pad_w) +
-        std::string(" stride_h = ") + std::to_string(stride_h) +
-        std::string(" stride_w = ") + std::to_string(stride_w) +
-        std::string(" skip_every = ") + std::to_string(skip_every) +
-        std::string(" offset = ") + std::to_string(offset);
+        "filter = " + std::string(filter_string) + std::string(" pad_h = ") +
+        std::to_string(pad_h) + std::string(" pad_w = ") +
+        std::to_string(pad_w) + std::string(" stride_h = ") +
+        std::to_string(stride_h) + std::string(" stride_w = ") +
+        std::to_string(stride_w) + std::string(" skip_every = ") +
+        std::to_string(skip_every) + std::string(" offset = ") +
+        std::to_string(offset);
     std::string test_name = std::string("SAMP_FP32 ") + suffix;
@@ -907,7 +897,7 @@ void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w,
     std::string cpu_test_name = std::string("SAMP_CPU ") + suffix;
-    unitTestResults.compareTensors((Tensor *) res_cpu, (Tensor *)res_sim, 0.05,
+    unitTestResults.compareTensors((Tensor *)res_cpu, (Tensor *)res_sim, 0.05,
@@ -928,7 +918,7 @@ void testSampling_3_3(UnitTestResults &unitTestResults) {
   fillTensorWithVal(filter, 1);
   float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
-  host_ptr[0] = 10; 
+  host_ptr[0] = 10;
   host_ptr[2] = 2;
   host_ptr[4] = 2;
   host_ptr[6] = 2;
diff --git a/hpvm/projects/gpu_profiler/src/profiler.cpp b/hpvm/projects/hpvm-tensor-rt/gpu_profiler/profiler.cpp
similarity index 100%
rename from hpvm/projects/gpu_profiler/src/profiler.cpp
rename to hpvm/projects/hpvm-tensor-rt/gpu_profiler/profiler.cpp
diff --git a/hpvm/projects/gpu_profiler/include/profiler.h b/hpvm/projects/hpvm-tensor-rt/gpu_profiler/profiler.h
similarity index 100%
rename from hpvm/projects/gpu_profiler/include/profiler.h
rename to hpvm/projects/hpvm-tensor-rt/gpu_profiler/profiler.h
diff --git a/hpvm/projects/soc_simulator/src/promise_timing_model.cpp b/hpvm/projects/hpvm-tensor-rt/soc_simulator/promise_timing_model.cpp
similarity index 100%
rename from hpvm/projects/soc_simulator/src/promise_timing_model.cpp
rename to hpvm/projects/hpvm-tensor-rt/soc_simulator/promise_timing_model.cpp
diff --git a/hpvm/projects/soc_simulator/include/promise_timing_model.h b/hpvm/projects/hpvm-tensor-rt/soc_simulator/promise_timing_model.h
similarity index 100%
rename from hpvm/projects/soc_simulator/include/promise_timing_model.h
rename to hpvm/projects/hpvm-tensor-rt/soc_simulator/promise_timing_model.h
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
index cbd44313873a5ed79a94a17c54ca4d8e57cf09d4..03b46ca5a3b30fc2287307f306b0ca2d8e450828 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
@@ -128,7 +128,7 @@ public:
   unsigned long getIterationFrequency();
-  void set_out_file_name(std::string &str);
+  void set_out_file_name(const std::string &str);
   void printToFile();
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
index 77ef5a396be425816ad093afdb13efe8f42cbd75..bea66370ba073490fe7970014f1005f123e58988 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -265,7 +265,9 @@ double ProfileInfo::getCurrentIterationComputeEnergy() {
   return energy_compute_current_iteration;
-void ProfileInfo::set_out_file_name(std::string &str) { out_file_name = str; }
+void ProfileInfo::set_out_file_name(const std::string &str) {
+  out_file_name = str;
 void ProfileInfo::printToFile() {
   INFO("Writing Runtime Profile Info File...\n");
@@ -430,11 +432,11 @@ NodeConfiguration *RuntimeController::getNodeConfiguration(const char *data) {
 void RuntimeController::init(const char *Cstr) {
-   INFO("INIT RUNTIME CONTROLLER ==================\n");
-   printf("INIT RUNTIME CONTROLLER ==================\n");
+  INFO("INIT RUNTIME CONTROLLER ==================\n");
+  printf("INIT RUNTIME CONTROLLER ==================\n");
   // We initialize the path to the profile info output file,
   // based on the path given for the configuration file
-  setProfileInfoFilename(Cstr);
+  PI->set_out_file_name("profile_info.txt");
   // NOTE: Configurations is pareto-configs. InitialConfigurations is the full
@@ -636,27 +638,6 @@ RuntimeController::~RuntimeController() {
   // are stored in different containers, but share the node setup
-void RuntimeController::setProfileInfoFilename(const char *str) {
-  if (PI) {
-    std::string file_path = std::string(str);
-    size_t idx = file_path.find_last_of("/");
-    file_path.erase(idx + 1);
-    file_path.append("profile_info_");
-    bool found = false;
-    std::string profile_filename;
-    for (unsigned i = 0; !found; i++) {
-      profile_filename = file_path;
-      profile_filename.append(std::to_string(i));
-      profile_filename.append(".txt");
-      found = !fileExists(profile_filename);
-    }
-    PI->set_out_file_name(profile_filename);
-  }
 void RuntimeController::readConfigurationFile(const char *str) {
   INFO("Reading Configuration File...\n");
@@ -868,8 +849,8 @@ void RuntimeController::readConfigurationFile(const char *str) {
           std::make_pair(tokens[0], NodeConf));
-                std::make_pair(firstTensorID, NodeConf));
-        INFO("*** firstTensorID = %d \n\n", firstTensorID);
+          std::make_pair(firstTensorID, NodeConf));
+      INFO("*** firstTensorID = %d \n\n", firstTensorID);
       unsigned idx = 2;
       while (idx < tokens.size()) {
diff --git a/hpvm/projects/predtuner b/hpvm/projects/predtuner
index 9c2482aeb8db796b9f5578d0c342b5e5d0e8b376..2fbd6f876c34bfdbcbddc71cd73646e71bde5748 160000
--- a/hpvm/projects/predtuner
+++ b/hpvm/projects/predtuner
@@ -1 +1 @@
-Subproject commit 9c2482aeb8db796b9f5578d0c342b5e5d0e8b376
+Subproject commit 2fbd6f876c34bfdbcbddc71cd73646e71bde5748
diff --git a/hpvm/projects/soc_simulator/CMakeLists.txt b/hpvm/projects/soc_simulator/CMakeLists.txt
deleted file mode 100644
index 6a78ce1ce5ea16a119838c71b195b4c1e0209010..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-cmake_minimum_required(VERSION 3.5)
-set(libsrc src/promise_timing_model.cpp)
-add_library(promise_profiler STATIC ${libsrc})
-target_include_directories(promise_profiler PRIVATE include)
diff --git a/hpvm/projects/soc_simulator/README.md b/hpvm/projects/soc_simulator/README.md
deleted file mode 100644
index 218c16985b64069e154f16ce1916208b93aa6b66..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Tegra TX2 Soc Simulator 
-## Build
-mkdir lib
-cmake ../
diff --git a/hpvm/projects/soc_simulator/SOCSimulatorCommands.md b/hpvm/projects/soc_simulator/SOCSimulatorCommands.md
deleted file mode 100644
index f9b0adea5507639b8a35f770efb5455e1bf6fe4a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/SOCSimulatorCommands.md
+++ /dev/null
@@ -1,127 +0,0 @@
-# SOC Simulator Commands 
-## Setup
-Log into underworld and cd into Gitlab/hpvm/llvm/projects/soc_simulator/src. 
-## Image Benchmarks
-### Canny
-python driver.py canny_dummy_layers.txt canny_table_gen/canny_results/canny_tensors.txt /home/nvidia/sd_card/hpvm_img/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/data/autotuner_data/tuner_confs_25_ported.txt /home/nvidia/sd_card/hpvm_img/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/data/soc_data/tuner_confs_25_ported.txt
-python driver.py canny_dummy_layers.txt canny_table_gen/canny_results/canny_tensors.txt /home/nvidia/sd_card/hpvm_img/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/data/autotuner_data/tuner_confs_30_ported.txt /home/nvidia/sd_card/hpvm_img/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/data/soc_data/tuner_confs_30_ported.txt
-### Blending
-python driver.py blending_dummy_layers.txt ~/sd_card/HPVMApprox/tensor_tables/blend_results/blend_tensors.txt ~/sd_card/hpvm_img/llvm/test/VISC/DNN_Benchmarks/benchmarks/blending/data/autotuner_data/tuner_confs_25.txt ~/sd_card/hpvm_img/llvm/test/VISC/DNN_Benchmarks/benchmarks/blending/data/soc_data/tuner_confs_25.txt
-python driver.py blending_dummy_layers.txt ~/sd_card/HPVMApprox/tensor_tables/blend_results/blend_tensors.txt ~/sd_card/hpvm_img/llvm/test/VISC/DNN_Benchmarks/benchmarks/blending/data/autotuner_data/tuner_confs_30.txt ~/sd_card/hpvm_img/llvm/test/VISC/DNN_Benchmarks/benchmarks/blending/data/soc_data/tuner_confs_30.txt
-## DNN Benchmarks
-### alexnet2
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt ~/sd_card/HPVMApprox/tensor_tables/alexnet2_results/alexnet2_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/autotuner_data/tuner_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/soc_data/tuner_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt ~/sd_card/HPVMApprox/tensor_tables/alexnet2_results/alexnet2_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/autotuner_data/tuner_pareto_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/soc_data/tuner_pareto_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt ~/sd_card/HPVMApprox/tensor_tables/alexnet2_results/alexnet2_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/autotuner_data/tuner_promise_confs_batch220_multi.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/soc_data/tuner_promise_confs_batch220_multi.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt ~/sd_card/HPVMApprox/tensor_tables/alexnet2_results/alexnet2_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/autotuner_data/tuner_promise_confs_batch220_single.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/soc_data/tuner_promise_confs_batch220_single.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt ~/sd_card/HPVMApprox/tensor_tables/alexnet2_results/alexnet2_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/alexnet2_3.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/alexnet2/alexnet2_3.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt ~/sd_card/HPVMApprox/tensor_tables/alexnet2_results/alexnet2_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/alexnet2_20.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/alexnet2/alexnet2_20.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt ~/sd_card/HPVMApprox/tensor_tables/alexnet2_results/alexnet2_tensors.txt ~/sd_card/HPVMApprox/results/standard_tuning/alexnet2_single2.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/alexnet2/alexnet2_single2.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt ~/sd_card/HPVMApprox/tensor_tables/alexnet2_results/alexnet2_tensors.txt ~/sd_card/HPVMApprox/results/standard_tuning/alexnet2_single2.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/alexnet2/alexnet2_single2.txt
-### alexnet
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet_cifar10/alexnet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/alexnet_results/alexnet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/autotuner_data/tuner_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/soc_data/tuner_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet_cifar10/alexnet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/alexnet_results/alexnet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/autotuner_data/tuner_pareto_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/soc_data/tuner_pareto_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet_cifar10/alexnet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/alexnet_results/alexnet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/autotuner_data/tuner_promise_confs_batch220_multi.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/soc_data/tuner_promise_confs_batch220_multi.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet_cifar10/alexnet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/alexnet_results/alexnet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/autotuner_data/tuner_promise_confs_batch220_single.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/soc_data/tuner_promise_confs_batch220_single.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet_cifar10/alexnet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/alexnet_results/alexnet_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/alexnet3.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/alexnet/alexnet3.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet_cifar10/alexnet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/alexnet_results/alexnet_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/alexnet_20.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/alexnet/alexnet_20.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet_cifar10/alexnet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/alexnet_results/alexnet_tensors.txt ~/sd_card/HPVMApprox/results/standard_tuning/alexnet_single2.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/alexnet/alexnet_single2.txt
-### resnet
-python driver_new_config_fp16_repl.py  ~/soc_simulator/resnet18_cifar10/resnet18_layers.txt ~/sd_card/HPVMApprox/tensor_tables/resnet18_results/resnet18_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/soc_data/tuner_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/resnet18_cifar10/resnet18_layers.txt ~/sd_card/HPVMApprox/tensor_tables/resnet18_results/resnet18_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_pareto_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/soc_data/tuner_pareto_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/resnet18_cifar10/resnet18_layers.txt ~/sd_card/HPVMApprox/tensor_tables/resnet18_results/resnet18_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_multi.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/soc_data/tuner_promise_confs_batch220_multi.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/resnet18_cifar10/resnet18_layers.txt ~/sd_card/HPVMApprox/tensor_tables/resnet18_results/resnet18_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_single.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/soc_data/tuner_promise_confs_batch220_single.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/resnet18_cifar10/resnet18_layers.txt ~/sd_card/HPVMApprox/tensor_tables/resnet18_results/resnet18_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/resnet3.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/resnet18/resnet3.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/resnet18_cifar10/resnet18_layers.txt ~/sd_card/HPVMApprox/tensor_tables/resnet18_results/resnet18_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/resnet_20.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/resnet18/resnet_20.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/resnet18_cifar10/resnet18_layers.txt ~/sd_card/HPVMApprox/tensor_tables/resnet18_results/resnet18_tensors.txt ~/sd_card/HPVMApprox/results/standard_tuning/resnet_single2.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/resnet18/resnet_single2.txt
-### mobilenet
-python driver_new_config_fp16_repl.py  ~/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/mobilenet_results/mobilenet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/soc_data/tuner_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/mobilenet_results/mobilenet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_pareto_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/soc_data/tuner_pareto_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/mobilenet_results/mobilenet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi2.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/soc_data/tuner_promise_confs_batch220_multi.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/mobilenet_results/mobilenet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_single.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/soc_data/tuner_promise_confs_batch220_single.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/mobilenet_results/mobilenet_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/mobilenet3.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/mobilenet/mobilenet3.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/mobilenet_results/mobilenet_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/mobilenet_20.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/mobilenet/mobilenet_20.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt  ~/sd_card/HPVMApprox/tensor_tables/mobilenet_results/mobilenet_tensors.txt ~/sd_card/HPVMApprox/results/standard_tuning/mobilenet_single2.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/mobilenet/mobilenet_single2.txt
-### lenet
-python driver_new_config_fp16_repl.py  ~/soc_simulator/lenet_mnist/lenet_layers.txt ~/sd_card/HPVMApprox/tensor_tables/lenet_results/lenet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/soc_data/tuner_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/lenet_mnist/lenet_layers.txt ~/sd_card/HPVMApprox/tensor_tables/lenet_results/lenet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_pareto_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/soc_data/tuner_pareto_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/lenet_mnist/lenet_layers.txt ~/sd_card/HPVMApprox/tensor_tables/lenet_results/lenet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_promise_confs_batch220_multi.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/soc_data/tuner_promise_confs_batch220_multi.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/lenet_mnist/lenet_layers.txt ~/sd_card/HPVMApprox/tensor_tables/lenet_results/lenet_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_promise_confs_batch220_single.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/soc_data/tuner_promise_confs_batch220_single.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/lenet_mnist/lenet_layers.txt ~/sd_card/HPVMApprox/tensor_tables/lenet_results/lenet_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/lenet3.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/lenet_mnist/lenet3.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/lenet_mnist/lenet_layers.txt ~/sd_card/HPVMApprox/tensor_tables/lenet_results/lenet_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/lenet_20.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/lenet_mnist/lenet_20.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/lenet_mnist/lenet_layers.txt ~/sd_card/HPVMApprox/tensor_tables/lenet_results/lenet_tensors.txt ~/sd_card/HPVMApprox/results/standard_tuning/lenet_single2.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/lenet_mnist/lenet_single2.txt
-### vgg16_10
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar10/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar10_results/vgg16_cifar10_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/autotuner_data/tuner_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/soc_data/tuner_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar10/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar10_results/vgg16_cifar10_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/autotuner_data/tuner_pareto_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/soc_data/tuner_pareto_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar10/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar10_results/vgg16_cifar10_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/autotuner_data/tuner_promise_confs_batch220_multi.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/soc_data/tuner_promise_confs_batch220_multi.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar10/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar10_results/vgg16_cifar10_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/autotuner_data/tuner_promise_confs_batch220_single.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/soc_data/tuner_promise_confs_batch220_single.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar10/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar10_results/vgg16_cifar10_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/vgg16_10_3.txt  /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/vgg16_cifar10/vgg16_10_3.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar10/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar10_results/vgg16_cifar10_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/vgg16_10_20.txt  /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/vgg16_cifar10/vgg16_10_20.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar10/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar10_results/vgg16_cifar10_tensors.txt ~/sd_card/HPVMApprox/results/standard_tuning/vgg16_10_single2.txt  /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/vgg16_cifar10/vgg16_10_single2.txt
-### vgg16_100
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar100/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar100_results/vgg16_cifar100_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/autotuner_data/tuner_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/soc_data/tuner_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar100/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar100_results/vgg16_cifar100_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/autotuner_data/tuner_pareto_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/soc_data/tuner_pareto_confs_batch220.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar100/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar100_results/vgg16_cifar100_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/autotuner_data/tuner_promise_confs_batch220_multi.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/soc_data/tuner_promise_confs_batch220_multi.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar100/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar100_results/vgg16_cifar100_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/autotuner_data/tuner_promise_confs_batch220_single.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/soc_data/tuner_promise_confs_batch220_single.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar100/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar100_results/vgg16_cifar100_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/vgg16_100_3.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/vgg16_cifar100/vgg16_100_3.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar100/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar100_results/vgg16_cifar100_tensors.txt ~/sd_card/HPVMApprox/results/federated_tuning/vgg16_100_20.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/vgg16_cifar100/vgg16_100_20.txt
-python driver_new_config_fp16_repl.py  ~/soc_simulator/vgg16_cifar100/vgg16_layers.txt ~/sd_card/HPVMApprox/tensor_tables/vgg16_cifar100_results/vgg16_cifar100_tensors.txt ~/sd_card/HPVMApprox/results/standard_tuning/vgg16_100_single2.txt /home/nvidia/sd_card/HPVMApprox/generated_tuner_confs/vgg16_cifar100/vgg16_100_single2.txt
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_confs1.txt b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_confs1.txt
deleted file mode 100644
index f8add8f3feabc93963e6f75fbce1ace36412ad07..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_confs1.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9
-9 9 9,7,7,7,9 9 9,9 9 9 9,9 9
-9 9 9,7,7,7,9 9 9,8 8 8 8,9 9
-9 9 9,7,7,7,8 8 8,9 9 9 9,9 9
-9 9 9,7,7,7,9 9 9,7,9 9
-9 9 9,7,7,7,9 9 9,7,9 9
-9 9 9,7,9 9 9,7,9 9 9,7,9 9
-9 9 9,7,7,7,9 9 9,9 9 9 9,9 9
-9 9 9,7,7,7,8 8 8,9 9 9 9,9 9
-9 9 9,7,7,7,9 9 9,7,9 9
-9 9 9,7,7,7,9 9 9,7,9 9
-9 9 9,7,7,7,9 9 9,7,9 9
-9 9 9,7,7,7,9 9 9,9 9 9 9,9 9
-9 9 9,7,7,7,9 9 9,8 8 8 8,9 9
-9 9 9,7,7,7,9 9 9,7,9 9
-9 9 9,7,7,7,8 8 8,9 9 9 9,9 9
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_confs2.txt b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_confs2.txt
deleted file mode 100644
index da9dce2be8e9649ece5376e363ede5b3a84629fe..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_confs2.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9
-9 9 9,6,8 8 8,6,6,6,9 9
-9 9 9,6,8 8 8,6,6,6,9 9
-9 9 9,6,8 8 8,6,6,7,9 9
-9 9 9,6,8 8 8,9 9 9 9,6,6,9 9
-9 9 9,6,8 8 8,6,8 8 8,6,9 9
-9 9 9,6,8 8 8,6,6,6,9 9
-9 9 9,6,8 8 8,6,6,6,9 9
-9 9 9,6,8 8 8,6,6,6,9 9
-9 9 9,6,8 8 8,6,6,6,9 9
-9 9 9,6,8 8 8,6,6,7,9 9
-9 9 9,6,8 8 8,9 9 9 9,6,6,9 9
-9 9 9,6,8 8 8,6,8 8 8,6,9 9
-9 9 9,6,8 8 8,6,6,6,9 9
-9 9 9,6,8 8 8,6,6,7,9 9
-9 9 9,6,8 8 8,6,6,6,9 9
-9 9 9,7,7,6,8 8 8,6,9 9
-9 9 9,6,8 8 8,9 9 9 9,6,6,9 9
-9 9 9,6,8 8 8,6,6,6,9 9
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_fp16.csv b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_fp16.csv
deleted file mode 100644
index 2414bed90d2339bb232800ceccf1711302aa4174..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_fp16.csv
+++ /dev/null
@@ -1,70 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_fp32.csv b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_fp32.csv
deleted file mode 100644
index e2cbba7f6782a0d77ab4c7279a3a88f5acfb532d..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_fp32.csv
+++ /dev/null
@@ -1,24 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt
deleted file mode 100644
index 98dfa6fa380a34ee7ff5ce0615656deab585ac5b..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt
+++ /dev/null
@@ -1,7 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_ops.txt b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_ops.txt
deleted file mode 100644
index 7a26ba6faa8bf17a0a2067c8a80f69a514ad07c0..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_ops.txt
+++ /dev/null
@@ -1,30 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_confs1.txt b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_confs1.txt
deleted file mode 100644
index c9e13831df88fd375fd8439e20106a483a8342bc..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_confs1.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9
-9 9 9,7,8 8 8,6,8 8 8,4,7
-9 9 9,7,8 8 8,6,8 8 8,5,7
-9 9 9,7,4,5,8 8 8,5,7
-9 9 9,7,4,6,8 8 8,6,7
-9 9 9,5,7,6,8 8 8,8 8 8 8,7
-9 9 9,7,7,6,8 8 8,6,7
-9 9 9,6,8 8 8,7,8 8 8,7,5
-9 9 9,9 9 9 9,7,6,8 8 8,5,6
-9 9 9,5,8 8 8,4,8 8 8,7,6
-9 9 9,8 8 8 8,7,6,8 8 8,5,7
-9 9 9,7,7,4,8 8 8,6,7
-8 8 8,8 8 8 8,8 8 8,6,8 8 8,5,7
-9 9 9,7,7,6,8 8 8,8 8 8 8,7
-8 8 8,6,4,6,8 8 8,8 8 8 8,7
-8 8 8,9 9 9 9,5,5,8 8 8,5,7
-9 9 9,7,5,6,8 8 8,8 8 8 8,7
-9 9 9,6,8 8 8,8 8 8 8,8 8 8,7,5
-8 8 8,9 9 9 9,4,4,8 8 8,8 8 8 8,4
-8 8 8,8 8 8 8,6,6,8 8 8,8 8 8 8,7
-8 8 8,7,8 8 8,5,8 8 8,8 8 8 8,7
-9 9 9,8 8 8 8,7,6,8 8 8,8 8 8 8,7
-9 9 9,8 8 8 8,5,4,8 8 8,8 8 8 8,8 8
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_confs2.txt b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_confs2.txt
deleted file mode 100644
index 9188c561894dd811fa5adc5606c61afaf3972a1f..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_confs2.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9
-9 9 9,7,7,8 8 8 8,8 8 8,5,8 8
-8 8 8,7,7,8 8 8 8,8 8 8,5,4
-8 8 8,6,7,8 8 8 8,8 8 8,7,7
-8 8 8,6,5,6,8 8 8,5,6
-8 8 8,7,7,6,8 8 8,4,6
-8 8 8,7,7,7,8 8 8,5,7
-8 8 8,7,6,6,8 8 8,5,4
-8 8 8,7,5,7,8 8 8,5,7
-8 8 8,7,8 8 8,6,8 8 8,4,7
-9 9 9,7,8 8 8,6,8 8 8,5,7
-8 8 8,7,7,5,8 8 8,5,7
-9 9 9,7,4,5,8 8 8,5,7
-8 8 8,7,4,6,8 8 8,6,7
-8 8 8,7,7,6,8 8 8,5,7
-8 8 8,5,7,6,8 8 8,8 8 8 8,7
-8 8 8,7,7,6,8 8 8,6,7
-8 8 8,9 9 9 9,7,6,8 8 8,4,8 8
-9 9 9,6,8 8 8,7,8 8 8,7,5
-8 8 8,7,7,6,8 8 8,8 8 8 8,8 8
-9 9 9,9 9 9 9,7,6,8 8 8,5,6
-8 8 8,8 8 8 8,8 8 8,6,8 8 8,5,8 8
-8 8 8,8 8 8 8,7,6,8 8 8,5,8 8
-8 8 8,5,8 8 8,4,8 8 8,7,6
-9 9 9,8 8 8 8,7,6,8 8 8,5,7
-8 8 8,7,7,4,8 8 8,6,7
-8 8 8,8 8 8 8,8 8 8,6,8 8 8,5,7
-9 9 9,7,7,6,8 8 8,8 8 8 8,7
-8 8 8,6,4,6,8 8 8,8 8 8 8,7
-8 8 8,9 9 9 9,5,5,8 8 8,5,7
-9 9 9,7,5,6,8 8 8,8 8 8 8,7
-9 9 9,6,8 8 8,8 8 8 8,8 8 8,7,5
-8 8 8,9 9 9 9,4,4,8 8 8,8 8 8 8,4
-8 8 8,8 8 8 8,6,6,8 8 8,8 8 8 8,7
-8 8 8,7,8 8 8,5,8 8 8,8 8 8 8,7
-8 8 8,8 8 8 8,7,6,8 8 8,8 8 8 8,7
-9 9 9,8 8 8 8,5,4,8 8 8,8 8 8 8,8 8
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_results1.csv b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_results1.csv
deleted file mode 100644
index 038318438fafe7dffdde74d6066e2bcb67cc686f..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_results1.csv
+++ /dev/null
@@ -1,297 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_results2.csv b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_results2.csv
deleted file mode 100644
index a46a2216abbef609554dea8b8df6f572e60dff2e..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_promise_results2.csv
+++ /dev/null
@@ -1,451 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_results1.csv b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_results1.csv
deleted file mode 100644
index 80534cd08b0ee64c04faf405926188256a6da2a4..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_results1.csv
+++ /dev/null
@@ -1,220 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_results2.csv b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_results2.csv
deleted file mode 100644
index 0fb57641e8484f4b2d7403c55ac35414dbf58bba..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_results2.csv
+++ /dev/null
@@ -1,253 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_tensors.txt b/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_tensors.txt
deleted file mode 100644
index 55ad19b1c8cd2123e86ce7cf7dc43dad7516e413..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet2_cifar10/alexnet2_tensors.txt
+++ /dev/null
@@ -1,30 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_confs1.txt b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_confs1.txt
deleted file mode 100644
index ff963b918fa017a3635a326bbbd7111246cf9384..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_confs1.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-9 9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,7,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,6,6,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,7,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,6,8 8 8,7,7
-8 8 8 8,6,7,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,6,8 8 8,8 8 8,7,7
-8 8 8 8,7,7,8 8 8,8 8 8 8,7
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_confs2.txt b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_confs2.txt
deleted file mode 100644
index 68b20e726f79648fa9c91169eb1f0a1111b6a889..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_confs2.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-9 9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9
-8 8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,7
-8 8 8 8,7,4,6,8 8 8 8,7
-8 8 8 8,8 8 8 8,7,4,6,7
-8 8 8 8,4,4,6,4,7
-8 8 8 8,4,4,7,7,7
-8 8 8 8,4,4,8 8 8,5,7
-8 8 8 8,7,7,7,8 8 8 8,7
-8 8 8 8,7,5,7,4,7
-8 8 8 8,8 8 8 8,8 8 8,6,4,7
-8 8 8 8,8 8 8 8,4,6,5,7
-8 8 8 8,7,4,6,8 8 8 8,7
-8 8 8 8,8 8 8 8,7,4,6,7
-8 8 8 8,4,4,6,4,7
-8 8 8 8,4,4,7,7,7
-8 8 8 8,4,4,8 8 8,5,7
-8 8 8 8,7,7,7,8 8 8 8,7
-8 8 8 8,7,5,7,4,7
-8 8 8 8,8 8 8 8,8 8 8,6,4,7
-8 8 8 8,4,8 8 8,8 8 8,7,7
-8 8 8 8,8 8 8 8,4,6,5,7
-8 8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,7
-8 8 8 8,7,4,6,8 8 8 8,7
-8 8 8 8,8 8 8 8,4,6,5,7
-8 8 8 8,4,4,6,4,7
-8 8 8 8,7,5,7,4,7
-8 8 8 8,7,7,7,8 8 8 8,7
-8 8 8 8,8 8 8 8,8 8 8,6,4,7
-8 8 8 8,8 8 8 8,7,4,6,7
-8 8 8 8,4,4,8 8 8,5,7
-8 8 8 8,4,4,7,7,7
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_fp16.csv b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_fp16.csv
deleted file mode 100644
index d6a6ef63a3128bc63e9fb6b5d07ce78c7d520338..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_fp16.csv
+++ /dev/null
@@ -1,61 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_fp32.csv b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_fp32.csv
deleted file mode 100644
index fe62cc77eb9744a531345697df044276e4bb26e2..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_fp32.csv
+++ /dev/null
@@ -1,21 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_layers.txt b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_layers.txt
deleted file mode 100644
index bc8c3f5668a2fdb5eb8a568f34b334fe02016954..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_layers.txt
+++ /dev/null
@@ -1,6 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_ops.txt b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_ops.txt
deleted file mode 100644
index 9d047b9e469f980534fa95b39fd43e2984bf9d43..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_ops.txt
+++ /dev/null
@@ -1,26 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_confs1.txt b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_confs1.txt
deleted file mode 100644
index f9a86825a6429e3145247f1680c64999ecfb918a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_confs1.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-9 9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9
-8 8 8 8,4,6,6,7,7
-8 8 8 8,6,9 9 9,6,9 9 9 9,3
-8 8 8 8,4,3,8 8 8,8 8 8 8,4
-9 9 9 9,4,3,9 9 9,7,9 9
-8 8 8 8,8 8 8 8,9 9 9,7,3,4
-8 8 8 8,4,6,8 8 8,9 9 9 9,7
-8 8 8 8,9 9 9 9,6,6,9 9 9 9,4
-9 9 9 9,7,9 9 9,8 8 8,9 9 9 9,5
-8 8 8 8,9 9 9 9,7,8 8 8,5,5
-8 8 8 8,8 8 8 8,7,7,8 8 8 8,5
-9 9 9 9,6,6,8 8 8,9 9 9 9,7
-8 8 8 8,4,4,8 8 8,8 8 8 8,8 8
-8 8 8 8,9 9 9 9,6,6,9 9 9 9,8 8
-8 8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,9 9
-9 9 9 9,8 8 8 8,9 9 9,9 9 9,7,7
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_confs2.txt b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_confs2.txt
deleted file mode 100644
index c73097f5494c9545b8d6a2bd7f737a7a9ad2dcc8..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_confs2.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-9 9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9
-8 8 8 8,3,1,4,7,3
-8 8 8 8,3,1,6,7,3
-8 8 8 8,4,1,3,7,4
-9 9 9 9,4,1,9 9 9,3,3
-8 8 8 8,3,1,7,7,4
-9 9 9 9,7,1,7,4,9 9
-8 8 8 8,3,1,6,5,9 9
-9 9 9 9,1,5,7,2,9 9
-8 8 8 8,3,2,2,7,3
-8 8 8 8,9 9 9 9,1,7,3,9 9
-8 8 8 8,4,1,6,7,8 8
-8 8 8 8,4,1,6,7,5
-8 8 8 8,1,8 8 8,6,7,3
-8 8 8 8,5,1,6,7,7
-9 9 9 9,7,1,6,7,8 8
-8 8 8 8,3,1,9 9 9,7,8 8
-8 8 8 8,3,7,2,7,3
-8 8 8 8,3,8 8 8,2,4,8 8
-9 9 9 9,1,8 8 8,9 9 9,4,8 8
-8 8 8 8,5,4,3,2,9 9
-8 8 8 8,5,1,6,8 8 8 8,3
-8 8 8 8,9 9 9 9,1,8 8 8,7,3
-9 9 9 9,4,2,4,9 9 9 9,2
-9 9 9 9,2,3,8 8 8,7,2
-9 9 9 9,3,2,6,7,3
-8 8 8 8,6,4,6,2,3
-8 8 8 8,6,4,9 9 9,2,3
-8 8 8 8,7,4,4,3,4
-9 9 9 9,4,7,6,2,8 8
-8 8 8 8,7,3,3,5,5
-8 8 8 8,3,2,6,7,7
-9 9 9 9,7,4,6,3,4
-9 9 9 9,7,9 9 9,3,7,3
-8 8 8 8,3,9 9 9,5,6,3
-9 9 9 9,6,7,3,4,5
-8 8 8 8,3,9 9 9,6,3,3
-8 8 8 8,2,5,7,9 9 9 9,3
-8 8 8 8,3,5,6,7,3
-8 8 8 8,4,7,6,5,3
-8 8 8 8,3,3,6,7,4
-8 8 8 8,3,4,7,5,6
-8 8 8 8,7,7,6,7,3
-8 8 8 8,5,4,7,4,7
-8 8 8 8,7,3,4,7,7
-8 8 8 8,4,3,7,4,5
-9 9 9 9,3,9 9 9,5,7,3
-8 8 8 8,7,2,7,9 9 9 9,7
-8 8 8 8,3,4,4,8 8 8 8,3
-8 8 8 8,3,5,6,7,5
-9 9 9 9,3,4,9 9 9,7,3
-8 8 8 8,7,4,7,7,3
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_results1.csv b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_results1.csv
deleted file mode 100644
index 4854a2efa2b2e321decb3ef95ab6bda3b1e22f42..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_results1.csv
+++ /dev/null
@@ -1,220 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_results2.csv b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_results2.csv
deleted file mode 100644
index b3853553e5d5a2c768c71ee52ff3748070d0a4b4..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_promise_results2.csv
+++ /dev/null
@@ -1,616 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_results1.csv b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_results1.csv
deleted file mode 100644
index ca505f7f4aef79e5d466ac7e797f2040a8af1225..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_results1.csv
+++ /dev/null
@@ -1,385 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_results2.csv b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_results2.csv
deleted file mode 100644
index a2baf6209cb3a101a9d8f3b423c209e73f1d89cf..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_results2.csv
+++ /dev/null
@@ -1,385 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_results2_nodma.csv b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_results2_nodma.csv
deleted file mode 100644
index bdc6580f750a3f05adaa9913814b04efb45265bb..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_results2_nodma.csv
+++ /dev/null
@@ -1,385 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_tensors.txt b/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_tensors.txt
deleted file mode 100644
index e7f0a6e270ba81fa11ec07b261192e5b50a4d93d..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_cifar10/alexnet_tensors.txt
+++ /dev/null
@@ -1,26 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet_imagenet/alexnet_imagenet_cost.txt b/hpvm/projects/soc_simulator/alexnet_imagenet/alexnet_imagenet_cost.txt
deleted file mode 100644
index 194ff1abc9fed79d834c684d17294a325d8edbaf..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_imagenet/alexnet_imagenet_cost.txt
+++ /dev/null
@@ -1,8 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet_imagenet/alexnet_imagenet_layers.txt b/hpvm/projects/soc_simulator/alexnet_imagenet/alexnet_imagenet_layers.txt
deleted file mode 100644
index 63057b7e8d383cf2d515c08faf7a1c4b82899142..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_imagenet/alexnet_imagenet_layers.txt
+++ /dev/null
@@ -1,8 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet_imagenet/alexnet_imagenet_ops.txt b/hpvm/projects/soc_simulator/alexnet_imagenet/alexnet_imagenet_ops.txt
deleted file mode 100644
index 71b2fff58b49eeed8ffccea5ef99d65d06e44d31..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_imagenet/alexnet_imagenet_ops.txt
+++ /dev/null
@@ -1,34 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/alexnet_imagenet/layer_composition.txt b/hpvm/projects/soc_simulator/alexnet_imagenet/layer_composition.txt
deleted file mode 100644
index b2bf962cd60722978b3205adca9c5822e59fc603..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/alexnet_imagenet/layer_composition.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-conv  add  activation  pool  
-conv  add  activation  pool  
-conv  add  activation  
-conv  add  activation  
-conv  add  activation  pool  
-dense  add  activation  
-dense  add  activation  
-dense  add  
diff --git a/hpvm/projects/soc_simulator/ddr_test.cpp b/hpvm/projects/soc_simulator/ddr_test.cpp
deleted file mode 100644
index 53c0e3b6d94f6fd2e1a9ea2a177767b4c46c9b9c..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/ddr_test.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cstdlib>
-#include <cstring>
-#include <chrono>
-// Goes through a 16 MB array to clear out the cache
-char clearCache() {
-    static const unsigned num_bytes = 16 * 1024 * 1024;
-    static char *temp = (char *) std::malloc(num_bytes);
-    for (unsigned i = 0; i < num_bytes; i++)
-        temp[i] = rand();
-    return temp[rand() % num_bytes];
-int main() {
-    srand(1);
-    std::chrono::time_point<std::chrono::high_resolution_clock> start;
-    std::chrono::time_point<std::chrono::high_resolution_clock> end;
-    std::ofstream ofs;
-    ofs.open("profile_data.txt");
-    // 1 MB to 64 MB
-    for (unsigned i = 20; i < 26; i++) {
-        const unsigned num_bytes = 1 << i;
-        char *src = (char *) std::malloc(num_bytes);
-        char *dst = (char *) std::malloc(num_bytes);
-        // Test writes
-        clearCache();
-        start = std::chrono::high_resolution_clock::now();
-        for (unsigned j = 0; j < num_bytes; j++)
-            src[j] = rand();
-        end = std::chrono::high_resolution_clock::now();
-        std::cout << num_bytes << "w = " << (static_cast<double>(num_bytes) / 1e9) / std::chrono::duration<double>(end - start).count() << "\n";
-        ofs << num_bytes << "w\t" << std::to_string(std::chrono::duration<double>(start.time_since_epoch()).count()) << "\n";
-        ofs << num_bytes << "w\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-        // Test reads
-        clearCache();
-        char sum;
-        start = std::chrono::high_resolution_clock::now();
-        for (unsigned j = 0; j < num_bytes; j++)
-            sum += src[j];
-        end = std::chrono::high_resolution_clock::now();
-        std::cout << num_bytes << "r = " << (static_cast<double>(num_bytes) / 1e9) / std::chrono::duration<double>(end - start).count() << "\n";
-        ofs << num_bytes << "r\t" << std::to_string(std::chrono::duration<double>(start.time_since_epoch()).count()) << "\n";
-        ofs << num_bytes << "r\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-        // Test read-then-write
-        clearCache();
-        start = std::chrono::high_resolution_clock::now();
-        std::memcpy(dst, src, num_bytes);
-        end = std::chrono::high_resolution_clock::now();
-        std::cout << num_bytes << "rw = " << (static_cast<double>(num_bytes) / 1e9) / std::chrono::duration<double>(end - start).count() << "\n";
-        ofs << num_bytes << "rw\t" << std::to_string(std::chrono::duration<double>(start.time_since_epoch()).count()) << "\n";
-        ofs << num_bytes << "rw\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-        // So the compiler doesn't optimize out everything
-        std::cout << num_bytes << ": " << sum << "\n";
-        std::cout << num_bytes << ": " << dst[rand() % num_bytes] << "\n";
-        free(src);
-        free(dst);
-    }
-    return 0;
diff --git a/hpvm/projects/soc_simulator/docs/.gitkeep b/hpvm/projects/soc_simulator/docs/.gitkeep
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/hpvm/projects/soc_simulator/docs/howToUse.md b/hpvm/projects/soc_simulator/docs/howToUse.md
deleted file mode 100644
index 12f946e36c0a5053d5b5e07f85c2b9909db349bc..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/docs/howToUse.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Workflow
-## Source Code Generation
-Generate new benchmarks with new parameter calls from pre-existing benchmarks. Each new benchmark generated corresponds to an inputted knob ID. Each new benchmark contains calls to the online profiler and prints the total time and energy usage at the end of the benchmark. 
-Source code: llvm/projects/hpvm-tensor-rt/code_autogenerators/source_code_autogenerator.py
-## Usage:
-python source_code_autogenerator.py <table file> <original filenames file> [per_tensor]
-* table file: File containing table containing parameters to be changed (see "Table" section for more info)
-* original filenames file: File containing newline separated file names to generate code from. A simple example:
-* per_tensor is an optional parameter. If "per_tensor" is included, the code autogenerator inserts profile calls around each tensor operation, which is what's desired, and outputs a list of all tensor calls and their corresponding times/energies. If "per_tensor" is not included, the code autogenerator inserts profile calls at the beginning and end of the entire benchmark. 
-python source_code_autogenerator.py clean
-* Deletes all autogenerated files not including the autogenerated CMakeLists.txt file 
-### Table
-approx_type,knob_id additional_param1,additional_param2,... 0 old_function_name new_function_name
-* Note that the parameters in the table file are ADDITIONAL parameters to be added to the function calls.
-* The current approx_types supported are: fp32 (copies the source code over and doesn't modify it), fp16 (converts all fp32 calls to fp16 calls by replacing tensor<Operation> with tensorHalf<Operation>), perf (knob ids 20 - 29), and samp (knob ids 31 - 36). Adding additional approximation types requires changing the source code.
-samp,31 1,1,2,0     1.88    tensorHalfConvolution   tensorConvApproxHalf
-samp,32 1,1,2,1     1.88    tensorHalfConvolution   tensorConvApproxHalf
-samp,33 1,1,4,0     1.88    tensorHalfConvolution   tensorConvApproxHalf
-samp,34 1,1,4,1     1.88    tensorHalfConvolution   tensorConvApproxHalf
-samp,35 1,1,4,2     1.88    tensorHalfConvolution   tensorConvApproxHalf
-samp,36 1,1,4,3     1.88    tensorHalfConvolution   tensorConvApproxHalf
-## Output
-For each file inputted (in the original filenames file), the code autogenerator creates a directory called <original_source_name>_different_knobs in the samne directory as source_code_autogenerator.py. This directory contains files named <original_source_name>_<id>.txt, where the id corresponds to the kob id.
-Note: The code autogenerator handles local include paths by converting them to global paths. 
-Example usage:
-python source_code_autogenerator.py knob_config_fp16_knobs_31_36.txt filenames_fp16_remainder.txt per_tensor
-## CMakeLists.txt File Generation
-Generates a CMakeLists.txt file for all generated files in a specific directory based off a hardcoded CMakeLists.txt file path.
-Source code: llvm/projects/hpvm-tensor-rt/code_autogenerators/cmakelists_generator.py
-### Input
-We have the choice of inputting an arbitrarily long list containing names of all generated files directories
-* Ex: alexnet_cifar10_autogenerated_knobs mobilenet_cifar10_autogenerated_knobs
-If 0 parameters were inputted, this code generator generates the CMakeLists.txt file for all generated files (all directories ending with "autogenerated_knobs") in the current directory. This second approach works for generating a CMakeLists.txt file for the generated sources described in the previous section.
-## Running Benchmarks
-After generating all files required (see previous steps) and building the generated benchmarks, we can either run the binaries manually or we can use an automater I created for convenience.
-Source code: llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automater.py
-Usage: python online_benchmark_testing_automator.py <builds dir> <outputs_file_name> [per_tensor]
-* builds_dir refers to the directory the binaries are in
-* outputs_file_name refers to the name of the file containing all the profiling info (the output file)
-* per_tensor MUST be set if the benchmarks were generated using the per_tensor parameter --> this parameter is needed to correctly parse the raw output of the binaries.
-## Generating the Table
-After running all the benchmarks and getting the raw profiling data (which the runtime generates), we generate a table that behaves like a massive cache and stores all the profiling data. We do this to avoid having to run every benchmark over again; instead, we can just simulate the benchmarks using the table data.
-Source code: llvm/projects/soc_simulator/src/table_generator.py
-### Input
-Usage: python table_generator.py <network name> <binary dir path> <soc_ops file> <num itrs> <profiler bin path>
-* <network name> refers to the name of the benchmark (ex: alexnet2) and is used to generate the output (see "output" section)
-* <binary dir path> refers to the directory containing the binaries that were run. IMPORTANT: This path must be the same as the path to the dir containing all the binaries we generated and ran at an earlier step; the table generator reads all profiling files generated by the runtime and organizes them into a table.
-* soc_ops file: ~/soc_simular/<network name>_cifar10/<network name>_ops.txt
-* num_itrs: Number of itrs we want to run the binaries for. This doesn't matter anymore because we're not running the binaries in this step.
-* profiler_bin_path: Path to the offline profiler. this doesn't matter anymore bc we're not runing the binaries in this step.
-### Output
-The table generator creats a directory called <network name>_results and a file within that directory called <network name>_tensors.txt, which contains the table.
-The table is outputted in the following format:
-** LayerName NumOpsInLayer <cols>
-OpName Col1Val Col2Val ...
-** Conv1 1 h2f_time h2f_energy fp32_time fp32_energy f2h_time f2h_energy fp16_perf_time fp16_perf_energy fp16_time fp16_energy
-Conv1 51.8808 97.2844 319.582 601.966 12.81 18.758 388.092 650.649 340.037 590.664
-### Example usage:
-python ../../soc_simulator/src/table_generator.py lenet_keras/ 10 ~/awesome_profiler/pp
-## SOC Simulator
-Instead of rerunning each benchmark, we simulate the benchmark runs on the gpu/promise depending on our inputted autotuner file.
-Source code: llvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py
-### Input
-python driver.py <layer info> <tensor info> <configurations> <results file>
-* layer_info: contains info on benchmark's layers (should be in ~/soc_simulator)
-* tensors_info: the table file generated in the previous step
-* configurations: the file outputted from the autotuner
-* results file: the name of the results file
-### Output
-A copy of the inputted autotuner config file is created. For each configuration, the simulator computes the relative speedup and energy reduction compared to the baseline (fp32 baseline or fp16 baseline depending on the config). Then, we replace the autotuner's estimated speedup and energy reduction (first line of each configuration) with the real speedup and energy reduction. IMPORTANT NOTE: THE FIRST CONFIGURATION OF THE INPUTTED CONFIGURATIONS FILE MUST BE FOR THE FP32 BASELINE -- the soc simulator assumes that the first configuration refers to the fp32 baseline version so all speedups/energy reductions will be off if the first configuration is an actual approx config.  
-Example usage:
-python driver_new_config_fp16_repl.py  ~/soc_simulator/alexnet2_cifar10/alexnet2_layers.txt ~/sd_card/HPVMApprox/tensor_tables/alexnet2_results/alexnet2_tensors.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/autotuner_data/tuner_confs_batch220.txt ~/Gitlab/hpvm/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/soc_data/tuner_confs_batch220.txt
diff --git a/hpvm/projects/soc_simulator/lenet/lenet_layers.txt b/hpvm/projects/soc_simulator/lenet/lenet_layers.txt
deleted file mode 100644
index a252f19b682e7d564fb7025d6f5f8ae36fad7a25..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/lenet/lenet_layers.txt
+++ /dev/null
@@ -1,4 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/lenet/lenet_ops.txt b/hpvm/projects/soc_simulator/lenet/lenet_ops.txt
deleted file mode 100644
index c1bdab18ff10fb2a89534fce6d352d47722b2433..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/lenet/lenet_ops.txt
+++ /dev/null
@@ -1,18 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt b/hpvm/projects/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt
deleted file mode 100644
index ec202b5be38d401551b82746655d45847567307c..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_cifar10/mobilenet_layers.txt
+++ /dev/null
@@ -1,83 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/HA_loss1.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/HA_loss1.txt
deleted file mode 100644
index 98ec2de71c8abc518c52ee3089721b0d96a06fdb..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/HA_loss1.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,9 9
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/HA_loss2.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/HA_loss2.txt
deleted file mode 100644
index 9fd37f52c3a00843b7ffaca9b3d8ecbc5f1139ec..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/HA_loss2.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9 9
-9,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,6,8,8,8,9 9
-9,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,6,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,5,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,6,8,8,9,8,8,8,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,8,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,6,8,8,9,8,8,5,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,5,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,6,8,8,8,9 9
-9,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,6,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,5,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,6,8,8,9,8,8,8,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,5,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,6,8,8,9,8,8,5,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,8,9 9
-9,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,5,8,8,9,8,8,8,8,8,8,9 9
-9,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,6,8,8,8,9 9
-9,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,6,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,5,8,8,9,8,8,7,8,8,9,8,8,7,8,8,9,8,8,7,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,6,8,8,9,8,8,8,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,6,8,8,9,8,8,5,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,8,8,8,8,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,5,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,7,8,8,8,9 9
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/HA_results_loss1.out b/hpvm/projects/soc_simulator/mobilenet_shallow/HA_results_loss1.out
deleted file mode 100644
index ddc3ead56dc7ccf7afe7b065617716f129141aa9..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/HA_results_loss1.out
+++ /dev/null
@@ -1,319 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/HA_results_loss2.out b/hpvm/projects/soc_simulator/mobilenet_shallow/HA_results_loss2.out
deleted file mode 100644
index 25c65eae81141ec16294f4c133b32879d039277a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/HA_results_loss2.out
+++ /dev/null
@@ -1,374 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/HS_loss1.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/HS_loss1.txt
deleted file mode 100644
index 4d905b811accaa167503d0c189e3bb475c3ad9e6..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/HS_loss1.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9 9
-9,8,8,9,8,8,9,8,8,9,8,8,9,8,8,9,8,8,9,8,8,9,8,8,7,8,8,9,8,8,3,8,8,9,8,8,4,8,8,8,9 9
-9,8,8,9,8,8,9,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,7,8,8,9,8,8,3,8,8,9,8,8,4,8,8,8,9 9
-8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,6,8,8,9,8,8,6,8,8,9,8,8,3,8,8,8,8 8
-8,8,8,9,8,8,9,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,4,8,8,9,8,8,6,8,8,9,8,8,4,8,8,8,9 9
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/HS_loss2.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/HS_loss2.txt
deleted file mode 100644
index b8677344889c875db8f5485832c7c2dce383cff7..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/HS_loss2.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9 9
-9,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,5,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,3,8,8,8,8 8
-8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,5,8,8,9,8,8,9,8,8,9,8,8,4,8,8,9,8,8,6,8,8,8,8 8
-8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,3,8,8,9,8,8,7,8,8,9,8,8,4,8,8,9,8,8,7,8,8,8,8 8
-8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,3,8,8,9,8,8,7,8,8,9,8,8,6,8,8,9,8,8,5,8,8,8,8 8
-8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,6,8,8,9,8,8,9,8,8,9,8,8,5,8,8,9,8,8,6,8,8,8,8 8
-9,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,4,8,8,9,8,8,6,8,8,9,8,8,7,8,8,9,8,8,3,8,8,8,8 8
-8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,5,8,8,9,8,8,9,8,8,9,8,8,4,8,8,9,8,8,9,8,8,8,9 9
-8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,3,8,8,9,8,8,7,8,8,9,8,8,6,8,8,9,8,8,7,8,8,8,9 9
-8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,3,8,8,9,8,8,4,8,8,9,8,8,9,8,8,9,8,8,8,8,8,8,8 8
-8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,4,8,8,9,8,8,7,8,8,9,8,8,9,8,8,9,8,8,8,8,8,8,8 8
-9,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,3,8,8,9,8,8,7,8,8,9,8,8,6,8,8,9,8,8,7,8,8,8,9 9
-8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,3,8,8,9,8,8,7,8,8,9,8,8,9,8,8,9,8,8,7,8,8,8,9 9
-8,8,8,9,8,8,9,8,8,9,8,8,8,8,8,9,8,8,3,8,8,9,8,8,7,8,8,9,8,8,9,8,8,9,8,8,8,8,8,8,9 9
-9,8,8,9,8,8,7,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,6,8,8,9,8,8,8,8,8,9,8,8,3,8,8,8,9 9
-8,8,8,9,8,8,8,8,8,9,8,8,8,8,8,9,8,8,9,8,8,9,8,8,3,8,8,9,8,8,4,8,8,9,8,8,3,8,8,8,9 9
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/HS_results_loss1.out b/hpvm/projects/soc_simulator/mobilenet_shallow/HS_results_loss1.out
deleted file mode 100644
index 4b0b840cee7d4daa6bac3bf282a7ff61dcde2cb1..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/HS_results_loss1.out
+++ /dev/null
@@ -1,748 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HA_loss1.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HA_loss1.txt
deleted file mode 100644
index 6e5e15bc0bd2bc835293154ef3c5911662a51062..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HA_loss1.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9 9
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HA_loss2.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HA_loss2.txt
deleted file mode 100644
index 36bb418742d308ee60d0d997a3a436ca8410870a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HA_loss2.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9 9
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8,8,6,8,8,8,9 9
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,5,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8,8,6,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,5,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8,8,6,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,5,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HS_loss1.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HS_loss1.txt
deleted file mode 100644
index 474eb6a9cc983edec9a18cc27a2ed3875d992bc7..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HS_loss1.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9 9
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,3,8,8,8,8,8,4,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8,8,3,8,8,8,8,8,4,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,6,8,8,8,8,8,3,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,4,8,8,8,8,8,6,8,8,8,8,8,4,8,8,8,8 8
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HS_loss2.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HS_loss2.txt
deleted file mode 100644
index 574b3850329006a5bd6a1d0b4cb56e72521363ff..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/confs/HS_loss2.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9 9
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,8,8,8,8,8,8,4,8,8,8,8,8,6,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8,8,7,8,8,8,8,8,4,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8,8,7,8,8,8,8,8,6,8,8,8,8,8,5,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,6,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,4,8,8,8,8,8,6,8,8,8,8,8,7,8,8,8,8,8,3,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,5,8,8,8,8,8,8,8,8,8,8,8,4,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8,8,7,8,8,8,8,8,6,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8,8,4,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,4,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8,8,7,8,8,8,8,8,6,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,7,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8,8,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,6,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8 8
-8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,3,8,8,8,8,8,4,8,8,8,8,8,3,8,8,8,8 8
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/info/mobilenet_shallow_layers.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/info/mobilenet_shallow_layers.txt
deleted file mode 100644
index 4ab1093cab28b2f3dfa284a4669c6a2885ff667d..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/info/mobilenet_shallow_layers.txt
+++ /dev/null
@@ -1,41 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/info/mobilenet_shallow_tensors.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/info/mobilenet_shallow_tensors.txt
deleted file mode 100644
index 76a5b9f8f5db9a4e924d492d32bf213977b9fe84..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/info/mobilenet_shallow_tensors.txt
+++ /dev/null
@@ -1,83 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/mobilenet_shallow_layers.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/mobilenet_shallow_layers.txt
deleted file mode 100644
index ba85aa142542d34722b19b9a16314100ecdd62da..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/mobilenet_shallow_layers.txt
+++ /dev/null
@@ -1,41 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/mobilenet_shallow_tensors.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/mobilenet_shallow_tensors.txt
deleted file mode 100644
index 80f75053287bfc0751dccd29ae41a62138b59419..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/mobilenet_shallow_tensors.txt
+++ /dev/null
@@ -1,83 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/result_test_conf.out b/hpvm/projects/soc_simulator/mobilenet_shallow/result_test_conf.out
deleted file mode 100644
index 7907f1be58e22b0ac7dd4873471aac1b5205bf31..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/result_test_conf.out
+++ /dev/null
@@ -1,66 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/results/HA_loss1_results.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/results/HA_loss1_results.txt
deleted file mode 100644
index d1d1c5e872107045ebcbe8e4f248ddb20d584f2a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/results/HA_loss1_results.txt
+++ /dev/null
@@ -1,319 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/results/HA_loss2_results.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/results/HA_loss2_results.txt
deleted file mode 100644
index 1127c71bb854e69baaaf74a21f82b84266251ef4..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/results/HA_loss2_results.txt
+++ /dev/null
@@ -1,385 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/results/HS_loss1_results.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/results/HS_loss1_results.txt
deleted file mode 100644
index 85b21baa2464f296262fc489ac6162bbd7d6810a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/results/HS_loss1_results.txt
+++ /dev/null
@@ -1,759 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/results/HS_loss2_results.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/results/HS_loss2_results.txt
deleted file mode 100644
index 965e1ff061ce5ca3247947e16033a1aef2a56db0..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/results/HS_loss2_results.txt
+++ /dev/null
@@ -1,880 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/mobilenet_shallow/test_conf.txt b/hpvm/projects/soc_simulator/mobilenet_shallow/test_conf.txt
deleted file mode 100644
index 0142d9b480bac8f9f4fe69826d9afecf6ff22f30..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/mobilenet_shallow/test_conf.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9 9
-9,9,9,9,9,9,7,9,9,9,9,9,9,9,9,9,9,9,7,9,9,9,9,9,7,9,9,9,9,9,7,9,9,9,9,9,6,9,9,9,9 9
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_confs1.txt
deleted file mode 100644
index 8ea283e0a2031578fd063967089b6f6cba54a549..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_confs1.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-9 9,9 9 9,9 9,9 9
-8 8,8 8 8,8 8,8 8 
-8 8,8 8 8,8 8,8 8
-8 8,9 9 9,8 8,8 8 
-9 9,9 9 9,9 9,9 9
-9 9,9 9 9,9 9,8 8
-8 8,9 9 9,9 9,9 9
-8 8,8 8 8,8 8,9 9
-8 8,8 8 8,9 9,9 9
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_confs2.txt
deleted file mode 100644
index 834a35f35200d56d029fda983afeb6aac188c6ec..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_confs2.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-9 9,9 9 9,9 9,9 9
-7,8 8 8,8 8,8 8
-7,8 8 8,8 8,8 8 
-7,8 8 8,9 9,9 9
-7,9 9 9,9 9,9 9
-7,8 8 8,8 8,8 8
-7,8 8 8,8 8,8 8
-7,9 9 9,9 9,9 9
-7,9 9 9,9 9,9 9
-7,8 8 8,8 8,9 9 
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_fp16.csv b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_fp16.csv
deleted file mode 100644
index 2beaa67647d3eac8f2d103e02829ffe3687264ec..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_fp16.csv
+++ /dev/null
@@ -1,27 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_fp32.csv b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_fp32.csv
deleted file mode 100644
index e790a0bb8ea6443dcfca9f588a5f502aa4aff1f0..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_fp32.csv
+++ /dev/null
@@ -1,9 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_layers.txt b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_layers.txt
deleted file mode 100644
index 10199705ce8b6061351a9018c085549a2d330230..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_layers.txt
+++ /dev/null
@@ -1,4 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_ops.txt b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_ops.txt
deleted file mode 100644
index 0807a77c985bd73c6c538a1259b2ffc44eeda73a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_ops.txt
+++ /dev/null
@@ -1,13 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_confs1.txt
deleted file mode 100644
index 39ea99038c7ed190c73ff101597f7b46ca3ecf46..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_confs1.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-9 9,9 9 9,9 9,9 9
-8 8,8 8 8,8 8,9 9
-9 9,8 8 8,9 9,8 8
-9 9,8 8 8,8 8,8 8
-8 8,9 9 9,9 9,9 9
-8 8,8 8 8,9 9,9 9
-8 8,9 9 9,8 8,9 9
-9 9,8 8 8,9 9,9 9
-9 9,8 8 8,8 8,9 9
-9 9,8 8 8,8 8,8 8
-8 8,8 8 8,8 8,8 8
-8 8,9 9 9,9 9,9 9
-8 8,9 9 9,8 8,8 8
-9 9,9 9 9,8 8,9 9
-9 9,9 9 9,9 9,9 9
-9 9,9 9 9,9 9,8 8
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_confs2.txt
deleted file mode 100644
index 4a23019eb2d256d728fcd7c6d66a76d5a25c6206..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_confs2.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-9 9,9 9 9,9 9,9 9
-5,9 9 9,9 9,8 8
-5,8 8 8,9 9,9 9
-5,9 9 9,9 9,9 9
-5,8 8 8,9 9,8 8
-7,9 9 9,7,9 9
-7,8 8 8,7,8 8
-7,9 9 9,7,8 8
-7,8 8 8,7,9 9
-7,9 9 9,9 9,9 9
-7,8 8 8,9 9,8 8
-7,8 8 8,9 9,9 9
-9 9,9 9 9,9 9,9 9
-6,9 9 9,9 9,9 9
-7,9 9 9,8 8,9 9
-7,9 9 9,9 9,8 8
-8 8,8 8 8,9 9,8 8
-8 8,8 8 8,7,9 9
-7,8 8 8,8 8,9 9
-6,8 8 8,8 8,9 9
-8 8,8 8 8,9 9,9 9
-8 8,8 8 8,8 8,9 9
-8 8,9 9 9,8 8,9 9
-6,9 9 9,8 8,8 8
-8 8,9 9 9,9 9,9 9
-9 9,9 9 9,7,8 8
-7,8 8 8,8 8,8 8
-8 8,9 9 9,7,9 9
-6,8 8 8,9 9,9 9
-6,8 8 8,8 8,8 8
-8 8,9 9 9,9 9,8 8
-8 8,8 8 8,7,8 8
-9 9,8 8 8,9 9,8 8
-9 9,9 9 9,8 8,8 8
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_results1.csv b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_results1.csv
deleted file mode 100644
index 40a0db161373ec37f4f22800a62a9e942b25d337..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_results1.csv
+++ /dev/null
@@ -1,220 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_results2.csv b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_results2.csv
deleted file mode 100644
index 3762ea7ee951b3a472bd4148816e772b22b8dd67..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_promise_results2.csv
+++ /dev/null
@@ -1,418 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_results1.csv b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_results1.csv
deleted file mode 100644
index 1e8d11387a221361bf025c7ff0716709f5cd82b0..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_results1.csv
+++ /dev/null
@@ -1,143 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_results2.csv b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_results2.csv
deleted file mode 100644
index 870bb0106874567d837a62a781422af940cd1631..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_results2.csv
+++ /dev/null
@@ -1,154 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_tensors.txt b/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_tensors.txt
deleted file mode 100644
index 71dce21c4710dee6c0180112e14825189281bce6..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEMO/pipeline_GEMO_tensors.txt
+++ /dev/null
@@ -1,13 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_confs1.txt
deleted file mode 100644
index 3c5d575879b4b851938894d007277b2e65fcacc4..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_confs1.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-9 9,9 9 9,9 9
-8 8,8 8 8,8 8
-8 8,8 8 8,9 9
-9 9,8 8 8,8 8
-8 8,9 9 9,9 9
-8 8,8 8 8,8 8
-9 9,9 9 9,8 8
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_confs2.txt
deleted file mode 100644
index 6ebcf19aec794b6fd5bcb2db1348a9bda7df8ee9..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_confs2.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-9 9,9 9 9,9 9
-8 8,8 8 8,9 9
-8 8,8 8 8,8 8
-9 9,9 9 9,9 9
-9 9,9 9 9,8 8
-8 8,9 9 9,8 8
-9 9,8 8 8,8 8
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_fp16.csv b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_fp16.csv
deleted file mode 100644
index a506b2186159108b62d813ff72c9f5e080517b35..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_fp16.csv
+++ /dev/null
@@ -1,21 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_fp32.csv b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_fp32.csv
deleted file mode 100644
index 4bd84e896cea5e41ca36173932788c4f289d729e..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_fp32.csv
+++ /dev/null
@@ -1,7 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_layers.txt b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_layers.txt
deleted file mode 100644
index b7d5c3cdcc00c398df81022401291a084d3f8ab8..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_layers.txt
+++ /dev/null
@@ -1,3 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_ops.txt b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_ops.txt
deleted file mode 100644
index 7c5a3a1c50a80a638bf569b41f5d3ef2add5f224..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_ops.txt
+++ /dev/null
@@ -1,10 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_confs1.txt
deleted file mode 100644
index 0d24c059750ac51d2e5a76925dfc5de8f4e3f059..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_confs1.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-9 9,9 9 9,9 9
-9 9,9 9 9,8 8
-9 9,8 8 8,9 9
-9 9,9 9 9,9 9
-9 9,8 8 8,8 8
-9 9,9 9 9,8 8
-8 8,8 8 8,9 9
-8 8,8 8 8,8 8
-8 8,9 9 9,8 8
-8 8,9 9 9,9 9
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_confs2.txt
deleted file mode 100644
index ba2839b706b3718fd6bcf065409164737a878b08..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_confs2.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-9 9,9 9 9,9 9
-8 8,9 9 9,8 8
-9 9,8 8 8,9 9
-9 9,9 9 9,9 9
-9 9,8 8 8,8 8
-8 8,9 9 9,9 9
-9 9,9 9 9,8 8
-8 8,8 8 8,9 9
-8 8,8 8 8,8 8
-8 8,9 9 9,8 8
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_results1.csv b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_results1.csv
deleted file mode 100644
index d74e5b5acea78b727e36599c2cecb813971fb82c..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_results1.csv
+++ /dev/null
@@ -1,154 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_results2.csv b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_results2.csv
deleted file mode 100644
index 2f278d260729ab866e55bc325c2b7db70ab87d4b..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_promise_results2.csv
+++ /dev/null
@@ -1,154 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_results1.csv b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_results1.csv
deleted file mode 100644
index cdda603e1802f9a53b8b54138515783f87671e43..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_results1.csv
+++ /dev/null
@@ -1,121 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_results2.csv b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_results2.csv
deleted file mode 100644
index 3772196ed94a052d40cee85ef52b9566646aa63d..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_results2.csv
+++ /dev/null
@@ -1,121 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_tensors.txt b/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_tensors.txt
deleted file mode 100644
index 1b0a2e35250a19f38f2c4eb020d2d79038f3b038..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEO/pipeline_GEO_tensors.txt
+++ /dev/null
@@ -1,10 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_confs1.txt
deleted file mode 100644
index da21e938c3aab5ec3f870ef5bdce0fb0e6457808..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_confs1.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-9 9,9 9 9,9 9,9 9
-8 8,8 8 8,8 8,7
-8 8,8 8 8,8 8,7
-8 8,8 8 8,9 9,7
-9 9,9 9 9,9 9,7
-8 8,8 8 8,8 8,7
-8 8,8 8 8,8 8,6
-8 8,8 8 8,9 9,6
-9 9,9 9 9,9 9,6
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_confs2.txt
deleted file mode 100644
index e004da0e84fe4a8f72bf37eccf5c2ce353263551..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_confs2.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-9 9,9 9 9,9 9,9 9
-8 8,8 8 8,8 8,6
-8 8,8 8 8,9 9,6
-9 9,9 9 9,9 9,6
-8 8,8 8 8,8 8,6
-8 8,8 8 8,8 8,5
-9 9,9 9 9,9 9,5
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_fp16.csv b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_fp16.csv
deleted file mode 100644
index 8f1731ca860ca20f23d2c879d4a19031ac96772d..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_fp16.csv
+++ /dev/null
@@ -1,27 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_fp32.csv b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_fp32.csv
deleted file mode 100644
index 32c71387d272af1306da6391e268ab0d1e7690c9..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_fp32.csv
+++ /dev/null
@@ -1,9 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_layers.txt b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_layers.txt
deleted file mode 100644
index aa19e9d56f9b6f6d681a08bf5f51a8e6aecd36d8..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_layers.txt
+++ /dev/null
@@ -1,4 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_ops.txt b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_ops.txt
deleted file mode 100644
index 0807a77c985bd73c6c538a1259b2ffc44eeda73a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_ops.txt
+++ /dev/null
@@ -1,13 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_confs1.txt
deleted file mode 100644
index 680259f083a6ec52920bb2ce22dbd0f1a50200b3..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_confs1.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-9 9,9 9 9,9 9,9 9
-8 8,8 8 8,9 9,3
-8 8,8 8 8,8 8,3
-8 8,9 9 9,9 9,3
-8 8,8 8 8,8 8,3
-8 8,9 9 9,8 8,3
-9 9,9 9 9,8 8,3
-9 9,9 9 9,9 9,3
-9 9,8 8 8,8 8,3
-9 9,8 8 8,9 9,3
-8 8,9 9 9,9 9,8 8
-8 8,9 9 9,9 9,6
-8 8,9 9 9,8 8,6
-8 8,9 9 9,9 9,7
-8 8,9 9 9,9 9,4
-8 8,8 8 8,9 9,6
-8 8,9 9 9,8 8,5
-8 8,8 8 8,8 8,8 8
-8 8,9 9 9,8 8,8 8
-9 9,9 9 9,8 8,6
-8 8,8 8 8,8 8,5
-8 8,8 8 8,8 8,4
-9 9,8 8 8,8 8,5
-8 8,8 8 8,8 8,6
-8 8,8 8 8,9 9,5
-8 8,9 9 9,9 9,9 9
-8 8,9 9 9,8 8,9 9
-8 8,8 8 8,8 8,7
-8 8,8 8 8,9 9,4
-8 8,8 8 8,8 8,9 9
-9 9,8 8 8,8 8,6
-9 9,9 9 9,8 8,5
-8 8,9 9 9,8 8,7
-9 9,9 9 9,9 9,9 9
-9 9,8 8 8,8 8,7
-8 8,8 8 8,9 9,8 8
-8 8,9 9 9,9 9,8 8
-8 8,9 9 9,8 8,4
-8 8,8 8 8,9 9,7
-9 9,9 9 9,9 9,6
-8 8,8 8 8,9 9,9 9
-8 8,9 9 9,9 9,5
-9 9,8 8 8,9 9,9 9
-9 9,8 8 8,9 9,7
-9 9,8 8 8,8 8,4
-9 9,9 9 9,8 8,4
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_confs2.txt
deleted file mode 100644
index 1e0ff4012e24dd4b11fa809d00f55bf49a2d0656..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_confs2.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-9 9,9 9 9,9 9,9 9
-8 8,8 8 8,8 8,5
-9 9,9 9 9,9 9,9 9
-9 9,8 8 8,8 8,5
-8 8,8 8 8,8 8,4
-8 8,8 8 8,8 8,7
-8 8,8 8 8,8 8,6
-8 8,9 9 9,8 8,9 9
-8 8,9 9 9,8 8,8 8
-9 9,8 8 8,8 8,8 8
-8 8,8 8 8,9 9,4
-8 8,9 9 9,8 8,4
-9 9,9 9 9,8 8,4
-8 8,9 9 9,9 9,4
-8 8,8 8 8,9 9,9 9
-8 8,8 8 8,9 9,2
-9 9,8 8 8,9 9,9 9
-8 8,8 8 8,8 8,9 9
-8 8,8 8 8,8 8,5
-8 8,8 8 8,9 9,5
-8 8,8 8 8,8 8,8 8
-8 8,8 8 8,8 8,3
-9 9,8 8 8,8 8,2
-8 8,9 9 9,9 9,6
-9 9,8 8 8,8 8,4
-8 8,9 9 9,8 8,3
-9 9,8 8 8,9 9,7
-9 9,8 8 8,9 9,4
-9 9,9 9 9,8 8,8 8
-8 8,8 8 8,8 8,2
-8 8,8 8 8,9 9,6
-8 8,8 8 8,9 9,3
-9 9,9 9 9,9 9,5
-9 9,9 9 9,9 9,2
-8 8,8 8 8,9 9,7
-8 8,9 9 9,9 9,3
-8 8,9 9 9,9 9,2
-9 9,9 9 9,9 9,7
-8 8,9 9 9,8 8,2
-8 8,9 9 9,9 9,5
-9 9,8 8 8,9 9,6
-8 8,9 9 9,8 8,5
-8 8,9 9 9,8 8,6
-8 8,9 9 9,8 8,7
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_results1.csv b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_results1.csv
deleted file mode 100644
index 282361a378d03d391107aeedc3c70df4d54b21fc..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_results1.csv
+++ /dev/null
@@ -1,550 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_results2.csv b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_results2.csv
deleted file mode 100644
index 76a1c5a4797a1520feee8c0c017c483778f867c7..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_promise_results2.csv
+++ /dev/null
@@ -1,528 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_results1.csv b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_results1.csv
deleted file mode 100644
index 8a52300d986765ae762a169faa4b5ebc99509663..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_results1.csv
+++ /dev/null
@@ -1,143 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_results2.csv b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_results2.csv
deleted file mode 100644
index 070c3ca75d9394b53a9d5d2779218edec113762e..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_results2.csv
+++ /dev/null
@@ -1,121 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_tensors.txt b/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_tensors.txt
deleted file mode 100644
index 3fa0edb09a67a17457cfd7a9ed1d5557586b15cb..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GEOM/pipeline_GEOM_tensors.txt
+++ /dev/null
@@ -1,13 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_confs1.txt
deleted file mode 100644
index 24c151f89f008f3d34a44852bf82df6fa6943607..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_confs1.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-9 9,9 9,9 9
-8 8,8 8,7
-8 8,9 9,7
-8 8,9 9,7
-8 8,8 8,7
-9 9,8 8,7
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_confs2.txt
deleted file mode 100644
index 66478e91d8eccacfd04be6fe0b4a9051b95b8cd3..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_confs2.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-9 9,9 9,9 9
-7,8 8,6
-7,9 9,7
-7,8 8,7
-7,9 9,7
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_fp16.csv b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_fp16.csv
deleted file mode 100644
index 119769d485e8f3635444942e1577dfb1f30ffb63..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_fp16.csv
+++ /dev/null
@@ -1,18 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_fp32.csv b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_fp32.csv
deleted file mode 100644
index 3b793e99df3c860b148a514488630229c6122385..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_fp32.csv
+++ /dev/null
@@ -1,6 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_layers.txt b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_layers.txt
deleted file mode 100644
index deefedb4efd481a87ecb2d58d8e5d503ff1daa7e..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_layers.txt
+++ /dev/null
@@ -1,3 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_ops.txt b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_ops.txt
deleted file mode 100644
index 6dbd74c42edb6ae286efef7aaad8239709c29748..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_ops.txt
+++ /dev/null
@@ -1,9 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_confs1.txt
deleted file mode 100644
index e061e9d839507ea88117b1ebacf5a705d477171f..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_confs1.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-9 9,9 9,9 9
-8 8,9 9,9 9
-8 8,8 8,8 8
-8 8,8 8,9 9
-8 8,9 9,8 8
-8 8,8 8,7
-9 9,8 8,8 8
-9 9,9 9,8 8
-9 9,8 8,9 9
-8 8,9 9,9 9
-8 8,9 9,7
-9 9,9 9,9 9
-9 9,8 8,7
-9 9,9 9,7
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_confs2.txt
deleted file mode 100644
index 4f35063f25b6b4b6f63252258c2545953e6b2a75..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_confs2.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-9 9,9 9,9 9
-7,9 9,3
-5,9 9,4
-4,9 9,5
-4,8 8,6
-4,9 9,6
-4,9 9,6
-5,8 8,5
-6,8 8,4
-6,9 9,4
-6,9 9,4
-5,9 9,5
-5,9 9,5
-4,9 9,7
-7,8 8,4
-5,8 8,6
-7,8 8,4
-5,9 9,6
-7,9 9,4
-6,8 8,5
-7,9 9,4
-6,9 9,5
-6,9 9,5
-5,8 8,7
-5,8 8,7
-5,9 9,7
-5,9 9,7
-7,8 8,5
-6,9 9,6
-6,9 9,6
-6,8 8,6
-9 9,8 8,3
-8 8,9 9,4
-7,9 9,9 9
-5,8 8,8 8
-3,9 9,8 8
-7,9 9,7
-7,8 8,7
-9 9,9 9,5
-8 8,8 8,4
-8 8,9 9,2
-8 8,9 9,8 8
-9 9,8 8,4
-8 8,9 9,7
-4,9 9,9 9
-8 8,9 9,6
-8 8,9 9,5
-8 8,8 8,3
-8 8,8 8,8 8
-8 8,8 8,5
-7,8 8,8 8
-9 9,9 9,2
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_results1.csv b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_results1.csv
deleted file mode 100644
index 408ee686dfca8f052cbf009b197115c6e2755081..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_results1.csv
+++ /dev/null
@@ -1,198 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_results2.csv b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_results2.csv
deleted file mode 100644
index 4bbd96279b68828be34786e140f3ba91321ce530..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_promise_results2.csv
+++ /dev/null
@@ -1,616 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_results1.csv b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_results1.csv
deleted file mode 100644
index 8be42209fe3813932175240022e7550b115fb013..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_results1.csv
+++ /dev/null
@@ -1,110 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_results2.csv b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_results2.csv
deleted file mode 100644
index 3962cd8a7becc7eb4f032529b362789713a3e8d4..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_results2.csv
+++ /dev/null
@@ -1,99 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_tensors.txt b/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_tensors.txt
deleted file mode 100644
index 9f9f7622734cbfa5fe8a93b9ba57d47c4a474881..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSM/pipeline_GSM_tensors.txt
+++ /dev/null
@@ -1,9 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_confs1.txt
deleted file mode 100644
index 8e8d9f3412cb2b8697b1255f266a911b0de7fb5b..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_confs1.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-9 9,9 9,9 9,9 9 9
-8 8,8 8,8 8,8 8 8
-8 8,8 8,8 8,8 8 8
-8 8,8 8,8 8,8 8 8
-8 8,9 9,8 8,8 8 8
-8 8,8 8,8 8,8 8 8
-8 8,8 8,8 8,8 8 8
-8 8,8 8,8 8,8 8 8
-9 9,9 9,9 9,9 9 9
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_confs2.txt
deleted file mode 100644
index f959aacca24ebf7e053b7942c15670ba9f4a6250..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_confs2.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-9 9,9 9,9 9,9 9 9
-7,8 8,8 8,8 8 8
-7,8 8,8 8,8 8 8
-7,8 8,8 8,8 8 8
-7,8 8,8 8,8 8 8
-7,9 9,8 8,8 8 8
-7,8 8,8 8,8 8 8
-7,8 8,8 8,8 8 8
-7,9 9,9 9,9 9 9
-7,9 9,9 9,8 8 8
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_fp16.csv b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_fp16.csv
deleted file mode 100644
index adf98595058033c6e719a7ab8ea4e0e0863b70fa..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_fp16.csv
+++ /dev/null
@@ -1,27 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_fp32.csv b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_fp32.csv
deleted file mode 100644
index 07f6d29eb8878988dd7385f39fc7dfc58c2ca13b..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_fp32.csv
+++ /dev/null
@@ -1,9 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_layers.txt b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_layers.txt
deleted file mode 100644
index cb7b918f537fcbb77d6c71844162161ca21ca01a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_layers.txt
+++ /dev/null
@@ -1,4 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_ops.txt b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_ops.txt
deleted file mode 100644
index d80034c986694bf89a332c0a382f4f3281728537..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_ops.txt
+++ /dev/null
@@ -1,13 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_confs1.txt b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_confs1.txt
deleted file mode 100644
index 8e7edaca0182fd1a9b2f6e510e33db9c51b923f5..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_confs1.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-9 9,9 9,9 9,9 9 9
-8 8,8 8,9 9,9 9 9
-8 8,9 9,8 8,8 8 8
-8 8,9 9,9 9,8 8 8
-8 8,8 8,8 8,8 8 8
-8 8,8 8,9 9,9 9 9
-8 8,9 9,8 8,9 9 9
-8 8,9 9,9 9,9 9 9
-9 9,8 8,8 8,8 8 8
-8 8,8 8,8 8,9 9 9
-9 9,9 9,8 8,8 8 8
-9 9,9 9,8 8,9 9 9
-9 9,9 9,9 9,9 9 9
-8 8,9 9,9 9,9 9 9
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_confs2.txt b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_confs2.txt
deleted file mode 100644
index 4c0531b59728b06aa987b947d9ed062aa5f80b72..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_confs2.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-9 9,9 9,9 9,9 9 9
-7,9 9,6,9 9 9
-7,8 8,6,9 9 9
-7,9 9,5,8 8 8
-5,9 9,7,9 9 9
-7,8 8,5,8 8 8
-5,9 9,7,8 8 8
-5,8 8,7,8 8 8
-6,9 9,6,8 8 8
-6,9 9,6,9 9 9
-6,8 8,6,8 8 8
-6,8 8,6,9 9 9
-9 9,8 8,4,8 8 8
-8 8,8 8,4,9 9 9
-9 9,8 8,4,9 9 9
-5,9 9,9 9,8 8 8
-5,9 9,8 8,8 8 8
-5,8 8,8 8,8 8 8
-5,8 8,8 8,9 9 9
-5,9 9,8 8,9 9 9
-5,9 9,9 9,8 8 8
-5,8 8,9 9,9 9 9
-5,8 8,9 9,8 8 8
-5,8 8,8 8,9 9 9
-5,9 9,9 9,9 9 9
-7,9 9,6,9 9 9
-8 8,9 9,6,8 8 8
-7,8 8,6,8 8 8
-7,8 8,8 8,9 9 9
-6,8 8,8 8,8 8 8
-7,9 9,6,8 8 8
-9 9,8 8,6,8 8 8
-9 9,9 9,6,8 8 8
-7,9 9,7,9 9 9
-6,8 8,8 8,9 9 9
-8 8,8 8,8 8,8 8 8
-8 8,8 8,8 8,9 9 9
-7,8 8,8 8,8 8 8
-8 8,8 8,6,8 8 8
-7,8 8,9 9,9 9 9
-7,8 8,9 9,8 8 8
-7,8 8,6,9 9 9
-6,8 8,9 9,9 9 9
-8 8,8 8,5,9 9 9
-8 8,8 8,5,8 8 8
-8 8,9 9,6,8 8 8
-7,8 8,7,8 8 8
-7,9 9,8 8,9 9 9
-6,9 9,8 8,9 9 9
-8 8,9 9,8 8,9 9 9
-7,9 9,7,8 8 8
-6,8 8,7,9 9 9
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_results1.csv b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_results1.csv
deleted file mode 100644
index 62918ed77910a43edb399dfc9a5e48e8fc956b37..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_results1.csv
+++ /dev/null
@@ -1,198 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_results2.csv b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_results2.csv
deleted file mode 100644
index 2923a28122195bfcd79bed384207073372a8eab9..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_promise_results2.csv
+++ /dev/null
@@ -1,616 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_results1.csv b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_results1.csv
deleted file mode 100644
index b45df9c4a9dbeb4c276364c549f61ea91945386f..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_results1.csv
+++ /dev/null
@@ -1,143 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_results2.csv b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_results2.csv
deleted file mode 100644
index 6d54eb0f35f60d2549ef8b12a44a85e28ec638e2..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_results2.csv
+++ /dev/null
@@ -1,154 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_tensors.txt b/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_tensors.txt
deleted file mode 100644
index 38f513ebc98833ddfdd95ef72d04b05aa60ed8a4..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/pipeline_GSME/pipeline_GSME_tensors.txt
+++ /dev/null
@@ -1,13 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_confs1.txt b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_confs1.txt
deleted file mode 100644
index 7361a062e7d4cc86d5c955dd4c7662d2e88175a6..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_confs1.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-9 9 9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9,9 9
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,7,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,7,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,7,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_confs2.txt b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_confs2.txt
deleted file mode 100644
index 3d0ad476552644463569166477024d81f4fc4dd0..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_confs2.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-9 9 9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9,9 9
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,7,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,7,7,8,8,8 8 8,8 8,8,8,7,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,7,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,7,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,7,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,7,7,8,8,8 8 8,8 8,8,8,7,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,7,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,7,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,7,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,7,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,7,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8 8,8,8,7,7,8,8,8 8 8,8 8,8,8,7,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,7,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,7,8 8,8 8,8,8,8 8 8,8 8,8,8,8 8 8,8 8,8,8,8,8 8
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_fp16.csv b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_fp16.csv
deleted file mode 100644
index 8aa9f4f5f1ab4418335e9f3eb0a9ed81035e44c2..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_fp16.csv
+++ /dev/null
@@ -1,220 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_fp32.csv b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_fp32.csv
deleted file mode 100644
index 1c825c847259023cf9ab4d78b89dda08710ec175..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_fp32.csv
+++ /dev/null
@@ -1,74 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_layers.txt b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_layers.txt
deleted file mode 100644
index 6837e87207b24eec8c1913275aa742824a67f74f..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_layers.txt
+++ /dev/null
@@ -1,41 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_ops.txt b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_ops.txt
deleted file mode 100644
index 86795a48547725b624c69f8768a4f7e53103d623..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_ops.txt
+++ /dev/null
@@ -1,114 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_confs1.txt b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_confs1.txt
deleted file mode 100644
index 0f382344773bc67d7a63545f5441e02e17c8025d..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_confs1.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-9 9 9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9,9 9
-9 9 9,9 9 9,9 9,8,8,9 9 9,7,8,8,9 9 9,7,8,8,7,8 8,9 9,8,8,8 8 8,8 8,8,8,9 9 9,8 8,8,8,7,9 9,8 8,8,8,8 8 8,8 8,8,8,9 9 9,8 8,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,8 8 8,7,8,8,7,9 9,8,8,9 9 9,7,9 9,8,8,7,8 8,8,8,9 9 9,9 9,8,8,9 9 9,7,8 8,8,8,9 9 9,9 9,8,8,9 9 9,8 8,8,8,8,7
-8 8 8,8 8 8,7,8,8,8 8 8,8 8,8,8,9 9 9,7,8,8,9 9 9,9 9,9 9,8,8,8 8 8,8 8,8,8,9 9 9,9 9,8,8,7,7,9 9,8,8,9 9 9,8 8,8,8,7,9 9,8,8,8,9 9
-8 8 8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8 8 8,8 8,8,8,9 9 9,8 8,9 9,8,8,9 9 9,8 8,8,8,8 8 8,8 8,8,8,9 9 9,8 8,9 9,8,8,9 9 9,9 9,8,8,8 8 8,7,8,8,8,9 9
-9 9 9,8 8 8,7,8,8,7,8 8,8,8,8 8 8,9 9,8,8,7,8 8,8 8,8,8,8 8 8,8 8,8,8,9 9 9,8 8,8,8,9 9 9,8 8,9 9,8,8,9 9 9,9 9,8,8,9 9 9,9 9,8,8,8,7
-8 8 8,8 8 8,7,8,8,7,8 8,8,8,7,8 8,8,8,9 9 9,9 9,8 8,8,8,9 9 9,8 8,8,8,8 8 8,7,8,8,8 8 8,9 9,7,8,8,9 9 9,9 9,8,8,9 9 9,9 9,8,8,8,8 8
-8 8 8,8 8 8,7,8,8,9 9 9,7,8,8,7,8 8,8,8,9 9 9,7,7,8,8,7,8 8,8,8,9 9 9,8 8,8,8,9 9 9,9 9,8 8,8,8,9 9 9,9 9,8,8,9 9 9,8 8,8,8,8,7
-8 8 8,8 8 8,7,8,8,9 9 9,8 8,8,8,8 8 8,9 9,8,8,9 9 9,8 8,7,8,8,9 9 9,8 8,8,8,9 9 9,9 9,8,8,8 8 8,8 8,7,8,8,8 8 8,7,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,9 9 9,7,8,8,8 8 8,8 8,8,8,8 8 8,9 9,8,8,9 9 9,8 8,8 8,8,8,8 8 8,9 9,8,8,7,9 9,8,8,9 9 9,8 8,7,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-9 9 9,8 8 8,9 9,8,8,7,7,8,8,7,9 9,8,8,8 8 8,7,9 9,8,8,8 8 8,9 9,8,8,8 8 8,9 9,8,8,8 8 8,9 9,7,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,8 8 8,7,8,8,7,9 9,8,8,9 9 9,7,9 9,8,8,7,9 9,8,8,9 9 9,8 8,8,8,8 8 8,9 9,8 8,8,8,9 9 9,9 9,8,8,9 9 9,8 8,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,7,7,8,8,7,7,8,8,8 8 8,9 9,8 8,8,8,9 9 9,9 9,8,8,8 8 8,8 8,8,8,9 9 9,9 9,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,7
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_confs2.txt b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_confs2.txt
deleted file mode 100644
index 25db61b421ddaf79a95c0b645fbcd381a7f07dc0..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_confs2.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-9 9 9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9 9,9,9,9 9 9,9 9,9,9,9 9 9,9 9,9,9,9,9 9
-9 9 9,9 9 9,9 9,8,8,9 9 9,7,8,8,9 9 9,7,8,8,7,8 8,9 9,8,8,8 8 8,8 8,8,8,9 9 9,8 8,8,8,7,9 9,8 8,8,8,8 8 8,8 8,8,8,9 9 9,8 8,8,8,8,9 9
-9 9 9,9 9 9,7,8,8,9 9 9,7,8,8,9 9 9,9 9,8,8,7,8 8,9 9,8,8,7,7,8,8,9 9 9,8 8,8,8,9 9 9,9 9,8 8,8,8,8 8 8,9 9,8,8,8 8 8,8 8,8,8,8,7
-8 8 8,9 9 9,7,8,8,7,9 9,8,8,7,7,8,8,9 9 9,9 9,8 8,8,8,9 9 9,7,8,8,8 8 8,7,8,8,8 8 8,7,9 9,8,8,9 9 9,9 9,8,8,9 9 9,9 9,8,8,8,8 8
-8 8 8,8 8 8,8 8,8,8,7,9 9,8,8,7,7,8,8,7,9 9,8 8,8,8,9 9 9,8 8,8,8,8 8 8,8 8,8,8,8 8 8,7,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,8 8 8,8 8,8,8,7,7,8,8,7,7,8,8,8 8 8,9 9,9 9,8,8,9 9 9,8 8,8,8,9 9 9,8 8,8,8,7,7,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,8 8 8,7,8,8,7,9 9,8,8,9 9 9,7,9 9,8,8,7,8 8,8,8,9 9 9,9 9,8,8,9 9 9,7,8 8,8,8,9 9 9,9 9,8,8,9 9 9,8 8,8,8,8,7
-8 8 8,8 8 8,8 8,8,8,7,7,8,8,7,7,8,8,8 8 8,9 9,8 8,8,8,9 9 9,7,8,8,9 9 9,8 8,8,8,8 8 8,7,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,7,7,8,8,7,7,8,8,8 8 8,9 9,8 8,8,8,9 9 9,7,8,8,8 8 8,8 8,8,8,7,7,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,8 8
-8 8 8,8 8 8,7,8,8,8 8 8,8 8,8,8,9 9 9,7,8,8,9 9 9,9 9,9 9,8,8,8 8 8,8 8,8,8,9 9 9,9 9,8,8,7,7,9 9,8,8,9 9 9,8 8,8,8,7,9 9,8,8,8,9 9
-8 8 8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8 8 8,8 8,8,8,9 9 9,8 8,9 9,8,8,9 9 9,8 8,8,8,8 8 8,8 8,8,8,9 9 9,8 8,9 9,8,8,9 9 9,9 9,8,8,8 8 8,7,8,8,8,9 9
-9 9 9,9 9 9,7,8,8,7,8 8,8,8,9 9 9,9 9,8,8,8 8 8,8 8,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,9 9 9,8 8,9 9,8,8,9 9 9,8 8,8,8,9 9 9,7,8,8,8,7
-8 8 8,9 9 9,8 8,8,8,7,9 9,8,8,8 8 8,9 9,8,8,9 9 9,7,7,8,8,8 8 8,7,8,8,9 9 9,8 8,8,8,9 9 9,7,9 9,8,8,8 8 8,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,9 9 9,7,8,8,7,9 9,8,8,9 9 9,9 9,8,8,8 8 8,7,7,8,8,9 9 9,7,8,8,9 9 9,8 8,8,8,7,8 8,9 9,8,8,8 8 8,9 9,8,8,8 8 8,8 8,8,8,8,8 8
-9 9 9,8 8 8,7,8,8,7,8 8,8,8,8 8 8,9 9,8,8,7,8 8,8 8,8,8,8 8 8,8 8,8,8,9 9 9,8 8,8,8,9 9 9,8 8,9 9,8,8,9 9 9,9 9,8,8,9 9 9,9 9,8,8,8,7
-8 8 8,8 8 8,7,8,8,7,8 8,8,8,7,8 8,8,8,9 9 9,9 9,8 8,8,8,9 9 9,8 8,8,8,8 8 8,7,8,8,8 8 8,9 9,7,8,8,9 9 9,9 9,8,8,9 9 9,9 9,8,8,8,8 8
-8 8 8,9 9 9,8 8,8,8,7,7,8,8,7,7,8,8,8 8 8,9 9,8 8,8,8,9 9 9,7,8,8,8 8 8,8 8,8,8,9 9 9,8 8,7,8,8,8 8 8,9 9,8,8,8 8 8,8 8,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,9 9 9,7,8,8,7,8 8,8,8,9 9 9,7,7,8,8,7,8 8,8,8,9 9 9,8 8,8,8,9 9 9,9 9,8 8,8,8,9 9 9,9 9,8,8,9 9 9,8 8,8,8,8,7
-8 8 8,8 8 8,7,8,8,9 9 9,7,8,8,9 9 9,7,8,8,8 8 8,9 9,8 8,8,8,9 9 9,7,8,8,8 8 8,8 8,8,8,7,7,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,9 9 9,8 8,8,8,8 8 8,9 9,8,8,9 9 9,8 8,7,8,8,9 9 9,8 8,8,8,9 9 9,9 9,8,8,8 8 8,8 8,7,8,8,8 8 8,7,8,8,8 8 8,9 9,8,8,8,9 9
-9 9 9,9 9 9,7,8,8,8 8 8,7,8,8,8 8 8,9 9,8,8,8 8 8,8 8,9 9,8,8,8 8 8,7,8,8,9 9 9,9 9,8,8,7,8 8,8 8,8,8,8 8 8,9 9,8,8,8 8 8,9 9,8,8,8,7
-9 9 9,9 9 9,7,8,8,8 8 8,9 9,8,8,9 9 9,9 9,8,8,9 9 9,9 9,8 8,8,8,8 8 8,7,8,8,8 8 8,8 8,8,8,7,8 8,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,7,7,8,8,7,7,8,8,7,7,9 9,8,8,9 9 9,9 9,8,8,8 8 8,8 8,8,8,8 8 8,8 8,9 9,8,8,9 9 9,9 9,8,8,9 9 9,9 9,8,8,8,7
-8 8 8,8 8 8,8 8,8,8,9 9 9,7,8,8,7,7,8,8,8 8 8,9 9,8 8,8,8,9 9 9,7,8,8,8 8 8,8 8,8,8,8 8 8,8 8,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,7,7,8,8,8 8 8,7,8,8,9 9 9,9 9,9 9,8,8,9 9 9,7,8,8,8 8 8,9 9,8,8,8 8 8,7,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,9 9 9,7,8,8,8 8 8,8 8,8,8,8 8 8,9 9,8,8,9 9 9,8 8,8 8,8,8,8 8 8,9 9,8,8,7,9 9,8,8,9 9 9,8 8,7,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-9 9 9,8 8 8,9 9,8,8,7,7,8,8,7,9 9,8,8,8 8 8,7,9 9,8,8,8 8 8,9 9,8,8,8 8 8,9 9,8,8,8 8 8,9 9,7,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-9 9 9,8 8 8,7,8,8,7,7,8,8,7,7,8,8,7,7,8 8,8,8,9 9 9,9 9,8,8,9 9 9,8 8,8,8,9 9 9,8 8,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,8 8 8,7,8,8,7,9 9,8,8,9 9 9,7,9 9,8,8,7,9 9,8,8,9 9 9,8 8,8,8,8 8 8,9 9,8 8,8,8,9 9 9,9 9,8,8,9 9 9,8 8,8,8,8,9 9
-8 8 8,8 8 8,8 8,8,8,9 9 9,7,8,8,9 9 9,7,8,8,9 9 9,8 8,7,8,8,7,8 8,8,8,8 8 8,8 8,8,8,8 8 8,9 9,8 8,8,8,8 8 8,9 9,8,8,9 9 9,9 9,8,8,8,9 9
-8 8 8,8 8 8,7,8,8,7,7,8,8,7,7,8,8,8 8 8,9 9,8 8,8,8,9 9 9,9 9,8,8,8 8 8,8 8,8,8,9 9 9,9 9,9 9,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,7
-9 9 9,8 8 8,9 9,8,8,8 8 8,7,8,8,8 8 8,7,8,8,9 9 9,7,7,8,8,8 8 8,9 9,8,8,8 8 8,9 9,8,8,8 8 8,9 9,7,8,8,9 9 9,9 9,8,8,8 8 8,9 9,8,8,8,9 9
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_results1.csv b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_results1.csv
deleted file mode 100644
index 43efb1d48e791b041d332cb015d671f111dae293..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_results1.csv
+++ /dev/null
@@ -1,187 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_results2.csv b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_results2.csv
deleted file mode 100644
index 79f9be2d437ef747e412badb8800b6261fbc6f41..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_promise_results2.csv
+++ /dev/null
@@ -1,396 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_results1.csv b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_results1.csv
deleted file mode 100644
index 0631464ab20d0003a9923616f97d3d2fde356614..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_results1.csv
+++ /dev/null
@@ -1,418 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_results2.csv b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_results2.csv
deleted file mode 100644
index abf09ee84725703f377f9ee103c0c119469018aa..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_results2.csv
+++ /dev/null
@@ -1,550 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_tensors.txt b/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_tensors.txt
deleted file mode 100644
index a68f03802ad3eb3236f0eaf1a2dbe4eb524d712a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet18_cifar10/resnet18_tensors.txt
+++ /dev/null
@@ -1,114 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/resnet50_imagenet/construct_ops.py b/hpvm/projects/soc_simulator/resnet50_imagenet/construct_ops.py
deleted file mode 100644
index 3b655f2f5fb7ccb3eb4ac8db2e105cb74d71f986..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet50_imagenet/construct_ops.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import sys
-op_map = {}
-op_map["conv"] = "Conv"
-op_map["add"] = "Add"
-op_map["dense"] = "Mul"
-op_map["pool"] = "Pool"
-op_map["relu"] = "Relu"
-op_map["activation"] = "Relu"
-op_map["tanh"] = "Tanh"
-op_map["batchnorm"] = "NML"
-unique_op_map = {}
-def getLayerStr(layer_toks):
-    layer_str = ""
-    for tok in layer_toks:
-        op_id = 1
-        if tok not in unique_op_map:
-          op_id = 1
-          unique_op_map[tok] = 1
-        else:
-          op_id = unique_op_map[tok]
-          op_id += 1
-          unique_op_map[tok] = op_id
-        layer_str += op_map[tok] + str(op_id) + "\n"
-    return layer_str
-if __name__ == "__main__":
-  f_path = sys.argv[1]
-  out_path = sys.argv[2]
-  f = open(f_path)
-  f2 = open(out_path, "w+")
-  nml_id = 1
-  conv_id = 1
-  fc_id = 1
-  for x in f:
-      toks = x.split()
-      layer_len = len(toks)
-      if layer_len == 1 and "conv" not in toks and "dense" not in toks:
-          f2.write("#NML" + str(nml_id) + ",1\n")
-          nml_id += 1
-      if "conv" in toks:
-          f2.write("#Conv" + str(conv_id) + "," + str(layer_len) + "\n")
-          layer_str = getLayerStr(toks)
-          f2.write(layer_str)
-          conv_id += 1
-      if "dense" in toks:
-          f2.write("#FC" + str(fc_id) + "," + str(layer_len) + "\n")
-          layer_str = getLayerStr(toks)
-          f2.write(layer_str)
-          fc_id += 1
-  f2.close()          
diff --git a/hpvm/projects/soc_simulator/resnet50_imagenet/layer_composition.txt b/hpvm/projects/soc_simulator/resnet50_imagenet/layer_composition.txt
deleted file mode 100644
index f0b6ebedc9beccfc639b70433e30b172e2d44fea..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet50_imagenet/layer_composition.txt
+++ /dev/null
@@ -1,172 +0,0 @@
-conv  add  activation  pool  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-conv  add  
-dense  add  
diff --git a/hpvm/projects/soc_simulator/resnet50_imagenet/layers.txt b/hpvm/projects/soc_simulator/resnet50_imagenet/layers.txt
deleted file mode 100644
index fae2dc1e91044d2ab3128274a11646cd7e8c8697..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet50_imagenet/layers.txt
+++ /dev/null
@@ -1,172 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/resnet50_imagenet/replace_nml.py b/hpvm/projects/soc_simulator/resnet50_imagenet/replace_nml.py
deleted file mode 100644
index 92e58f629b0419d3f2f9d3539507ae4baaf10026..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet50_imagenet/replace_nml.py
+++ /dev/null
@@ -1,27 +0,0 @@
-if __name__ == "__main__":
-  f = open("layers.txt")
-  f2 = open("resnet50_imagenet_layers.txt", "w+")
-  nml_id = 1
-  for x in f:
-      if "#" in x:
-          f2.write("NML" + str(nml_id) + "\n")
-          nml_id += 1
-      elif "Conv" in x:
-          x = x.replace("\n", "")
-          x = x + ",1,1\n"
-          f2.write(x)
-      else:
-          f2.write(x)
-  f2.close()
diff --git a/hpvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_layers.txt b/hpvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_layers.txt
deleted file mode 100644
index 066c41cf6e62e57556b0cbace0eb61bca2accbc3..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_layers.txt
+++ /dev/null
@@ -1,172 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_layers.txt2 b/hpvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_layers.txt2
deleted file mode 100644
index 885d81b8901089e995074bb6ba3f7cd3988e9011..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_layers.txt2
+++ /dev/null
@@ -1,172 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt b/hpvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt
deleted file mode 100644
index da42b2ad85397c72f2385724f4af52f3da6c0c78..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt
+++ /dev/null
@@ -1,282 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/scripts/construct_ops.py b/hpvm/projects/soc_simulator/scripts/construct_ops.py
deleted file mode 100644
index 3bcb2af9c345b19d86acaf92e3771c60370b4678..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/scripts/construct_ops.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import sys
-op_map = {}
-op_map["conv"] = "Conv"
-op_map["add"] = "Add"
-op_map["dense"] = "Mul"
-op_map["pool"] = "Pool"
-op_map["relu"] = "Relu"
-op_map["activation"] = "Relu"
-op_map["tanh"] = "Tanh"
-op_map["batchnorm"] = "BatchNorm"
-unique_op_map = {}
-def getLayerStr(layer_toks):
-    layer_str = ""
-    for tok in layer_toks:
-        op_id = 1
-        if tok not in unique_op_map:
-          op_id = 1
-          unique_op_map[tok] = 1
-        else:
-          op_id = unique_op_map[tok]
-          op_id += 1
-          unique_op_map[tok] = op_id
-        layer_str += op_map[tok] + str(op_id) + "\n"
-    return layer_str
-if __name__ == "__main__":
-  f_path = sys.argv[1]
-  out_path = sys.argv[2]
-  f = open(f_path)
-  f2 = open(out_path, "w+")
-  nml_id = 1
-  conv_id = 1
-  fc_id = 1
-  batchnorm_id = 1
-  for x in f:
-      toks = x.split()
-      layer_len = len(toks)
-      #if "batchnorm" in toks:
-      #    f2.write("BatchNorm" + str(batchnorm_id) + "\n")
-      #    batchnorm_id += 1
-      #    continue
-      if layer_len == 1 and "conv" not in toks and "dense" not in toks:
-          f2.write("#NML" + str(nml_id) + ",1\n")
-          nml_id += 1
-          layer_str = getLayerStr(toks)
-          f2.write(layer_str)
-      if "conv" in toks:
-          f2.write("#Conv" + str(conv_id) + "," + str(layer_len) + "\n")
-          layer_str = getLayerStr(toks)
-          f2.write(layer_str)
-          conv_id += 1
-      if "dense" in toks:
-          f2.write("#FC" + str(fc_id) + "," + str(layer_len) + "\n")
-          layer_str = getLayerStr(toks)
-          f2.write(layer_str)
-          fc_id += 1
-  f2.close()          
diff --git a/hpvm/projects/soc_simulator/scripts/replace_nml.py b/hpvm/projects/soc_simulator/scripts/replace_nml.py
deleted file mode 100644
index 92e58f629b0419d3f2f9d3539507ae4baaf10026..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/scripts/replace_nml.py
+++ /dev/null
@@ -1,27 +0,0 @@
-if __name__ == "__main__":
-  f = open("layers.txt")
-  f2 = open("resnet50_imagenet_layers.txt", "w+")
-  nml_id = 1
-  for x in f:
-      if "#" in x:
-          f2.write("NML" + str(nml_id) + "\n")
-          nml_id += 1
-      elif "Conv" in x:
-          x = x.replace("\n", "")
-          x = x + ",1,1\n"
-          f2.write(x)
-      else:
-          f2.write(x)
-  f2.close()
diff --git a/hpvm/projects/soc_simulator/src/Makefile b/hpvm/projects/soc_simulator/src/Makefile
deleted file mode 100644
index c31b4d295f0bffb35da63e965fc9cb1d8163af99..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-all: clean timing quant patch gemm
-	g++ -std=c++11 -O3 promise_timing_model.cpp -o ptm
-	nvcc -std=c++11 -O3 -arch=sm_62 quantization.cu -o quantize
-	nvcc -std=c++11 -O3 -arch=sm_62 patch.cu -o patch
-	nvcc -std=c++11 -O3 -arch=sm_62 gemm.cu fp16_emu.cpp -o gemm -lcublas
-	rm -rf ptm quantize patch gemm
diff --git a/hpvm/projects/soc_simulator/src/copy_tensor_data.pl b/hpvm/projects/soc_simulator/src/copy_tensor_data.pl
deleted file mode 100755
index 98729936071d49bf95017c429e202f107babdae9..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/copy_tensor_data.pl
+++ /dev/null
@@ -1,130 +0,0 @@
-use strict;
-use warnings;
-my %tensors;
-if (($#ARGV + 1) != 4) {
-    print "Usage: copy_tensor_data.pl <tensor info> <FP16 data> <FP32 data> <output tensor file>\n";
-    exit;
-# Input tensor file
-my $tensor_filename = $ARGV[0];
-# FP16 data file
-my $fp16_data_filename = $ARGV[1];
-# FP32 data file
-my $fp32_data_filename = $ARGV[2];
-# Output tensor file
-my $output_filename = $ARGV[3];
-print "Reading FP16 data\n";
-open(my $fp16_data_file, '<', $fp16_data_filename) or die "Couldn't open FP16 data file $fp16_data_filename: $!";
-while (my $line = <$fp16_data_file>) {
-    chomp $line;
-    my @tokens = split /,/, $line;
-    # The format of each line is:
-    # 0        1     2
-    # Op name, time, energy
-    my $op_name = $tokens[0];
-    my $op_time = $tokens[1];
-    my $op_energy = $tokens[2];
-    if (is_f2h($op_name)) {
-        # Remove _f2h
-        $op_name = substr($op_name, 0, -4);
-        $tensors{$op_name}{"f2h_time"} = $op_time;
-        $tensors{$op_name}{"f2h_energy"} = $op_energy;
-    } elsif (is_h2f($op_name)) {
-        # Remove _h2f
-        $op_name = substr($op_name, 0, -4);
-        $tensors{$op_name}{"h2f_time"} = $op_time;
-        $tensors{$op_name}{"h2f_energy"} = $op_energy;
-    } else {
-        $tensors{$op_name}{"fp16_time"} = $op_time;
-        $tensors{$op_name}{"fp16_energy"} = $op_energy;
-    }
-print "Reading FP32 data\n";
-open(my $fp32_data_file, '<', $fp32_data_filename) or die "Couldn't open FP32 data file $fp32_data_filename: $!";
-while (my $line = <$fp32_data_file>) {
-    chomp $line;
-    my @tokens = split /,/, $line;
-    # The format of each line is:
-    # 0        1     2
-    # Op name, time, energy
-    my $op_name = $tokens[0];
-    my $op_time = $tokens[1];
-    my $op_energy = $tokens[2];
-    $tensors{$op_name}{"Name"} = $op_name;
-    $tensors{$op_name}{"fp32_time"} = $op_time;
-    $tensors{$op_name}{"fp32_energy"} = $op_energy;
-print "Generating output tensor file\n";
-open(my $tensor_file, '<', $tensor_filename) or die "Couldn't open tensor info file $tensor_filename: $!";
-open(my $output_file, '>', $output_filename) or die "Couldn't open results file $output_filename: $!";
-while (my $line = <$tensor_file>) {
-    print $output_file $line;
-    chomp $line;
-    # Layer lines look like this: #layer_name, num_tensor_ops_in_layer
-    my @tokens = split /,/, $line;
-    my $layer_name = substr($tokens[0], 1);
-    my $num_ops = $tokens[1];
-    # Tensor lines look like this: name
-    for (my $i = 0; $i < $num_ops; $i++) {
-        my $op_name = <$tensor_file>;
-        chomp $op_name;
-        # Format of each output line:
-        # Name, FP32 time, FP32 energy, FP16 time, FP16 energy, f2h time, f2h energy, h2f time, h2f energy
-        print $output_file $tensors{$op_name}{"Name"} . ",";
-        print $output_file $tensors{$op_name}{"fp32_time"} . ",";
-        print $output_file $tensors{$op_name}{"fp32_energy"} . ",";
-        print $output_file $tensors{$op_name}{"fp16_time"} . ",";
-        print $output_file $tensors{$op_name}{"fp16_energy"} . ",";
-        print $output_file $tensors{$op_name}{"f2h_time"} . ",";
-        print $output_file $tensors{$op_name}{"f2h_energy"} . ",";
-        print $output_file $tensors{$op_name}{"h2f_time"} . ",";
-        print $output_file $tensors{$op_name}{"h2f_energy"} . "\n";
-    }
-print "Done!\n";
-sub is_f2h {
-    my ($name) = @_;
-    if ($name =~ /f2h/i) {
-        return 1;
-    } else {
-        return;
-    }
-sub is_h2f {
-    my ($name) = @_;
-    if ($name =~ /h2f/i) {
-        return 1;
-    } else {
-        return;
-    }
diff --git a/hpvm/projects/soc_simulator/src/driver.pl b/hpvm/projects/soc_simulator/src/driver.pl
deleted file mode 100755
index fe53ca9d850e58e72a1e8ebe8d7e048d24ddc017..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/driver.pl
+++ /dev/null
@@ -1,506 +0,0 @@
-use strict;
-use warnings;
-my $fp16_swing = 8;
-my $iterations = 10;
-my $fp16 = "FP16";
-my $fp32 = "FP32";
-my @layers;
-my %tensors;
-my %results;
-if (($#ARGV + 1) != 6) {
-    print "Usage: driver.pl <layer info> <tensor info> <configurations> <results file> <smart DMA> <detailed results?>\n";
-    exit;
-my $layer_filename = $ARGV[0];
-my $tensor_filename = $ARGV[1];
-my $config_filename = $ARGV[2];
-my $results_filename = $ARGV[3];
-my $smart_dma = $ARGV[4];
-my $detailed_results = $ARGV[5];
-print "Reading layer info\n";
-open(my $layer_file, '<', $layer_filename) or die "Couldn't open layer info file $layer_filename: $!";
-while (my $line = <$layer_file>) {
-    chomp $line;
-    add_layer($line);
-print "Reading tensor info\n";
-open(my $tensor_file, '<', $tensor_filename) or die "Couldn't open tensor info file $tensor_filename: $!";
-while (my $line = <$tensor_file>) {
-    chomp $line;
-    # Layer lines look like this: #layer_name, num_tensor_ops_in_layer
-    my @tokens = split /,/, $line;
-    my $layer_name = substr($tokens[0], 1);
-    my $num_ops = $tokens[1];
-    # Tensor lines look like this: name, FP32 time, FP32 energy, FP16 time, FP16 energy, f2h time, f2h energy, h2f time, h2f energy
-    for (my $i = 0; $i < $num_ops; $i++) {
-        my $op = <$tensor_file>;
-        chomp $op;
-        my @values = split /,/, $op;
-        $tensors{$layer_name}{$i} = [@values];
-    }
-print "Running simulations\n";
-my $conf_count = 0;
-open(my $config_file, '<', $config_filename) or die "Couldn't open config file $config_filename: $!";
-open(my $results_file, '>', $results_filename) or die "Couldn't open results file $results_filename: $!";
-while (my $line = <$config_file>) {
-    chomp $line;
-    # Each line has a bunch of comma separated voltage swing levels
-    my @levels = split /,/, $line;
-    my $layer_count = 0;
-    my $prev = $fp32;
-    my $curr;
-    foreach my $level (@levels) {
-        my %layer = %{$layers[$layer_count]};
-        if (is_promise($level)) {
-            # The voltage level corresponds to PROMISE
-            print "Running layer $layer{\"Name\"} on PROMISE\n";
-            $curr = $PROMISE;
-            # Quantization
-            my ($qtime, $qenergy) = quantize($curr, $prev, 0, %layer);
-            # Patching
-            my ($ptime, $penergy) = patch(%layer);
-            # Compute
-            my ($ttime, $tenergy, $ctime, $cenergy, $mtime, $menergy, $lenergy) = promise($level, %layer);
-            # Unpatching
-            my ($utime, $uenergy) = unpatch(%layer);
-            # Layer info
-            $results{"Time"}{$conf_count}{$layer{"Name"}} = $mtime + $ctime;
-            $results{"Quantization Time"}{$conf_count}{$layer{"Name"}} = 0.0;
-            $results{"Memory Time"}{$conf_count}{$layer{"Name"}} = $mtime;
-            $results{"Compute Time"}{$conf_count}{$layer{"Name"}} = $ctime;
-            $results{"Energy"}{$conf_count}{$layer{"Name"}} = $qenergy + $penergy + $menergy + $cenergy + $uenergy + $lenergy;
-            $results{"Quantization Energy"}{$conf_count}{$layer{"Name"}} = $qenergy;
-            $results{"Patch Energy"}{$conf_count}{$layer{"Name"}} =   $penergy;
-            $results{"Memory Energy"}{$conf_count}{$layer{"Name"}} =  $menergy;
-            $results{"Compute Energy"}{$conf_count}{$layer{"Name"}} = $cenergy;
-            $results{"Unpatch Energy"}{$conf_count}{$layer{"Name"}} = $uenergy;
-            $results{"Leakage Energy"}{$conf_count}{$layer{"Name"}} = $lenergy;
-            # Aggregate info
-            $results{"Time"}{$conf_count}{"Total"} += ($mtime + $ctime);
-            $results{"Quantization Time"}{$conf_count}{"Total"} += 0.0;
-            $results{"Memory Time"}{$conf_count}{"Total"} += $mtime;
-            $results{"Compute Time"}{$conf_count}{"Total"} += $ctime;
-            $results{"Energy"}{$conf_count}{"Total"} += ($qenergy + $penergy + $menergy + $cenergy + $uenergy + $lenergy);
-            $results{"Quantization Energy"}{$conf_count}{"Total"} += $qenergy;
-            $results{"Patch Energy"}{$conf_count}{"Total"} +=   $penergy;
-            $results{"Memory Energy"}{$conf_count}{"Total"} +=  $menergy;
-            $results{"Compute Energy"}{$conf_count}{"Total"} += $cenergy;
-            $results{"Unpatch Energy"}{$conf_count}{"Total"} += $uenergy;
-            $results{"Leakage Energy"}{$conf_count}{"Total"} += $lenergy;
-        } else {
-            # The voltage level corresponds to GPU (FP16 or FP32)
-            print "Running layer $layer{\"Name\"} on the GPU\n";
-            my @sublevels = split / /, $level;
-            my $tensor_count = 0;
-            my $total_qtime;
-            my $total_ctime;
-            my $total_qenergy;
-            my $total_cenergy;
-            foreach my $sublevel (@sublevels) {
-                if ($sublevel == $fp16_swing) {
-                    $curr = $fp16;
-                } else {
-                    $curr = $fp32;
-                }
-                # Quantization
-                my ($qtime, $qenergy) = quantize($curr, $prev, $tensor_count, %layer);
-                # Compute
-                my ($ctime, $cenergy) = gpu($curr, $layer{"Name"}, $tensor_count);
-                # Update total
-                $total_qtime += $qtime;
-                $total_ctime += $ctime;
-                $total_qenergy += $qenergy;
-                $total_cenergy += $cenergy;
-                $prev = $curr;
-                $tensor_count++;
-            }
-            # Layer info
-            $results{"Time"}{$conf_count}{$layer{"Name"}} = $total_qtime + $total_ctime;
-            $results{"Quantization Time"}{$conf_count}{$layer{"Name"}} = $total_qtime;
-            $results{"Memory Time"}{$conf_count}{$layer{"Name"}} = 0.0;
-            $results{"Compute Time"}{$conf_count}{$layer{"Name"}} = $total_ctime;
-            $results{"Energy"}{$conf_count}{$layer{"Name"}} = $total_qenergy + $total_cenergy;
-            $results{"Quantization Energy"}{$conf_count}{$layer{"Name"}} = $total_qenergy;
-            $results{"Patch Energy"}{$conf_count}{$layer{"Name"}} =   0.0;
-            $results{"Memory Energy"}{$conf_count}{$layer{"Name"}} =  0.0;
-            $results{"Compute Energy"}{$conf_count}{$layer{"Name"}} = $total_cenergy;
-            $results{"Unpatch Energy"}{$conf_count}{$layer{"Name"}} = 0.0;
-            $results{"Leakage Energy"}{$conf_count}{$layer{"Name"}} = 0.0;
-            # Aggregate info
-            $results{"Time"}{$conf_count}{"Total"} += ($total_qtime + $total_ctime);
-            $results{"Quantization Time"}{$conf_count}{"Total"} += $total_qtime;
-            $results{"Memory Time"}{$conf_count}{"Total"} += 0.0;
-            $results{"Compute Time"}{$conf_count}{"Total"} += $total_ctime;
-            $results{"Energy"}{$conf_count}{"Total"} += ($total_qenergy + $total_cenergy);
-            $results{"Quantization Energy"}{$conf_count}{"Total"} += $total_qenergy;
-            $results{"Patch Energy"}{$conf_count}{"Total"} +=   0.0;
-            $results{"Memory Energy"}{$conf_count}{"Total"} +=  0.0;
-            $results{"Compute Energy"}{$conf_count}{"Total"} += $total_cenergy;
-            $results{"Unpatch Energy"}{$conf_count}{"Total"} += 0.0;
-            $results{"Leakage Energy"}{$conf_count}{"Total"} += 0.0;
-        }
-        $prev = $curr;
-        $layer_count++;
-    }
-    print "\n";
-    $conf_count++;
-#foreach my $config (sort keys %{$results{$attribute}}) {
-#foreach my $layer (sort keys %{$results{$attribute}{$config}}) {
-print "Printing results\n";
-if ($detailed_results) {
-    my @attributes_to_print = ("Time", "Energy");
-   #foreach my $attribute (@attributes_to_print) {
-    foreach my $attribute (sort keys %results) {
-        print $results_file $attribute . "\n";
-        # Print header
-        print $results_file "Configuration,";
-        foreach my $layer (@layers) {
-            print $results_file ${$layer}{"Name"} . ",";
-        }
-        print $results_file "Total,Improvement\n";
-        my $baseline = $results{$attribute}{0}{"Total"};
-        my $best_config = undef;
-        my $best_result = undef;
-        for (my $config = 0; $config < $conf_count; $config++) {
-            print $results_file "c$config";
-            # This is *really* important. It ensures that each configuration's layer
-            # data is printed in the right order.
-            foreach my $layer (@layers) {
-                my $layer_name = ${$layer}{"Name"};
-                print $results_file "," . $results{$attribute}{$config}{$layer_name};
-            }
-            my $val = $results{$attribute}{$config}{"Total"};
-            print $results_file "," . $val;
-            print $results_file "," . ($baseline / ($val + 0.0001));
-            print $results_file "\n";
-            if ((!defined $best_result) or ($val < $best_result)) {
-                $best_result = $val;
-                $best_config = $config;
-            }
-            #$best_result = $val if !defined $best_result or $val < $best_result;
-            #$best_config = $config if !defined $best_config or $val < $best_result;
-        }
-        print $results_file "\n";
-        print $results_file "c$best_config";
-        print $results_file "," . $results{$attribute}{$best_config}{"Total"};
-        print $results_file "\n\n";
-    }
-} else {
-    my @attributes_to_print = ("Time", "Energy");
-    foreach my $attribute (@attributes_to_print) {
-        print $results_file $attribute . "\n";
-        # Print header
-        print $results_file "Configuration,";
-        print $results_file "Total,Improvement\n";
-        my $baseline = $results{$attribute}{0}{"Total"};
-        my $best_config = undef;
-        my $best_result = undef;
-        for (my $config = 0; $config < $conf_count; $config++) {
-            print $results_file "c$config";
-            my $val = $results{$attribute}{$config}{"Total"};
-            print $results_file "," . $val;
-            print $results_file "," . ($baseline / ($val + 0.0001));
-            print $results_file "\n";
-            if ((!defined $best_result) or ($val < $best_result)) {
-                $best_result = $val;
-                $best_config = $config;
-            }
-        }
-        print $results_file "\n";
-        print $results_file "c$best_config";
-        print $results_file "," . $results{$attribute}{$best_config}{"Total"};
-        print $results_file "\n\n";
-    }
-# Cleanup
-`rm -f blah profile_data.txt`;
-print "Done!\n";
-sub add_layer {
-    my ($line) = @_;
-    my @tokens = split /,/, $line;
-    # The format of each line is:
-    # 0     1  2  3  4  5   6  7   8   9   10
-    # Name, N, C, H, W, Co, C, Kh, Kw, Sh, Sw
-    #       OR
-    # 0     1   2   3   4
-    # Name, RA, CA, RB, CB
-    my %layer;
-    my $layer_name = $tokens[0];
-    $layer{"Name"} = $layer_name;
-    if (is_conv($layer_name)) {
-        $layer{"N"} = $tokens[1];
-        $layer{"C"} = $tokens[2];
-        $layer{"H"} = $tokens[3];
-        $layer{"W"} = $tokens[4];
-        $layer{"Co"} = $tokens[5];
-        $layer{"Kh"} = $tokens[7];
-        $layer{"Kw"} = $tokens[8];
-        $layer{"Sh"} = $tokens[9];
-        $layer{"Sw"} = $tokens[10];
-    } elsif (is_fc($layer_name)) {
-        $layer{"RA"} = $tokens[1];
-        $layer{"CA"} = $tokens[2];
-        $layer{"RB"} = $tokens[3];
-        $layer{"CB"} = $tokens[4];
-    } elsif (not is_nml($layer_name)) {
-        die "Illegal layer name\n";
-    }
-    push @layers, \%layer;
-sub quantize {
-    my ($curr, $prev, $tensor, %layer) = @_;
-    my $size;
-    my $te;
-    # No quantization needed if on same device/knob
-    if ($curr eq $prev) {
-        return (0.0, 0.0);
-    }
-    # No quantization needed with smart DMA
-    if ($smart_dma and (($curr eq $PROMISE) or ($prev eq $PROMISE))) {
-        return (0.0, 0.0);
-    }
-    my $layer_name = $layer{"Name"};
-    if (is_conv($layer_name)) {
-        # Input + Kernel
-        $size = ($layer{"N"} * $layer{"C"} * $layer{"H"} * $layer{"W"}) + ($layer{"Co"} * $layer{"C"} * $layer{"Kh"} * $layer{"Kw"});
-    } elsif (is_fc($layer_name)) {
-        # Matrix A + matrix B
-        $size = ($layer{"RA"} * $layer{"CA"}) + ($layer{"RB"} * $layer{"CB"});
-    } elsif (not is_nml($layer_name)) {
-        die "This should never, ever happen\n";
-    }
-    if ($curr eq $PROMISE) {
-        # We are offloading to PROMISE
-        my $type;
-        if ($prev eq $fp32) {
-            $type = "f2c";
-        } else {
-            $type = "h2c";
-        }
-        $te = `~/awesome_profiler/pp "./quantize $size $type" $iterations blah`;
-        chomp $te;
-    } elsif ($prev eq $PROMISE) {
-        # We are coming back from PROMISE
-        my $type;
-        if ($curr eq $fp32) {
-            $type = "c2f";
-        } else {
-            $type = "c2h";
-        }
-        $te = `~/awesome_profiler/pp "./quantize $size $type" $iterations blah`;
-        chomp $te;
-    } else {
-        # We are converting between FP16 and FP32
-        # Tensor lines look like this: name, FP32 time, FP32 energy, FP16 time, FP16 energy, f2h time, f2h energy, h2f time, h2f energy
-        my @info = @{$tensors{$layer{"Name"}}{$tensor}};
-        if ($curr eq $fp32) {
-            $te = $info[7] . "," . $info[8]; # h2f
-        } else {
-            $te = $info[5] . "," . $info[6]; # f2h
-        }
-    }
-    my @temp = split /,/, $te;
-    print "Quantization: ($temp[0], $temp[1])\n";
-    return ($temp[0], $temp[1]);
-sub patch {
-    # No patching needed with smart DMA
-    if ($smart_dma) {
-        return (0.0, 0.0);
-    }
-    my (%layer) = @_;
-    if (is_conv($layer{"Name"})) {
-        my $te_input = `~/awesome_profiler/pp "./patch $layer{"N"} $layer{"C"} $layer{"H"} $layer{"W"} $layer{"Co"} $layer{"Kh"} $layer{"Kw"} patch" $iterations blah`;
-        my $te_kernel = `~/awesome_profiler/pp "./patch $layer{"N"} $layer{"C"} $layer{"H"} $layer{"W"} $layer{"Co"} $layer{"Kh"} $layer{"Kw"} kernel" $iterations blah`;
-        chomp $te_input;
-        chomp $te_kernel;
-        my @input = split /,/, $te_input;
-        my @kernel = split /,/, $te_kernel;
-        print "Patch: ($input[0] + $kernel[0], $input[1] + $kernel[1])\n";
-        return ($input[0] + $kernel[0], $input[1] + $kernel[1]);
-    } else {
-        return (0.0, 0.0);
-    }
-sub unpatch {
-    # No unpatching needed with smart DMA
-    if ($smart_dma) {
-        return (0.0, 0.0);
-    }
-    my (%layer) = @_;
-    if (is_conv($layer{"Name"})) {
-        my $te = `~/awesome_profiler/pp "./patch $layer{"N"} $layer{"C"} $layer{"H"} $layer{"W"} $layer{"Co"} $layer{"Kh"} $layer{"Kw"} unpatch" $iterations blah`;
-        chomp $te;
-        my @temp = split /,/, $te;
-        print "Unpatch: ($temp[0], $temp[1])\n";
-        return ($temp[0], $temp[1]);
-    } else {
-        return (0.0, 0.0);
-    }
-sub promise {
-    my ($swing, %layer) = @_;
-    my $rows_a;
-    my $cols_a;
-    my $rows_b;
-    my $cols_b;
-    my $patch_factor = 1;
-    my $layer_name = $layer{"Name"};
-    if (is_conv($layer_name)) {
-        $rows_a = ($layer{"N"} * $layer{"H"} * $layer{"W"}) / ($layer{"Sh"} * $layer{"Sw"});
-        $cols_a = $layer{"C"} * $layer{"Kh"} * $layer{"Kw"};
-        $rows_b = $cols_a;
-        $cols_b = $layer{"Co"};
-        if ($smart_dma) {
-            $patch_factor = $layer{"Kh"} * $layer{"Kw"};
-        }
-    } elsif (is_fc($layer_name)) {
-        $rows_a = $layer{"RA"};
-        $cols_a = $layer{"CA"};
-        $rows_b = $cols_a;
-        $cols_b = $layer{"CB"};
-    } else {
-        # It's either an NML or something else, either way, this is bad
-        die "PROMISE can't run whatever this is!\n";
-    }
-    #print "[$rows_a x $cols_a] x [$rows_b x $cols_b] : $swing\n";
-    my $te = `./ptm $rows_a $cols_a $rows_b $cols_b $patch_factor $swing`;
-    chomp $te;
-    my @temp = split /,/, $te;
-    print "PROMISE: ($temp[0], $temp[1])\n";
-    return ($temp[0], $temp[1], $temp[2], $temp[3], $temp[4], $temp[5], $temp[6]);
-sub gpu {
-    my ($curr, $layer_name, $tensor) = @_;
-    my @info = @{$tensors{$layer_name}{$tensor}};
-    my $time;
-    my $energy;
-    # Tensor lines look like this: name, FP32 time, FP32 energy, FP16 time, FP16 energy, f2h time, f2h energy, h2f time, h2f energy
-    if ($curr eq $fp32) {
-        $time = $info[1];
-        $energy = $info[2];
-    } else {
-        $time = $info[3];
-        $energy = $info[4];
-    }
-    print "GPU: ($time, $energy)\n";
-    return ($time, $energy);
-sub is_promise {
-    my ($level) = @_;
-    my @sublevels = split / /, $level;
-    if ($sublevels[0] < $fp16_swing) {
-        return 1;
-    } else {
-        return;
-    }
-sub is_conv {
-    my ($name) = @_;
-    if ($name =~ /conv/i) {
-        return 1;
-    } else {
-        return;
-    }
-sub is_fc {
-    my ($name) = @_;
-    if ($name =~ /fc/i) {
-        return 1;
-    } else {
-        return;
-    }
-sub is_nml {
-    my ($name) = @_;
-    if ($name =~ /nml/i) {
-        return 1;
-    } else {
-        return;
-    }
diff --git a/hpvm/projects/soc_simulator/src/driver.py b/hpvm/projects/soc_simulator/src/driver.py
deleted file mode 100644
index dbf2651bd3a9512c46d9e0a549c61290ad913ab0..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/driver.py
+++ /dev/null
@@ -1,306 +0,0 @@
-from collections import defaultdict
-import os
-import subprocess
-import sys
-class Driver:
-    fp16_swing = 8
-    class ApproxTypes:
-        FP16 = 0
-        FP32 = 1
-        PROMISE = 2
-    results_time_key = "Time"
-    results_energy_key = "Energy"
-    def __get_str(self, appr):
-        if appr == Driver.ApproxTypes.FP16:
-            return "FP16"
-        elif appr == Driver.ApproxTypes.FP32:
-            return "FP32"
-        elif appr == Driver.ApproxTypes.PROMISE:
-            return "PROMISE"
-    def driver(self):
-        self.__parse_tensor_layer_file()
-        self.__parse_tensor_table()
-        self.__run_simulations()
-        self.__display_results()
-    def __init__(self, layer_filename, table_filename, config_filename, results_filename):
-        self.__layer_filename = layer_filename
-        self.__table_filename = table_filename
-        self.__config_filename = config_filename
-        self.__results_filename = results_filename
-        # NOTE: Use an OrderedDict if we want to search by operation name 
-        # Using a list bc we care about the order the data is read in
-        # since it corresponds to the data in the configuration file
-        self.__tensor_layers = []
-        # [layer_name][operation_name][cols] 
-        # Operation names need to be stored in order of insertion 
-        self.__tensor_table = defaultdict(lambda: list(defaultdict(str)))
-        # [Time/Energy][number corresponding to order the layer config was read in] = time/energy
-        self.__aggregate_results = defaultdict(lambda: defaultdict(float))
-        self.__config_count = 0
-    @staticmethod
-    def is_conv(operation_name):
-        return operation_name.startswith("Conv")
-    @staticmethod
-    def is_nml(operation_name):
-        return operation_name.startswith("NML")
-    @staticmethod
-    def is_fc(operation_name):
-        return operation_name.startswith("FC")
-    def __parse_tensor_layer_file(self): 
-        if not os.path.isfile(self.__layer_filename):
-            print("ERROR: %s was not found." % self.__layer_filename)
-            exit(1)
-        layer_file = open(self.__layer_filename, "r")
-        for line in layer_file:
-            layer_data = line.strip().split(',')
-            layer_name = layer_data[0]
-            tensor_layer = defaultdict(str)
-            tensor_layer["Name"] = layer_name
-            if Driver.is_conv(layer_name):
-                tensor_layer["N"] = float(layer_data[1])
-                tensor_layer["Cin"] = float(layer_data[2])
-                tensor_layer["H"] = float(layer_data[3])
-                tensor_layer["W"] = float(layer_data[4])
-                tensor_layer["Cout"] = float(layer_data[5])
-                tensor_layer["Kh"] = float(layer_data[7])
-                tensor_layer["Kw"] = float(layer_data[8])
-                tensor_layer["Sh"] = float(layer_data[9])
-                tensor_layer["Sw"] = float(layer_data[10])
-            elif Driver.is_fc(layer_name):
-                tensor_layer["RA"] = float(layer_data[1])
-                tensor_layer["CA"] = float(layer_data[2])
-                tensor_layer["RB"] = float(layer_data[3])
-                tensor_layer["CB"] = float(layer_data[4])
-            elif not Driver.is_nml(layer_name): # TODO should we store data for NMLs?
-                print("ERROR: Invalid layer name %s" % layer_name)
-                exit(1)
-            self.__tensor_layers.append(tensor_layer)
-        layer_file.close()
-    def __parse_tensor_table(self): 
-        if not os.path.isfile(self.__table_filename):
-            print("ERROR: %s was not found." % self.__table_filename)
-            exit(1)
-        table_file = open(self.__table_filename, "r")
-        line = table_file.readline().strip()
-        while line:
-            # Line here MUST be a header or there's a bug 
-            # Get the description of the layer 
-            assert(line.startswith("**"))
-            header_contents = line.split(' ')[1:] 
-            layer_name = header_contents[0]
-            num_ops = int(header_contents[1])
-            col_names = header_contents[2:]
-            layer_operations = []
-            # Go through all operations in the layer
-            for op_count in range(num_ops):
-                operation_data = defaultdict(str)
-                line = table_file.readline().strip()
-                op_data = line.split(' ')
-                op_name = op_data[0]
-                operation_data["Name"] = op_name
-                # Number of data items (#s) needs to match up with the # of cols 
-                assert(len(op_data) - 1 == len(col_names)) 
-                # Go through all data items (each col element) per operation 
-                for i in range(len(col_names)):
-                    operation_data[col_names[i]] = float(op_data[i + 1])
-                layer_operations.append(operation_data)
-            self.__tensor_table[layer_name] = layer_operations
-            line = table_file.readline().strip()
-        table_file.close()
-    @staticmethod
-    def is_promise(config_layer):
-        return float(config_layer.split(' ')[0]) < Driver.fp16_swing
-    def __quantize(self, curr_layer, prev_layer, h2f_f2h_operation_ind, layer_data):
-        print(self.__get_str(curr_layer), self.__get_str(prev_layer), h2f_f2h_operation_ind)
-        if curr_layer == prev_layer or curr_layer == Driver.ApproxTypes.PROMISE \
-                    or prev_layer == Driver.ApproxTypes.PROMISE: # No quantization needed
-            return 0.0, 0.0
-        layer_name = layer_data["Name"]
-        # NOTE: Ignoring logic where curr == promise or prev == promise bc 
-        # smartDMA is always true so we'd return near the beginning of the method
-        # Get h2f/f2h data using the first tensor operation in the layer
-        # (which is why order matters in the tensor table)
-        tensor_op_row = self.__tensor_table[layer_name][h2f_f2h_operation_ind]  
-        if curr_layer == Driver.ApproxTypes.FP32:
-            time = tensor_op_row["h2f_time"]
-            energy = tensor_op_row["h2f_energy"]
-        elif curr_layer == Driver.ApproxTypes.FP16:
-            time = tensor_op_row["f2h_time"]
-            energy = tensor_op_row["f2h_energy"]
-        print("Quantization: (%f, %f)" % (time, energy))
-        return (time, energy)
-    def __run_promise_simulation(self, swing, layer_data):
-        layer_name = layer_data["Name"] 
-        patch_factor = 1 
-        if Driver.is_conv(layer_name): 
-            rows_a = layer_data["N"] * layer_data["H"] * layer_data["W"] \
-                    / (layer_data["Sh"] * layer_data["Sw"])
-            cols_a = layer_data["Cin"] * layer_data["Kh"] * layer_data["Kw"]
-            rows_b = cols_a
-            cols_b = layer_data["Cout"]
-            patch_factor = layer_data["Kh"] * layer_data["Kw"]
-        elif Driver.is_fc(layer_name):
-            rows_a = layer_data["RA"] 
-            cols_a = layer_data["CA"]
-            rows_b = cols_a
-            cols_b = layer_data["CB"]
-        else:
-            print("PROMISE can't run whatever this layer is.")
-            exit(1)
-        # Run promise simulator
-        # TODO need to print time and energy in the ptm runner so we can pipe it
-        output = subprocess.Popen(["./ptm_new", str(rows_a), str(cols_a), str(rows_b), \
-                    str(cols_b), str(patch_factor), str(swing)], \
-                    stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0]
-        total_time_energy = output.strip().split(',')
-        assert(len(total_time_energy) == 2)
-        print("PROMISE: (%s, %s)" % (total_time_energy[0], total_time_energy[1]))
-        return float(total_time_energy[0]), float(total_time_energy[1])
-    def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind):
-        tensor_info = self.__tensor_table[layer_name][tensor_ind]
-        if curr_layer == Driver.ApproxTypes.FP32:
-            conversion_time = tensor_info["fp32_time"]
-            conversion_energy = tensor_info["fp32_energy"]
-        else:
-            conversion_time = tensor_info["fp16_time"]
-            conversion_energy = tensor_info["fp16_energy"]
-        print("GPU: (%f, %f)" % (conversion_time, conversion_energy))
-        return (conversion_time, conversion_energy)
-    def __run_simulations(self):
-        if not os.path.isfile(self.__config_filename):
-            print("ERROR: %s was not found" % self.__config_filename)
-            exit(1)
-        config_file = open(self.__config_filename, "r")
-        # each line = indepedent configuration
-        # layers are separated by commas
-        # tensor ops are separated by spaces
-        for config in config_file:
-            config_layers = config.strip().split(',')
-            prev_layer = Driver.ApproxTypes.FP32
-            curr_layer = None
-            for layer_ind, config_layer in enumerate(config_layers): # level
-                layer_data = self.__tensor_layers[layer_ind]  # layer
-                layer_name = layer_data["Name"]
-                if Driver.is_promise(config_layer):
-                    print("Running layer %s on PROMISE" % layer_name)
-                    curr_layer = Driver.ApproxTypes.PROMISE
-                    quant_time, quant_energy = self.__quantize(curr_layer, prev_layer, 0, layer_data)
-                    # Compute 
-                    time, energy = self.__run_promise_simulation(config_layer, layer_data)
-                    self.__aggregate_results[Driver.results_time_key][self.__config_count] += time
-                    self.__aggregate_results[Driver.results_energy_key][self.__config_count] += energy 
-                else:
-                    print("Running layer %s on the GPU" % layer_name)
-                    tensor_ops = config_layer.split(' ')
-                    total_time = 0
-                    total_energy = 0
-                    for tensor_ind, tensor_op in enumerate(tensor_ops): # sublevle
-                        tensor_op = int(tensor_op)
-                        if tensor_op == Driver.fp16_swing:
-                            curr_layer = Driver.ApproxTypes.FP16
-                        else:
-                            curr_layer = Driver.ApproxTypes.FP32
-                        quant_time, quant_energy = self.__quantize(curr_layer, prev_layer, tensor_ind, layer_data)
-                        conv_time, conv_energy = self.__run_gpu_simulation(curr_layer, layer_name, tensor_ind)
-                        total_time += quant_time + conv_time 
-                        total_energy += quant_energy + conv_energy
-                        prev_layer = curr_layer
-                    self.__aggregate_results[Driver.results_time_key][self.__config_count] += total_time
-                    self.__aggregate_results[Driver.results_energy_key][self.__config_count] += total_energy 
-                prev_layer = curr_layer
-            self.__config_count += 1
-            print("\n")
-        config_file.close()
-    def __display_results(self):
-        results_file = open(self.__results_filename, "w")
-        attributes_to_print = [Driver.results_time_key, Driver.results_energy_key]
-        for attribute in attributes_to_print:
-            results_file.write("%s\n" % attribute)
-            results_file.write("Configuration,Total,Improvement\n") 
-            baseline_val = self.__aggregate_results[attribute][0]
-            print(baseline_val)
-            best_config = None
-            best_result = None
-            for config_ind in range(self.__config_count):
-                results_file.write("c%d" % config_ind)
-                time_or_energy_val = self.__aggregate_results[attribute][config_ind]
-                # Using repr to keep all decimal digits when writing to file
-                results_file.write(",%s" % repr(time_or_energy_val))
-                results_file.write(",%s\n" % repr(baseline_val / (time_or_energy_val + 0.0001)))
-                if not best_result or time_or_energy_val < best_result:
-                    best_result = time_or_energy_val
-                    best_config = config_ind
-            results_file.write("\nc%d,%s\n\n" % (best_config, repr(self.__aggregate_results[attribute][best_config])))
-        results_file.close()
-if __name__ == "__main__":
-    if len(sys.argv) != 5:
-        print("Usage: python driver.py <layer info> <tensor info> <configurations> <results file>")
-        exit(1)
-    Driver(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]).driver()
diff --git a/hpvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py b/hpvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py
deleted file mode 100644
index 9d7f20eecc7dd5ddddabbd261f28a05a7c1530b4..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py
+++ /dev/null
@@ -1,523 +0,0 @@
-from collections import defaultdict
-import os
-import subprocess
-import sys
-class Driver:
-    class PrecisionTypes:
-        FP16 = 0
-        FP32 = 1
-        PROMISE = 2
-    class ApproxTypes:
-        PERF = 3 
-        SAMP = 4
-        REDUCE = 5
-    results_time_key = "Time"
-    results_energy_key = "Energy"
-    def __init__(self, layer_filename, table_filename, config_filename, results_filename):
-        self.__layer_filename = layer_filename
-        self.__table_filename = table_filename
-        self.__config_filename = config_filename
-        self.__results_filename = results_filename
-        # NOTE: Use an OrderedDict if we want to search by operation name 
-        # Using a list bc we care about the order the data is read in
-        # since it corresponds to the data in the configuration file
-        self.__tensor_layers = []
-        # [layer_name][operation_name][cols] 
-        # Operation names need to be stored in order of insertion 
-        self.__tensor_table = defaultdict(lambda: list(defaultdict(str)))
-        self.__conf_results = [] # indexed 
-        #self.__conf_results = {} # {conf name: (first line, [[layer value if promise], [tensor vals if gpu]])}
-    @staticmethod
-    def is_conv(operation_name):
-        return operation_name.startswith("Conv")
-    @staticmethod
-    def is_nml(operation_name):
-        return operation_name.startswith("NML")
-    @staticmethod
-    def is_fc(operation_name):
-        return operation_name.startswith("FC")
-    def __get_str(self, appr):
-        if appr == Driver.PrecisionTypes.FP16:
-            return "FP16"
-        elif appr == Driver.PrecisionTypes.FP32:
-            return "FP32"
-        elif appr == Driver.PrecisionTypes.PROMISE:
-            return "PROMISE"
-        elif appr == Driver.ApproxTypes.PERF:
-            return "PERF"
-        elif appr == Driver.ApproxTypes.SAMP:
-            return "SAMP"
-        elif appr == Driver.ApproxTypes.REDUCE:
-            return "REDUCE"
-    def driver(self):
-        self.__parse_tensor_layer_file()
-        self.__parse_tensor_table()
-        self.__run_simulations()
-        self.__write_output()
-    def __parse_tensor_layer_file(self): 
-        if not os.path.isfile(self.__layer_filename):
-            print("ERROR: %s was not found." % self.__layer_filename)
-            exit(1)
-        layer_file = open(self.__layer_filename, "r")
-        for line in layer_file:
-            layer_data = line.strip().split(',')
-            layer_name = layer_data[0]
-            tensor_layer = defaultdict(str)
-            tensor_layer["Name"] = layer_name
-            if Driver.is_conv(layer_name):
-                tensor_layer["N"] = float(layer_data[1])
-                tensor_layer["Cin"] = float(layer_data[2])
-                tensor_layer["H"] = float(layer_data[3])
-                tensor_layer["W"] = float(layer_data[4])
-                tensor_layer["Cout"] = float(layer_data[5])
-                tensor_layer["Kh"] = float(layer_data[7])
-                tensor_layer["Kw"] = float(layer_data[8])
-                tensor_layer["Sh"] = float(layer_data[9])
-                tensor_layer["Sw"] = float(layer_data[10])
-            elif Driver.is_fc(layer_name):
-                tensor_layer["RA"] = float(layer_data[1])
-                tensor_layer["CA"] = float(layer_data[2])
-                tensor_layer["RB"] = float(layer_data[3])
-                tensor_layer["CB"] = float(layer_data[4])
-            elif not Driver.is_nml(layer_name): # TODO should we store data for NMLs?
-                print("ERROR: Invalid layer name %s" % layer_name)
-                exit(1)
-            self.__tensor_layers.append(tensor_layer)
-        layer_file.close()
-    def __parse_tensor_table(self): 
-        if not os.path.isfile(self.__table_filename):
-            print("ERROR: %s was not found." % self.__table_filename)
-            exit(1)
-        print ("table_file_name = ", self.__table_filename, " ")
-        table_file = open(self.__table_filename, "r")
-        line = table_file.readline().strip()
-        while line:
-            # Line here MUST be a header or there's a bug 
-            # Get the description of the layer 
-            assert(line.startswith("**"))
-            header_contents = line.split(' ')[1:] 
-            layer_name = header_contents[0]
-            num_ops = int(header_contents[1])
-            col_names = header_contents[2:]
-            layer_operations = []
-            # Go through all operations in the layer
-            for op_count in range(num_ops):
-                operation_data = defaultdict(str)
-                line = table_file.readline().strip()
-                op_data = line.split(' ')
-                op_name = op_data[0]
-                operation_data["Name"] = op_name
-                # Number of data items (#s) needs to match up with the # of cols 
-                assert(len(op_data) - 1 == len(col_names))
-                # Go through all data items (each col element) per operation 
-                for i in range(len(col_names)):
-                    operation_data[col_names[i]] = float(op_data[i + 1])
-                layer_operations.append(operation_data)
-            self.__tensor_table[layer_name] = layer_operations
-            line = table_file.readline().strip()
-        table_file.close()
-    @staticmethod
-    def is_promise(layer_hardware):
-        return layer_hardware == "promise"
-    @staticmethod
-    def is_gpu(layer_hardware):
-        return layer_hardware == "gpu"
-    def __run_simulations(self):
-        config_file = open(self.__config_filename, "r")
-        line = config_file.readline().strip()
-        while line: 
-            assert(line == "+++++")
-            print("CONFIGURATION")
-            curr_conf_results = []
-            prev_layer = Driver.PrecisionTypes.FP32
-            curr_layer = None
-            line = config_file.readline().strip()
-            first_line = line
-            conf_name = line.split(' ')[0]
-            print("CONF NAME: %s" % conf_name)
-            assert(conf_name.startswith("conf"))
-            line = config_file.readline().strip()
-            while line != "-----":
-                layer_as_lst = line.split(' ')
-                layer_results = []
-                # Skip softmax
-                if line.find("softmax") != -1:
-                    layer_results.append((0, 0, ' '.join(layer_as_lst[2:])))
-                    curr_conf_results.append((layer_as_lst[1], layer_results))
-                    line = config_file.readline().strip()
-                    continue
-                layer_ind = int(layer_as_lst[0]) - 1
-                layer_table_data = self.__tensor_layers[layer_ind]
-                layer_name = layer_table_data["Name"]
-                if Driver.is_promise(layer_as_lst[1]):
-                    print("Running layer %s on PROMISE" % layer_name)
-                    curr_layer = Driver.PrecisionTypes.PROMISE
-                    total_time = 0
-                    total_energy = 0
-                    # To support multiple sets of <param> <number> in the future
-                    for i in range(2, len(layer_as_lst), 2):
-                        param_name = layer_as_lst[i] # Use when there's more than 1 type of param 
-                        param_val = int(layer_as_lst[i + 1])
-                        time, energy = self.__run_promise_simulation(param_val, layer_table_data)
-                        total_time += time
-                        total_energy += energy
-                        print("Curr promise: ", time, energy)
-                    print("Total promise: ", total_time, total_energy)
-                    layer_results.append((total_time, total_energy, ' '.join(layer_as_lst[2:])))
-                elif Driver.is_gpu(layer_as_lst[1]):
-                    print("Running layer %s on the GPU" % layer_name)
-                    tensor_count = 0 
-                    # 3 elements per tensor operation 
-                    for i in range(2, len(layer_as_lst), 3):
-                        op_type = layer_as_lst[i]
-                        precision_type = layer_as_lst[i + 1]
-                        op_number = layer_as_lst[i + 2]
-                        #print(' '.join(layer_as_lst[i : i + 3]))
-                        approx_type = None
-                        if line.find("fp16") != -1:
-                            curr_layer = Driver.PrecisionTypes.FP16
-                        elif line.find("fp32") != -1:
-                            curr_layer = Driver.PrecisionTypes.FP32
-                        if precision_type == "perf" or precision_type == "samp" or precision_type == "reduce": # Handle approx type
-                            if precision_type == "perf": 
-                                approx_type = Driver.ApproxTypes.PERF
-                            elif precision_type == "samp": 
-                                approx_type = Driver.ApproxTypes.SAMP
-                            elif precision_type == "reduce":
-                                approx_type = Driver.ApproxTypes.REDUCE
-                            curr_layer = Driver.PrecisionTypes.FP16
-                        quant_time, quant_energy = self.__quantize(precision_type, op_number, curr_layer, prev_layer, tensor_count, layer_table_data)
-                        if quant_time != 0:
-                            assert i == 2 #and layer_ind == 0
-                        conv_time, conv_energy = self.__run_gpu_simulation(curr_layer, layer_name, \
-                                    tensor_count, approx_type, op_number) 
-                        print(quant_time, conv_time)
-                        layer_results.append((quant_time + conv_time, quant_energy + conv_energy, ' '.join(layer_as_lst[i : i + 3])))
-                        prev_layer = curr_layer
-                        tensor_count += 1
-                line = config_file.readline().strip()
-                prev_layer = curr_layer
-                curr_conf_results.append((layer_as_lst[1], layer_results))
-            if not self.__conf_results: # we're appending the baseline
-                # need to find the fp16 baseline
-                self.fp16_baseline = []
-                prev_layer = Driver.PrecisionTypes.FP32
-                curr_layer = None
-                has_quantized = False
-                for layer_ind, (hardware, layer) in enumerate(curr_conf_results):
-                    if layer[0][2].find("softmax") != -1: continue
-                    fp16_layer = []
-                    layer_table_data = self.__tensor_layers[layer_ind]
-                    layer_name = layer_table_data["Name"]
-                    for tensor_ind, (op_time, op_energy, tensor_op) in enumerate(layer): 
-                        curr_layer = Driver.PrecisionTypes.FP16 # always
-                        quant_time, quant_energy = self.__quantize("fp16", "1", curr_layer, prev_layer, tensor_ind, layer_table_data)
-                        if quant_time != 0:
-                            assert not has_quantized
-                            has_quantized = True
-                        tensor_info = self.__tensor_table[layer_name][tensor_ind]
-                        if "fp16_time" not in tensor_info or "fp16_energy" not in tensor_info:
-                          print("\n\n ERROR: NO FP16_time and/or enegy in Profile data table .. \n\n Aborting....\n")
-                          sys.exit(1)
-                        fp16_time = tensor_info["fp16_time"] + quant_time
-                        fp16_energy = tensor_info["fp16_energy"] + quant_energy
-                        fp16_layer.append((fp16_time, fp16_energy, tensor_op.replace("fp32", "fp16")))  
-                        prev_layer = curr_layer
-                    prev_layer = curr_layer
-                    self.fp16_baseline.append((hardware, fp16_layer))
-            self.__conf_results.append( (first_line, curr_conf_results) )
-            line = config_file.readline().strip()
-        config_file.close()
-    def __quantize(self, precision_type, op_number, curr_layer, prev_layer, h2f_f2h_operation_ind, layer_data):
-        if curr_layer == prev_layer or curr_layer == Driver.PrecisionTypes.PROMISE \
-                    or prev_layer == Driver.PrecisionTypes.PROMISE:
-            return 0.0, 0.0
-        layer_name = layer_data["Name"]
-        print("QUANTIZATION")
-        print(precision_type, op_number, self.__get_str(curr_layer), self.__get_str(prev_layer), h2f_f2h_operation_ind, layer_data)
-        # NOTE: Ignoring logic where curr == promise or prev == promise bc 
-        # smartDMA is always true so we'd return near the beginning of the method
-        # Get h2f/f2h data using the first tensor operation in the layer
-        # (which is why order matters in the tensor table)
-        tensor_op_row = self.__tensor_table[layer_name][h2f_f2h_operation_ind]  
-        time_key = None
-        energy_key = None
-        if op_number == "1":
-            lookup_key = "_" #lookup_key = precision_type
-        else:
-            lookup_key = "_" + precision_type + str(op_number) + "_"
-        if curr_layer == Driver.PrecisionTypes.FP32:
-            time_key = "h2f%stime" % lookup_key
-            energy_key = "h2f%senergy" % lookup_key
-        elif curr_layer == Driver.PrecisionTypes.FP16:
-            time_key = "f2h%stime" % lookup_key
-            energy_key = "f2h%senergy" % lookup_key
-        print(tensor_op_row)
-        time = tensor_op_row[time_key]
-        energy = tensor_op_row[energy_key]
-        print(time_key, energy_key)
-        return (time, energy)
-    def __run_promise_simulation(self, swing, layer_data):
-        layer_name = layer_data["Name"] 
-        patch_factor = 1 
-        if Driver.is_conv(layer_name): 
-            rows_a = layer_data["N"] * layer_data["H"] * layer_data["W"] \
-                    / (layer_data["Sh"] * layer_data["Sw"])
-            cols_a = layer_data["Cin"] * layer_data["Kh"] * layer_data["Kw"]
-            rows_b = cols_a
-            cols_b = layer_data["Cout"]
-            patch_factor = layer_data["Kh"] * layer_data["Kw"]
-        elif Driver.is_fc(layer_name):
-            rows_a = layer_data["RA"] 
-            cols_a = layer_data["CA"]
-            rows_b = layer_data["RB"] 
-            cols_b = layer_data["CB"]
-        else:
-            print("PROMISE can't run whatever this layer is.")
-            exit(1)
-        # Run promise simulator
-        # TODO need to print time and energy in the ptm runner so we can pipe it
-        output = subprocess.Popen(["./ptm_new", str(rows_a), str(cols_a), str(rows_b), \
-                    str(cols_b), str(patch_factor), str(swing)], \
-                    stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0]
-        total_time_energy = output.strip().split(',')
-        assert(len(total_time_energy) == 2)
-        return float(total_time_energy[0]), float(total_time_energy[1])
-    def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind, \
-                    approx_type = None, knob_number = None):
-        tensor_info = self.__tensor_table[layer_name][tensor_ind]
-        time_key = None
-        energy_key = None
-        if approx_type == Driver.ApproxTypes.PERF or approx_type == Driver.ApproxTypes.SAMP or approx_type == Driver.ApproxTypes.REDUCE: # fp16_perf2_energy
-            approx_type_str = None
-            if approx_type == Driver.ApproxTypes.PERF:
-                approx_type_str = "perf"
-            elif approx_type == Driver.ApproxTypes.SAMP: 
-                approx_type_str = "samp"
-            elif approx_type == Driver.ApproxTypes.REDUCE:
-                approx_type_str = "reduce"
-            if curr_layer == Driver.PrecisionTypes.FP32:
-                time_key = "fp32_%s%s_time" % (approx_type_str, knob_number)
-                energy_key = "fp32_%s%s_energy" % (approx_type_str, knob_number)
-            elif curr_layer == Driver.PrecisionTypes.FP16:
-                time_key = "fp16_%s%s_time" % (approx_type_str, knob_number)
-                energy_key = "fp16_%s%s_energy" % (approx_type_str, knob_number)
-        else: # None for now
-            if curr_layer == Driver.PrecisionTypes.FP32:
-                time_key = "fp32_time"
-                energy_key = "fp32_energy"
-            elif curr_layer == Driver.PrecisionTypes.FP16:
-                time_key = "fp16_time"
-                energy_key = "fp16_energy"
-        #print(time_key, energy_key)
-        conversion_time = tensor_info[time_key]
-        conversion_energy = tensor_info[energy_key]
-        #print("GPU: (%f, %f)\n" % (conversion_time, conversion_energy))
-        return conversion_time, conversion_energy
-    def __write_output(self):
-        config_file = open(self.__config_filename, "r")
-        results_file = open(self.__results_filename, "w")
-        def write_conf_to_file(conf_name, final_conf, time_speedup, energy_speedup):
-            # conf = [layer value if promise], [tensor vals if gpu]]
-            conf_str = ["+++++"]
-            # process the first line
-            first_line, layers = final_conf
-            first_line_lst = first_line.split(' ')
-            assert first_line_lst[0] == conf_name
-            new_header = [conf_name]
-            new_header.append(repr(time_speedup))
-            new_header.append(repr(energy_speedup))
-            new_header.append(repr(abs(float(first_line_lst[-2]))))
-            new_header.append(repr(abs(float(first_line_lst[-1]))))
-            conf_str.append(' '.join(new_header))
-            for ind, (hardware, layer) in enumerate(layers):
-                layer_lst = [str(ind + 1)]
-                layer_lst.append(hardware)
-                for op_time, op_energy, tensor_op in layer:
-                    layer_lst.append(tensor_op) 
-                conf_str.append(' '.join(layer_lst))
-            conf_str.append("-----\n")
-            results_file.write('\n'.join(conf_str))
-        fp32_baseline_conf = None
-        baseline_total_time = baseline_total_energy = 0 
-        def get_baseline_times_energies(conf):
-            curr_time = curr_energy = 0
-            for hardware, layer in conf[1]:
-                for op_time, op_energy, tensor_op in layer:
-                    curr_time += op_time
-                    curr_energy += op_energy
-            return curr_time, curr_energy
-        def get_final_times_energies_conf(curr_conf, curr_conf_name):
-            final_time = final_energy = 0
-            final_conf = [] # List (conf) of lists (layers) of tuples (operation data)
-            #for hardware, layer in self.fp16_baseline:
-                #print(hardware, layer)
-            for layer_ind, (hardware, layer) in enumerate(curr_conf[1]):
-                final_conf_layer = []
-                for tensor_ind, (op_time, op_energy, tensor_op) in enumerate(layer):
-                    if tensor_op.find("softmax") != -1:
-                        final_conf_layer.append((None, None, tensor_op))
-                        continue
-                    # layer name, operation name, val name
-                    if tensor_op.find("promise") != -1: # compute sum of entire fp16 baseline layer
-                        baseline_time = 0
-                        baseline_energy = 0
-                        baseline_op = []
-                        if tensor_op.find("fp32") != -1:
-                            assert False
-                            baseline_layer = fp32_baseline_conf[layer_ind][1]
-                        else:
-                            baseline_layer = self.fp16_baseline[layer_ind][1]
-                        for op_time, op_energy, tensor_op in baseline_layer:
-                            baseline_time += op_time
-                            baseline_energy += op_energy
-                            baseline_op.append(tensor_op)
-                    else: # look at the individual tensor operation as before
-                        if tensor_op.find("fp32") != -1:
-                            assert False
-                            baseline_layer = fp32_baseline_conf[1][layer_ind]
-                        else:
-                            baseline_layer = self.fp16_baseline[layer_ind][1]
-                        baseline_time = baseline_layer[tensor_ind][0]
-                        baseline_energy = baseline_layer[tensor_ind][1]
-                        baseline_op = baseline_layer[tensor_ind][2]
-                    final_tensor_op = tensor_op
-                    if op_time > baseline_time:
-                        print("**************** BIGGER ******************")
-                        print(curr_conf_name)
-                        print(baseline_time, baseline_energy, baseline_op, layer_ind)
-                        print(op_time, tensor_op, layer_ind)
-                        final_time += baseline_time
-                        final_energy += baseline_energy
-                        final_tensor_op = baseline_op
-                    else:
-                        print("**************** SMALLER ******************")
-                        print(curr_conf_name)
-                        print(baseline_time, baseline_energy, baseline_op, layer_ind)
-                        print(op_time, tensor_op, layer_ind)
-                        final_time += op_time
-                        final_energy += op_energy
-                    final_conf_layer.append((None, None, final_tensor_op)) # Don't care about the times and energies when writing
-                final_conf.append((hardware, final_conf_layer))
-            #print("\n")
-            return final_time, final_energy, (curr_conf[0], final_conf) 
-        conf_index = 0
-        print("RESULTS")
-        for line in config_file:
-            if line.startswith("conf"):
-                orig_line_lst = line.split(' ')
-                conf_name = orig_line_lst[0]
-                if not fp32_baseline_conf:
-                    fp32_baseline_conf = self.__conf_results[conf_index] #conf_name]
-                    baseline_total_time, baseline_total_energy = get_baseline_times_energies(fp32_baseline_conf)
-                    results_file.write("%s\n" % repr(baseline_total_time))
-                    write_conf_to_file(conf_name, fp32_baseline_conf, 1, 1)
-                else:
-                    curr_conf = self.__conf_results[conf_index] #conf_name]
-                    final_time, final_energy, curr_conf = get_final_times_energies_conf(curr_conf, conf_name)
-                    print("Baseline time: %f, final time: %f, baseline energy: %f, final energy: %f, rations: %f %f " % (baseline_total_time, final_time, baseline_total_energy, final_energy, baseline_total_time / final_time, baseline_total_energy / final_energy))
-                    write_conf_to_file(conf_name, curr_conf, baseline_total_time / final_time, baseline_total_energy / final_energy) 
-                conf_index += 1
-        results_file.close()
-        config_file.close()
-if __name__ == "__main__":
-    if len(sys.argv) != 5:
-        print("Usage: python driver.py <layer info> <tensor info> <configurations> <results file>")
-        exit(1)
-    Driver(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]).driver()
diff --git a/hpvm/projects/soc_simulator/src/driver_new_config_no_fp16_repl.py b/hpvm/projects/soc_simulator/src/driver_new_config_no_fp16_repl.py
deleted file mode 100644
index d12477fd77533f94ff067e05771459ff4c830bb8..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/driver_new_config_no_fp16_repl.py
+++ /dev/null
@@ -1,464 +0,0 @@
-from collections import defaultdict
-import os
-import subprocess
-import sys
-class Driver:
-    fp16_swing = 8
-    class PrecisionTypes:
-        FP16 = 0
-        FP32 = 1
-        PROMISE = 2
-    class ApproxTypes:
-        PERF = 3 
-        SAMP = 4
-    results_time_key = "Time"
-    results_energy_key = "Energy"
-    def __init__(self, layer_filename, table_filename, config_filename, results_filename):
-        self.__layer_filename = layer_filename
-        self.__table_filename = table_filename
-        self.__config_filename = config_filename
-        self.__results_filename = results_filename
-        # NOTE: Use an OrderedDict if we want to search by operation name 
-        # Using a list bc we care about the order the data is read in
-        # since it corresponds to the data in the configuration file
-        self.__tensor_layers = []
-        # [layer_name][operation_name][cols] 
-        # Operation names need to be stored in order of insertion 
-        self.__tensor_table = defaultdict(lambda: list(defaultdict(str)))
-        self.__conf_results = [] # indexed 
-        #self.__conf_results = {} # {conf name: (first line, [[layer value if promise], [tensor vals if gpu]])}
-    @staticmethod
-    def is_conv(operation_name):
-        return operation_name.startswith("Conv")
-    @staticmethod
-    def is_nml(operation_name):
-        return operation_name.startswith("NML")
-    @staticmethod
-    def is_fc(operation_name):
-        return operation_name.startswith("FC")
-    def __get_str(self, appr):
-        if appr == Driver.PrecisionTypes.FP16:
-            return "FP16"
-        elif appr == Driver.PrecisionTypes.FP32:
-            return "FP32"
-        elif appr == Driver.PrecisionTypes.PROMISE:
-            return "PROMISE"
-        elif appr == Driver.ApproxTypes.PERF:
-            return "PERF"
-        elif appr == Driver.ApproxTypes.SAMP:
-            return "SAMP"
-    def driver(self):
-        self.__parse_tensor_layer_file()
-        self.__parse_tensor_table()
-        self.__run_simulations()
-        self.__write_output()
-    def __parse_tensor_layer_file(self): 
-        if not os.path.isfile(self.__layer_filename):
-            print("ERROR: %s was not found." % self.__layer_filename)
-            exit(1)
-        layer_file = open(self.__layer_filename, "r")
-        for line in layer_file:
-            layer_data = line.strip().split(',')
-            layer_name = layer_data[0]
-            tensor_layer = defaultdict(str)
-            tensor_layer["Name"] = layer_name
-            if Driver.is_conv(layer_name):
-                tensor_layer["N"] = float(layer_data[1])
-                tensor_layer["Cin"] = float(layer_data[2])
-                tensor_layer["H"] = float(layer_data[3])
-                tensor_layer["W"] = float(layer_data[4])
-                tensor_layer["Cout"] = float(layer_data[5])
-                tensor_layer["Kh"] = float(layer_data[7])
-                tensor_layer["Kw"] = float(layer_data[8])
-                tensor_layer["Sh"] = float(layer_data[9])
-                tensor_layer["Sw"] = float(layer_data[10])
-            elif Driver.is_fc(layer_name):
-                tensor_layer["RA"] = float(layer_data[1])
-                tensor_layer["CA"] = float(layer_data[2])
-                tensor_layer["RB"] = float(layer_data[3])
-                tensor_layer["CB"] = float(layer_data[4])
-            elif not Driver.is_nml(layer_name): # TODO should we store data for NMLs?
-                print("ERROR: Invalid layer name %s" % layer_name)
-                exit(1)
-            self.__tensor_layers.append(tensor_layer)
-        layer_file.close()
-    def __parse_tensor_table(self): 
-        if not os.path.isfile(self.__table_filename):
-            print("ERROR: %s was not found." % self.__table_filename)
-            exit(1)
-        table_file = open(self.__table_filename, "r")
-        line = table_file.readline().strip()
-        while line:
-            # Line here MUST be a header or there's a bug 
-            # Get the description of the layer 
-            assert(line.startswith("**"))
-            header_contents = line.split(' ')[1:] 
-            layer_name = header_contents[0]
-            num_ops = int(header_contents[1])
-            col_names = header_contents[2:]
-            layer_operations = []
-            # Go through all operations in the layer
-            for op_count in range(num_ops):
-                operation_data = defaultdict(str)
-                line = table_file.readline().strip()
-                op_data = line.split(' ')
-                op_name = op_data[0]
-                operation_data["Name"] = op_name
-                # Number of data items (#s) needs to match up with the # of cols 
-                assert(len(op_data) - 1 == len(col_names))
-                # Go through all data items (each col element) per operation 
-                for i in range(len(col_names)):
-                    operation_data[col_names[i]] = float(op_data[i + 1])
-                layer_operations.append(operation_data)
-            self.__tensor_table[layer_name] = layer_operations
-            line = table_file.readline().strip()
-        table_file.close()
-    @staticmethod
-    def is_promise(layer_hardware):
-        return layer_hardware == "promise"
-    @staticmethod
-    def is_gpu(layer_hardware):
-        return layer_hardware == "gpu"
-    def __run_simulations(self):
-        config_file = open(self.__config_filename, "r")
-        line = config_file.readline().strip()
-        while line: 
-            assert(line == "+++++")
-            print("CONFIGURATION")
-            curr_conf_results = []
-            prev_layer = Driver.PrecisionTypes.FP32
-            curr_layer = None
-            line = config_file.readline().strip()
-            first_line = line
-            conf_name = line.split(' ')[0]
-            print("CONF NAME: %s" % conf_name)
-            assert(conf_name.startswith("conf"))
-            line = config_file.readline().strip()
-            while line != "-----":
-                layer_as_lst = line.split(' ')
-                layer_results = []
-                # Skip softmax
-                if line.find("softmax") != -1:
-                    layer_results.append((0, 0, ' '.join(layer_as_lst[2:])))
-                    curr_conf_results.append((layer_as_lst[1], layer_results))
-                    line = config_file.readline().strip()
-                    continue
-                layer_ind = int(layer_as_lst[0]) - 1
-                layer_table_data = self.__tensor_layers[layer_ind]
-                layer_name = layer_table_data["Name"]
-                if Driver.is_promise(layer_as_lst[1]):
-                    print("Running layer %s on PROMISE" % layer_name)
-                    curr_layer = Driver.PrecisionTypes.PROMISE
-                    total_time = 0
-                    total_energy = 0
-                    # To support multiple sets of <param> <number> in the future
-                    for i in range(2, len(layer_as_lst), 2):
-                        param_name = layer_as_lst[i] # Use when there's more than 1 type of param 
-                        param_val = int(layer_as_lst[i + 1])
-                        time, energy = self.__run_promise_simulation(param_val, layer_table_data)
-                        total_time += time
-                        total_energy += energy
-                    layer_results.append((total_time, total_energy, ' '.join(layer_as_lst[2:])))
-                elif Driver.is_gpu(layer_as_lst[1]):
-                    print("Running layer %s on the GPU" % layer_name)
-                    tensor_count = 0 
-                    # 3 elements per tensor operation 
-                    for i in range(2, len(layer_as_lst), 3):
-                        op_type = layer_as_lst[i]
-                        precision_type = layer_as_lst[i + 1]
-                        op_number = layer_as_lst[i + 2]
-                        approx_type = None
-                        if line.find("fp16") != -1:
-                            curr_layer = Driver.PrecisionTypes.FP16
-                        elif line.find("fp32") != -1:
-                            curr_layer = Driver.PrecisionTypes.FP32
-                        if precision_type == "perf" or precision_type == "samp": # Handle approx type
-                            if precision_type == "perf": 
-                                approx_type = Driver.ApproxTypes.PERF
-                            elif precision_type == "samp": 
-                                approx_type = Driver.ApproxTypes.SAMP
-                            if line.find("fp16") != -1:
-                                curr_layer = Driver.PrecisionTypes.FP16
-                            elif line.find("fp32") != -1:
-                                curr_layer = Driver.PrecisionTypes.FP32
-                        quant_time, quant_energy = self.__quantize(op_type, precision_type, op_number, curr_layer, prev_layer, \
-                                    tensor_count, layer_table_data)
-                        if quant_time != 0:
-                            assert i == 2
-                        conv_time, conv_energy = self.__run_gpu_simulation(curr_layer, layer_name, \
-                                    tensor_count, approx_type, op_number) 
-                        print(quant_time, quant_energy, conv_time, conv_energy)
-                        layer_results.append((quant_time + conv_time, quant_energy + conv_energy, ' '.join(layer_as_lst[i : i + 3])))
-                        prev_layer = curr_layer
-                        tensor_count += 1
-                line = config_file.readline().strip()
-                prev_layer = curr_layer
-                curr_conf_results.append((layer_as_lst[1], layer_results))
-            self.__conf_results.append( (first_line, curr_conf_results) )
-            line = config_file.readline().strip()
-        config_file.close()
-    def __quantize(self, op_type, precision_type, op_number, curr_layer, prev_layer, h2f_f2h_operation_ind, layer_data):
-        if curr_layer == prev_layer or curr_layer == Driver.PrecisionTypes.PROMISE \
-                    or prev_layer == Driver.PrecisionTypes.PROMISE:
-            return 0.0, 0.0
-        print("IN QUANTIZE") 
-        layer_name = layer_data["Name"]
-        # NOTE: Ignoring logic where curr == promise or prev == promise bc 
-        # smartDMA is always true so we'd return near the beginning of the method
-        # Get h2f/f2h data using the first tensor operation in the layer
-        # (which is why order matters in the tensor table)
-        tensor_op_row = self.__tensor_table[layer_name][h2f_f2h_operation_ind]  
-        time_key = None
-        energy_key = None
-        print(precision_type, op_number)
-        if op_number == "1":
-            lookup_key = "_" #lookup_key = precision_type
-        else:
-            lookup_key = "_" + precision_type + str(op_number) + "_"
-        print("QUANT LOOKUP KEY", lookup_key)
-        if curr_layer == Driver.PrecisionTypes.FP32:
-            time_key = "h2f%stime" % lookup_key
-            energy_key = "h2f%senergy" % lookup_key
-        elif curr_layer == Driver.PrecisionTypes.FP16:
-            time_key = "f2h%stime" % lookup_key
-            energy_key = "f2h%senergy" % lookup_key
-        print(time_key, energy_key)
-        time = tensor_op_row[time_key]
-        energy = tensor_op_row[energy_key]
-        print("Quantization: (%f, %f)" % (time, energy))
-        return (time, energy)
-    def __run_promise_simulation(self, swing, layer_data):
-        layer_name = layer_data["Name"] 
-        patch_factor = 1 
-        if Driver.is_conv(layer_name): 
-            rows_a = layer_data["N"] * layer_data["H"] * layer_data["W"] \
-                    / (layer_data["Sh"] * layer_data["Sw"])
-            cols_a = layer_data["Cin"] * layer_data["Kh"] * layer_data["Kw"]
-            rows_b = cols_a
-            cols_b = layer_data["Cout"]
-            patch_factor = layer_data["Kh"] * layer_data["Kw"]
-        elif Driver.is_fc(layer_name):
-            rows_a = layer_data["RA"] 
-            cols_a = layer_data["CA"]
-            rows_b = cols_
-            cols_b = layer_data["CB"]
-        else:
-            print("PROMISE can't run whatever this layer is.")
-            exit(1)
-        # Run promise simulator
-        # TODO need to print time and energy in the ptm runner so we can pipe it
-        output = subprocess.Popen(["./ptm_new", str(rows_a), str(cols_a), str(rows_b), \
-                    str(cols_b), str(patch_factor), str(swing)], \
-                    stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0]
-        total_time_energy = output.strip().split(',')
-        assert(len(total_time_energy) == 2)
-        return float(total_time_energy[0]), float(total_time_energy[1])
-    def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind, \
-                    approx_type = None, knob_number = None):
-        tensor_info = self.__tensor_table[layer_name][tensor_ind]
-        #print(tensor_info)
-        #print(layer_name)
-        #print(tensor_ind)
-        time_key = None
-        energy_key = None
-        if approx_type == Driver.ApproxTypes.PERF or approx_type == Driver.ApproxTypes.SAMP: # fp16_perf2_energy
-            approx_type_str = None
-            if approx_type == Driver.ApproxTypes.PERF:
-                approx_type_str = "perf"
-            elif approx_type == Driver.ApproxTypes.SAMP: 
-                approx_type_str = "samp"
-            if curr_layer == Driver.PrecisionTypes.FP32:
-                time_key = "fp32_%s%s_time" % (approx_type_str, knob_number)
-                energy_key = "fp32_%s%s_energy" % (approx_type_str, knob_number)
-            elif curr_layer == Driver.PrecisionTypes.FP16:
-                time_key = "fp16_%s%s_time" % (approx_type_str, knob_number)
-                energy_key = "fp16_%s%s_energy" % (approx_type_str, knob_number)
-        else: # None for now
-            if curr_layer == Driver.PrecisionTypes.FP32:
-                time_key = "fp32_time"
-                energy_key = "fp32_energy"
-            elif curr_layer == Driver.PrecisionTypes.FP16:
-                time_key = "fp16_time"
-                energy_key = "fp16_energy"
-        print(time_key, energy_key)
-        conversion_time = tensor_info[time_key]
-        conversion_energy = tensor_info[energy_key]
-        #print("GPU: (%f, %f)\n" % (conversion_time, conversion_energy))
-        return conversion_time, conversion_energy
-    def __write_output(self):
-        config_file = open(self.__config_filename, "r")
-        results_file = open(self.__results_filename, "w")
-        def write_conf_to_file(conf_name, final_conf, time_speedup, energy_speedup):
-            # conf = [layer value if promise], [tensor vals if gpu]]
-            conf_str = ["+++++"]
-            # process the first line
-            first_line, layers = final_conf
-            first_line_lst = first_line.split(' ')
-            assert first_line_lst[0] == conf_name
-            new_header = [conf_name]
-            new_header.append(repr(time_speedup))
-            new_header.append(repr(energy_speedup))
-            new_header.append(repr(abs(float(first_line_lst[-2]))))
-            new_header.append(repr(abs(float(first_line_lst[-1]))))
-            conf_str.append(' '.join(new_header))
-            for ind, (hardware, layer) in enumerate(layers):
-                print(layer)
-                layer_lst = [str(ind + 1)]
-                layer_lst.append(hardware)
-                print(layer_lst)
-                for op_time, op_energy, tensor_op in layer:
-                    layer_lst.append(tensor_op) 
-                conf_str.append(' '.join(layer_lst))
-            conf_str.append("-----\n")
-            results_file.write('\n'.join(conf_str))
-        baseline_conf = None
-        baseline_total_time = baseline_total_energy = 0 
-        def get_baseline_times_energies(conf):
-            curr_time = curr_energy = 0
-            print("RESULTS: ", conf[1])
-            for hardware, layer in conf[1]:
-                for op_time, op_energy, tensor_op in layer:
-                    curr_time += op_time
-                    curr_energy += op_energy
-            return curr_time, curr_energy
-        def get_final_times_energies_conf(curr_conf):
-            final_time = final_energy = 0
-            final_conf = [] # List (conf) of lists (layers) of tuples (operation data)
-            for layer_ind, (hardware, layer) in enumerate(curr_conf[1]):
-                final_conf_layer = []
-                for tensor_ind, (op_time, op_energy, tensor_op) in enumerate(layer):
-                    baseline_time, baseline_energy, baseline_op = baseline_conf[1][layer_ind][tensor_ind]
-                    final_tensor_op = tensor_op
-                    if op_time > baseline_time:
-                        print("**************** BIGGER ******************")
-                        final_time += baseline_time
-                        final_energy += baseline_energy
-                        final_tensor_op = baseline_op
-                    else:
-                        final_time += op_time
-                        final_energy += op_energy
-                    '''
-                    # Ignoring bigger energies for now  
-                    if op_energy > baseline_energy:
-                        final_time += baseline_energy
-                        final_energy += baseline_energy
-                        final_tensor_op = baseline_op
-                    else:
-                        final_time += op_time
-                        final_energy += op_energy
-                    '''
-                    final_conf_layer.append((None, None, final_tensor_op)) # Don't care about the times and energies when writing
-                final_conf.append(final_conf_layer)
-            return final_time, final_energy, (curr_conf[0], final_conf) 
-        conf_index = 0
-        print("RESULTS")
-        for line in config_file:
-            if line.startswith("conf"):
-                orig_line_lst = line.split(' ')
-                conf_name = orig_line_lst[0]
-                if not baseline_conf:
-                    baseline_conf = self.__conf_results[conf_index] #conf_name]
-                    baseline_total_time, baseline_total_energy = get_baseline_times_energies(baseline_conf)
-                    results_file.write("%s\n" % repr(baseline_total_time))
-                    write_conf_to_file(conf_name, baseline_conf, 1, 1)
-                else:
-                    curr_conf = self.__conf_results[conf_index] #conf_name]
-                    final_time, final_energy = get_baseline_times_energies(curr_conf)
-                    write_conf_to_file(conf_name, curr_conf, baseline_total_time / final_time, baseline_total_energy / final_energy) 
-                conf_index += 1
-        results_file.close()
-        config_file.close()
-if __name__ == "__main__":
-    if len(sys.argv) != 5:
-        print("Usage: python driver.py <layer info> <tensor info> <configurations> <results file>")
-        exit(1)
-    Driver(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]).driver()
diff --git a/hpvm/projects/soc_simulator/src/ext_test b/hpvm/projects/soc_simulator/src/ext_test
deleted file mode 100755
index 5c120650ec8efd65d57d15ea93092c80463b6e28..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/soc_simulator/src/ext_test and /dev/null differ
diff --git a/hpvm/projects/soc_simulator/src/fp16_emu.cpp b/hpvm/projects/soc_simulator/src/fp16_emu.cpp
deleted file mode 100644
index 81a541f31f7ce97d992181448c0174fbf032f7cb..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/fp16_emu.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-#include "fp16_emu.h" 
-#define STATIC_ASSERT(cond) do { typedef char compile_time_assert[(cond) ? 1 : -1]; } while (0)
-// Host functions for converting between FP32 and FP16 formats
-// Paulius Micikevicius (pauliusm@nvidia.com)
-half1 cpu_float2half_rn(float f)
-    unsigned x = *((int*)(void*)(&f));
-    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
-    unsigned sign, exponent, mantissa;
-    __half_raw hr;
-    // Get rid of +NaN/-NaN case first.
-    if (u > 0x7f800000) {
-        hr.x = 0x7fffU;
-        return reinterpret_cast<half1&>(hr);
-    }
-    sign = ((x >> 16) & 0x8000);
-    // Get rid of +Inf/-Inf, +0/-0.
-    if (u > 0x477fefff) {
-        hr.x = sign | 0x7c00U;
-        return reinterpret_cast<half1&>(hr);
-    }
-    if (u < 0x33000001) {
-        hr.x = sign | 0x0000U;
-        return reinterpret_cast<half1&>(hr);
-    }
-    exponent = ((u >> 23) & 0xff);
-    mantissa = (u & 0x7fffff);
-    if (exponent > 0x70) {
-        shift = 13;
-        exponent -= 0x70;
-    } else {
-        shift = 0x7e - exponent;
-        exponent = 0;
-        mantissa |= 0x800000;
-    }
-    lsb = (1 << shift);
-    lsb_s1 = (lsb >> 1);
-    lsb_m1 = (lsb - 1);
-    // Round to nearest even.
-    remainder = (mantissa & lsb_m1);
-    mantissa >>= shift;
-    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
-        ++mantissa;
-        if (!(mantissa & 0x3ff)) {
-            ++exponent;
-            mantissa = 0;
-        }
-    }  
-    hr.x = (sign | (exponent << 10) | mantissa);  
-    return reinterpret_cast<half1&>(hr);
-float cpu_half2float(half1 h)
-    STATIC_ASSERT(sizeof(int) == sizeof(float));
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-    unsigned sign     = ((hr.x >> 15) & 1);
-    unsigned exponent = ((hr.x >> 10) & 0x1f);
-    unsigned mantissa = ((hr.x & 0x3ff) << 13);
-    if (exponent == 0x1f) {  /* NaN or Inf */
-        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
-        exponent = 0xff;
-    } else if (!exponent) {  /* Denorm or Zero */
-        if (mantissa) {
-            unsigned int msb;
-            exponent = 0x71;
-            do {
-                msb = (mantissa & 0x400000);
-                mantissa <<= 1;  /* normalize */
-                --exponent;
-            } while (!msb);
-            mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
-        }
-    } else {
-        exponent += 0x70;
-    }
-    int temp = ((sign << 31) | (exponent << 23) | mantissa);
-    return reinterpret_cast<float&>(temp);
diff --git a/hpvm/projects/soc_simulator/src/fp16_emu.h b/hpvm/projects/soc_simulator/src/fp16_emu.h
deleted file mode 100644
index 8aef176b92e9d598e117a61098532e2190cf2554..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/fp16_emu.h
+++ /dev/null
@@ -1,157 +0,0 @@
- * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
- *
- *
- * This source code and/or documentation ("Licensed Deliverables") are
- * subject to NVIDIA intellectual property rights under U.S. and
- * international Copyright laws.
- *
- * These Licensed Deliverables contained herein is PROPRIETARY and
- * CONFIDENTIAL to NVIDIA and is being provided under the terms and
- * conditions of a form of NVIDIA software license agreement by and
- * between NVIDIA and Licensee ("License Agreement") or electronically
- * accepted by Licensee.  Notwithstanding any terms or conditions to
- * the contrary in the License Agreement, reproduction or disclosure
- * of the Licensed Deliverables to any third party without the express
- * written consent of NVIDIA is prohibited.
- *
- *
- * U.S. Government End Users.  These Licensed Deliverables are a
- * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
- * 1995), consisting of "commercial computer software" and "commercial
- * computer software documentation" as such terms are used in 48
- * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
- * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
- * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
- * U.S. Government End Users acquire the Licensed Deliverables with
- * only those rights set forth herein.
- *
- * Any use of the Licensed Deliverables in individual and commercial
- * software must include, in the user documentation and internal
- * comments to the code, the above Disclaimer and U.S. Government End
- * Users Notice.
- */
-// Conversion from/to 16-bit floating point (half-precision).
-#if !defined(_FP16_EMU_H_)
-#define _FP16_EMU_H_
-#include <driver_types.h>
-#include <cuda_fp16.h>
-// Necessary to ensure visibility of CUDART_VERSION macro
-#include <cuda_runtime_api.h>
-// Definition of '__half_raw' was not provided before CUDA 9.0.
-// '__half_raw' is our type where the unsigned 16-bit integer 
-// data member 'x' can be accessed in both CUDA 9.0 and 8.0.
-#if CUDART_VERSION < 9000 
-typedef __half __half_raw;
-// Internally, in CUDNN we use half1 struct as the FP16 type.
-typedef __half half1;
-#define HLF_EPSILON 4.887581E-04
-#define HLF_MIN     6.103516E-05
-#define HLF_MAX     6.550400E+04
-half1 cpu_float2half_rn(float f);
-float cpu_half2float(half1 h);
-static __inline__ __device__ __host__ half1 habs(half1 h)
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-    hr.x &= 0x7fffU;
-    return reinterpret_cast<half1&>(hr);
-static __inline__ __device__ __host__ half1 hneg(half1 h)
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-    hr.x ^= 0x8000U;
-    return reinterpret_cast<half1&>(hr);
-static __inline__ __device__ __host__ int ishnan(half1 h)
-    // When input is NaN, exponent is all ones and mantissa is non-zero.
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-    return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) != 0;
-static __inline__ __device__ __host__ int ishinf(half1 h)
-    // When input is +/- inf, exponent is all ones and mantissa is zero.
-    __half_raw hr = reinterpret_cast<__half_raw&>(h);
-    return (hr.x & 0x7c00U) == 0x7c00U && (hr.x & 0x03ffU) == 0;
-static __inline__ __device__ __host__ int ishequ(half1 x, half1 y)
-    __half_raw xr = reinterpret_cast<__half_raw&>(x);
-    __half_raw yr = reinterpret_cast<__half_raw&>(y);
-    return ishnan(x) == 0 && ishnan(y) == 0 && xr.x == yr.x;
-// Returns 0.0000 in FP16 binary form
-static __inline__ __device__ __host__ half1 hzero()
-    __half_raw hr;
-    hr.x = 0x0000U;
-    return reinterpret_cast<half1&>(hr);
-// Returns 1.0000 in FP16 binary form
-static __inline__ __device__ __host__ half1 hone()
-    __half_raw hr;
-    hr.x = 0x3c00U;
-    return reinterpret_cast<half1&>(hr);
-// Returns quiet NaN, the most significant fraction bit #9 is set
-static __inline__ __device__ __host__ half1 hnan()
-    __half_raw hr;
-    hr.x = 0x7e00U;
-    return reinterpret_cast<half1&>(hr);
-// Largest positive FP16 value, corresponds to 6.5504e+04
-static __inline__ __device__ __host__ half1 hmax()
-    // Exponent all ones except LSB (0x1e), mantissa is all ones (0x3ff)
-    __half_raw hr;
-    hr.x = 0x7bffU;
-    return reinterpret_cast<half1&>(hr);
-// Smallest positive (normalized) FP16 value, corresponds to 6.1035e-05
-static __inline__ __device__ __host__ half1 hmin()
-    // Exponent is 0x01 (5 bits), mantissa is all zeros (10 bits)
-    __half_raw hr;
-    hr.x = 0x0400U;
-    return reinterpret_cast<half1&>(hr);
-#endif  // _FP16_EMU_H_
diff --git a/hpvm/projects/soc_simulator/src/gemm b/hpvm/projects/soc_simulator/src/gemm
deleted file mode 100755
index 8730e084e099f740ad03cb3457862363579f76fe..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/soc_simulator/src/gemm and /dev/null differ
diff --git a/hpvm/projects/soc_simulator/src/gemm.cu b/hpvm/projects/soc_simulator/src/gemm.cu
deleted file mode 100644
index 040a8dbba973d89f2a6ac30b9f38864f1ba9d42e..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/gemm.cu
+++ /dev/null
@@ -1,255 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <chrono>
-#include <cublas_v2.h>
-#include <cuda_fp16.h>
-#include "fp16_emu.h"
-#define NUM_ARGS (5)
-inline cudaError_t checkCuda(cudaError_t result) {
-    if (result != cudaSuccess)
-        std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n";
-    return result;
-inline cublasStatus_t checkCublas(cublasStatus_t result) {
-    if (result != CUBLAS_STATUS_SUCCESS)
-        std::cerr << "cuBLAS Error: " << result << "\n";
-    return result;
-template <typename T>
-inline void printArray(const T * const __restrict__ array,
-                       const unsigned elements) {
-    for (unsigned i = 0; i < elements; i++)
-        std::cout << std::to_string(array[i]) << "\n";
-// initialization
-template <typename T>
-__global__ void initKernel(T * const __restrict__ array,
-                           const unsigned elements) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        array[idx] = 1.2;
-template <typename T>
-void init(T * const __restrict__ array,
-          const unsigned elements) {
-    const unsigned block_size = 512;
-    const unsigned num_blocks = (elements + block_size - 1) / block_size;
-    initKernel<<<num_blocks, block_size>>>(array, elements);
-    checkCuda(cudaDeviceSynchronize());
-// float to half
-__global__ void f2hKernel(const float * const __restrict__ input,
-                          const unsigned elements,
-                          half * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = __float2half_rn(input[idx]);
-void f2h(const float * const __restrict__ input,
-         const unsigned elements,
-         half * const __restrict__ output) {
-    const unsigned block_size = 512;
-    const unsigned num_blocks = (elements + block_size - 1) / block_size;
-    f2hKernel<<<num_blocks, block_size>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
-// half to float
-__global__ void h2fKernel(const half * const __restrict__ input,
-                          const unsigned elements,
-                          float * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = __half2float(input[idx]);
-void h2f(const half * const __restrict__ input,
-         const unsigned elements,
-         float * const __restrict__ output) {
-    const unsigned block_size = 512;
-    const unsigned num_blocks = (elements + block_size - 1) / block_size;
-    h2fKernel<<<num_blocks, block_size>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
-void sgemm(const float * const __restrict__ a,
-           const unsigned num_rows_a,
-           const unsigned num_cols_a,
-           const float * const __restrict__ b,
-           const unsigned num_rows_b,
-           const unsigned num_cols_b,
-           float * const __restrict__ c) {
-    std::chrono::time_point<std::chrono::high_resolution_clock> begin;
-    std::chrono::time_point<std::chrono::high_resolution_clock> end;
-    std::ofstream ofs("profile_data.txt", std::ios::out);
-    cublasHandle_t handle;
-    checkCublas(cublasCreate(&handle));
-    // Enable Tensor Cores
-    checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
-    const float alpha_ = 1.0;
-    const float beta_  = 0.0;
-    const float *alpha = &alpha_;
-    const float *beta  = &beta_;
-    begin = std::chrono::high_resolution_clock::now();
-    checkCublas(cublasGemmEx(handle,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             // Dimensions
-                             num_rows_a,
-                             num_cols_b,
-                             num_cols_a,
-                             alpha,
-                             // A
-                             a,
-                             CUDA_R_32F,
-                             num_rows_a,
-                             // B
-                             b,
-                             CUDA_R_32F,
-                             num_rows_b,
-                             beta,
-                             // C
-                             c,
-                             CUDA_R_32F,
-                             num_rows_a,
-                             // Compute precision and algorithm
-                             CUDA_R_32F,
-                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    checkCuda(cudaDeviceSynchronize());
-    end = std::chrono::high_resolution_clock::now();
-    ofs << "FP32_GEMM" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-    ofs << "FP32_GEMM" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-    ofs.flush();
-    ofs.close();
-void hgemm(const float * const __restrict__ af,
-           const unsigned num_rows_a,
-           const unsigned num_cols_a,
-           const float * const __restrict__ bf,
-           const unsigned num_rows_b,
-           const unsigned num_cols_b,
-           float * const __restrict__ cf) {
-    std::chrono::time_point<std::chrono::high_resolution_clock> begin;
-    std::chrono::time_point<std::chrono::high_resolution_clock> end;
-    std::ofstream ofs("profile_data.txt", std::ios::out);
-    const unsigned num_elements_a = num_rows_a * num_cols_a;
-    const unsigned num_elements_b = num_rows_b * num_cols_b;
-    const unsigned num_elements_c = num_rows_a * num_cols_b;
-    half *a;
-    half *b;
-    half *c;
-    checkCuda(cudaMallocManaged(&a, sizeof(half) * num_elements_a));
-    checkCuda(cudaMallocManaged(&b, sizeof(half) * num_elements_b));
-    checkCuda(cudaMallocManaged(&c, sizeof(half) * num_elements_c));
-    init(a, num_elements_a);
-    init(b, num_elements_b);
-    init(c, num_elements_c);
-    // Convert floats to halfs
-    f2h(af, num_elements_a, a);
-    f2h(bf, num_elements_b, b);
-    cublasHandle_t handle;
-    checkCublas(cublasCreate(&handle));
-    checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
-    const half alpha_ = cpu_float2half_rn(1.0);
-    const half beta_  = cpu_float2half_rn(0.0);
-    const half *alpha = &alpha_;
-    const half *beta  = &beta_;
-    begin = std::chrono::high_resolution_clock::now();
-    checkCublas(cublasGemmEx(handle,
-                             CUBLAS_OP_N,
-                             CUBLAS_OP_N,
-                             // Dimensions
-                             num_rows_a,
-                             num_cols_b,
-                             num_cols_a,
-                             alpha,
-                             // A
-                             a,
-                             CUDA_R_16F,
-                             num_rows_a,
-                             // B
-                             b,
-                             CUDA_R_16F,
-                             num_rows_b,
-                             beta,
-                             // C
-                             c,
-                             CUDA_R_16F,
-                             num_rows_a,
-                             // Compute precision and algorithm
-                             CUDA_R_16F,
-                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    checkCuda(cudaDeviceSynchronize());
-    end = std::chrono::high_resolution_clock::now();
-    h2f(c, num_elements_c, cf);
-    ofs << "FP16_GEMM" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-    ofs << "FP16_GEMM" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-    ofs.flush();
-    ofs.close();
-int main(int argc, char *argv[]) {
-    if (argc != NUM_ARGS) {
-        std::cerr << "Usage: " << argv[0] << " <RA> <CA> <CB> <precision>\n";
-        exit(1);
-    }
-    // Inputs
-    const unsigned num_rows_a = std::atoi(argv[1]);
-    const unsigned num_cols_a = std::atoi(argv[2]);
-    const unsigned num_rows_b = num_cols_a;
-    const unsigned num_cols_b = std::atoi(argv[3]);
-    const std::string precision(argv[4]);
-    const unsigned num_elements_a = num_rows_a * num_cols_a;
-    const unsigned num_elements_b = num_rows_b * num_cols_b;
-    const unsigned num_elements_c = num_rows_a * num_cols_b;
-    float *a;
-    float *b;
-    float *c;
-    checkCuda(cudaMallocManaged(&a, sizeof(float) * num_elements_a));
-    checkCuda(cudaMallocManaged(&b, sizeof(float) * num_elements_b));
-    checkCuda(cudaMallocManaged(&c, sizeof(float) * num_elements_c));
-    init(a, num_elements_a);
-    init(b, num_elements_b);
-    init(c, num_elements_c);
-    if (precision == "fp32")
-        sgemm(a, num_rows_a, num_cols_a, b, num_rows_b, num_cols_b, c);
-    else
-        hgemm(a, num_rows_a, num_cols_a, b, num_rows_b, num_cols_b, c);
-    checkCuda(cudaFree(a));
-    checkCuda(cudaFree(b));
-    checkCuda(cudaFree(c));
-    return 0;
diff --git a/hpvm/projects/soc_simulator/src/hardware_knobs.pl b/hpvm/projects/soc_simulator/src/hardware_knobs.pl
deleted file mode 100755
index fac6b6c748a706cd397040546fa551b72ac14f3c..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/hardware_knobs.pl
+++ /dev/null
@@ -1,104 +0,0 @@
-use strict;
-use warnings;
-# Do a GEMM: [5000 x K] x [K x 256], where K = {128, 256, ..., 16384}
-my $rows_a = 5000;
-my $cols_b = 1024;
-my @K;
-for (my $i = 8; $i <= 15; $i++) {
-    push @K, 2**$i;
-# Header
-foreach my $element (@K) {
-    print $element . ",";
-print "\n\n";
-print "########## PROMISE 1 #########\n";
-# Time
-foreach my $element (@K) {
-    my ($time, $energy) = promise($rows_a, $element, $element, $cols_b, 1);
-    print $time . ",";
-print "\n\n";
-# Energy
-foreach my $element (@K) {
-    my ($time, $energy) = promise($rows_a, $element, $element, $cols_b, 1);
-    print $energy . ",";
-print "\n\n";
-print "########## PROMISE 7 #########\n";
-# Time
-foreach my $element (@K) {
-    my ($time, $energy) = promise($rows_a, $element, $element, $cols_b, 7);
-    print $time . ",";
-print "\n\n";
-# Energy
-foreach my $element (@K) {
-    my ($time, $energy) = promise($rows_a, $element, $element, $cols_b, 7);
-    print $energy . ",";
-print "\n\n";
-print "########## FP32 #########\n";
-foreach my $element (@K) {
-    my ($time, $energy) = gpu($rows_a, $element, $element, $cols_b, "fp32");
-    print $time . ",";
-print "\n\n";
-# Energy
-foreach my $element (@K) {
-    my ($time, $energy) = gpu($rows_a, $element, $element, $cols_b, "fp32");
-    print $energy . ",";
-print "\n\n";
-print "########## FP16 #########\n";
-foreach my $element (@K) {
-    my ($time, $energy) = gpu($rows_a, $element, $element, $cols_b, "fp16");
-    print $time . ",";
-print "\n\n";
-# Energy
-foreach my $element (@K) {
-    my ($time, $energy) = gpu($rows_a, $element, $element, $cols_b, "fp16");
-    print $energy . ",";
-print "\n\n";
-# Cleanup
-`rm -f blah profile_data.txt`;
-sub promise {
-    my ($rows_a, $cols_a, $rows_b, $cols_b, $swing) = @_;
-    my $patch_factor = 1;
-    my $te = `./ptm $rows_a $cols_a $rows_b $cols_b $patch_factor $swing`;
-    chomp $te;
-    my @temp = split /,/, $te;
-    return ($temp[0], $temp[1]);
-# GPU
-sub gpu {
-    my ($rows_a, $cols_a, $rows_b, $cols_b, $precision) = @_;
-    my $iterations = 10;
-    my $te = `~/awesome_profiler/pp "./gemm $rows_a $cols_a $cols_b $precision" $iterations blah`;
-    chomp $te;
-    my @temp = split /,/, $te;
-    return ($temp[0], $temp[1]);
diff --git a/hpvm/projects/soc_simulator/src/patch b/hpvm/projects/soc_simulator/src/patch
deleted file mode 100755
index 94d04d5ba8a88bdd4fb1cbb907f18c419ba248bd..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/soc_simulator/src/patch and /dev/null differ
diff --git a/hpvm/projects/soc_simulator/src/patch.cu b/hpvm/projects/soc_simulator/src/patch.cu
deleted file mode 100644
index 6d489978a24f828cc370cdb6e2add64877c514a7..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/patch.cu
+++ /dev/null
@@ -1,292 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <chrono>
-// NOTE: This benchmark was originally meant to measure the performance of
-// patch matrix generation on the GPU. However, it is now used for measuring
-// both performance and energy of the actual patch matrix generation that needs
-// to happen before offloading the computation to PROMISE.
-// In order to do so, we assume that initially only that tile of the patch
-// matrix is generated that PROMISE will actually work on, and the generation
-// of the remaining tiles can be pipelined with PROMISE's computation.  This
-// has two implications:
-// 1. We only add the time for the first tile's generation. This means that
-// this program has to be profiled with the appropriate smaller batch size to
-// obtain the *time* overhead.
-// 2. Since pipelining doesn't affect energy, the program has to be profiled
-// with the full batch size to obtain the *energy* overhead.
-#define NUM_ARGS (9)
-#define BLOCK_SIZE (512)
-struct image_dim {
-    unsigned n;
-    unsigned cin;
-    unsigned h;
-    unsigned w;
-struct kernel_dim {
-    unsigned cout;
-    unsigned cin;
-    unsigned h;
-    unsigned w;
-inline cudaError_t checkCuda(cudaError_t result) {
-    if (result != cudaSuccess)
-        std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n";
-    return result;
-// init kernel to bring all the pages to the GPU
-template <typename T>
-__global__ void initKernel(T * const __restrict__ array,
-                           const unsigned elements) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        array[idx] = 1;
-template <typename T>
-void init(T * const __restrict__ array,
-          const unsigned elements) {
-    const unsigned num_blocks = (elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    initKernel<<<num_blocks, BLOCK_SIZE>>>(array, elements);
-    checkCuda(cudaDeviceSynchronize());
-// Patches input matrix
-template <typename T>
-__global__ void patchInputKernel(const T * const __restrict__ input,
-                                 const image_dim idim,
-                                 const kernel_dim kdim,
-                                 T * const __restrict__ patch_input,
-                                 const unsigned patch_rows,
-                                 const unsigned patch_cols,
-                                 const unsigned patch_n) {
-    // Coalesced writes ftw
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < patch_n) {
-        // Index of output
-        const unsigned patch_col = idx % patch_cols;
-        const unsigned patch_row = idx / patch_cols;
-        // Index of the source point in the input image batch
-        const unsigned col_idx = patch_row % idim.w;
-        const unsigned row_idx = patch_row / idim.w;
-        // Index of my point in the (kh * kw * cin) kernel block. kw and kh are
-        // flipped because I've assumed the source point lies at the bottom
-        // right of the kernel cube and not the top left.
-        const int kw_idx = kdim.w - (patch_col % kdim.w) - 1;
-        const int kh_idx = kdim.h - ((patch_col / kdim.w) % kdim.h) - 1;
-        const int kc_idx = patch_col / (kdim.w * kdim.h);
-        // Index of the "kernel point" in the input image batch
-        const int w_idx = col_idx - kw_idx;
-        const int h_idx = row_idx - kh_idx;
-        const int c_idx = kc_idx;
-        // Are we inside the input cube?
-        const bool inside = (w_idx >= 0 && h_idx >= 0);
-        //printf("thread %d: dst (%u, %u); src point (%u, %u); kernel (%d, %d, %d); input (%d, %d, %d); inside %d\n", idx, patch_row, patch_col, row_idx, col_idx, kc_idx, kh_idx, kw_idx, c_idx, h_idx, w_idx, inside);
-        if (inside)
-            patch_input[idx] = input[(c_idx * idim.w * idim.h) + (h_idx * idim.w) + w_idx];
-        else
-            patch_input[idx] = 0;
-    }
-template <typename T>
-void patchInput(const T * const __restrict__ input,
-                const image_dim idim,
-                const kernel_dim kdim,
-                T * const __restrict__ patch_input) {
-    const auto patch_rows = (idim.n * idim.h * idim.w) / sizeof(unsigned);
-    const auto patch_cols = kdim.cin * kdim.h * kdim.w;
-    const auto patch_n = patch_rows * patch_cols;
-    const unsigned num_blocks = (patch_n + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    patchInputKernel<<<num_blocks, BLOCK_SIZE>>>(input, idim, kdim, patch_input, patch_rows, patch_cols, patch_n);
-    checkCuda(cudaDeviceSynchronize());
-// Unpatches output matrix
-template <typename T>
-__global__ void unpatchOutputKernel(const T * const __restrict__ patch_output,
-                                    const unsigned patch_rows,
-                                    const unsigned patch_cols,
-                                    const unsigned patch_n,
-                                    T * const __restrict__ output,
-                                    const image_dim odim) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < patch_n) {
-        // Read index in the patch matrix
-        const unsigned image_size = odim.w * odim.h * odim.cin;
-        const unsigned image_id  = idx / image_size;
-        const unsigned patch_row = (image_id * odim.w * odim.h) + (idx % (odim.w * odim.h));
-        const unsigned patch_col = (idx / (odim.w * odim.h)) % odim.cin;
-        //printf("thread %d: src (%u, %u)\n", idx, patch_row, patch_col);
-        // Coalesced writes ftw
-        output[idx] = patch_output[(patch_row * patch_cols) + patch_col];
-    }
-template <typename T>
-void unpatchOutput(const T * const __restrict__ patch_output,
-                   T * const __restrict__ output,
-                   const image_dim odim) {
-    const auto patch_rows = (odim.n * odim.h * odim.w) / sizeof(unsigned);
-    const auto patch_cols = odim.cin;
-    const auto patch_n = patch_rows * patch_cols;
-    const unsigned num_blocks = (patch_n + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    unpatchOutputKernel<<<num_blocks, BLOCK_SIZE>>>(patch_output, patch_rows, patch_cols, patch_n, output, odim);
-    checkCuda(cudaDeviceSynchronize());
-// Patches kernel matrix
-template <typename T>
-__global__ void patchKernelKernel(const T * const __restrict__ kernel,
-                                  T * const __restrict__ patch_kernel,
-                                  const unsigned patch_rows,
-                                  const unsigned patch_cols,
-                                  const unsigned patch_n) {
-    // There are two ways to implement this: coalesced reads or coalesced
-    // writes. Empirically, coalesced writes is about 3x faster and that's
-    // what's used here. This is most likely because the coalesced reads
-    // version results in different thread blocks writing to the same line
-    // while presumably being on different cores. This results in ping-ponging
-    // of the line, which hurts performance. The strided reads in the coalesced
-    // writes version are just reads - they can be shared as many times as
-    // needed without penalty.
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < patch_n) {
-        const unsigned col_idx = idx % patch_cols;
-        const unsigned row_idx = idx / patch_cols;
-        patch_kernel[idx] = kernel[(col_idx * patch_rows) + row_idx];
-    }
-template <typename T>
-void patchKernel(const T * const __restrict__ kernel,
-                 const kernel_dim kdim,
-                 T * const __restrict__ patch_kernel) {
-    const auto patch_rows = kdim.cin * kdim.h * kdim.w;
-    const auto patch_cols = kdim.cout;
-    const auto patch_n = patch_rows * patch_cols;
-    const unsigned num_blocks = (patch_n + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    patchKernelKernel<<<num_blocks, BLOCK_SIZE>>>(kernel, patch_kernel, patch_rows, patch_cols, patch_n);
-    checkCuda(cudaDeviceSynchronize());
-int main(int argc, char *argv[]) {
-    if (argc != NUM_ARGS) {
-        std::cerr << "Usage: " << argv[0] << " <N> <C> <H> <W> <Cout> <Kh> <Kw> <patch/unpatch>\n";
-        exit(1);
-    }
-    // Inputs
-    const unsigned n = std::atoi(argv[1]);
-    const unsigned c = std::atoi(argv[2]);
-    const unsigned h = std::atoi(argv[3]);
-    const unsigned w = std::atoi(argv[4]);
-    const unsigned cout = std::atoi(argv[5]);
-    const unsigned kh = std::atoi(argv[6]);
-    const unsigned kw = std::atoi(argv[7]);
-    const std::string type(argv[8]);
-    const image_dim idim = {n, c, h, w};
-    const kernel_dim kdim = {cout, c, kh, kw};
-    const image_dim odim = {n, cout, h, w};
-    std::chrono::time_point<std::chrono::high_resolution_clock> begin;
-    std::chrono::time_point<std::chrono::high_resolution_clock> end;
-    std::ofstream ofs("profile_data.txt", std::ios::out);
-    if (type == "patch") {
-        const auto patch_rows = (idim.n * idim.h * idim.w) / sizeof(unsigned);
-        const auto patch_cols = kdim.cin * kdim.h * kdim.w;
-        const auto patch_n = patch_rows * patch_cols;
-        unsigned *input;
-        unsigned *output;
-        checkCuda(cudaMallocManaged(&input, sizeof(unsigned) * patch_n));
-        checkCuda(cudaMallocManaged(&output, sizeof(unsigned) * patch_n));
-        init(input, patch_n);
-        init(output, patch_n);
-        begin = std::chrono::high_resolution_clock::now();
-        patchInput(input, idim, kdim, output);
-        end = std::chrono::high_resolution_clock::now();
-        checkCuda(cudaFree(input));
-        checkCuda(cudaFree(output));
-        ofs << "Patch_Input" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-        ofs << "Patch_Input" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-    } else if (type == "unpatch") {
-        const auto patch_rows = (odim.n * odim.h * odim.w) / sizeof(unsigned);
-        const auto patch_cols = odim.cin;
-        const auto patch_n = patch_rows * patch_cols;
-        unsigned *input;
-        unsigned *output;
-        checkCuda(cudaMallocManaged(&input, sizeof(unsigned) * patch_n));
-        checkCuda(cudaMallocManaged(&output, sizeof(unsigned) * patch_n));
-        init(input, patch_n);
-        init(output, patch_n);
-        begin = std::chrono::high_resolution_clock::now();
-        unpatchOutput(input, output, odim);
-        end = std::chrono::high_resolution_clock::now();
-        checkCuda(cudaFree(input));
-        checkCuda(cudaFree(output));
-        ofs << "Unpatch_Output" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-        ofs << "Unpatch_Output" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-    } else if (type == "kernel") {
-        const auto patch_rows = kdim.cin * kdim.h * kdim.w;
-        const auto patch_cols = kdim.cout;
-        const auto patch_n = patch_rows * patch_cols;
-        char *input;
-        char *output;
-        checkCuda(cudaMallocManaged(&input, sizeof(char) * patch_n));
-        checkCuda(cudaMallocManaged(&output, sizeof(char) * patch_n));
-        init(input, patch_n);
-        init(output, patch_n);
-        begin = std::chrono::high_resolution_clock::now();
-        patchKernel(input, kdim, output);
-        end = std::chrono::high_resolution_clock::now();
-        checkCuda(cudaFree(input));
-        checkCuda(cudaFree(output));
-        ofs << "Patch_Kernel" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-        ofs << "Patch_Kernel" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-    } else {
-        std::cerr << "Patch type not supported!\n";
-        exit(1);
-    }
-    ofs.close();
-    return 0;
diff --git a/hpvm/projects/soc_simulator/src/promise_timing_model_ext b/hpvm/projects/soc_simulator/src/promise_timing_model_ext
deleted file mode 100755
index 960ac445a104c93fd8a5f71c9323f921cf2bc9e4..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/soc_simulator/src/promise_timing_model_ext and /dev/null differ
diff --git a/hpvm/projects/soc_simulator/src/ptm b/hpvm/projects/soc_simulator/src/ptm
deleted file mode 100755
index 0cdb86b8481a64e096766092c375e8ee682e7dac..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/soc_simulator/src/ptm and /dev/null differ
diff --git a/hpvm/projects/soc_simulator/src/quantization.cu b/hpvm/projects/soc_simulator/src/quantization.cu
deleted file mode 100644
index bb1f34b2bdfe869ebd79b67bf05b0776bfbd3d4a..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/quantization.cu
+++ /dev/null
@@ -1,275 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <chrono>
-#include "cuda_fp16.h"
-// NOTE: This benchmark was originally meant to measure the performance of
-// various conversion routines. However, it is now used for measuring both
-// performance and energy of the actual conversion that needs to happen before
-// offloading the computation to PROMISE.
-// In order to do so, we assume that the initial conversion is only performed
-// for the tile that PROMISE will actually work on, and the conversions of the
-// remaining tiles can be pipelined with PROMISE's computation.  This has two
-// implications:
-// 1. We only add the time for the conversion of the very first tile. This
-// means that this program has to be profiled with the appropriate smaller
-// batch size to obtain the *time* overhead.
-// 2. Since pipelining doesn't affect energy, the program has to be profiled
-// with the full batch size to obtain the *energy* overhead.
-#define NUM_ARGS (3)
-#define BLOCK_SIZE (512)
-inline cudaError_t checkCuda(cudaError_t result) {
-    if (result != cudaSuccess)
-        std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n";
-    return result;
-// init kernel to bring all the pages to the GPU
-template <typename T>
-__global__ void initKernel(T * const __restrict__ array,
-                           const unsigned elements) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        array[idx] = 1;
-template <typename T>
-void init(T * const __restrict__ array,
-          const unsigned elements) {
-    const unsigned num_blocks = (elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    initKernel<<<num_blocks, BLOCK_SIZE>>>(array, elements);
-    checkCuda(cudaDeviceSynchronize());
-// float to half
-__global__ void f2hKernel(const float * const __restrict__ input,
-                          const unsigned elements,
-                          half * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = __float2half_rn(input[idx]);
-void f2h(const float * const __restrict__ input,
-         const unsigned elements,
-         half * const __restrict__ output) {
-    const unsigned num_blocks = (elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    f2hKernel<<<num_blocks, BLOCK_SIZE>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
-// half to float
-__global__ void h2fKernel(const half * const __restrict__ input,
-                          const unsigned elements,
-                          float * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = __half2float(input[idx]);
-void h2f(const half * const __restrict__ input,
-         const unsigned elements,
-         float * const __restrict__ output) {
-    const unsigned num_blocks = (elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    h2fKernel<<<num_blocks, BLOCK_SIZE>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
-// float to char ([-1.0, 1.0] to [-128, 127])
-__global__ void f2cKernel(const float * const __restrict__ input,
-                          const unsigned elements,
-                          char * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = (char) ((127.5 * input[idx]) - 0.5);
-void f2c(const float * const __restrict__ input,
-         const unsigned elements,
-         char * const __restrict__ output) {
-    const unsigned num_blocks = (elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    f2cKernel<<<num_blocks, BLOCK_SIZE>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
-// char to float ([-128, 127] to [-1.0, 1.0])
-__global__ void c2fKernel(const char * const __restrict__ input,
-                          const unsigned elements,
-                          float * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = (((float) input[idx]) + 0.5) / 127.5;
-void c2f(const char * const __restrict__ input,
-         const unsigned elements,
-         float * const __restrict__ output) {
-    const unsigned num_blocks = (elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    c2fKernel<<<num_blocks, BLOCK_SIZE>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
-// half to char ([-1.0, 1.0] to [-128, 127])
-__global__ void h2cKernel(const half * const __restrict__ input,
-                          const unsigned elements,
-                          char * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = (char) ((127.5 * __half2float(input[idx])) - 0.5);
-void h2c(const half * const __restrict__ input,
-         const unsigned elements,
-         char * const __restrict__ output) {
-    const unsigned num_blocks = (elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    h2cKernel<<<num_blocks, BLOCK_SIZE>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
-// char to half ([-128, 127] to [-1.0, 1.0])
-__global__ void c2hKernel(const char * const __restrict__ input,
-                          const unsigned elements,
-                          half * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = __float2half_rn((((float) input[idx]) + 0.5) / 127.5);
-void c2h(const char * const __restrict__ input,
-         const unsigned elements,
-         half * const __restrict__ output) {
-    const unsigned num_blocks = (elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    c2hKernel<<<num_blocks, BLOCK_SIZE>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
-int main(int argc, char *argv[]) {
-    if (argc != NUM_ARGS) {
-        std::cerr << "Usage: " << argv[0] << " <#elements> <conversion type>\n";
-        exit(1);
-    }
-    float *floats;
-    half *halfs;
-    char *chars;
-    // Inputs
-    const unsigned n = std::atoi(argv[1]);
-    const std::string type(argv[2]);
-    std::chrono::time_point<std::chrono::high_resolution_clock> begin;
-    std::chrono::time_point<std::chrono::high_resolution_clock> end;
-    std::ofstream ofs("profile_data.txt", std::ios::out);
-    if (type == "f2h") {
-        checkCuda(cudaMallocManaged(&floats, sizeof(float) * n));
-        checkCuda(cudaMallocManaged(&halfs, sizeof(half) * n));
-        init(floats, n);
-        init(halfs, n);
-        begin = std::chrono::high_resolution_clock::now();
-        f2h(floats, n, halfs);
-        end = std::chrono::high_resolution_clock::now();
-        checkCuda(cudaFree(floats));
-        checkCuda(cudaFree(halfs));
-        ofs << "f2h" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-        ofs << "f2h" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-    } else if (type == "h2f") {
-        checkCuda(cudaMallocManaged(&floats, sizeof(float) * n));
-        checkCuda(cudaMallocManaged(&halfs, sizeof(half) * n));
-        init(floats, n);
-        init(halfs, n);
-        begin = std::chrono::high_resolution_clock::now();
-        h2f(halfs, n, floats);
-        end = std::chrono::high_resolution_clock::now();
-        checkCuda(cudaFree(floats));
-        checkCuda(cudaFree(halfs));
-        ofs << "h2f" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-        ofs << "h2f" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-    } else if (type == "f2c") {
-        checkCuda(cudaMallocManaged(&floats, sizeof(float) * n));
-        checkCuda(cudaMallocManaged(&chars, sizeof(char) * n));
-        init(floats, n);
-        init(chars, n);
-        begin = std::chrono::high_resolution_clock::now();
-        f2c(floats, n, chars);
-        end = std::chrono::high_resolution_clock::now();
-        checkCuda(cudaFree(floats));
-        checkCuda(cudaFree(chars));
-        ofs << "f2c" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-        ofs << "f2c" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-    } else if (type == "c2f") {
-        checkCuda(cudaMallocManaged(&floats, sizeof(float) * n));
-        checkCuda(cudaMallocManaged(&chars, sizeof(char) * n));
-        init(floats, n);
-        init(chars, n);
-        begin = std::chrono::high_resolution_clock::now();
-        c2f(chars, n, floats);
-        end = std::chrono::high_resolution_clock::now();
-        checkCuda(cudaFree(floats));
-        checkCuda(cudaFree(chars));
-        ofs << "c2f" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-        ofs << "c2f" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-    } else if (type == "h2c") {
-        checkCuda(cudaMallocManaged(&halfs, sizeof(half) * n));
-        checkCuda(cudaMallocManaged(&chars, sizeof(char) * n));
-        init(halfs, n);
-        init(chars, n);
-        begin = std::chrono::high_resolution_clock::now();
-        h2c(halfs, n, chars);
-        end = std::chrono::high_resolution_clock::now();
-        checkCuda(cudaFree(halfs));
-        checkCuda(cudaFree(chars));
-        ofs << "h2c" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-        ofs << "h2c" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-    } else if (type == "c2h") {
-        checkCuda(cudaMallocManaged(&halfs, sizeof(half) * n));
-        checkCuda(cudaMallocManaged(&chars, sizeof(char) * n));
-        init(halfs, n);
-        init(chars, n);
-        begin = std::chrono::high_resolution_clock::now();
-        c2h(chars, n, halfs);
-        end = std::chrono::high_resolution_clock::now();
-        checkCuda(cudaFree(halfs));
-        checkCuda(cudaFree(chars));
-        ofs << "c2h" << "\t" << std::to_string(std::chrono::duration<double>(begin.time_since_epoch()).count()) << "\n";
-        ofs << "c2h" << "\t" << std::to_string(std::chrono::duration<double>(end.time_since_epoch()).count()) << "\n";
-        ofs.flush();
-    } else {
-        std::cerr << "Conversion type not supported!\n";
-        exit(1);
-    }
-    ofs.close();
-    return 0;
diff --git a/hpvm/projects/soc_simulator/src/quantize b/hpvm/projects/soc_simulator/src/quantize
deleted file mode 100755
index d977c364e97cae9352e968a2469303cfdba8a650..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/soc_simulator/src/quantize and /dev/null differ
diff --git a/hpvm/projects/soc_simulator/src/run_configs.pl b/hpvm/projects/soc_simulator/src/run_configs.pl
deleted file mode 100755
index ad67ea75ee652f6e504d27135a5898a4503cae26..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/run_configs.pl
+++ /dev/null
@@ -1,126 +0,0 @@
-use strict;
-use warnings;
-my $smart_dma = 1;
-my $detailed_results = 0;
-# Full experiments
-my @mnist_networks = ("lenet", "fc4");
-my @cifar10_networks = ("alexnet", "alexnet2", "resnet18", "vgg16");
-my @cifar100_networks = ("vgg16");
-my @pipelines = ("pipeline_GEMO", "pipeline_GEO", "pipeline_GEOM", "pipeline_GSM", "pipeline_GSME");
-# Naive experiments
-my @mnist_networks_naive = ("lenet", "fc4");
-my @cifar10_networks_naive = ("vgg16");
-#print "############### NAIVE 1% ##############\n";
-# MNIST networks
-foreach my $network (@mnist_networks_naive) {
-    print "Running $network\n";
-    print "Naive Loss1\n";
-    `./driver.pl ../${network}_mnist/${network}_layers.txt ../${network}_mnist/${network}_tensors.txt ../${network}_mnist/${network}_naive1.txt ../${network}_mnist/${network}_naive_results1.csv $smart_dma $detailed_results`;
-# CIFAR10 networks
-foreach my $network (@cifar10_networks_naive) {
-    print "Running $network\n";
-    print "Naive Loss1\n";
-    `./driver.pl ../${network}_cifar10/${network}_layers.txt ../${network}_cifar10/${network}_tensors.txt ../${network}_cifar10/${network}_naive1.txt ../${network}_cifar10/${network}_naive_results1.csv $smart_dma $detailed_results`;
-#print "############### 1% ##############\n";
-## MNIST networks
-#foreach my $network (@mnist_networks) {
-#    print "Running $network\n";
-#    print "HA Loss1\n";
-#    `./driver.pl ../${network}_mnist/${network}_layers.txt ../${network}_mnist/${network}_tensors.txt ../${network}_mnist/${network}_confs1.txt ../${network}_mnist/${network}_results1.csv $smart_dma $detailed_results`;
-#    print "HS Loss1\n";
-#    `./driver.pl ../${network}_mnist/${network}_layers.txt ../${network}_mnist/${network}_tensors.txt ../${network}_mnist/${network}_promise_confs1.txt ../${network}_mnist/${network}_promise_results1.csv $smart_dma $detailed_results`;
-## CIFAR10 networks
-#foreach my $network (@cifar10_networks) {
-#    print "Running $network\n";
-#    print "HA Loss1\n";
-#    `./driver.pl ../${network}_cifar10/${network}_layers.txt ../${network}_cifar10/${network}_tensors.txt ../${network}_cifar10/${network}_confs1.txt ../${network}_cifar10/${network}_results1.csv $smart_dma $detailed_results`;
-#    print "HS Loss1\n";
-#    `./driver.pl ../${network}_cifar10/${network}_layers.txt ../${network}_cifar10/${network}_tensors.txt ../${network}_cifar10/${network}_promise_confs1.txt ../${network}_cifar10/${network}_promise_results1.csv $smart_dma $detailed_results`;
-## CIFAR100 networks
-#foreach my $network (@cifar100_networks) {
-#    print "Running $network\n";
-#    print "HA Loss1\n";
-#    `./driver.pl ../${network}_cifar100/${network}_layers.txt ../${network}_cifar100/${network}_tensors.txt ../${network}_cifar100/${network}_confs1.txt ../${network}_cifar100/${network}_results1.csv $smart_dma $detailed_results`;
-#    print "HS Loss1\n";
-#    `./driver.pl ../${network}_cifar100/${network}_layers.txt ../${network}_cifar100/${network}_tensors.txt ../${network}_cifar100/${network}_promise_confs1.txt ../${network}_cifar100/${network}_promise_results1.csv $smart_dma $detailed_results`;
-## Image pipelines
-#foreach my $pipeline (@pipelines) {
-#    print "Running $pipeline\n";
-#    print "HA Loss1\n";
-#    `./driver.pl ../${pipeline}/${pipeline}_layers.txt ../${pipeline}/${pipeline}_tensors.txt ../${pipeline}/${pipeline}_confs1.txt ../${pipeline}/${pipeline}_results1.csv $smart_dma $detailed_results`;
-#    print "HS Loss1\n";
-#    `./driver.pl ../${pipeline}/${pipeline}_layers.txt ../${pipeline}/${pipeline}_tensors.txt ../${pipeline}/${pipeline}_promise_confs1.txt ../${pipeline}/${pipeline}_promise_results1.csv $smart_dma $detailed_results`;
-## vgg16_cifar100_top5
-##print "Running vgg16_cifar100_top5\n";
-##`./driver.pl ../vgg16_cifar100_top5/vgg16_layers.txt ../vgg16_cifar100_top5/vgg16_tensors.txt ../vgg16_cifar100_top5/vgg16_confs1.txt ../vgg16_cifar100_top5/vgg16_results1.csv $smart_dma $detailed_results`;
-##`./driver.pl ../vgg16_cifar100_top5/vgg16_layers.txt ../vgg16_cifar100_top5/vgg16_tensors.txt ../vgg16_cifar100_top5/vgg16_promise_confs1.txt ../vgg16_cifar100_top5/vgg16_promise_results1.csv $smart_dma $detailed_results`;
-#print "############### 2% ##############\n";
-## MNIST networks
-#foreach my $network (@mnist_networks) {
-#    print "Running $network\n";
-#    print "HA Loss2\n";
-#    `./driver.pl ../${network}_mnist/${network}_layers.txt ../${network}_mnist/${network}_tensors.txt ../${network}_mnist/${network}_confs2.txt ../${network}_mnist/${network}_results2.csv $smart_dma $detailed_results`;
-#    print "HS Loss2\n";
-#    `./driver.pl ../${network}_mnist/${network}_layers.txt ../${network}_mnist/${network}_tensors.txt ../${network}_mnist/${network}_promise_confs2.txt ../${network}_mnist/${network}_promise_results2.csv $smart_dma $detailed_results`;
-## CIFAR10 networks
-#foreach my $network (@cifar10_networks) {
-#    print "Running $network\n";
-#    print "HA Loss2\n";
-#    `./driver.pl ../${network}_cifar10/${network}_layers.txt ../${network}_cifar10/${network}_tensors.txt ../${network}_cifar10/${network}_confs2.txt ../${network}_cifar10/${network}_results2.csv $smart_dma $detailed_results`;
-#    print "HS Loss2\n";
-#    `./driver.pl ../${network}_cifar10/${network}_layers.txt ../${network}_cifar10/${network}_tensors.txt ../${network}_cifar10/${network}_promise_confs2.txt ../${network}_cifar10/${network}_promise_results2.csv $smart_dma $detailed_results`;
-## CIFAR100 networks
-#foreach my $network (@cifar100_networks) {
-#    print "Running $network\n";
-#    print "HA Loss2\n";
-#    `./driver.pl ../${network}_cifar100/${network}_layers.txt ../${network}_cifar100/${network}_tensors.txt ../${network}_cifar100/${network}_confs2.txt ../${network}_cifar100/${network}_results2.csv $smart_dma $detailed_results`;
-#    print "HS Loss2\n";
-#    `./driver.pl ../${network}_cifar100/${network}_layers.txt ../${network}_cifar100/${network}_tensors.txt ../${network}_cifar100/${network}_promise_confs2.txt ../${network}_cifar100/${network}_promise_results2.csv $smart_dma $detailed_results`;
-## Image pipelines
-#foreach my $pipeline (@pipelines) {
-#    print "Running $pipeline\n";
-#    print "HA Loss2\n";
-#    `./driver.pl ../${pipeline}/${pipeline}_layers.txt ../${pipeline}/${pipeline}_tensors.txt ../${pipeline}/${pipeline}_confs2.txt ../${pipeline}/${pipeline}_results2.csv $smart_dma $detailed_results`;
-#    print "HS Loss2\n";
-#    `./driver.pl ../${pipeline}/${pipeline}_layers.txt ../${pipeline}/${pipeline}_tensors.txt ../${pipeline}/${pipeline}_promise_confs2.txt ../${pipeline}/${pipeline}_promise_results2.csv $smart_dma $detailed_results`;
-## vgg16_cifar100_top5
-##print "Running vgg16_cifar100_top5\n";
-##`./driver.pl ../vgg16_cifar100_top5/vgg16_layers.txt ../vgg16_cifar100_top5/vgg16_tensors.txt ../vgg16_cifar100_top5/vgg16_confs2.txt ../vgg16_cifar100_top5/vgg16_results2.csv $smart_dma $detailed_results`;
-##`./driver.pl ../vgg16_cifar100_top5/vgg16_layers.txt ../vgg16_cifar100_top5/vgg16_tensors.txt ../vgg16_cifar100_top5/vgg16_promise_confs2.txt ../vgg16_cifar100_top5/vgg16_promise_results2.csv $smart_dma $detailed_results`;
-# Alexnet HA Loss2 without pDMA
-#print "Running Alexnet without pDMA\n";
-#$smart_dma = 0;
-#$detailed_results = 1;
-#`./driver.pl ../alexnet_cifar10/alexnet_layers.txt ../alexnet_cifar10/alexnet_tensors.txt ../alexnet_cifar10/alexnet_confs2.txt ../alexnet_cifar10/alexnet_results2_nodma.csv $smart_dma $detailed_results`;
diff --git a/hpvm/projects/soc_simulator/src/table_generator.py b/hpvm/projects/soc_simulator/src/table_generator.py
deleted file mode 100644
index 528b8e0ef5677cec9ccdba37abfde696544029cc..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/src/table_generator.py
+++ /dev/null
@@ -1,308 +0,0 @@
-import glob
-import os 
-import subprocess
-import shutil 
-import sys
-from collections import defaultdict
-** LayerName NumOpsInLayer <cols>
-OpName Col1Val Col2Val ...
-** Conv1 1 h2f_time h2f_energy fp32_time fp32_energy f2h_time f2h_energy fp16_perf_time fp16_perf_energy fp16_time fp16_energy
-Conv1 51.8808 97.2844 319.582 601.966 12.81 18.758 388.092 650.649 340.037 590.664
-class TableGenerator: 
-    __ops_header_delimiter = "#"
-    __table_header_delimter = "**" 
-    __time_col_name = "time" 
-    __energy_col_name = "energy"
-    '''
-    Stores all precision conversions used. 
-    '''
-    precision_conversions = frozenset(["h2f", "f2h"]) 
-    def __init__(self, network_name, dir_path, soc_ops_file, iters, profiler_binary_name):
-        '''
-        Args:
-            dir_path:               Path of directory containing network binaries
-            iters:                  Number of iterations to run each binary for
-            profiler_binary_name:   Name of offline profiler binary to run 
-        '''
-        self.__network_name = network_name
-        self.__dir_path = dir_path
-        # Name of the actual directory 
-        self.__soc_ops_filename = soc_ops_file
-        self.__iters = iters 
-        self.__profiler_binary_name = profiler_binary_name
-        # Path to results directory 
-        self.__results_dir_path = "%s_results" % self.__network_name
-        # Outputted table file
-        self.__table_filename = "%s_tensors.txt" % self.__network_name
-		# Nested default dictionary of default dicts
-        self.__table = self.__build_nested_default_dict()
-    def generate_table(self):
-        '''
-        Generates a table file called <network_name>_tensors.txt in the following 
-        steps:
-        1. Runs the offline profiler against the inputted binaries to generate
-        results files
-        2. Builds an internal table storing all data from the parsed results files
-        the offline profiler generated
-        3. Writes the internal table to <network_name>_tensors.txt file and uses the 
-        <network_name>_ops.txt file as a guideline in terms of row order 
-        '''
-        #self.__run_inputted_binaries()
-        self.__build_internal_table()
-        self.__output_table_to_file()
-    def __run_inputted_binaries(self):
-        '''
-        Invokes the profiler to run all appropriate binaries (must start with the network 
-        name) in the inputted directory. Result files generated by the profiler are 
-        stored in the results file directory and are named <binary_name>.txt. These results
-        files are then parsed in a later step to generate the table
-        '''
-        if not os.path.isdir(self.__dir_path):
-            print("ERROR: Directory %s not found" % self.__dir_path)
-            exit(1)
-        try:
-            os.mkdir(self.__results_dir_path)
-        except OSError:
-            if os.path.isdir(self.__results_dir_path):
-                print("Directory already exists. Clearing directory.")
-                for old_file in glob.glob(os.path.join(self.__results_dir_path, "*")):
-                    os.remove(old_file)
-            else:
-                print("ERROR: Directory doesn't exist but failed to create dir")
-        for binary_name in os.listdir(self.__dir_path):
-            binary_path = os.path.join(self.__dir_path, binary_name)
-            if not self.__should_execute_file(binary_path):
-                continue
-            output_file = os.path.join(self.__results_dir_path, binary_name + ".txt")
-            # No stdout/stderr piping needed for now
-            subprocess.Popen([profiler_binary_name, binary_path, str(self.__iters), \
-                        output_file]).communicate()
-    def __build_internal_table(self):
-        '''
-        Iterates through each results file generated by the runs of the offline
-        profiler and stores the data in a dictionary in the following format:
-            [operation name][approximation type OR conversion type][time/energy]
-        '''
-        for results_file_name in os.listdir(self.__results_dir_path):
-            # Ignore if it's not a results file
-            if results_file_name == self.__table_filename or \
-                        not results_file_name.startswith(self.__network_name):
-                continue
-            approx_type = self.__get_approximation_type(results_file_name)
-            results_file = open(os.path.join(self.__results_dir_path, results_file_name), "r")
-            for line in results_file:
-                line = line.strip()
-                op_name, total_time, total_energy = self.__parse_tensor_operation_line(line)
-                # If the current operation is f2h or h2f  
-                if any(op_name.endswith(prec_conv) for prec_conv in TableGenerator.precision_conversions):
-                    # Get the original operation name (without the f2h/h2f) and the conversion type 
-                    orig_op_name, conversion_type = self.__get_original_operation_name(op_name)
-                    if orig_op_name not in self.__table:
-                        print("ERROR: Conversion found but original %s is not in the table" % orig_op_name)
-                        exit(1)
-                    # Store f2h and h2f as columns in the row belonging to the original operation
-                    approx_type_no_fp_prefix = approx_type[5 : ]
-                    self.__table[orig_op_name][conversion_type + "_" + approx_type_no_fp_prefix][TableGenerator.__time_col_name] = total_time
-                    self.__table[orig_op_name][conversion_type + "_" + approx_type_no_fp_prefix][TableGenerator.__energy_col_name] = total_energy
-                # Create a new row in the dictionary
-                else:
-                    self.__table[op_name][approx_type][TableGenerator.__time_col_name] = total_time
-                    self.__table[op_name][approx_type][TableGenerator.__energy_col_name] = total_energy
-            results_file.close()
-    def __output_table_to_file(self):
-        '''
-        Outputs the internally stored table to a file using the <network_name>_ops.txt file as
-        a guideline in the following steps:
-        1. Opens the ops file and the file to output the table to
-        2. Reads a line from the ops file (guaranteed to be the layers/NML header)
-        3. For each operation in the layer (or 1 operation if the "layer" is a NML), we store the
-        time and the energy
-        '''
-        table_file_path = os.path.join(self.__results_dir_path, self.__table_filename)
-        soc_operations_file = open(self.__soc_ops_filename, "r")
-        table_file = open(table_file_path, "w")
-        curr_line = soc_operations_file.readline().strip()
-        while curr_line:
-            # First line is always the layers line (#layer_name,num_ops)
-            layer_name, num_ops = self.__parse_layer_info_line(curr_line)
-            # List of strings, where each string is a row corresponding to an operation
-            # in the layer
-            ops_in_layer = []
-            # Stores a list of elements in the header, which will be joined into a string
-            # The header is only generated for the first operation in the layer
-            # CRITICAL ASSUMPTION: All operations within a layer have the same # columns
-            # or everything breaks bc the header is per layer, not per operation
-            header = [TableGenerator.__table_header_delimter, layer_name, str(num_ops)]
-            # Iterate through all operations within the layer 
-            for op_in_layer_count in range(num_ops):
-                # Contains the operation name 
-                curr_line = soc_operations_file.readline().strip()
-                # Stores a list of elements that will be joined to make up a row 
-                curr_op = [curr_line]
-                operation_data = self.__table[curr_line]
-                # Iterate through time/energy data for each approximation type corresponding
-                # to the current operation
-                for approx_type in operation_data:
-                    op_time = operation_data[approx_type][TableGenerator.__time_col_name]
-                    op_energy = operation_data[approx_type][TableGenerator.__energy_col_name]
-                    curr_op.append(op_time)
-                    curr_op.append(op_energy)
-                    if op_in_layer_count == 0:
-                        if approx_type == "fp32_perf20":
-                            header.append("fp32_time")
-                            header.append("fp32_energy")
-                        elif approx_type == "fp16_perf20":
-                            header.append("fp16_time")
-                            header.append("fp16_energy")
-                        elif approx_type.find("f2h_perf20") != -1:
-                            header.append("f2h_time")
-                            header.append("f2h_energy")
-                        else:
-                            header.append("%s_time" % approx_type)
-                            header.append("%s_energy" % approx_type)
-                ops_in_layer.append(' '.join(curr_op))
-            # Getting all operation rows and then writing everything because
-            # calls to write() are slow (memory vs time tradeoff)
-            table_file.write("%s\n%s\n" % (' '.join(header), '\n'.join(ops_in_layer)))
-            curr_line = soc_operations_file.readline().strip()
-    def __should_execute_file(self, file_path):
-        '''
-        Checks if the file at the given file path is a binary that should be run
-        by the profiler. Must exist, be a binary, and must start with the network
-        name as per our naming standards.
-        Args:
-            file_path:          Path of the file to check 
-        '''
-        return os.path.isfile(file_path) and os.access(file_path, os.X_OK) and \
-                file_path.find(self.__network_name) != -1
-    def __get_approximation_type(self, results_filename):
-        '''
-        Parses a given results filename for the approximation type. 
-        Format assumption: <network_name>_<approx_type>.txt
-        Args:
-            results_filename:      Name of results file
-        Returns:
-            the approximation technique (ex: fp16) 
-        '''
-        approx_type_start_ind = results_filename.find("_", results_filename.find("_") + 1) + 1 
-        approx_type_end_ind = results_filename.find(".txt")
-        return results_filename[approx_type_start_ind : approx_type_end_ind] 
-    def __parse_tensor_operation_line(self, tensor_op_line):
-        '''
-        Parses a tensor operation line (within a output file from the offline
-        profiler for the operation name, the total time used, and the total
-        energy used
-        Args:
-            tensor_op_line:        Tensor operation line from output file
-        Returns:
-            operation name
-            total time used
-            total energy used
-        '''
-        line_as_list = tensor_op_line.split(",")
-        return line_as_list[0], line_as_list[1], line_as_list[2] 
-    def __build_nested_default_dict(self):
-        '''
-        Builds a nested default dictionary with an arbitrary number of levels
-        '''
-        return defaultdict(self.__build_nested_default_dict)
-    def __get_original_operation_name(self, op_name):
-        '''
-        Parses an operation name containing _<conversion type> for the original
-        operation name.
-        Format assumption: <original_op_name>_<conversion type>
-        Args:
-            op_name:        Name of the operation
-        Returns:
-            the original operation name 
-        '''
-        underscore_ind = op_name.find("_")
-        return op_name[ : underscore_ind], op_name[underscore_ind + 1 : ]
-    def __parse_layer_info_line(self, layer_info_line): #layer_name,num_ops
-        '''
-        Parses a layer header (from the original ops.txt file) into the layer name
-        and the number of operations
-        Assumed format: #layer_name,num_ops
-        Args:
-            layer_info_line:    Line at the beginning of each layer in the ops file
-        Returns:
-            layer name
-            number of ops in the layer
-        '''
-        comma_ind = layer_info_line.find(",")
-        return layer_info_line[layer_info_line.find(TableGenerator.__ops_header_delimiter) + 1 : comma_ind], \
-                    int(layer_info_line[comma_ind + 1 : ])
-if __name__ == "__main__":
-    if len(sys.argv) != 6:
-        print("python table_generator.py <network name> <binary dir path> <soc_ops file> <num itrs> <profiler bin path>")
-        print("soc ops file: ~/soc_simular/%s_cifar10/%s_ops.txt")
-        exit(1)
-    network_name = sys.argv[1]
-    binary_dir_path = sys.argv[2]
-    soc_ops_file = sys.argv[3]
-    num_iters = int(sys.argv[4]) 
-    profiler_binary_name = sys.argv[5]
-    table_gen = TableGenerator(network_name, binary_dir_path, soc_ops_file, num_iters, profiler_binary_name)
-    table_gen.generate_table()
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_confs1.txt b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_confs1.txt
deleted file mode 100644
index 69a3b7cc2fbbad37d571bf1009d4efbd982ce0f7..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_confs1.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9
-9 9 9,7,7,9 9 9 9,9 9 9,7,7,8 8 8,7,7,8 8 8,8 8 8,7,9 9 9,9 9
-9 9 9,7,7,8 8 8 8,8 8 8,9 9 9,7,7,8 8 8,7,7,7,8 8 8 8,9 9 9,9 9
-9 9 9,7,7,7,9 9 9,8 8 8,7,8 8 8,7,7,8 8 8,8 8 8,7,9 9 9,9 9
-9 9 9,7,7,7,8 8 8,8 8 8,7,7,9 9 9,7,7,7,9 9 9 9,9 9 9,9 9
-9 9 9,7,7,9 9 9 9,8 8 8,7,7,8 8 8,7,7,8 8 8,8 8 8,7,9 9 9,9 9
-9 9 9,7,7,7,9 9 9,8 8 8,7,8 8 8,7,7,8 8 8,8 8 8,7,9 9 9,9 9
-9 9 9,7,7,8 8 8 8,8 8 8,9 9 9,7,7,8 8 8,7,7,7,8 8 8 8,9 9 9,9 9
-9 9 9,7,7,9 9 9 9,9 9 9,7,7,8 8 8,7,7,8 8 8,8 8 8,7,9 9 9,9 9
-9 9 9,7,7,7,8 8 8,8 8 8,7,7,9 9 9,7,7,7,9 9 9 9,9 9 9,9 9
-9 9 9,7,7,9 9 9 9,8 8 8,7,7,8 8 8,7,7,8 8 8,8 8 8,7,9 9 9,9 9
-9 9 9,7,7,9 9 9 9,9 9 9,7,7,8 8 8,7,7,8 8 8,8 8 8,7,9 9 9,9 9
-9 9 9,7,7,7,9 9 9,8 8 8,7,8 8 8,7,7,8 8 8,8 8 8,7,9 9 9,9 9
-9 9 9,7,7,9 9 9 9,8 8 8,7,7,8 8 8,7,7,8 8 8,8 8 8,7,9 9 9,9 9
-9 9 9,7,7,8 8 8 8,8 8 8,9 9 9,7,7,8 8 8,7,7,7,8 8 8 8,9 9 9,9 9
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_confs2.txt b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_confs2.txt
deleted file mode 100644
index 73bef415abe86b2cfb918aa56ab7a2ba13b9021d..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_confs2.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,7,8 8 8,4,7,7,4,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,5,8 8 8,8 8 8 8,7,6,7,7,8 8 8,4,4,4,8 8 8 8,8 8 8,8 8
-9 9 9,7,4,8 8 8 8,7,8 8 8,4,7,8 8 8,4,4,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,4,8 8 8 8,7,5,4,7,7,7,4,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,7,7,7,6,7,7,8 8 8,4,4,4,7,8 8 8,8 8
-9 9 9,8 8 8 8,5,7,4,6,8 8 8 8,7,4,7,4,4,7,8 8 8,8 8
-9 9 9,8 8 8 8,4,8 8 8 8,8 8 8,4,4,7,7,8 8 8 8,8 8 8,5,7,8 8 8,8 8
-9 9 9,7,8 8 8,8 8 8 8,7,8 8 8,4,7,7,4,4,8 8 8,6,8 8 8,8 8
-9 9 9,8 8 8 8,4,8 8 8 8,7,4,7,8 8 8,7,7,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,7,4,8 8 8 8,7,6,4,7,8 8 8,4,7,4,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,4,8 8 8 8,7,4,7,8 8 8,7,7,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,4,8 8 8 8,7,5,4,7,7,7,4,8 8 8,7,8 8 8,8 8
-9 9 9,7,4,8 8 8 8,7,8 8 8,4,7,8 8 8,4,4,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,4,8 8 8 8,8 8 8,4,4,7,7,8 8 8 8,8 8 8,5,7,8 8 8,8 8
-9 9 9,5,8 8 8,8 8 8 8,7,6,7,7,8 8 8,4,4,4,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,5,7,4,6,8 8 8 8,7,4,7,4,4,7,8 8 8,8 8
-9 9 9,7,8 8 8,8 8 8 8,7,8 8 8,4,7,7,4,4,8 8 8,6,8 8 8,8 8
-9 9 9,7,4,8 8 8 8,7,6,4,7,8 8 8,4,7,4,8 8 8 8,8 8 8,8 8
-9 9 9,5,5,8 8 8 8,4,6,4,7,8 8 8,4,4,4,7,8 8 8,8 8
-9 9 9,8 8 8 8,7,7,7,6,7,7,8 8 8,4,4,4,7,8 8 8,8 8
-9 9 9,7,4,8 8 8 8,7,8 8 8,8 8 8 8,7,7,4,4,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,4,8 8 8 8,8 8 8,4,4,7,7,8 8 8 8,8 8 8,5,7,8 8 8,8 8
-9 9 9,8 8 8 8,7,7,7,6,7,7,8 8 8,4,4,4,7,8 8 8,8 8
-9 9 9,8 8 8 8,5,8 8 8 8,8 8 8,8 8 8,7,8 8 8,7,7,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,5,8 8 8,8 8 8 8,7,6,7,7,8 8 8,4,4,4,8 8 8 8,8 8 8,8 8
-9 9 9,7,8 8 8,8 8 8 8,7,8 8 8,4,7,7,4,4,8 8 8,6,8 8 8,8 8
-9 9 9,8 8 8 8,5,7,4,6,8 8 8 8,7,4,7,4,4,7,8 8 8,8 8
-9 9 9,8 8 8 8,4,8 8 8 8,7,4,7,8 8 8,7,7,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,4,8 8 8 8,7,5,4,7,7,7,4,8 8 8,7,8 8 8,8 8
-9 9 9,7,4,8 8 8 8,7,6,4,7,8 8 8,4,7,4,8 8 8 8,8 8 8,8 8
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_fp16.csv b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_fp16.csv
deleted file mode 100644
index 04c4cfc4efb2b0fe6f94ddc332d356ba2966da72..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_fp16.csv
+++ /dev/null
@@ -1,148 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_fp32.csv b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_fp32.csv
deleted file mode 100644
index 2e203cf73d4f5220f9f3217398c952496028fb62..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_fp32.csv
+++ /dev/null
@@ -1,50 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_layers.txt b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_layers.txt
deleted file mode 100644
index af6469192145b246beaec42cf42a6629e5ed1a93..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_layers.txt
+++ /dev/null
@@ -1,15 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_naive1.txt b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_naive1.txt
deleted file mode 100644
index 90a09b7a87bfde672e576b124225a9efbebe069c..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_naive1.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9
-8 8 8,7,7,7,7,7,7,7,7,7,7,7,7,7,7
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_naive_results1.csv b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_naive_results1.csv
deleted file mode 100644
index 3036f34f281fb2142622187df7d369b1a9dbcd3e..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_naive_results1.csv
+++ /dev/null
@@ -1,14 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_ops.txt b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_ops.txt
deleted file mode 100644
index 2075774fde3e66afd1a1946cac46b87038a6486f..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_ops.txt
+++ /dev/null
@@ -1,64 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_confs1.txt b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_confs1.txt
deleted file mode 100644
index e7b8720b064ac873815b3222371f587b6d3eace9..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_confs1.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9
-8 8 8,8 8 8 8,8 8 8,7,7,5,7,6,8 8 8,7,7,6,7,7,7
-8 8 8,5,8 8 8,7,7,6,8 8 8 8,8 8 8,7,5,8 8 8,6,7,9 9 9,9 9
-9 9 9,5,9 9 9,9 9 9 9,6,6,9 9 9 9,6,6,8 8 8 8,9 9 9,6,9 9 9 9,6,8 8
-8 8 8,8 8 8 8,8 8 8,7,6,5,7,7,5,5,5,8 8 8,7,7,5
-8 8 8,8 8 8 8,8 8 8,7,7,5,9 9 9 9,6,5,5,7,6,7,7,5
-8 8 8,8 8 8 8,8 8 8,5,7,5,7,8 8 8,6,5,7,6,7,9 9 9,7
-8 8 8,6,8 8 8,8 8 8 8,7,8 8 8,7,6,9 9 9,7,8 8 8,8 8 8,9 9 9 9,9 9 9,6
-9 9 9,8 8 8 8,8 8 8,7,7,6,7,6,5,9 9 9 9,7,7,9 9 9 9,7,7
-8 8 8,8 8 8 8,8 8 8,7,7,5,7,7,5,5,7,6,8 8 8 8,8 8 8,8 8
-9 9 9,9 9 9 9,8 8 8,7,7,5,7,6,7,5,7,9 9 9,7,7,5
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,9 9 9 9,7,6,9 9 9 9,7,5,8 8 8 8,6,8 8
-8 8 8,8 8 8 8,8 8 8,7,5,5,7,6,9 9 9,6,7,6,9 9 9 9,7,7
-8 8 8,8 8 8 8,8 8 8,7,7,8 8 8,8 8 8 8,6,5,5,5,6,7,9 9 9,7
-9 9 9,8 8 8 8,8 8 8,5,5,5,9 9 9 9,6,9 9 9,6,7,6,7,7,6
-9 9 9,8 8 8 8,8 8 8,7,7,9 9 9,9 9 9 9,6,5,5,7,6,7,7,6
-8 8 8,8 8 8 8,8 8 8,7,5,5,7,6,9 9 9,9 9 9 9,7,6,9 9 9 9,7,7
-9 9 9,8 8 8 8,8 8 8,7,7,8 8 8,8 8 8 8,6,7,5,7,6,7,7,6
-8 8 8,8 8 8 8,8 8 8,7,7,6,7,6,7,5,8 8 8,6,7,7,7
-8 8 8,8 8 8 8,8 8 8,7,7,5,7,6,5,5,9 9 9,6,7,7,7
-8 8 8,8 8 8 8,8 8 8,7,5,5,7,9 9 9,5,5,7,6,8 8 8 8,7,5
-9 9 9,8 8 8 8,8 8 8,5,6,6,8 8 8 8,6,6,7,7,5,8 8 8 8,7,8 8
-9 9 9,9 9 9 9,8 8 8,5,6,6,8 8 8 8,7,6,7,7,5,8 8 8 8,7,8 8
-8 8 8,8 8 8 8,8 8 8,7,7,5,7,9 9 9,7,5,7,9 9 9,7,9 9 9,7
-8 8 8,8 8 8 8,8 8 8,8 8 8 8,7,5,8 8 8 8,9 9 9,5,5,7,8 8 8,7,7,7
-8 8 8,8 8 8 8,8 8 8,7,5,8 8 8,6,8 8 8,7,5,6,9 9 9,7,5,7
-8 8 8,8 8 8 8,8 8 8,7,7,8 8 8,9 9 9 9,9 9 9,5,5,7,6,7,7,7
-9 9 9,8 8 8 8,8 8 8,6,7,5,9 9 9 9,6,7,5,7,9 9 9,7,7,8 8
-8 8 8,8 8 8 8,9 9 9,8 8 8 8,8 8 8,8 8 8,9 9 9 9,5,8 8 8,5,7,6,8 8 8 8,5,6
-9 9 9,8 8 8 8,8 8 8,7,7,7,9 9 9 9,6,7,5,7,8 8 8,7,7,7
-8 8 8,9 9 9 9,8 8 8,7,5,8 8 8,9 9 9 9,7,8 8 8,8 8 8 8,7,7,9 9 9 9,6,7
-8 8 8,8 8 8 8,9 9 9,8 8 8 8,8 8 8,8 8 8,9 9 9 9,5,8 8 8,5,7,6,8 8 8 8,5,7
-9 9 9,8 8 8 8,8 8 8,5,9 9 9,9 9 9,6,6,9 9 9,6,9 9 9,6,5,7,7
-8 8 8,8 8 8 8,8 8 8,7,7,5,8 8 8 8,5,5,7,9 9 9,6,7,7,8 8
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,5,5,6,6,5,7,8 8 8,8 8 8,9 9 9 9,5,6
-8 8 8,9 9 9 9,9 9 9,7,6,5,6,9 9 9,6,8 8 8 8,8 8 8,6,7,7,8 8
-8 8 8,8 8 8 8,9 9 9,5,5,7,7,8 8 8,9 9 9,5,6,9 9 9,9 9 9 9,7,7
-8 8 8,8 8 8 8,8 8 8,7,7,5,7,9 9 9,5,5,7,8 8 8,8 8 8 8,9 9 9,7
-8 8 8,8 8 8 8,8 8 8,7,7,9 9 9,8 8 8 8,9 9 9,5,8 8 8 8,8 8 8,6,7,7,7
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,8 8 8,9 9 9,6,6,7,7,8 8 8,8 8 8,9 9 9 9,5,6
-8 8 8,8 8 8 8,8 8 8,7,5,9 9 9,8 8 8 8,9 9 9,5,6,9 9 9,8 8 8,5,7,7
-8 8 8,9 9 9 9,8 8 8,6,7,6,9 9 9 9,8 8 8,6,9 9 9 9,9 9 9,5,5,6,5
-8 8 8,8 8 8 8,9 9 9,8 8 8 8,8 8 8,8 8 8,8 8 8 8,6,8 8 8,9 9 9 9,9 9 9,6,8 8 8 8,7,6
-8 8 8,8 8 8 8,8 8 8,6,7,7,6,8 8 8,6,8 8 8 8,8 8 8,9 9 9,6,6,8 8
-8 8 8,9 9 9 9,8 8 8,6,7,6,9 9 9 9,8 8 8,6,9 9 9 9,9 9 9,8 8 8,9 9 9 9,6,5
-8 8 8,9 9 9 9,8 8 8,9 9 9 9,7,5,9 9 9 9,8 8 8,6,9 9 9 9,9 9 9,8 8 8,9 9 9 9,9 9 9,5
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,7,6,9 9 9 9,8 8 8,7,9 9 9 9,9 9 9,8 8 8,9 9 9 9,9 9 9,5
-9 9 9,8 8 8 8,8 8 8,9 9 9 9,7,7,9 9 9 9,8 8 8,6,9 9 9 9,9 9 9,8 8 8,9 9 9 9,6,5
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_confs2.txt b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_confs2.txt
deleted file mode 100644
index f846e81c4b2ab1c7681debe4ec84de99be49fade..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_confs2.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9
-8 8 8,7,8 8 8,6,5,5,6,6,7,7,6,5,5,9 9 9,5
-8 8 8,5,5,6,5,9 9 9,6,6,7,7,6,5,5,6,5
-8 8 8,7,8 8 8,8 8 8 8,5,5,8 8 8 8,6,7,7,6,5,7,6,5
-8 8 8,8 8 8 8,5,6,5,5,9 9 9 9,6,7,7,6,5,9 9 9 9,6,5
-9 9 9,7,5,6,5,5,9 9 9 9,6,7,6,6,5,5,6,5
-8 8 8,8 8 8 8,5,6,7,5,6,6,7,7,9 9 9,5,5,6,5
-9 9 9,7,7,5,6,7,6,7,9 9 9,7,5,7,5,6,8 8
-8 8 8,7,5,6,5,5,8 8 8 8,6,7,7,6,5,5,9 9 9,5
-8 8 8,5,8 8 8,7,9 9 9,8 8 8,7,7,5,6,6,7,6,6,6
-8 8 8,8 8 8 8,6,6,6,7,6,6,7,5,9 9 9,5,9 9 9 9,9 9 9,6
-8 8 8,9 9 9 9,7,7,8 8 8,6,9 9 9 9,9 9 9,7,5,6,5,5,7,9 9
-8 8 8,7,7,6,5,5,6,7,7,7,8 8 8,5,5,6,5
-8 8 8,7,5,6,5,5,7,8 8 8,7,7,6,9 9 9,5,6,9 9
-8 8 8,8 8 8 8,9 9 9,9 9 9 9,6,6,7,6,7,6,5,5,5,6,7
-9 9 9,7,6,6,5,5,6,7,6,5,5,9 9 9,8 8 8 8,5,6
-8 8 8,6,6,6,5,8 8 8,7,8 8 8,8 8 8,7,5,9 9 9,5,9 9 9,6
-8 8 8,6,5,6,5,9 9 9,8 8 8 8,6,7,7,6,5,9 9 9 9,5,5
-8 8 8,7,5,6,5,5,6,9 9 9,7,5,6,9 9 9,5,6,6
-9 9 9,7,5,7,5,5,6,6,7,6,9 9 9,5,9 9 9 9,6,5
-9 9 9,6,5,7,9 9 9,9 9 9,5,7,8 8 8,5,7,5,5,6,7
-8 8 8,7,5,6,5,5,6,6,7,5,6,9 9 9,9 9 9 9,6,5
-9 9 9,6,8 8 8,8 8 8 8,5,7,5,7,7,5,9 9 9,6,9 9 9 9,5,7
-9 9 9,7,6,6,5,5,6,7,6,8 8 8 8,5,9 9 9,8 8 8 8,5,6
-9 9 9,5,5,8 8 8 8,5,8 8 8,7,5,9 9 9,5,9 9 9,7,8 8 8 8,8 8 8,5
-8 8 8,7,5,6,5,6,9 9 9 9,9 9 9,7,7,6,9 9 9,6,6,5
-8 8 8,6,5,6,5,8 8 8,9 9 9 9,7,7,7,9 9 9,5,5,6,5
-8 8 8,6,6,6,7,8 8 8,8 8 8 8,9 9 9,7,7,8 8 8,7,6,5,5
-8 8 8,6,6,7,7,8 8 8,8 8 8 8,9 9 9,6,7,8 8 8,7,6,5,5
-8 8 8,7,6,6,5,7,6,7,6,7,8 8 8,5,9 9 9 9,6,9 9
-8 8 8,7,7,7,6,9 9 9,9 9 9 9,7,9 9 9,7,6,9 9 9,6,6,6
-8 8 8,7,6,7,5,7,9 9 9 9,5,6,7,8 8 8,7,7,8 8 8,5
-9 9 9,6,5,7,7,6,5,5,6,8 8 8 8,6,8 8 8,8 8 8 8,5,6
-8 8 8,6,5,5,8 8 8,5,8 8 8 8,7,9 9 9,6,9 9 9,9 9 9,5,7,6
-8 8 8,7,5,6,5,5,6,6,7,8 8 8 8,6,9 9 9,8 8 8 8,6,6
-8 8 8,7,5,7,7,6,5,6,6,8 8 8 8,9 9 9,5,7,5,6
-8 8 8,7,6,7,5,7,5,8 8 8,5,6,8 8 8,8 8 8,5,6,7
-9 9 9,8 8 8 8,5,8 8 8 8,9 9 9,8 8 8,7,5,9 9 9,7,8 8 8,7,6,8 8 8,5
-8 8 8,7,6,7,5,7,5,8 8 8,5,6,8 8 8,8 8 8,5,8 8 8,7
-9 9 9,6,9 9 9,5,6,9 9 9,7,8 8 8,6,5,7,7,5,7,7
-8 8 8,6,6,6,8 8 8,9 9 9,8 8 8 8,6,6,8 8 8 8,7,6,6,6,5
-8 8 8,7,9 9 9,6,5,8 8 8,9 9 9 9,6,7,5,6,5,5,6,5
-8 8 8,5,8 8 8,6,5,5,6,6,9 9 9,7,9 9 9,5,5,6,5
-8 8 8,6,9 9 9,6,5,8 8 8,6,6,7,7,7,8 8 8,5,6,8 8
-8 8 8,6,5,9 9 9 9,9 9 9,6,7,7,7,9 9 9 9,8 8 8,5,8 8 8 8,6,8 8
-8 8 8,7,8 8 8,6,5,5,8 8 8 8,6,7,7,6,8 8 8,5,6,5
-8 8 8,6,8 8 8,6,6,6,7,7,6,5,9 9 9,5,9 9 9 9,7,8 8
-9 9 9,7,8 8 8,6,5,5,9 9 9 9,6,5,9 9 9 9,7,6,8 8 8 8,6,9 9
-8 8 8,7,9 9 9,7,5,5,9 9 9 9,7,7,7,6,8 8 8,5,8 8 8,8 8
-8 8 8,6,8 8 8,7,9 9 9,8 8 8,7,9 9 9,9 9 9,6,6,7,8 8 8 8,6,5
-9 9 9,6,9 9 9,7,9 9 9,8 8 8,5,7,8 8 8,9 9 9 9,8 8 8,5,5,6,8 8
-8 8 8,6,5,8 8 8 8,8 8 8,8 8 8,5,8 8 8,6,7,8 8 8,7,6,5,9 9
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_results1.csv b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_results1.csv
deleted file mode 100644
index 3cc252f7f64c7b9fecac6fd7a7793d5560c18285..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_results1.csv
+++ /dev/null
@@ -1,572 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_results2.csv b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_results2.csv
deleted file mode 100644
index 3371feb04844bd486683dbd9d19495a0aa255099..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_promise_results2.csv
+++ /dev/null
@@ -1,616 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_results1.csv b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_results1.csv
deleted file mode 100644
index fa38a02ff27be3de0da8d5ce5598fcea3f304bb2..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_results1.csv
+++ /dev/null
@@ -1,209 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_results2.csv b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_results2.csv
deleted file mode 100644
index 03e64efef1e92da5898265ce6982aebb2ff9a7c7..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_results2.csv
+++ /dev/null
@@ -1,385 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_tensors.txt b/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_tensors.txt
deleted file mode 100644
index 6c6b42b93c0446c298489429261592fe99e2f81b..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar10/vgg16_tensors.txt
+++ /dev/null
@@ -1,64 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_confs1.txt b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_confs1.txt
deleted file mode 100644
index 2c88b81aaa9620b01f75897e5e082dd78c1d3d57..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_confs1.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,7,7,8 8 8,8 8 8 8,7,7,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,7,8 8 8,7,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,7,8 8 8,7,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,7,7,8 8 8,8 8 8 8,7,7,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,7,8 8 8,7,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,7,8 8 8,7,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,7,8 8 8,7,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,7,7,8 8 8,8 8 8 8,7,7,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,7,8 8 8,7,8 8 8 8,8 8 8,8 8
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_confs2.txt b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_confs2.txt
deleted file mode 100644
index 8dacc6e3fe910098503d504feece0b5ecd1753dc..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_confs2.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,7,8 8 8,7,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,7,8 8 8,8 8 8,8 8 8 8,6,6,7,8 8 8,7,6,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,7,7,8 8 8,7,7,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,7,7,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,7,8 8 8,7,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,7,8 8 8,8 8 8,8 8 8 8,6,6,7,8 8 8,7,6,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,7,7,8 8 8,7,7,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,7,7,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,7,8 8 8,7,8 8 8,8 8 8,7,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,7,8 8 8,8 8 8,8 8 8 8,6,6,7,8 8 8,7,6,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,7,7,8 8 8,7,7,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,7,7,8 8 8,8 8 8,7,8 8 8,8 8
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_fp16.csv b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_fp16.csv
deleted file mode 100644
index 04c4cfc4efb2b0fe6f94ddc332d356ba2966da72..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_fp16.csv
+++ /dev/null
@@ -1,148 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_fp32.csv b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_fp32.csv
deleted file mode 100644
index 2e203cf73d4f5220f9f3217398c952496028fb62..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_fp32.csv
+++ /dev/null
@@ -1,50 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_layers.txt b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_layers.txt
deleted file mode 100644
index af6469192145b246beaec42cf42a6629e5ed1a93..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_layers.txt
+++ /dev/null
@@ -1,15 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_ops.txt b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_ops.txt
deleted file mode 100644
index 2075774fde3e66afd1a1946cac46b87038a6486f..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_ops.txt
+++ /dev/null
@@ -1,64 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_confs1.txt b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_confs1.txt
deleted file mode 100644
index e6989e16ee3869ecb13ecc9f73af7f9a66c24dee..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_confs1.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9
-9 9 9,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,5,7,8 8 8,5,8 8 8 8,6,7
-9 9 9,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,5,7,8 8 8,8 8 8,6,6,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,5,7,8 8 8,6,8 8 8 8,6,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,5,7,8 8 8,5,8 8 8 8,6,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,5,7,8 8 8,5,6,6,5
-9 9 9,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,5,8 8 8 8,8 8 8,8 8 8,6,6,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,8 8 8 8,8 8 8,5,5,8 8 8,5,6,8 8 8,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,7,7,8 8 8,8 8 8,7,8 8 8,5,6,8 8 8,8 8
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,8 8 8,7,8 8 8,5,6,6,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,5,5,8 8 8 8,8 8 8,8 8 8,8 8 8 8,5,5,7,7,6
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,5,5,8 8 8,5,6,5,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,6,8 8 8 8,8 8 8,8 8 8,8 8 8 8,6,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,8 8 8 8,8 8 8,5,7,8 8 8,5,7,6,6
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,5,7,8 8 8,5,6,6,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,5,8 8 8 8,8 8 8,5,5,6,7
-8 8 8,8 8 8 8,8 8 8,8 8 8 8,6,6,8 8 8 8,8 8 8,6,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,6
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,5,8 8 8 8,8 8 8,5,6,5,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,8 8 8,7,8 8 8,8 8 8,6,6,7
-9 9 9,9 9 9 9,8 8 8,8 8 8 8,7,7,6,8 8 8,8 8 8,8 8 8 8,8 8 8,7,5,8 8 8,8 8
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,6,8 8 8 8,8 8 8,5,5,6,7
-8 8 8,8 8 8 8,8 8 8,8 8 8 8,6,8 8 8,8 8 8 8,8 8 8,6,8 8 8 8,8 8 8,8 8 8,5,6,6
-8 8 8,8 8 8 8,8 8 8,8 8 8 8,7,8 8 8,8 8 8 8,8 8 8,6,8 8 8 8,5,8 8 8,8 8 8 8,6,6
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,6,8 8 8,8 8 8 8,8 8 8,8 8 8,7,7,5,7,7,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,5,8 8 8,8 8 8 8,8 8 8,8 8 8,6,8 8 8,7,6,8 8 8,5
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,7,5,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8
-8 8 8,8 8 8 8,8 8 8,8 8 8 8,8 8 8,5,5,8 8 8,8 8 8,8 8 8 8,7,8 8 8,8 8 8 8,5,8 8
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,7,5,8 8 8,8 8 8,8 8 8 8,8 8 8,7,7,7,8 8
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,7,7,5,6,6,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,8 8 8,8 8 8 8,8 8 8,5,5,6,5
-9 9 9,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,7,8 8 8,8 8 8,8 8 8 8,8 8 8,6,6,6,7
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,8 8 8,6,8 8 8 8,8 8 8,8 8 8,7,8 8 8,5,8 8 8 8,6,7
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,7,6,8 8 8,5
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_confs2.txt b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_confs2.txt
deleted file mode 100644
index 4732b0c143c403151aa411ec9b97007a2fc4898b..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_confs2.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-9 9 9,9 9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9 9,9 9 9 9,9 9 9,9 9
-9 9 9,7,8 8 8,9 9 9 9,9 9 9,5,9 9 9 9,8 8 8,8 8 8,8 8 8 8,8 8 8,5,6,9 9 9,8 8
-9 9 9,7,8 8 8,9 9 9 9,9 9 9,6,8 8 8 8,8 8 8,7,5,8 8 8,9 9 9,8 8 8 8,7,8 8
-9 9 9,7,8 8 8,9 9 9 9,9 9 9,5,8 8 8 8,8 8 8,9 9 9,8 8 8 8,8 8 8,9 9 9,7,7,8 8
-8 8 8,7,8 8 8,9 9 9 9,7,7,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,7,6,7,8 8
-8 8 8,8 8 8 8,9 9 9,7,7,7,9 9 9 9,8 8 8,5,5,6,8 8 8,9 9 9 9,5,7
-9 9 9,9 9 9 9,8 8 8,9 9 9 9,8 8 8,8 8 8,8 8 8 8,7,5,6,7,7,7,8 8 8,8 8
-9 9 9,7,8 8 8,9 9 9 9,6,5,9 9 9 9,8 8 8,7,8 8 8 8,8 8 8,9 9 9,6,7,7
-9 9 9,9 9 9 9,8 8 8,7,7,9 9 9,9 9 9 9,9 9 9,7,7,8 8 8,6,5,9 9 9,7
-9 9 9,7,8 8 8,9 9 9 9,7,6,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,9 9 9,6,5,8 8
-9 9 9,7,8 8 8,9 9 9 9,7,6,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,9 9 9,6,7,8 8
-9 9 9,7,8 8 8,9 9 9 9,6,9 9 9,8 8 8 8,8 8 8,8 8 8,5,8 8 8,9 9 9,6,7,8 8
-9 9 9,9 9 9 9,8 8 8,9 9 9 9,5,5,8 8 8 8,7,7,8 8 8 8,9 9 9,9 9 9,9 9 9 9,7,8 8
-9 9 9,9 9 9 9,8 8 8,7,6,9 9 9,8 8 8 8,9 9 9,7,8 8 8 8,5,9 9 9,6,7,8 8
-9 9 9,8 8 8 8,9 9 9,8 8 8 8,8 8 8,9 9 9,9 9 9 9,5,6,6,8 8 8,6,5,6,6
-8 8 8,8 8 8 8,9 9 9,8 8 8 8,5,9 9 9,7,9 9 9,7,8 8 8 8,7,6,5,7,6
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,5,6,7,8 8 8,7,5,8 8 8,6,5,5,9 9
-8 8 8,8 8 8 8,9 9 9,8 8 8 8,9 9 9,8 8 8,5,5,8 8 8,9 9 9 9,6,8 8 8,6,5,8 8
-9 9 9,7,8 8 8,9 9 9 9,7,5,8 8 8 8,8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,6,7,6
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,9 9 9,8 8 8,7,9 9 9,5,8 8 8 8,8 8 8,6,6,8 8 8,5
-9 9 9,8 8 8 8,8 8 8,9 9 9 9,7,9 9 9,7,9 9 9,7,9 9 9 9,6,7,6,5,8 8
-9 9 9,8 8 8 8,9 9 9,8 8 8 8,9 9 9,8 8 8,5,7,7,9 9 9 9,6,8 8 8,9 9 9 9,6,6
-9 9 9,9 9 9 9,8 8 8,5,6,5,8 8 8 8,8 8 8,7,8 8 8 8,8 8 8,9 9 9,6,5,8 8
-8 8 8,8 8 8 8,8 8 8,9 9 9 9,8 8 8,8 8 8,7,7,8 8 8,8 8 8 8,8 8 8,5,6,9 9 9,8 8
-9 9 9,8 8 8 8,8 8 8,9 9 9 9,8 8 8,8 8 8,7,7,5,8 8 8 8,8 8 8,6,6,9 9 9,5
-8 8 8,8 8 8 8,8 8 8,8 8 8 8,6,5,7,8 8 8,7,5,8 8 8,8 8 8,8 8 8 8,8 8 8,9 9
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,9 9 9,5,6,8 8 8,8 8 8 8,8 8 8,9 9 9,6,6,8 8
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,9 9 9,5,6,8 8 8,8 8 8 8,8 8 8,9 9 9,7,9 9 9,8 8
-9 9 9,8 8 8 8,8 8 8,9 9 9 9,9 9 9,8 8 8,7,7,8 8 8,8 8 8 8,8 8 8,6,6,9 9 9,7
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,8 8 8,9 9 9,5,7,8 8 8,8 8 8 8,8 8 8,9 9 9,6,6,8 8
-9 9 9,9 9 9 9,9 9 9,8 8 8 8,7,7,8 8 8 8,8 8 8,7,7,7,9 9 9,6,7,5
-8 8 8,9 9 9 9,9 9 9,7,9 9 9,8 8 8,9 9 9 9,9 9 9,9 9 9,6,9 9 9,7,8 8 8 8,7,6
-8 8 8,9 9 9 9,9 9 9,7,8 8 8,5,8 8 8 8,9 9 9,8 8 8,5,8 8 8,6,6,7,7
-8 8 8,9 9 9 9,8 8 8,7,9 9 9,8 8 8,9 9 9 9,9 9 9,9 9 9,6,9 9 9,7,8 8 8 8,7,6
-9 9 9,8 8 8 8,8 8 8,8 8 8 8,6,5,9 9 9 9,9 9 9,7,7,8 8 8,7,7,6,9 9
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,5,9 9 9,5,6,9 9 9,9 9 9 9,8 8 8,6,8 8 8 8,5,9 9
-8 8 8,8 8 8 8,8 8 8,8 8 8 8,8 8 8,8 8 8,9 9 9 9,5,8 8 8,9 9 9 9,6,8 8 8,7,6,8 8
-8 8 8,9 9 9 9,8 8 8,9 9 9 9,9 9 9,6,6,8 8 8,7,5,8 8 8,9 9 9,6,7,5
-9 9 9,9 9 9 9,8 8 8,9 9 9 9,8 8 8,7,7,8 8 8,6,7,8 8 8,9 9 9,7,5,9 9
-9 9 9,9 9 9 9,9 9 9,8 8 8 8,6,5,9 9 9 9,9 9 9,6,8 8 8 8,6,9 9 9,6,9 9 9,9 9
-8 8 8,9 9 9 9,9 9 9,8 8 8 8,8 8 8,6,6,9 9 9,7,7,8 8 8,5,8 8 8 8,6,7
-9 9 9,9 9 9 9,8 8 8,9 9 9 9,9 9 9,8 8 8,8 8 8 8,7,7,8 8 8 8,8 8 8,8 8 8,5,7,8 8
-8 8 8,9 9 9 9,8 8 8,9 9 9 9,8 8 8,6,6,8 8 8,7,7,8 8 8,9 9 9,8 8 8 8,7,9 9
-9 9 9,9 9 9 9,8 8 8,9 9 9 9,7,5,8 8 8 8,8 8 8,7,9 9 9 9,8 8 8,9 9 9,6,9 9 9,8 8
-9 9 9,9 9 9 9,8 8 8,9 9 9 9,8 8 8,7,8 8 8 8,8 8 8,6,7,7,7,7,7,8 8
-9 9 9,8 8 8 8,8 8 8,9 9 9 9,8 8 8,7,6,8 8 8,7,8 8 8 8,7,6,6,7,8 8
-9 9 9,9 9 9 9,8 8 8,6,9 9 9,5,8 8 8 8,9 9 9,8 8 8,8 8 8 8,8 8 8,9 9 9,6,7,8 8
-8 8 8,9 9 9 9,8 8 8,8 8 8 8,9 9 9,7,5,9 9 9,7,8 8 8 8,9 9 9,8 8 8,8 8 8 8,6,6
-9 9 9,9 9 9 9,8 8 8,9 9 9 9,9 9 9,5,8 8 8 8,8 8 8,7,6,8 8 8,7,6,7,6
-8 8 8,9 9 9 9,8 8 8,9 9 9 9,8 8 8,6,6,9 9 9,9 9 9,7,8 8 8,5,8 8 8 8,8 8 8,8 8
-9 9 9,8 8 8 8,8 8 8,9 9 9 9,6,9 9 9,8 8 8 8,8 8 8,7,7,8 8 8,9 9 9,6,7,8 8
-9 9 9,8 8 8 8,8 8 8,9 9 9 9,9 9 9,5,8 8 8 8,8 8 8,7,7,8 8 8,9 9 9,6,8 8 8,8 8
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_results1.csv b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_results1.csv
deleted file mode 100644
index f5825603b85a2a5a4313c427ce35c6c0f97996c4..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_results1.csv
+++ /dev/null
@@ -1,407 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_results2.csv b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_results2.csv
deleted file mode 100644
index d02d164d68835e458f2902e02bbd6c3acff2e1c7..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_promise_results2.csv
+++ /dev/null
@@ -1,616 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_results1.csv b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_results1.csv
deleted file mode 100644
index a03d6b7ac070f7afab0e1cd4a25ba2bd3913415d..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_results1.csv
+++ /dev/null
@@ -1,187 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_results2.csv b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_results2.csv
deleted file mode 100644
index ef1faa1dec83dc5120ac7f2c1810f0cad3edbb1b..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_results2.csv
+++ /dev/null
@@ -1,187 +0,0 @@
-Compute Energy
-Compute Time
-Leakage Energy
-Memory Energy
-Memory Time
-Patch Energy
-Quantization Energy
-Quantization Time
-Unpatch Energy
diff --git a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_tensors.txt b/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_tensors.txt
deleted file mode 100644
index 6c6b42b93c0446c298489429261592fe99e2f81b..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_cifar100/vgg16_tensors.txt
+++ /dev/null
@@ -1,64 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_imagenet/layer_composition.txt b/hpvm/projects/soc_simulator/vgg16_imagenet/layer_composition.txt
deleted file mode 100644
index 13c3c9e2cf89e0226186ed60786fec35dce84c42..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_imagenet/layer_composition.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-conv  add  activation  
-conv  add  activation  pool  
-conv  add  activation  
-conv  add  activation  pool  
-conv  add  activation  
-conv  add  activation  
-conv  add  activation  pool  
-conv  add  activation  
-conv  add  activation  
-conv  add  activation  pool  
-conv  add  activation  
-conv  add  activation  
-conv  add  activation  pool  
-dense  add  activation  
-dense  add  activation  
-dense  add  
diff --git a/hpvm/projects/soc_simulator/vgg16_imagenet/layers.txt b/hpvm/projects/soc_simulator/vgg16_imagenet/layers.txt
deleted file mode 100644
index cb5f0702ddef804cf3def37b4cca013b566bfd7f..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_imagenet/layers.txt
+++ /dev/null
@@ -1,16 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_imagenet/vgg16_imagenet_layers.txt b/hpvm/projects/soc_simulator/vgg16_imagenet/vgg16_imagenet_layers.txt
deleted file mode 100644
index 227a16d6dc95423b9606732c57ae513e9d37d15c..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_imagenet/vgg16_imagenet_layers.txt
+++ /dev/null
@@ -1,16 +0,0 @@
diff --git a/hpvm/projects/soc_simulator/vgg16_imagenet/vgg16_imagenet_ops.txt b/hpvm/projects/soc_simulator/vgg16_imagenet/vgg16_imagenet_ops.txt
deleted file mode 100644
index 95c6f0c5878344f5c2f4e911d75026e2a3b7bc4d..0000000000000000000000000000000000000000
--- a/hpvm/projects/soc_simulator/vgg16_imagenet/vgg16_imagenet_ops.txt
+++ /dev/null
@@ -1,68 +0,0 @@
diff --git a/hpvm/projects/torch2hpvm/README.md b/hpvm/projects/torch2hpvm/README.md
index 719abc5e3c2346fd7bc68972ee4f9035af307db1..1f06142f524b34760f1fffc84a5b2a2f07bf23a3 100644
--- a/hpvm/projects/torch2hpvm/README.md
+++ b/hpvm/projects/torch2hpvm/README.md
@@ -1,25 +1,111 @@
-## Importing Conda Environment:
+# PyTorch Frontend for HPVM
-conda env create -f onnx\_environment.yml
+`torch2hpvm` is a PyTorch frontend for HPVM. It provides a set of API that
-## Activate/deactivate Conda Environment
+- Generates a PyTorch `module` into HPVM-C code;
+- Exports a PyTorch dataset to ApproxHPVM dataset format;
+- Compiles the generated code into binary by invoking HPVM automatically.
-conda activate onnx\_frontend
+## Installation
-## Building and Installing Frontend for ONNX:
+`pip` is the recommended package manager (also available within `conda`).
+Using `pip`:
-python setup.py build
+pip install -e ./
+## Getting Started
+Let's look at an example that uses DNNs and weights pre-shipped with HPVM.
+This is found at `hpvm/test/dnn_benchmarks/pytorch/test_frontend.py`.
+*Note* that below we'll be working under directory `hpvm/test/dnn_benchmarks/pytorch`.
-python setup.py install
+We'll be generating ResNet-18 into an HPVM-compiled binary.
+First, prepare 2 datasets for autotuning and testing.
-### How to Run
+from torch2hpvm import BinDataset
+from pathlib import Path
+data_dir = Path(__file__).parent / "../model_params/resnet18_cifar10"
+dataset_shape = 5000, 3, 32, 32
+tuneset = BinDataset(data_dir / "tune_input.bin", data_dir / "tune_labels.bin", dataset_shape)
+testset = BinDataset(data_dir / "test_input.bin", data_dir / "test_labels.bin", dataset_shape)
-python main.py
+`BinDataset` is a dataset created over files of ApproxHPVM dataset format.
+Any instance `torch.utils.data.Dataset` can be used here.
+*Note* that each `module` is bound to 2 datasets: a "tune" and a "test" set.
+The generated binary accepts an argument to be either the string "tune" or "test",
+and performs inference over a dataset accordingly.
+This is because the dataset can contain arbitrary Python code which cannot yet be exported into HPVM-C;
+instead the frontend has to export some predefined datasets for the model to use.
+See TODOs (1).
+Create a DNN `module` and load the checkpoint:
+import torch
+from torch.nn import Module
+import dnn  # Defined at `hpvm/test/dnn_benchmarks/pytorch`
+model: Module = dnn.ResNet18()
+checkpoint = Path(__file__).parent / "../model_params/resnet18_cifar10.pth.tar"
+Any `torch.nn.Module` can be similarly used,
+as long as they only contain the tensor operators supported in HPVM
+(see "Supported Operators" and TODOs (2)).
+Now we are ready to export the model. The main functioning class of `torch2hpvm` is `ModelExporter`:
+from torch2hpvm import ModelExporter
+output_dir = Path("./resnet18_hpvm")
+build_dir = output_dir / "build"
+target_binary = build_dir / "resnet18"
+batch_size = 500
+conf_file = "" # TODO: points to your configuration file.
+exporter = ModelExporter(model, tuneset, testset, output_dir, config_file=conf_file)
+exporter.generate(batch_size=batch_size).compile(target_binary, build_dir)
-Set all your config, e.g. onnx model location, input size and emit directory for generated source code, in **config.py**.
-### Resources
-1. [ONNX overview](https://github.com/onnx/onnx/blob/master/docs/IR.md)
-2. [ONNX operator specs](https://github.com/onnx/onnx/blob/master/docs/Operators.md)
-3. [Conversion between models - available adapters](https://github.com/onnx/onnx/blob/master/onnx/version_converter.py#L21)
\ No newline at end of file
+`output_dir`, `build_dir`, and `target_binary` define the folder for code generation, compilation,
+and path to the compiled binary respectively.
+`batch_size` is the batch size the binary uses during inference.
+*Note* that `conf_file` is the path to an HPVM approximation configuration file.
+This file decides what approximation the binary will use during inference.
+This path is hardcoded into the binary and is only read when the binary starts,
+so it's fine to have `conf_file` point to a non-existing path.
+An example can be found at `test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/data/tuner_confs.txt`.
+## Supported Operators
+Any builtin and custom PyTorch `Module` are supported
+*as long as* the generated ONNX model consists of only the following operators
+when the Module is exported into ONNX:
+| Convolution | Linear | Pooling           | Pointwise          | Other    |
+| Conv        | MatMul | GlobalAveragePool | BatchNormalization | Flatten  |
+|             | Gemm   | AveragePool       | Relu               | Softmax  |
+|             |        | MaxPool           | Tanh               | Identity |
+|             |        |                   |                    | Pad      |
+|             |        |                   |                    | Add      |
+This choice of operators is largely constrained by backend (tensor_runtime) supports.
+## TODOs
+1. Optionally insert a Python-C interface in the generated binary to
+   call back into a Dataset class and read the data.
+   - Needs pybind11, hardcoding of Python environment, and some fiddling with import mechanism.
+1. Expand the list of operators supported in the frontend.
+   - Most ideally, create a high-level description of operators that can tie
+     HPVM-C intrinsics and the frontend list of operators together.
diff --git a/hpvm/projects/torch2hpvm/TODO.md b/hpvm/projects/torch2hpvm/TODO.md
deleted file mode 100644
index 1ebb883f79b65b177baa5459b426bb3610f64ac0..0000000000000000000000000000000000000000
--- a/hpvm/projects/torch2hpvm/TODO.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# What kind of models we should use
-ResNet-50 -> as a start?
-Mask R-CNN
-# Questions from Sudipta
-Graph optimization: operator fusions, data layout transformations
-Automatic scheduling
-Handling dynamic control flow in model graph
-Automatic differentiation
-# Operators used in the BERT model available in the ONNX model zoo, organized by type and number of occurences
-(Opset 10 versions)
-Unsqueeze 191
-Reshape 71
-Cast 70
-Transpose 62
-Concat 56
-Identity 28
-Squeeze 7
-Shape 5
-ConstantOfShape 1
-Gather 1
-OneHot 1
-Split 1
-Mul 186
-Add 185
-MatMul 98
-Sub 62
-ReduceMean 50
-Sqrt 25
-Reciprocal 25
-Softmax 12
-Pow 12
-Tanh 12
diff --git a/hpvm/projects/torch2hpvm/setup.py b/hpvm/projects/torch2hpvm/setup.py
index f0cd851e586cf4d35c856ead11915f97c7654901..6d66372c71accf32d74032cbe4088dd700ec36aa 100644
--- a/hpvm/projects/torch2hpvm/setup.py
+++ b/hpvm/projects/torch2hpvm/setup.py
@@ -4,11 +4,16 @@ setup(
     description="PyTorch frontend for HPVM",
-    author="Yuanjing Shi, Yifan Zhao",
-    author_email="ys26@illinois.edu, yifanz16@illinois.edu",
+    author="Yifan Zhao, Yuanjing Shi",
+    author_email="yifanz16@illinois.edu, ys26@illinois.edu",
+    package_data={"torch2hpvm": ["*.json", "*.cpp.in"]},
-        "jinja2>=2.11", "networkx>=2.5", "onnx>=1.8.0", "torch", "onnx-simplifier>=0.2.27"
+        "jinja2>=2.11",
+        "networkx>=2.5",
+        "onnx>=1.8.0",
+        # Starting from 1.7.0 PyTorch starts to do some weird optimizations.
+        "torch>=1.4,<=1.6",
+        "onnx-simplifier>=0.2.27",
-    entry_points={"console_scripts": ["torch2hpvm=torch2hpvm:main"]},
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/compile.py b/hpvm/projects/torch2hpvm/torch2hpvm/compile.py
index f469c871f990c0b0abe38465e36a67597c71d5f8..922b6795ade457ba4c961af4d2e70ce150e22e92 100644
--- a/hpvm/projects/torch2hpvm/torch2hpvm/compile.py
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/compile.py
@@ -55,7 +55,7 @@ class ModelExporter:
         self.dfg = DFG(onnx_model.graph)
         output_dir = Path(output_dir).absolute()
-        os.makedirs(output_dir, exist_ok=True)
+        os.makedirs(output_dir, exist_ok=False)  # Will throw if already existss
         self.weight_dir = output_dir / self.weight_dir_name
         self.codefile = output_dir / self.source_file_name
@@ -69,7 +69,7 @@ class ModelExporter:
                 raise ValueError(
                     f"Config file must be given and exist under hpvm_tensor mode"
-            self.path_params = {"config_file": Path(config_file)}
+            self.path_params = {"config_file": str(config_file)}
             self.compile_args = ["-t", "tensor", "--conf-file", str(config_file)]
             self.codegen = HpvmCodeGen(*args3, "tensor", None)
         elif target == "hpvm_tensor_inspect":
@@ -81,7 +81,7 @@ class ModelExporter:
                 "tune_labels_path": (self.weight_dir / self.tuneset_name[1]).as_posix(),
                 "conf_path": config_file.as_posix(),
                 "fifo_path_r": (output_dir / self.fifo_file_name_r).as_posix(),
-                "fifo_path_w": (output_dir / self.fifo_file_name_w).as_posix()
+                "fifo_path_w": (output_dir / self.fifo_file_name_w).as_posix(),
             self.compile_args = ["-t", "tensor", "--conf-file", str(config_file)]
             self.codegen = HpvmCodeGen(*args3, "tensor", self.path_params)
@@ -161,7 +161,7 @@ class ModelExporter:
                     "knob_speedup": knob_speedup,
                     "op_knobs": op_knobs,
                     "baseline_knob": baseline_knob,
-                    **self.path_params
+                    **self.path_params,
@@ -290,7 +290,9 @@ class ModelExporter:
             raise ValueError(f"Cannot accept model of type {type(model)}")
         if opset is not None:
             onnx_model = check_onnx_version(onnx_model, opset)
-        onnx_model, check = simplify(onnx_model)
+        onnx_model, check = simplify(
+            onnx_model, skip_fuse_bn=True, skipped_optimizers=["fuse_bn_into_conv"]
+        )
         assert check, "Simplified ONNX model could not be validated"
         return onnx.shape_inference.infer_shapes(onnx_model)
@@ -318,17 +320,18 @@ def torch_to_onnx(
     output_obj: Union[IO, PathLike],
     opset_version: int = 10,
+    from torch.onnx import export
     # Export the model (must be on CPU, some model only supports this)
-    torch.onnx.export(
+    export(
         export_params=True,  # store the trained parameter weights inside the model file
+        do_constant_folding=False,
         opset_version=opset_version,  # the ONNX version to export the model to
-        do_constant_folding=True,  # whether to execute constant folding for optimization
         input_names=["input"],  # the model's input names
         output_names=["output"],  # the model's output names
-        strip_doc_string=False,
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in
index 0c1db9b1ff9d71cb9a8c8bbf3a2c64cec8331476..1c4a386ce2fd9e50953a49377df20c9d3ebf75da 100644
--- a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in
@@ -90,6 +90,7 @@ int main(int argc, char *argv[]){
+  #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++){
     int start = i * batch_size, end = start + batch_size;
     copyInputBatch(input_path.c_str(), start, end, {{input_shape|join(', ')}}, {{input_name}});
diff --git a/hpvm/scripts/automated_tests.sh b/hpvm/scripts/automated_tests.sh
deleted file mode 100644
index aece74eaa0a59e2e91807e9444c5c10054126cdd..0000000000000000000000000000000000000000
--- a/hpvm/scripts/automated_tests.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-if [ -f $BUILD_DIR/tools/hpvm/projects/$HPVM_RT ]; then
-    true
-    echo $BUILD_DIR/tools/hpvm/projects/$HPVM_RT
-    echo HPVM not installed! Exiting without running tests!.
-    exit 0
-echo Running tests ...
-# Run regression tests
-# Run unit tests
diff --git a/hpvm/scripts/hpvm_installer.py b/hpvm/scripts/hpvm_installer.py
index 11ad3045528543d8398828394cc7236fc849bb30..e83d5f3e727044dcba942e2533597bc779c9b816 100755
--- a/hpvm/scripts/hpvm_installer.py
+++ b/hpvm/scripts/hpvm_installer.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 from pathlib import Path
 from argparse import ArgumentParser, Namespace
-from subprocess import check_call
+from subprocess import CalledProcessError, check_call
 from os import makedirs, chdir, environ
 VERSION = "9.0.0"
@@ -12,11 +12,15 @@ CLANG_TARBALL = f"{CLANG_DIR}.tar.xz"
 LLVM_DIR = f"llvm-{VERSION}.src"
 LLVM_TARBALL = f"{LLVM_DIR}.tar.xz"
-ROOT_DIR = Path.cwd()
+ROOT_DIR = (Path(__file__).parent / "..").absolute()
 BUILD_DIR = ROOT_DIR / "build"
 TEST_DIR = ROOT_DIR / "test"
 LLVM_LIT = BUILD_DIR / "bin/llvm-lit"
+MODEL_PARAMS_DIR = TEST_DIR / "dnn_benchmarks/model_params"
+MODEL_PARAMS_TAR = ROOT_DIR / "model_params.tar.gz"
+MODEL_PARAMS_LINK = "https://databank.illinois.edu/datafiles/o3izd/download"
 LINKS = [
@@ -27,8 +31,15 @@ LINKS = [
 MAKE_TARGETS = ["approxhpvm.py"]
-MAKE_TEST_TARGETS = ["hpvm-check"]
+MAKE_TEST_TARGETS = ["check-hpvm-dnn", "check-hpvm-pass"]
+# Relative to project root which is __file__.parent.parent
+    "projects/hpvm-profiler",
+    "projects/predtuner",
+    "projects/torch2hpvm",
+    "projects/keras",
 def parse_args():
     parser = ArgumentParser(
@@ -63,6 +74,9 @@ def parse_args():
         "-r", "--run-tests", action="store_true", help="Build and run test cases"
+    parser.add_argument(
+        "--no-params", action="store_true", help="Don't download DNN model parameters"
+    )
     return parser.parse_args()
@@ -74,11 +88,11 @@ def prompt_args():
     def parse_int(s: str):
             v = int(s)
-            return v
         except ValueError:
             return None
         if v <= 0:
             return None
+        return v
     def parse_targets(s: str):
         if " " in s:
@@ -124,7 +138,7 @@ def print_args(args):
 def check_download_llvm_clang():
     if Path("llvm/").is_dir():
-        print("Found LLVM, not extracting it again.")
+        print("Found LLVM directory, not extracting it again.")
         if Path(LLVM_TARBALL).is_file():
             print(f"Found {LLVM_TARBALL}, not downloading it again.")
@@ -141,7 +155,7 @@ def check_download_llvm_clang():
     environ["LLVM_SRC_ROOT"] = str(ROOT_DIR / "llvm")
     if (tools / "clang/").is_dir():
-        print("Found clang, not downloading it again.")
+        print("Found clang directory, not extracting it again.")
     print(f"Downloading {CLANG_TARBALL}...")
@@ -155,6 +169,30 @@ def check_download_llvm_clang():
+def check_download_model_params():
+    if MODEL_PARAMS_DIR.is_dir():
+        print("Found model parameters, not extracting it again.")
+        return
+    if MODEL_PARAMS_TAR.is_file():
+        print(f"Found {MODEL_PARAMS_TAR}, not downloading it again.")
+    else:
+        print(f"Downloading DNN model parameters: {MODEL_PARAMS_TAR}...")
+        print(f"=============================")
+        check_call([WGET, MODEL_PARAMS_LINK, "-O", MODEL_PARAMS_TAR])
+    print(f"Extracting DNN model parameters {MODEL_PARAMS_TAR} => {MODEL_PARAMS_DIR}...")
+    # Decompression is pretty time-consuming so we try to show a progress bar:
+    try:
+        check_call(f"pv {MODEL_PARAMS_TAR} | tar xz", shell=True)
+    except CalledProcessError:
+        # Maybe `pv` is not installed. Fine, we'll run without progress bar.
+        print(">> 'pv' is not installed, no progress bar will be shown during decompression.")
+        print(">> Decompression ongoing...")
+        check_call(["tar", "xzf", MODEL_PARAMS_TAR])
+    check_call(["mv", "model_params", MODEL_PARAMS_DIR])
+    if MODEL_PARAMS_TAR.is_file():
+        MODEL_PARAMS_TAR.unlink()
 def link_and_patch():
     from os import symlink
@@ -210,6 +248,16 @@ For more details refer to README.md.
+def install_py_packages():
+    import sys
+    project_root = Path(__file__).parent.parent
+    for package in PY_PACKAGES:
+        package_home = project_root / package
+        print(f"Installing python package {package_home}")
+        check_call([sys.executable, "-m", "pip", "install", str(package_home)])
 def run_tests():
     # Run regression tests
@@ -236,7 +284,10 @@ def main():
+    if not args.no_params:
+        check_download_model_params()
     maybe_build(not args.no_build, args.parallel, args.targets, args.run_tests)
+    install_py_packages()
     if args.run_tests:
diff --git a/hpvm/test/CMakeLists.txt b/hpvm/test/CMakeLists.txt
index 4c96ee124f066bbe35c2f8117ea29078f38df7ae..660003538fe72d45e3dbfc1178fd296cdc7156b5 100644
--- a/hpvm/test/CMakeLists.txt
+++ b/hpvm/test/CMakeLists.txt
@@ -1,55 +1,4 @@
-  )
-  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
-  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
-  )
-# Set the depends list as a variable so that it can grow conditionally.
-# NOTE: Sync the substitutions in test/lit.cfg when adding to this list.
-  opt hpvm-rt.bc
-  # Passes:
-  # Test utils:
-  FileCheck count not
-add_lit_testsuite(check-hpvm "Running the HPVM regression tests"
-set_target_properties(check-hpvm PROPERTIES FOLDER "Tests")
-# Setup a legacy alias for 'check-llvm'. This will likely change to be an
-# alias for 'check-all' at some point in the future.
-add_dependencies(hpvm-check check-hpvm)
-set_target_properties(hpvm-check PROPERTIES FOLDER "Tests")
-# HPVM-C benchmarks uses ctest instead of LLVM's test mechanism
-# because they are compiled in a custom way
+include(../cmake/TestFile.cmake)  # Generation of `.test` files in CMake
+add_subdirectory(hpvm_pass)  # Passes test suite
+add_subdirectory(dnn_benchmarks/hpvm-c)  # DNN accuracy test suite
+add_subdirectory(dnn_benchmarks/profiling)  # hpvm-profiler test suite
diff --git a/hpvm/test/README.md b/hpvm/test/README.md
index 7e8b408a0c127bf2365eaf7c7b8498178c7c11b1..18cb05b833434fcffc7e4c50b5f38150c924fb19 100644
--- a/hpvm/test/README.md
+++ b/hpvm/test/README.md
@@ -14,6 +14,12 @@ This directory is organized as follows:
   * `dnn_benchmarks/hpvm-c` contains the HPVM-C version of these DNNs.
     Their organization and usage are similar to the benchmarks under `benchmarks/`.
+    Each subfolder contains a DNN with 2 versions (2 `.cpp` files):
+    the `tensor`-targeted version which compiles to `tensor_runtime`,
+    and the `cudnn`-targeted version which compiles to operators in `cuDNN`
+    (has `_cudnn` in name).
   * `dnn_benchmarks/keras` contains these DNNs implemented in Keras,
     and code for generating them down to HPVM-C (testing Keras frontend).
   * `dnn_benchmarks/pytorch` contains these DNNs in PyTorch
@@ -23,16 +29,40 @@ This directory is organized as follows:
 ## Running Test Cases and Benchmarks
-The easiest way to run `unitTests/` and `regressionTests/` is
-to build the target `hpvm-check` in the global build directory: `make -j hpvm-check`.
-`hpvm-check` doesn't automatically run `benchmarks/` and `dnn_benchmarks` as they are extremely time-consuming.
+The easiest way to run tests is to use `make` targets,
+which will also take care of all compilation of test cases and test fixtures.
+The following targets runs these tests respectively:
+* `make -j check-hpvm-pass` runs tests in `hpvm_pass`: `hpvm_pass/**/*.ll`.
+  These are regression and unit tests for HPVM passes.
+* `make -j check-hpvm-dnn` runs all 20 DNN benchmarks under `dnn_benchmarks/hpvm-c`
+  (10 DNNs x 2 versions) and validates their accuracy.
+  *Note* that this can take quite long due to the size of DNNs and datasets.
+  Depending on your hardware capability, this test can take 5-30 minutes.
+  Also, this is set to run sequentially out of GPU memory concerns.
+* `make -j check-hpvm-profiler` runs `hpvm-profiler` on some smaller networks
+  (as it is extremely time-consuming) and presents the tradeoff curve with profiled speedup.
+  *Note* that if you're on an NVIDIA Jetson TX2, you may want to run
+  `bash dnn_benchmarks/profiling/jetson_clocks.sh`
+  to ensure that the clocks are running at the maximum frequency
+Underneath, `llvm-lit` is used to discover and run the tests.
 `benchmarks/` can only be compiled in-source with `make`.
 We are working to migrate it into the `cmake` system.
+## Compiling Benchmarks
+This section explains how to compile the benchmarks without running them as tests.
 ### HPVM-C DNN Benchmarks
-To build all `dnn_benchmarks/hpvm-c`, use `make -j dnn_benchmarks`.
+To build (not run) all `dnn_benchmarks/hpvm-c`, use `make -j dnn_benchmarks`.
+For each benchmark `${bench_name}`, the binary is generated at
 Alternatively, it's possible to build just 1 DNN benchmark.
 The output of CMake shows a list of these benchmarks as target names, starting with
@@ -56,10 +86,6 @@ Currently, there are 20 of them. These are:
 `_cudnn` suffix indicates the code is generated onto cuDNN functions.
 Otherwise they are generated to `tensor_runtime` DNN functions which are hand-written in CUDA.
-### DNN Frontends
-TODO: figure out how to
+### TODO: figure out how to
-1. Auto run all hpvm-c DNN benchmarks
-2. Compare the output accuracy to groundtruth
-3. Auto run Keras and PyTorch tests (generating, compiling and running all DNNs)
+1. Auto run Keras and PyTorch tests (generating, compiling and running all DNNs)
diff --git a/hpvm/test/dnn_benchmarks/.gitignore b/hpvm/test/dnn_benchmarks/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6363621cf146eb33ded322d27ce19cf0ae6374c4
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/.gitignore
@@ -0,0 +1 @@
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt
index 37a856123d1ea9ee074a5ac2844b223a78c56e16..3a0c6534e02ce82fdfd02f483f71c6be1a9ab433 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt
@@ -13,8 +13,32 @@ configure_file(
 # and we'll give this to approxhpvm.py
+# --[ llvm-lit test setup
+# lit.cfg.py looks for tests in CMAKE_CURRENT_BINARY_DIR (see lit.cfg.py)
+# as most of the tests require some kind of compilation / generation
+# which is best done over there.
+  ../../lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+add_lit_testsuite(check-hpvm-dnn "Running HPVM DNNs"
+  DEPENDS dnn_benchmarks  # Compile all dnn benchmarks to run them
+  ARGS "-j1"  # Run DNN benchmarks sequentially
+# Install an accuracy comparator under build/bin
+  OUTPUT ${BIN_DIR}/check_dnn_acc.py
+  COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/check_dnn_acc.py ${BIN_DIR}
+  COMMAND chmod +x ${BIN_DIR}/check_dnn_acc.py
 set(test_compile_targets "")
-function(approxhpvm_py_codegen bin_filename src_filepath codegen_target)
+function(compile_hpvm_c bin_filename src_filepath codegen_target)
     OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bin_filename}
     DEPENDS ${src_filepath} approxhpvm.py
@@ -23,36 +47,38 @@ function(approxhpvm_py_codegen bin_filename src_filepath codegen_target)
       -t ${codegen_target} -I ${CONFIG_INCLUDE_DIR} ${ARGV}
   add_custom_target(${bin_filename} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bin_filename})
-  add_test(NAME test_${bin_filename} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${bin_filename})
-  set_tests_properties(test_${bin_filename} PROPERTIES RUN_SERIAL TRUE)
   set(test_compile_targets ${test_compile_targets} ${bin_filename} PARENT_SCOPE)
-set(test_run_targets "")
-function(run_single_benchmark run_target benchmark)
-  add_custom_target(
-    ${run_target} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${benchmark}
-  )
-  add_dependencies(${run_target} ${benchmark})
-  set(test_run_targets ${test_run_targets} ${run_target} PARENT_SCOPE)
+function(hpvm_add_dnn_test benchmark_target)
+  # llvm_test_run* composes a .test file with the RUN line needed by llvm-lit
+  # No need to give binary path yet;
+  # llvm_add_test_for_target knows the binary path from target name
+  # Runs the binary with no argument
+  llvm_test_run()
+  # Requests comparison of accuracy
+  llvm_test_run(EXECUTABLE check_dnn_acc.py final_accuracy ${benchmark_target})
+  # Removes the final_accuracy file
+  llvm_test_run(EXECUTABLE rm final_accuracy)
+  # llvm_add_test creates .test file to given output path for given binary.
+  llvm_add_test(${benchmark_target}.test ${CMAKE_CURRENT_BINARY_DIR}/${benchmark_target})
+  # TODO: add_dependencies
 file(GLOB entries ./benchmarks/*)
 foreach(dir ${entries})
   get_filename_component(dirname "${dir}" NAME)
   # Generate "tensor"-targeted code
-  approxhpvm_py_codegen(
+  compile_hpvm_c(
     ${dirname} ${dir}/${dirname}.cpp tensor 
     --conf-file ${dir}/data/tuner_confs.txt
-  # Run tensor binary
-  run_single_benchmark(run_${dirname} ${dirname})
+  hpvm_add_dnn_test(${dirname})
   # Generate "cudnn"-targeted code
-  approxhpvm_py_codegen(${dirname}_cudnn ${dir}/${dirname}_cudnn.cpp cudnn)
-  # Run cudnn binary
-  run_single_benchmark(run_${dirname}_cudnn ${dirname}_cudnn)
+  compile_hpvm_c(${dirname}_cudnn ${dir}/${dirname}_cudnn.cpp cudnn)
+  hpvm_add_dnn_test(${dirname}_cudnn)
-message(STATUS "List of test dnn benchmarks: ${test_compile_targets}")
-add_custom_target(dnn_benchmarks DEPENDS ${test_compile_targets})
-message(STATUS "Target name for compiling all dnn benchmarks: dnn_benchmarks")
+message(STATUS "List of HPVM-C DNN benchmarks: ${test_compile_targets}")
+add_custom_target(dnn_benchmarks DEPENDS ${test_compile_targets} ${BIN_DIR}/check_dnn_acc.py)
+message(STATUS "Target name for compiling all DNN benchmarks: dnn_benchmarks")
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
index dd689d202a91755ecad116a3d1277f59c740d0b1..35f8188f785d023264c31a20480f661f066fb9f5 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -411,10 +405,12 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
+  std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
@@ -458,12 +454,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  // void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32);
-  // uint8_t* labels = readLabels(labels_path.c_str(),10000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -493,45 +487,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-  std::string input_path = dir_prefix + std::string("test_input.bin");
-  // void* input = create4DTensor(0,nchw,batch_size,3,32,32);
+  __hpvm__init();
-  startProfiling();
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-      // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-      args->input = input;
-      args->input_bytes = 0;
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-      __hpvm__wait(dfg);
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
index 9c0c980d977138a628f1c0b76354d626066d77f9..5bcc5b627b546d714404c89d9a775856c647e7bc 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -411,11 +405,18 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
@@ -458,9 +459,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -490,41 +492,25 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  __hpvm__init();
+  float total_accuracy = 0;
+#pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
-    int start = i * batch_size;
-    int end = (i + 1) * batch_size;
+    int start = i * batch_size, end = start + batch_size;
     copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
-    args->input = input;
-    args->input_bytes = 0;
     void *dfg = __hpvm__launch(0, root, (void *)args);
     void *result = static_cast<RootIn *>(args)->r.tensor;
     hpvm_request_tensor(result, 0);
     uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
-    computeAccuracy3(labels, result);
-    // llvm_hpvm_invokeRtControl2(result, labels);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+  write_accuracy(total_accuracy / input_size);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
index ae5f31b7dcca3ec59920e0dcc0ba34ca5ea28cbc..51e0dd137db1cd835412bc6ee5694795718e739d 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -362,12 +356,12 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
-  // void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32);
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   uint8_t *labels = readLabels(labels_path.c_str(), 5000);
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -406,11 +400,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -436,43 +429,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
+  __hpvm__init();
-  startProfiling();
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-      // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-      // Replaced create4DTensor and copyInputBatch with readInputBatch
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-      args->input = input;
-      args->input_bytes = 0;
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-      __hpvm__wait(dfg);
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
index 7ce160881372f9b09e20f079ba5b065f724fe34f..74c5420fd9b77aa2deab656204e43b164a241304 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -363,13 +357,18 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 3, 32, 32);
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
@@ -407,9 +406,8 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -436,14 +434,26 @@ int main() {
   args->dense_1_w_bytes = 0;
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
-  computeAccuracy3(labels, result);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
index d49c0d2d06b1ea04ad78ee72dc2776bd000dacfd..16bcecf939051ef7490d58a196a12786b0d4f465 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
@@ -1,14 +1,8 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <config.h>
 #include <hpvm.h>
+#include <string>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
-#include <config.h>
 void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) {
@@ -460,11 +454,11 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
-  std::string dir_prefix =
-      std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
+const int batch_size = 100, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -515,16 +509,11 @@ int main() {
   std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
   void *dense_3_b =
       readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
-  // void* input = readTrainedWeights(input_path.c_str(), 0, 1000,3,224,224);
-  // uint32_t* labels = readLabels2(labels_path.c_str(),6000);
-  // uint32_t* labels = readLabels3(labels_path.c_str(), 1000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -558,40 +547,21 @@ int main() {
   args->dense_3_b = dense_3_b;
   args->dense_3_b_bytes = 0;
-  int batch_size = 200;
-  int test_input_size = 4000;
-  int batch_count = test_input_size / batch_size;
+  __hpvm__init();
-  startProfiling();
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
-      args->input = input;
-      args->input_bytes = 0;
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-      __hpvm__wait(dfg);
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
index 73175982ab98c19efdf1e77b6e2db504af4d6d93..5ddd9694328db6d892c8c23b44b2e165afe77953 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -460,12 +454,18 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
-  std::string dir_prefix =
-      std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+const int batch_size = 100, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11);
@@ -514,14 +514,9 @@ int main() {
   std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
   void *dense_3_b =
       readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 1000, 3, 224, 224);
-  // uint32_t* labels = readLabels2(labels_path.c_str(),6000);
-  uint32_t *labels = readLabels3(labels_path.c_str(), 1000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -557,14 +552,25 @@ int main() {
   args->dense_3_b = dense_3_b;
   args->dense_3_b_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
-  computeAccuracy3(labels, result);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
index b67d585d01b4809d4107d95ab4476e741f13dd7c..ee81665ec94a4c9cc634c16bdda5bbea96e120df 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -264,13 +258,13 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+const int batch_size = 1000, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 1, 5, 5);
@@ -294,15 +288,11 @@ int main() {
       readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 1024, 10);
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
-  //  void* input = readTrainedWeights(input_path.c_str(), 0, 5000,1,28,28);
-  //  uint32_t* labels = readLabels3(labels_path.c_str(), 5000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-  //  args->input = input;
-  //  args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 1, 28, 28);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -320,37 +310,21 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
+  __hpvm__init();
-  startProfiling();
+#pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
-    int start = i * batch_size;
-    int end = (i + 1) * batch_size;
-    void *input = readInputBatch(input_path.c_str(), 0, start, end, 1, 28, 28);
-    args->input = input;
-    args->input_bytes = 0;
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 1, 28, 28, input);
     void *dfg = __hpvm__launch(0, root, (void *)args);
     void *result = static_cast<RootIn *>(args)->r.tensor;
     hpvm_request_tensor(result, 0);
     llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
index 4e0adc7bbe15356955a178d8db30466c8b872258..eecc7f5d60cf63b10ea5af098156a0dfa2890f80 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -264,13 +258,18 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+const int batch_size = 1000, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 1, 5, 5);
@@ -294,13 +293,9 @@ int main() {
       readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 1024, 10);
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 1, 28, 28);
-  uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 1, 28, 28);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -320,15 +315,25 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 1, 28, 28, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
-  computeAccuracy3(labels, result);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
index a4de2826216d9bf6b3843e466097abae35ca8b72..58051e0993e8b8893a398eee3a0358556a18c2f4 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -1965,8 +1959,12 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 int main() {
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
+  std::string dir_prefix =
+      std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -2501,14 +2499,11 @@ int main() {
       readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10);
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  // void* input = readTrainedWeights(input_path.c_str(), 0, 5000,3,32,32);
-  // uint8_t* labels = readLabels(labels_path.c_str(), 5000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->batch_normalization_1_gamma = batch_normalization_1_gamma;
@@ -2784,39 +2779,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-  // void* input = create4DTensor(0,nchw,batch_size,3,32,32);
+  __hpvm__init();
-  startProfiling();
+#pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
-    int start = i * batch_size;
-    int end = (i + 1) * batch_size;
-    // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-    args->input = input;
-    args->input_bytes = 0;
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
     void *dfg = __hpvm__launch(0, root, (void *)args);
     void *result = static_cast<RootIn *>(args)->r.tensor;
     hpvm_request_tensor(result, 0);
     llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
index b7e0a714590418414a2647474526a1fb0c09e390..482a37d4c4be22eda1079b0a900c762fdb4d1001 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -1965,12 +1959,19 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+int main() {
+  std::string dir_prefix =
+      std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
@@ -2502,12 +2503,10 @@ int main() {
       readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10);
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 3, 32, 32);
   uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -2785,14 +2784,25 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
-  computeAccuracy3(labels, result);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
index 66ab37cd33e502df35f73ca2b3addb1c4be53808..a254a625709f13ec08b403c26eac126a09df6daa 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
@@ -1,11 +1,5 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -1300,14 +1294,13 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
-  // void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32);
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
-  // uint32_t* labels = readLabels3(labels_path.c_str(),5000);
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3);
@@ -1439,11 +1432,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -1533,47 +1525,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-  // NOTE-HASHIM: commented out
-  // void* input = create4DTensor(0,nchw,batch_size,3,32,32);
+  __hpvm__init();
-  startProfiling();
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-      // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-      // NOTE-HASHIM: Commented out above line and line that does create4DTensor
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-      args->input = input;
-      args->input_bytes = 0;
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-      __hpvm__wait(dfg);
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-      printf("RUNNING BATCH = %d \n", i);
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
index e1429ada17629aa7d889b882f23817943a36dabf..da1ce91ba3fdb4dc7d74e6b854dad7fc1c2d412e 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
@@ -1,11 +1,5 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -1226,13 +1220,18 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 3, 32, 32);
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
@@ -1365,9 +1364,8 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -1459,16 +1457,25 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
-  computeAccuracy3(labels, result);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
index db6b64daa0d214017ebcf968067fe44f40aa9c06..a3ece5fedec57a73537d870199b6b4270b541b42 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -5132,8 +5126,10 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+const int batch_size = 25, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
   std::string dir_prefix =
       std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
@@ -6311,12 +6307,10 @@ int main() {
   void *dense_1_b =
       readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1);
-  // void* input = readTrainedWeights(input_path.c_str(), 0,100,3,224,224);
-  // uint32_t* labels = readLabelsBatch3(labels_path.c_str(),0,100);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -6958,39 +6952,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
+  __hpvm__init();
-  startProfiling();
-  unsigned int batch_size = 50;
-  unsigned int test_input_size = 1000;
-  unsigned int batch_count = test_input_size / batch_size;
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
-      args->input = input;
-      args->input_bytes = 0;
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-      __hpvm__wait(dfg);
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
index ab613983a0a57673a2575378b6a9a2a3fc04f941..03674b50a5b6b9dcde87fd1e32b0520362ca8ca3 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -4903,12 +4897,19 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+const int batch_size = 50, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
   std::string dir_prefix =
       std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/";
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 7, 7);
@@ -6081,12 +6082,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b =
       readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 100, 3, 224, 224);
   uint32_t *labels = readLabels3(labels_path.c_str(), 100);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -6730,14 +6729,25 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
-  computeAccuracy3(labels, result);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
index 39c2ffc8769c8b8f13b359e56f4e138dff0fed98..cad22649fdfe4fd6271f5202aa524cea2f3f1383 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -827,8 +821,10 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -920,14 +916,11 @@ int main() {
       readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10);
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
-  // void* input = readTrainedWeights(input_path.c_str(), 0,2000,3,32,32);
-  // uint32_t* labels = readLabels3(labels_path.c_str(),2000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -989,41 +982,21 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
+  __hpvm__init();
-  startProfiling();
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-      // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-      args->input = input;
-      args->input_bytes = 0;
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-      __hpvm__wait(dfg);
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
index c1cb38327dc94938934486f3022e4e9cb360f517..662520282892f852fd8f634061cc0f6f72e465f9 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -827,11 +821,18 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/";
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
@@ -920,12 +921,10 @@ int main() {
       readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10);
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32);
   uint32_t *labels = readLabels3(labels_path.c_str(), 2000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -989,28 +988,25 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
+  __hpvm__init();
+  float total_accuracy = 0;
-  startProfiling();
-  input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32);
-  args->input = input;
-  args->input_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
-  computeAccuracy3(labels, result);
-  freeBatchMemory();
-  stopProfiling();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
index ce899cd0a24776bd5a7c8b51f13e0dac698b3495..54417171fbcda003e27d7662a11f35499f7c0cc8 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -827,10 +821,11 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -922,14 +917,10 @@ int main() {
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1);
-  //  void* input = readTrainedWeights(input_path.c_str(), 0,2000,3,32,32);
-  //  uint32_t* labels = readLabels3(labels_path.c_str(),2000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-  //  args->input = input;
-  //  args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -991,40 +982,21 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
+  __hpvm__init();
-  startProfiling();
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-      args->input = input;
-      args->input_bytes = 0;
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-      __hpvm__wait(dfg);
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
index 326542a03852d97dbce2dacf4da913005f9ef28a..9f989e361051a8623657d11224cbb898f061032e 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -827,12 +821,18 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
@@ -922,12 +922,8 @@ int main() {
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32);
-  uint32_t *labels = readLabels3(labels_path.c_str(), 2000);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -991,14 +987,25 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
-  computeAccuracy3(labels, result);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
index 91af01fe8eb7deacb47cc42f3fe6cbb620adc000..12f7870a152d8f42fa01b90429bc1102059861ae 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -875,10 +869,11 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+const int batch_size = 10, input_size = 5000,
+          batch_count = input_size / batch_size;
-  std::string dir_prefix =
-      std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -978,9 +973,10 @@ int main() {
   void *dense_3_b =
       readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -1046,40 +1042,21 @@ int main() {
   args->dense_3_b = dense_3_b;
   args->dense_3_b_bytes = 0;
+  __hpvm__init();
-  startProfiling();
-  unsigned int batch_size = 50;
-  unsigned int test_input_size = 1000;
-  unsigned int batch_count = test_input_size / batch_size;
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
-      args->input = input;
-      args->input_bytes = 0;
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-      __hpvm__wait(dfg);
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
-  stopProfiling();
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
index 16f145efad6a783cd78557c871ff1348bb6689f5..189460c928d65ed989201dc715df5cbe0ccd5bde 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
@@ -1,10 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -875,11 +869,18 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+const int batch_size = 25, input_size = 5000,
+          batch_count = input_size / batch_size;
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
-  std::string input_path = dir_prefix + std::string("tune_input.bin");
-  std::string labels_path = dir_prefix + std::string("tune_labels.bin");
+  std::string input_path = dir_prefix + std::string("test_input.bin");
+  std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
@@ -976,12 +977,10 @@ int main() {
   std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
   void *dense_3_b =
       readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 100, 3, 224, 224);
   uint32_t *labels = readLabelsBatch3(labels_path.c_str(), 0, 100);
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -1049,14 +1048,25 @@ int main() {
   args->dense_3_b = dense_3_b;
   args->dense_3_b_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
-  computeAccuracy3(labels, result);
   return 0;
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/check_dnn_acc.py b/hpvm/test/dnn_benchmarks/hpvm-c/check_dnn_acc.py
new file mode 100644
index 0000000000000000000000000000000000000000..745836a3286d77c9054ac37356e8f6d585d8c748
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/check_dnn_acc.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+from sys import argv
+network_accuracies = {
+    "alexnet2_cifar10": 84.98,
+    "alexnet_cifar10": 79.28,
+    "alexnet_imagenet": 56.30,
+    "lenet_mnist": 98.70,
+    "mobilenet_cifar10": 84.42,
+    "resnet18_cifar10": 89.56,
+    "resnet50_imagenet": 75.10,
+    "vgg16_cifar10": 89.96,
+    "vgg16_cifar100": 66.50,
+    "vgg16_imagenet": 69.46,
+def almost_equal(x1, x2):
+    return abs(x1 - x2) < 1e-4
+_, acc_file, network_name = argv
+# cudnn version should have the same accuracy as non-cudnn version.
+network_name = network_name.replace("_cudnn", "")
+with open(acc_file) as f:
+    obtained_acc = float(f.read().strip())
+target_acc = network_accuracies[network_name]
+if not almost_equal(target_acc, obtained_acc):
+    raise ValueError(
+        f"Accuracy mismatch. Obtained: {obtained_acc}, target: {target_acc}"
+    )
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h b/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h
index 71e1c268726e1fb77b0713599928262b95bd64f5..05d9157a6473fb74061e6edefc4455080368f706 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h
@@ -3,7 +3,6 @@
 #include <sstream>
 #include <vector>
 #include <bits/stdc++.h>
@@ -11,15 +10,13 @@
 #include <tensor.h>
 #include <cmath>
 std::vector<float> run_accuracies;
+void printTensorInfo(void *tensor_ptr) {
-void printTensorInfo(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-  if(tensor->gpu_data != NULL){
+  if (tensor->gpu_data != NULL) {
     printf("Successful cudaMalloc \n");
@@ -29,376 +26,354 @@ void printTensorInfo(void* tensor_ptr){
   printf("num_elems = %lu \n", tensor->num_elems);
 // FIXIT: Move this to debug.h and include in all files
-void dumpWeightsToFile(char* file_name, void* weights_ptr){
+void dumpWeightsToFile(char *file_name, void *weights_ptr) {
-  struct Tensor* weights = (Tensor*) weights_ptr;
+  struct Tensor *weights = (Tensor *)weights_ptr;
   // Move data back to host
   hpvm_request_tensor(weights, 0);
-  FILE* fp = fopen(file_name, "wb");
-  if(fp == NULL){
-    printf("File %s could not be created. Check if directory exists \n", file_name);
+  FILE *fp = fopen(file_name, "wb");
+  if (fp == NULL) {
+    printf("File %s could not be created. Check if directory exists \n",
+           file_name);
-  //printf("size_in_bytes = %lu \n", weights->size_in_bytes);
-  size_t bytes_written = fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
-  //printf("bytes_written = %lu \n", bytes_written);
+  // printf("size_in_bytes = %lu \n", weights->size_in_bytes);
+  size_t bytes_written =
+      fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
+  // printf("bytes_written = %lu \n", bytes_written);
+void fillTensorWithOnes(void *tensor_ptr) {
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
-void fillTensorWithOnes(void* tensor_ptr){
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
   hpvm_request_tensor(tensor, 0);
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = 1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = 1.0;
+void fillWithOnesAndTwos(void *tensor_ptr) {
-void fillWithOnesAndTwos(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
   hpvm_request_tensor(tensor, 0);
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems/2; i++){
-      data_arr[i] = 1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems / 2; i++) {
+      data_arr[i] = 1.0;
-    for(unsigned int i = tensor->num_elems/2; i < tensor->num_elems; i++){
-      data_arr[i] = 2.0;    
+    for (unsigned int i = tensor->num_elems / 2; i < tensor->num_elems; i++) {
+      data_arr[i] = 2.0;
+void fillTensorWithVal(void *tensor_ptr, float target_value) {
-void fillTensorWithVal(void* tensor_ptr, float target_value){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
   hpvm_request_tensor(tensor, 0);
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = target_value;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = target_value;
+void fillTensorWithNegOnes(void *tensor_ptr) {
-void fillTensorWithNegOnes(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
   hpvm_request_tensor(tensor, 0);
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = -1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = -1.0;
+void fillTensorVals(void *tensor_ptr) {
-void fillTensorVals(void* tensor_ptr){
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = i + 1;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = i + 1;
+void printTensorValues(void *tensor_ptr) {
-void printTensorValues(void* tensor_ptr){
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
   hpvm_request_tensor(tensor, 0);
   // printing is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      printf("%f,", data_arr[i]);    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      printf("%f,", data_arr[i]);
+void printTensorDims(void *tensor_ptr) {
-void printTensorDims(void* tensor_ptr){
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
   printf("Num_elems = %lu \n", tensor->num_elems);
-  for (int i = 0; i < tensor->dims.num_dims; i++){
+  for (int i = 0; i < tensor->dims.num_dims; i++) {
     printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]);
+void compareTensors(void *tensor1_ptr, void *tensor2_ptr) {
-void compareTensors(void* tensor1_ptr, void* tensor2_ptr){
-  struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr;
-  struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr;
+  struct Tensor *tensor1 = (struct Tensor *)tensor1_ptr;
+  struct Tensor *tensor2 = (struct Tensor *)tensor2_ptr;
   hpvm_request_tensor(tensor1, 0);
   hpvm_request_tensor(tensor2, 0);
-  float* tensor_data1 = (float*) tensor1->host_data;
-  float* tensor_data2 = (float*) tensor2->host_data;
-  for(unsigned int i = 0; i < tensor1->num_elems; i++){
-    if(tensor_data1[i] != tensor_data2[i]){
+  float *tensor_data1 = (float *)tensor1->host_data;
+  float *tensor_data2 = (float *)tensor2->host_data;
+  for (unsigned int i = 0; i < tensor1->num_elems; i++) {
+    if (tensor_data1[i] != tensor_data2[i]) {
       printf("Tensor data mismatch at index %d \n", i);
+void compareValues(void *tensor_ptr, float *data, size_t num_elems) {
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
-void compareValues(void* tensor_ptr, float* data, size_t num_elems){
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
   hpvm_request_tensor(tensor, 0);
-  float* tensor_data = (float*) tensor->host_data;
-  for(unsigned int i = 0; i < num_elems; i++){
-    if(tensor_data[i] != data[i]){
+  float *tensor_data = (float *)tensor->host_data;
+  for (unsigned int i = 0; i < num_elems; i++) {
+    if (tensor_data[i] != data[i]) {
       printf("Tensor data mismatch");
-void* readInputTensor(const char* file_name, int data_type, int dim1_size, int dim2_size,
-		      int dim3_size, int dim4_size){
+void *readInputTensor(const char *file_name, int data_type, int dim1_size,
+                      int dim2_size, int dim3_size, int dim4_size) {
   int type_size = 4; // NOTE: Assuming floating point tensors
   int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
   int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  uint8_t* file_data = (uint8_t*) malloc(sizeof(char) * num_elems);
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  uint8_t *file_data = (uint8_t *)malloc(sizeof(char) * num_elems);
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   int file_header_size = 16;
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(file_data, 1, sizeof(uint8_t) * num_elems, file);
-  for (size_t i = 0; i < num_elems; ++i){
-    tensor_data[i] = (float) file_data[i] / 255.0f;
+  for (size_t i = 0; i < num_elems; ++i) {
+    tensor_data[i] = (float)file_data[i] / 255.0f;
   // NOTE: Using NCHW format
-  struct Tensor* input = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					dim3_size, dim4_size);
+  struct Tensor *input = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
   initTensorData(input, tensor_data, size_in_bytes);
   //  compareValues(input, tensor_data, num_elems);
-  return input;  
+  return input;
 //*** FIXIT: Move this to CPU-only
-struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type,
-				     int dim1_size, int dim2_size,
-				     int dim3_size, int dim4_size){
+struct Tensor *readTrainedWeightsCPU(const char *file_name, int data_type,
+                                     int dim1_size, int dim2_size,
+                                     int dim3_size, int dim4_size) {
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   int file_header_size = 0;
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
   printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
   initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
+  // compareValues(weights, tensor_data, num_elems);
   return weights;
-struct Tensor* readTrainedWeights(const char* file_name, int data_type,
-				  long int dim1_size, long int dim2_size,
-				  long int dim3_size, long int dim4_size){
+struct Tensor *readTrainedWeights(const char *file_name, int data_type,
+                                  long int dim1_size, long int dim2_size,
+                                  long int dim3_size, long int dim4_size) {
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   printf("size_in_bytes  = %lu \n", size_in_bytes);
   int file_header_size = 0;
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
-  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
+  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes,
+  // bytes_read);
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
   initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
+  // compareValues(weights, tensor_data, num_elems);
   return weights;
-struct Tensor* readInputBatch(const char* file_name, int data_type,
-			      int start, int end,
-			      int dim2_size, int dim3_size, int dim4_size){
+struct Tensor *readInputBatch(const char *file_name, long data_type, long start,
+                              long end, long dim2_size, long dim3_size,
+                              long dim4_size) {
   long int dim1_size = end - start;
   // FIXIT: Don't assume floating point types
   long int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  long int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
+  long int file_header_size =
+      type_size * start * dim2_size * dim3_size * dim4_size;
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
-  //printf ("FIXED input BATCH read \n");
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
+  // printf ("FIXED input BATCH read \n");
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
   initTensorData(weights, tensor_data, size_in_bytes);
   return weights;
+void *copyInputBatch(const char *file_name, long start, long end,
+                     long dim2_size, long dim3_size, long dim4_size,
+                     void *inputTensor_ptr) {
+  struct Tensor *inputTensor = (struct Tensor *)inputTensor_ptr;
-void* copyInputBatch(const char* file_name, 
-		    int start, int end,
-		    int dim2_size, int dim3_size, int dim4_size,
-		    void* inputTensor_ptr){
-  struct Tensor* inputTensor = (struct Tensor*) inputTensor_ptr;
   int dim1_size = end - start;
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
+  long int file_header_size =
+      type_size * start * dim2_size * dim3_size * dim4_size;
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
   initTensorData(inputTensor, tensor_data, size_in_bytes);
   printf("******NOTE: tensor Dims = %d \n", inputTensor->dims.num_dims);
-  if(inputTensor->host_data == NULL || inputTensor->gpu_data == NULL)
+  if (inputTensor->host_data == NULL || inputTensor->gpu_data == NULL)
     printf("ERROR: NULL data pointers \n");
-  // Chaning Tensor Placement to HOST 
+  // Chaning Tensor Placement to HOST
   changeTensorPlacement(inputTensor, HOST);
   return inputTensor;
+uint8_t *readLabels(const char *labels_file, int num_labels) {
-uint8_t* readLabels(const char* labels_file, int num_labels){
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+  uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
@@ -406,17 +381,15 @@ uint8_t* readLabels(const char* labels_file, int num_labels){
   size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
   return labels;
+uint32_t *readLabels3(const char *labels_file, int num_labels) {
-uint32_t* readLabels3(const char* labels_file, int num_labels){
-  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+  uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
@@ -424,264 +397,248 @@ uint32_t* readLabels3(const char* labels_file, int num_labels){
   size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
   return labels;
-uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
+uint8_t *readLabelsBatch(const char *labels_file, int start, int end) {
   int num_labels = end - start;
   int file_header_size = sizeof(uint8_t) * start;
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+  uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
+  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
   // printf("--labels bytes_read = %lu \n", bytes_read);
   return labels;
-uint32_t* readLabelsBatch3(const char* labels_file, int start, int end){
+uint32_t *readLabelsBatch3(const char *labels_file, int start, int end) {
   int num_labels = end - start;
   int file_header_size = sizeof(uint32_t) * start;
-  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+  uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
+  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
   return labels;
+void computeAccuracy(const char *labels_file, int num_labels,
+                     void *result_ptr) {
+  struct Tensor *result = (struct Tensor *)result_ptr;
-void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  uint8_t* labels = readLabels(labels_file, num_labels);
+  uint8_t *labels = readLabels(labels_file, num_labels);
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
-  for(int i = 0; i < batch_dim; i++){
+  for (int i = 0; i < batch_dim; i++) {
     int chosen = 0;
-    for (int id = 1; id < 10; ++id){
-      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
+    for (int id = 1; id < 10; ++id) {
+      if (data[i * channels + chosen] < data[i * channels + id])
+        chosen = id;
-    //printf("chosen = %d, label = %d \n", chosen, labels[i]);
-    if(chosen != labels[i])
+    // printf("chosen = %d, label = %d \n", chosen, labels[i]);
+    if (chosen != labels[i])
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
+// NOTE: batch_size and num_classes are Unused arguments
+float computeAccuracy2(uint8_t *labels, int batch_size, void *result_ptr,
+                       size_t num_classes = 10) {
+  struct Tensor *result = (struct Tensor *)result_ptr;
-// NOTE: batch_size and num_classes are Unused arguments 
-float computeAccuracy2(uint8_t* labels, int batch_size,
-		       void* result_ptr, size_t num_classes = 10){
-  struct Tensor* result = (struct Tensor*) result_ptr;
   size_t batch_dim = result->dims.dim_sizes[0];
   num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, num_classes);
-  for(unsigned int i = 0; i < batch_dim; i++){ 
+  for (unsigned int i = 0; i < batch_dim; i++) {
     int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
+    for (int id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
-    if(chosen != labels[i])
-      num_errors++;
+    if (chosen != labels[i])
+      num_errors++;
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
-  return accuracy;    
+  return accuracy;
+float computeAccuracy3(uint32_t *labels, void *result_ptr) {
+  struct Tensor *result = (struct Tensor *)result_ptr;
-float computeAccuracy3(uint32_t* labels, void* result_ptr){
-  struct Tensor* result = (struct Tensor*) result_ptr;
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
   printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
-  for(int i = 0; i < batch_dim; i++){
+  for (int i = 0; i < batch_dim; i++) {
     int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
+    for (int id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
-    if(chosen != labels[i])
+    if (chosen != labels[i])
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
-  return accuracy;    
+  return accuracy;
-struct ClassProb{
+struct ClassProb {
   float prob;
   int index;
-bool descendFloatComp(ClassProb obj1, ClassProb obj2){
+bool descendFloatComp(ClassProb obj1, ClassProb obj2) {
   return obj1.prob > obj2.prob;
+float computeTop5Accuracy(uint8_t *labels, int num_labels, void *result_ptr,
+                          unsigned num_classes = 10) {
+  struct Tensor *result = (struct Tensor *)result_ptr;
-float computeTop5Accuracy(uint8_t* labels, int num_labels,
-			  void* result_ptr, unsigned num_classes = 10){
-  struct Tensor* result = (struct Tensor*) result_ptr;
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
-  for(int i = 0; i < num_labels; i++){
+  for (int i = 0; i < num_labels; i++) {
     std::vector<ClassProb> elem_probs;
-    for (int id = 0; id < num_classes; ++id){
+    for (int id = 0; id < num_classes; ++id) {
       ClassProb cProb;
       cProb.prob = data[i * channels + id];
       cProb.index = id;
-      elem_probs.push_back(cProb);   
+      elem_probs.push_back(cProb);
-    std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
+  std:
+    sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
     // Check if any of top-5 predictions matches
     bool matched = false;
-    for(int j = 0; j < 5; j++){
+    for (int j = 0; j < 5; j++) {
       ClassProb cProb = elem_probs[j];
-      if(cProb.index == labels[i])
+      if (cProb.index == labels[i])
         matched = true;
-    if(!matched)
-      num_errors +=1; 
+    if (!matched)
+      num_errors += 1;
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
-  return accuracy;    
+  return accuracy;
-void dumpFinalAccuracy(float accuracy){
+void dumpFinalAccuracy(float accuracy) {
   printf("\n\n **** Final Accuracy = %f \n", accuracy);
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
@@ -690,44 +647,37 @@ void dumpFinalAccuracy(float accuracy){
+void dumpAvgPSNR(float avg_psnr) {
-void dumpAvgPSNR(float avg_psnr){
-  FILE* fp = fopen("avg_psnr", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("avg_psnr", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << avg_psnr;
-    std::string print_str = ss.str(); 
+    std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
+void dumpPSNRStd(float psnr_std) {
-void dumpPSNRStd(float psnr_std){
-  FILE* fp = fopen("psnr_std.txt", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("psnr_std.txt", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << psnr_std;
-    std::string print_str = ss.str(); 
+    std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
+void dumpExecutionAccuracies() {
-void dumpExecutionAccuracies(){
-  FILE* fp = fopen("run_accuracies.txt", "w+");
-  if(fp != NULL){  
-    for (int i = 0; i < run_accuracies.size(); i++){
+  FILE *fp = fopen("run_accuracies.txt", "w+");
+  if (fp != NULL) {
+    for (int i = 0; i < run_accuracies.size(); i++) {
       float accuracy = run_accuracies[i];
       std::ostringstream ss;
       ss << std::fixed << accuracy;
@@ -735,63 +685,60 @@ void dumpExecutionAccuracies(){
       fwrite(print_str.c_str(), 1, print_str.length(), fp);
       fwrite("\n", 1, 1, fp);
-float readPSNRFromFile(const char* file_name){
+float readPSNRFromFile(const char *file_name) {
   float psnr;
-  FILE* pFile = fopen(file_name, "r");
-  if(pFile == NULL){
+  FILE *pFile = fopen(file_name, "r");
+  if (pFile == NULL) {
     printf("ERROR: psnr.txt not found! \n");
   fscanf(pFile, "%f", &psnr);
   printf("**** PSNR read = %f \n\n", psnr);
-  return psnr; 
+  return psnr;
+float computePSNRViolation(void *gold_ptr, void *approx_ptr,
+                           float PSNR_threshold) {
-float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){
   PSNR_threshold = readPSNRFromFile("psnr.txt");
   std::vector<float> psnr_list;
-  struct Tensor* gold_tensor = (struct Tensor*) gold_ptr;
-  struct Tensor* approx_tensor = (struct Tensor*) approx_ptr;
-  size_t* dim_sizes = gold_tensor->dims.dim_sizes;
+  struct Tensor *gold_tensor = (struct Tensor *)gold_ptr;
+  struct Tensor *approx_tensor = (struct Tensor *)approx_ptr;
+  size_t *dim_sizes = gold_tensor->dims.dim_sizes;
   size_t batch_dim = dim_sizes[0];
   size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
   printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size);
-  float* gold_data = (float*) gold_tensor->host_data;
-  float* approx_data = (float*) approx_tensor->host_data;
-  FILE* fp = fopen("img_psnr.txt", "w+");
+  float *gold_data = (float *)gold_tensor->host_data;
+  float *approx_data = (float *)approx_tensor->host_data;
+  FILE *fp = fopen("img_psnr.txt", "w+");
   float sum_psnr = 0.0;
-  int num_errors = 0;  
-  for(size_t i = 0; i < batch_dim; i++){
+  int num_errors = 0;
+  for (size_t i = 0; i < batch_dim; i++) {
     float mse_sum = 0.0;
-    float max_val = -999999;     
+    float max_val = -999999;
     size_t offset = i * image_size;
-    for(size_t j = 0; j < image_size; j++){
+    for (size_t j = 0; j < image_size; j++) {
       float diff = gold_data[offset + j] - approx_data[offset + j];
       float diff_square = diff * diff;
       mse_sum += diff_square;
-      if(max_val < gold_data[offset + j]){
-	max_val = gold_data[offset + j];
-      }   
+      if (max_val < gold_data[offset + j]) {
+        max_val = gold_data[offset + j];
+      }
     mse_sum = mse_sum / image_size;
@@ -799,7 +746,7 @@ float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshol
     sum_psnr += psnr;
     if (psnr < PSNR_threshold)
-      num_errors += 1;    
+      num_errors += 1;
     printf("PSNR value = %f \n", psnr);
@@ -817,39 +764,35 @@ float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshol
   float avg_psnr = sum_psnr / batch_dim;
   printf("*** avg_psnr =  %f \n\n", avg_psnr);
   float success_rate = 100.0 - violation_rate;
   float var = 0.0;
-  for(size_t i = 0; i < batch_dim; i++){
-    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); 
+  for (size_t i = 0; i < batch_dim; i++) {
+    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr);
   var /= batch_dim;
   float std = sqrt(var);
-  return violation_rate;  
+  return violation_rate;
-void dumpOutput(void* output_ptr, const char* file_name){
+void dumpOutput(void *output_ptr, const char *file_name) {
-  struct Tensor* out_tensor = (struct Tensor*) output_ptr;  
+  struct Tensor *out_tensor = (struct Tensor *)output_ptr;
   size_t size_in_bytes = out_tensor->size_in_bytes;
-  printf ("** Output size = %lu \n", size_in_bytes);
-  float* host_data = (float*) out_tensor->host_data; 
-  FILE* fd = fopen(file_name, "w+");
+  printf("** Output size = %lu \n", size_in_bytes);
+  float *host_data = (float *)out_tensor->host_data;
+  FILE *fd = fopen(file_name, "w+");
   fwrite(host_data, 1, size_in_bytes, fd);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/lit.cfg.py b/hpvm/test/dnn_benchmarks/hpvm-c/lit.cfg.py
new file mode 100644
index 0000000000000000000000000000000000000000..74703a5c140ee2b05cec7024d7335c166af4a1e4
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/lit.cfg.py
@@ -0,0 +1,27 @@
+# -*- Python -*-
+# Configuration file for the 'lit' test runner.
+import os
+import lit.formats
+from lit.llvm import llvm_config
+# name: The name of this test suite.
+config.name = "HPVM-DNN"
+# testFormat: The test format to use to interpret tests.
+config.test_format = lit.formats.ShTest(False)
+# suffixes: A list of file extensions to treat as test files. This is overriden
+# by individual lit.local.cfg files in the test subdirectories.
+config.suffixes = [".test"]
+# test_source_root: The root path where tests are located.
+# test_exec_root: The root path where tests should be run.
+current_source_dir = os.path.dirname(os.path.relpath(__file__, config.llvm_src_root))
+current_binary_dir = os.path.join(config.llvm_obj_root, current_source_dir)
+config.test_source_root = config.test_exec_root = current_binary_dir
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment("PATH", config.llvm_tools_dir, append_path=True)
diff --git a/hpvm/test/dnn_benchmarks/profiling/CMakeLists.txt b/hpvm/test/dnn_benchmarks/profiling/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..712741c0e347acfc84e37bc2c91d998f549c7077
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/profiling/CMakeLists.txt
@@ -0,0 +1,14 @@
+# --[ llvm-lit test setup
+# lit.cfg.py looks for tests in CMAKE_CURRENT_BINARY_DIR (see lit.cfg.py)
+# as most of the tests require some kind of compilation / generation
+# which is best done over there.
+  ../../lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+add_lit_testsuite(check-hpvm-profiler "Run tests for package hpvm-profiler"
+  DEPENDS dnn_benchmarks  # Requires all dnn benchmarks
diff --git a/hpvm/test/dnn_benchmarks/profiling/alexnet2_cifar10.test b/hpvm/test/dnn_benchmarks/profiling/alexnet2_cifar10.test
new file mode 100644
index 0000000000000000000000000000000000000000..455a3e75a7aff4ac76123cb62e860701e8397713
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/profiling/alexnet2_cifar10.test
@@ -0,0 +1 @@
+RUN: test_hpvm_c_profiling.py alexnet2_cifar10
\ No newline at end of file
diff --git a/hpvm/test/dnn_benchmarks/profiling/alexnet_cifar10.test b/hpvm/test/dnn_benchmarks/profiling/alexnet_cifar10.test
new file mode 100644
index 0000000000000000000000000000000000000000..62c667a249e514a17f8ea809f364c4e65c3332dd
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/profiling/alexnet_cifar10.test
@@ -0,0 +1 @@
+RUN: test_hpvm_c_profiling.py alexnet_cifar10
\ No newline at end of file
diff --git a/hpvm/test/dnn_benchmarks/profiling/jetson_clocks.sh b/hpvm/test/dnn_benchmarks/profiling/jetson_clocks.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a15af4ba731b3ec175f4cfc09602a519831d79d6
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/profiling/jetson_clocks.sh
@@ -0,0 +1,400 @@
+# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+NC='\e[0m' # No Color
+	if [ "$1" != "" ]; then
+		echo -e ${RED}"$1"${NC}
+	fi
+		cat >& 2 <<EOF
+Maximize jetson performance by setting static max frequency to CPU, GPU and EMC clocks.
+jetson_clocks.sh [options]
+  options,
+  --show             display current settings
+  --store [file]     store current settings to a file (default: \${HOME}/l4t_dfs.conf)
+  --restore [file]   restore saved settings from a file (default: \${HOME}/l4t_dfs.conf)
+  run jetson_clocks.sh without any option to set static max frequency to CPU, GPU and EMC clocks.
+	exit 0
+	for conf in `cat "${CONF_FILE}"`; do
+		file=`echo $conf | cut -f1 -d :`
+		data=`echo $conf | cut -f2 -d :`
+		case "${file}" in
+			/sys/devices/system/cpu/cpu*/online |\
+			/sys/kernel/debug/clk/override*/state)
+				if [ `cat $file` -ne $data ]; then
+					echo "${data}" > "${file}"
+				fi
+				;;
+			/sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq |\
+			/sys/kernel/debug/tegra_cpufreq/*_CLUSTER/cc3/enable)
+				echo "${data}" > "${file}" 2>/dev/null
+				;;
+			*)
+				echo "${data}" > "${file}"
+				ret=$?
+				if [ ${ret} -ne 0 ]; then
+					echo "Error: Failed to restore $file"
+				fi
+				;;
+		esac
+	done
+	for file in $@; do
+		if [ -e "${file}" ]; then
+			echo "${file}:`cat ${file}`" >> "${CONF_FILE}"
+		fi
+	done
+	# Jetson-TK1 CPU fan is always ON.
+	if [ "${machine}" = "jetson-tk1" ] ; then
+			return
+	fi
+	if [ ! -w /sys/kernel/debug/tegra_fan/target_pwm ]; then
+		echo "Can't access Fan!"
+		return
+	fi
+	case "${ACTION}" in
+		show)
+			echo "Fan: speed=`cat /sys/kernel/debug/tegra_fan/target_pwm`"
+			;;
+		store)
+			store "/sys/kernel/debug/tegra_fan/target_pwm"
+			;;
+		*)
+			FAN_SPEED=255
+			echo "${FAN_SPEED}" > /sys/kernel/debug/tegra_fan/target_pwm
+			;;
+	esac
+	case "${ACTION}" in
+		show)
+			if [ -d "/sys/kernel/cluster" ]; then
+				ACTIVE_CLUSTER=`cat /sys/kernel/cluster/active`
+				echo "CPU Cluster Switching: Active Cluster ${ACTIVE_CLUSTER}"
+			else
+				echo "CPU Cluster Switching: Disabled"
+			fi
+			;;
+		store)
+			if [ -d "/sys/kernel/cluster" ]; then
+				store "/sys/kernel/cluster/immediate"
+				store "/sys/kernel/cluster/force"
+				store "/sys/kernel/cluster/active"
+			fi
+			;;
+		*)
+			if [ -d "/sys/kernel/cluster" ]; then
+				echo 1 > /sys/kernel/cluster/immediate
+				echo 0 > /sys/kernel/cluster/force
+				echo G > /sys/kernel/cluster/active
+			fi
+			;;
+	esac
+	case "${ACTION}" in
+		show)
+			echo "Online CPUs: `cat /sys/devices/system/cpu/online`"
+			;;
+		store)
+			for file in /sys/devices/system/cpu/cpu[0-9]/online; do
+				store "${file}"
+			done
+			;;
+		*)
+			if [ "${SOCFAMILY}" != "tegra186" ]; then
+				for file in /sys/devices/system/cpu/cpu*/online; do
+					if [ `cat $file` -eq 0 ]; then
+						echo 1 > "${file}"
+					fi
+				done
+			fi
+	esac
+	FREQ_GOVERNOR="cpufreq/scaling_governor"
+	CPU_MIN_FREQ="cpufreq/scaling_min_freq"
+	CPU_MAX_FREQ="cpufreq/scaling_max_freq"
+	CPU_CUR_FREQ="cpufreq/scaling_cur_freq"
+	CPU_SET_SPEED="cpufreq/scaling_setspeed"
+	INTERACTIVE_SETTINGS="/sys/devices/system/cpu/cpufreq/interactive"
+	SCHEDUTIL_SETTINGS="/sys/devices/system/cpu/cpufreq/schedutil"
+	case "${ACTION}" in
+		show)
+			for folder in /sys/devices/system/cpu/cpu[0-9]; do
+				CPU=`basename ${folder}`
+				if [ -e "${folder}/${FREQ_GOVERNOR}" ]; then
+					echo "$CPU: Gonvernor=`cat ${folder}/${FREQ_GOVERNOR}`" \
+						"MinFreq=`cat ${folder}/${CPU_MIN_FREQ}`" \
+						"MaxFreq=`cat ${folder}/${CPU_MAX_FREQ}`" \
+						"CurrentFreq=`cat ${folder}/${CPU_CUR_FREQ}`"
+				fi
+			done
+			;;
+		store)
+			store "/sys/module/qos/parameters/enable"
+			for file in \
+				/sys/devices/system/cpu/cpu[0-9]/cpufreq/scaling_min_freq; do
+				store "${file}"
+			done
+			if [ "${SOCFAMILY}" = "tegra186" ]; then
+				store "/sys/kernel/debug/tegra_cpufreq/M_CLUSTER/cc3/enable"
+				store "/sys/kernel/debug/tegra_cpufreq/B_CLUSTER/cc3/enable"
+			fi
+			;;
+		*)
+			echo 0 > /sys/module/qos/parameters/enable
+			if [ "${SOCFAMILY}" = "tegra186" ]; then
+				echo 0 > /sys/kernel/debug/tegra_cpufreq/M_CLUSTER/cc3/enable 2>/dev/null
+				echo 0 > /sys/kernel/debug/tegra_cpufreq/B_CLUSTER/cc3/enable 2>/dev/null
+			fi
+			for folder in /sys/devices/system/cpu/cpu[0-9]; do
+				cat "${folder}/${CPU_MAX_FREQ}" > "${folder}/${CPU_MIN_FREQ}" 2>/dev/null
+			done
+			;;
+	esac
+	case "${SOCFAMILY}" in
+		tegra186)
+			GPU_MIN_FREQ="/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"
+			GPU_MAX_FREQ="/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"
+			GPU_CUR_FREQ="/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"
+			GPU_RAIL_GATE="/sys/devices/17000000.gp10b/railgate_enable"
+			;;
+		tegra210)
+			GPU_MIN_FREQ="/sys/devices/57000000.gpu/devfreq/57000000.gpu/min_freq"
+			GPU_MAX_FREQ="/sys/devices/57000000.gpu/devfreq/57000000.gpu/max_freq"
+			GPU_CUR_FREQ="/sys/devices/57000000.gpu/devfreq/57000000.gpu/cur_freq"
+			GPU_RAIL_GATE="/sys/devices/57000000.gpu/railgate_enable"
+			;;
+		*)
+			echo "Error! unsupported SOC ${SOCFAMILY}"
+			exit 1;
+			;;
+	esac
+	case "${ACTION}" in
+		show)
+			echo "GPU MinFreq=`cat ${GPU_MIN_FREQ}`" \
+				"MaxFreq=`cat ${GPU_MAX_FREQ}`" \
+				"CurrentFreq=`cat ${GPU_CUR_FREQ}`"
+			;;
+		store)
+			store "${GPU_MIN_FREQ}"
+			store "${GPU_RAIL_GATE}"
+			;;
+		*)
+			echo 0 > "${GPU_RAIL_GATE}"
+			cat "${GPU_MAX_FREQ}" > "${GPU_MIN_FREQ}"
+			ret=$?
+			if [ ${ret} -ne 0 ]; then
+				echo "Error: Failed to max GPU frequency!"
+			fi
+			;;
+	esac
+	case "${SOCFAMILY}" in
+		tegra186)
+			EMC_ISO_CAP="/sys/kernel/nvpmodel_emc_cap/emc_iso_cap"
+			EMC_MIN_FREQ="/sys/kernel/debug/bpmp/debug/clk/emc/min_rate"
+			EMC_MAX_FREQ="/sys/kernel/debug/bpmp/debug/clk/emc/max_rate"
+			EMC_CUR_FREQ="/sys/kernel/debug/clk/emc/clk_rate"
+			EMC_UPDATE_FREQ="/sys/kernel/debug/bpmp/debug/clk/emc/rate"
+			EMC_FREQ_OVERRIDE="/sys/kernel/debug/bpmp/debug/clk/emc/mrq_rate_locked"
+			;;
+		tegra210)
+			EMC_MIN_FREQ="/sys/kernel/debug/tegra_bwmgr/emc_min_rate"
+			EMC_MAX_FREQ="/sys/kernel/debug/tegra_bwmgr/emc_max_rate"
+			EMC_CUR_FREQ="/sys/kernel/debug/clk/override.emc/clk_rate"
+			EMC_UPDATE_FREQ="/sys/kernel/debug/clk/override.emc/clk_update_rate"
+			EMC_FREQ_OVERRIDE="/sys/kernel/debug/clk/override.emc/clk_state"
+			;;
+		*)
+			echo "Error! unsupported SOC ${SOCFAMILY}"
+			exit 1;
+			;;
+	esac
+	if [ "${SOCFAMILY}" = "tegra186" ]; then
+		emc_cap=`cat "${EMC_ISO_CAP}"`
+		emc_fmax=`cat "${EMC_MAX_FREQ}"`
+		if [ "$emc_cap" -gt 0 ] && [ "$emc_cap" -lt  "$emc_fmax" ]; then
+		fi
+	fi
+	case "${ACTION}" in
+		show)
+			echo "EMC MinFreq=`cat ${EMC_MIN_FREQ}`" \
+				"MaxFreq=`cat ${EMC_MAX_FREQ}`" \
+				"CurrentFreq=`cat ${EMC_CUR_FREQ}`" \
+				"FreqOverride=`cat ${EMC_FREQ_OVERRIDE}`"
+			;;
+		store)
+			store "${EMC_FREQ_OVERRIDE}"
+			;;
+		*)
+			cat "${EMC_MAX_FREQ}" > "${EMC_UPDATE_FREQ}"
+			echo 1 > "${EMC_FREQ_OVERRIDE}"
+			;;
+	esac
+main ()
+	while [ -n "$1" ]; do
+		case "$1" in
+			--show)
+				echo "SOC family:${SOCFAMILY}  Machine:${machine}"
+				ACTION=show
+				;;
+			--store)
+				[ -n "$2" ] && CONF_FILE=$2
+				ACTION=store
+				shift 1
+				;;
+			--restore)
+				[ -n "$2" ] && CONF_FILE=$2
+				ACTION=restore
+				shift 1
+				;;
+			-h|--help)
+				usage
+				exit 0
+				;;
+			*)
+				usage "Unknown option: $1"
+				exit 1
+				;;
+		esac
+		shift 1
+	done
+	[ `whoami` != root ] && \
+		echo Error: Run this script\($0\) as a root user && exit 1
+	case $ACTION in
+		store)
+			if [ -e "${CONF_FILE}" ]; then
+				echo "File $CONF_FILE already exists. Can I overwrite it? Y/N:"
+				read answer
+				case $answer in
+					y|Y)
+						rm -f $CONF_FILE
+						;;
+					*)
+						echo "Error: file $CONF_FILE already exists!"
+						exit 1
+						;;
+				esac
+			fi
+			;;
+		restore)
+			if [ ! -e "${CONF_FILE}" ]; then
+				echo "Error: $CONF_FILE file not found !"
+				exit 1
+			fi
+			restore
+			exit 0
+			;;
+	esac
+	do_hotplug
+	do_clusterswitch
+	do_cpu
+	do_gpu
+	do_emc
+	do_fan
+if [ -e "/sys/devices/soc0/family" ]; then
+	SOCFAMILY="`cat /sys/devices/soc0/family`"
+	if [ -e "/sys/devices/soc0/machine" ]; then
+		machine=`cat /sys/devices/soc0/machine`
+	fi
+elif [ -e "/proc/device-tree/compatible" ]; then
+	grep "nvidia,tegra210" /proc/device-tree/compatible &>/dev/null
+	if [ $? -eq 0 ]; then
+		SOCFAMILY="tegra210"
+	else
+		grep "nvidia,tegra186" /proc/device-tree/compatible &>/dev/null
+		if [ $? -eq 0 ]; then
+			SOCFAMILY="tegra186"
+		fi
+	fi
+	if [ -e "/proc/device-tree/model" ]; then
+		machine="`cat /proc/device-tree/model`"
+	fi
+main $@
+exit 0
diff --git a/hpvm/test/dnn_benchmarks/profiling/lenet_mnist.test b/hpvm/test/dnn_benchmarks/profiling/lenet_mnist.test
new file mode 100644
index 0000000000000000000000000000000000000000..88856a8913f2c9fb275187d65d443c50aa8bf583
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/profiling/lenet_mnist.test
@@ -0,0 +1 @@
+RUN: test_hpvm_c_profiling.py lenet_mnist
\ No newline at end of file
diff --git a/hpvm/test/dnn_benchmarks/profiling/lit.cfg.py b/hpvm/test/dnn_benchmarks/profiling/lit.cfg.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3584478209402a308ed17ba2c3e5994a49dab76
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/profiling/lit.cfg.py
@@ -0,0 +1,34 @@
+# -*- Python -*-
+# Configuration file for the 'lit' test runner.
+import os
+import lit.formats
+from lit.llvm import llvm_config
+# name: The name of this test suite.
+config.name = "HPVM-Profiler"
+# testFormat: The test format to use to interpret tests.
+config.test_format = lit.formats.ShTest(False)
+# suffixes: A list of file extensions to treat as test files. This is overriden
+# by individual lit.local.cfg files in the test subdirectories.
+config.suffixes = [".test"]
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+# test_exec_root: The root path where tests should be run.
+current_source_dir = os.path.dirname(os.path.relpath(__file__, config.llvm_src_root))
+current_binary_dir = os.path.join(config.llvm_obj_root, current_source_dir)
+config.test_exec_root = current_binary_dir
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment("PATH", config.llvm_tools_dir, append_path=True)
+# Add substitution for our main script in this directory.
+llvm_config.add_tool_substitutions(["test_hpvm_c_profiling.py"], config.test_source_root)
diff --git a/hpvm/test/dnn_benchmarks/profiling/mobilenet_cifar10.test b/hpvm/test/dnn_benchmarks/profiling/mobilenet_cifar10.test
new file mode 100644
index 0000000000000000000000000000000000000000..a40981c9408b52f45ae9a58ab3895e12889bf665
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/profiling/mobilenet_cifar10.test
@@ -0,0 +1 @@
+RUN: test_hpvm_c_profiling.py mobilenet_cifar10
\ No newline at end of file
diff --git a/hpvm/test/dnn_benchmarks/profiling/resnet18_cifar10.test b/hpvm/test/dnn_benchmarks/profiling/resnet18_cifar10.test
new file mode 100644
index 0000000000000000000000000000000000000000..5d09297309e6f2ac48c23e0c529021144d6734e7
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/profiling/resnet18_cifar10.test
@@ -0,0 +1 @@
+RUN: test_hpvm_c_profiling.py resnet18_cifar10
\ No newline at end of file
diff --git a/hpvm/test/dnn_benchmarks/profiling/test_hpvm_c_profiling.py b/hpvm/test/dnn_benchmarks/profiling/test_hpvm_c_profiling.py
new file mode 100755
index 0000000000000000000000000000000000000000..5f4a96740cedb05295e4fcde0c5dfa65a0be34cc
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/profiling/test_hpvm_c_profiling.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+from pathlib import Path
+from sys import argv
+from hpvm_profiler import profile_configs, read_hpvm_configs
+# relative to cwd()
+benchmarks_bindir = Path("../hpvm-c")
+# relative to location of this file
+benchmarks_srcdir = Path(__file__).parent / "../hpvm-c/benchmarks"
+# We're called in the "current" binary directory.
+# For example (depending on where build dir is),
+# "hpvm/build/tools/hpvm/test/dnn_benchmarks/profiling".
+# So we know where the benchmark binaries are due to source directory structure,
+# and this is not hardcoding.
+dnn = argv[1]
+bench_bin_file = benchmarks_bindir / dnn
+config_file = benchmarks_srcdir / dnn / "data/tuner_confs.txt"
+out_config_file = f"./{dnn}.txt"
+profile_configs(bench_bin_file, config_file, out_config_file)
diff --git a/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py b/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py
index 19f17366459a7684c6df8a940438b661cf7f6029..2fb1de17ee226571e6cd6b808640bf35280932db 100644
--- a/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py
+++ b/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py
@@ -39,7 +39,7 @@ for model_cls, nch, img_size, batch_size, pathname in benchmarks:
         params / "test_input.bin", params / "test_labels.bin", dataset_shape
     model: Module = model_cls()
-    checkpoint = self_folder / "../model_params" / f"{pathname}.pth.tar"
+    checkpoint = self_folder / "../model_params/pytorch" / f"{pathname}.pth.tar"
     build_dir = codegen_dir / "build"
diff --git a/hpvm/test/dnn_benchmarks/scripts/run_dnn.py b/hpvm/test/dnn_benchmarks/scripts/run_dnn.py
deleted file mode 100644
index 2eed6739a76c7251ea60ed77df9730b0be9ac034..0000000000000000000000000000000000000000
--- a/hpvm/test/dnn_benchmarks/scripts/run_dnn.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import os.path
-from os import path
-import sys
-#import matplotlib.pyplot as plt 
-binary_dir = "../../../build/tools/hpvm/test/dnn_benchmarks/"
-accuracy_file = "final_accuracy"
-profile_file = "profile_data.txt"
-profile_file_prefix = "profile_info_"
-temp_file_name = "temp.txt"
-pred_binary_prefix = "test_"
-pred_binary_suffix = "_pred" 
-rt_binary_suffix = "_rt_pred"
-max_num_runs = 20
-def max_num_configs (config_file):
-    num_configs = 0
-    with open(config_file, "r") as f:
-        for line in f:
-            if "conf" in line:
-                num_configs = num_configs + 1
-    return (num_configs + 1)
-def read_and_write_config (config_file, config_num, temp_file):
-    config = ""
-    print("--CONFIG FILE: " + config_file)
-    print("--CONFIG NUM: " + str(config_num))
-    print("--TEMP FILE: " + temp_file)
-    with open(config_file, "r") as f:
-        conf = "conf" + str(config_num)
-        read_config = False
-        read_first_line = False
-        for line in f:
-            if read_first_line == False:
-                config = config + line
-                read_first_line = True
-                continue
-            if "-----" in line and read_config == True:
-                read_config = False
-                config = config + line
-                break
-            if read_config == True:
-                config = config + line
-                continue
-            if conf in line:
-                read_config = True
-                config = config + "+++++\n"
-                config = config + line
-    print("config: ")
-    print(config)
-    with open(temp_file, "w") as f:
-        f.write(config)
-def get_avg_exec_time(profile_file_path, config_num):
-    prof_file = profile_file_path + profile_file_prefix + str(config_num) + ".txt"
-    print("PROFILE FILE: " + prof_file)
-    with open(prof_file, "r") as f:
-            for line in f:
-                if "Total Time" in line:
-                    print("LINE: " + line)
-                    time = line.strip().split() [3]
-                    print("TIME: " + time)
-                    return float(time)
-    print("ERROR")
-    sys.exit()
-    return float(-1)
-def get_exec_time(config_file):
-    print("CONFIG FILE: " + config_file)
-    with open(config_file, "r") as f:
-        for line in f:
-            if "conf" in line:
-                print("LINE: " + line)
-                time = line.strip().split() [1]
-                print("TIME: " + time)
-                return float(time)
-    print("ERROR")
-    sys.exit()
-    return float(-1)
-def get_avg_exec_accuracy(file_name):
-    with open(file_name, "r") as f:
-        for line in f:
-            accuracy = line.strip().split() [0]
-            print("ACCURACY: " + accuracy)
-            return float(accuracy)
-    print("ERROR")
-    sys.exit()
-    return float(-1)
-def get_exec_accuracy(config_file):
-    with open(config_file, "r") as f:
-        for line in f:
-            if "conf" in line:
-                print("LINE: " + line)
-                acc = line.strip().split() [4]
-                print("ACCURACY: " + acc)
-                return float(acc)
-    print("ERROR")
-    sys.exit()
-    return float(-1)
-def predictive_tuning_exec(dnn_name):
-    #num_args = len(sys.argv)
-    #binary_files = list()
-    #arg = 2
-    #while arg < num_args:
-    #    binary_files.append(sys.argv[arg])
-    #    arg = arg + 1
-    #for dnn_name in binary_files:
-    dnn_dir = "../benchmarks/" + dnn_name
-    binary_name = binary_dir + pred_binary_prefix + dnn_name + pred_binary_suffix
-    pred_dir = dnn_dir + "/predictive/"
-    config_file = pred_dir + dnn_name + ".txt"
-    temp_file = pred_dir + temp_file_name
-    print("dnn_dir: " + dnn_dir)
-    print("binary name: " + binary_name)
-    print("pred_dir: " + pred_dir)
-    print("config_file: " + config_file)
-    print("temp_file: " + temp_file)
-    exec_command = "rm " + temp_file + " " + accuracy_file + " " + profile_file + " " + pred_dir + "profile*"
-    print(exec_command)
-    os.system(exec_command)
-    config_num = 1
-    max_configs = max_num_configs(config_file)
-    baseline_time = 0
-    baseline_acc = 0
-    print("MAX CONFIGS: " + str(max_configs))
-    perf_list = list()
-    acc_list = list()
-    while config_num < max_configs:
-        read_and_write_config(config_file, config_num, temp_file)
-        exec_command = binary_name
-        print(exec_command) 
-        os.system(exec_command)
-        time = get_avg_exec_time(pred_dir, config_num - 1)
-        acc = get_avg_exec_accuracy(accuracy_file)
-        config_time = get_exec_time(temp_file)
-        config_acc = get_exec_accuracy(temp_file)
-        if config_num == 1:
-            baseline_time = time
-            baseline_acc = acc 
-        else:
-            print("SPEEDUP: ")
-            print(baseline_time/time)
-            #time.append(baseline_time/time)
-            print("CONFIG TIME: ")
-            print(config_time)
-            print("ACC LOSS: ")
-            print(baseline_acc - acc)
-            #acc_list.append(baseline_acc - acc)
-            print("CONFIG ACC: ")
-            print(config_acc)
-        config_num = config_num + 1
-    #plt.plot(perf_list, acc_list)
-    #plt.xlabel("Speedups")
-    #plt.ylabel("Accurancy loss")
-    #plt.savefig(pred_dir + "tradeoff.pdf")
-    #exec_command = "rm " + temp_file + " " + accuracy_file + " " + profile_file + " " + pred_dir + "profile*"
-    #print(exec_command)
-    #os.system(exec_command)
-def runtime_tuning_exec():
-    num_args = len(sys.argv)
-    binary_files = list()
-    arg = 2
-    while arg < num_args:
-        binary_files.append(sys.argv[arg])
-        arg = arg + 1
-    for dnn_name in binary_files:
-        binary_dir = "../benchmarks/" + dnn_name
-        binary_name = binary_dir + rt_binary_suffix
-        conf_dir = binary_dir + "/data"
-        print("binary_dir: " + binary_dir)
-        print("binary name: " + binary_name)
-        run = 0
-        while run < max_num_runs:
-            exec_command = binary_name
-            print(exec_command)
-            os.system(exec_command)
-            exec_command = "/home/nvidia/poll 13"
-            print(exec_command)
-            os.system(exec_command)
-            exec_command = "mv " + conf_dir + "/profile_info_0.txt " + conf_dir + "/profile_info_out-run-" + str(run) + ".txt"
-            print(exec_command)
-            os.system(exec_command)
-            run = run + 1
-        exec_command = "rm -rf " + conf_dir +  "/run_data"
-        print(exec_command)
-        os.system(exec_command)
-        exec_command = "mkdir " + conf_dir + "/run_data"  
-        print(exec_command)
-        os.system(exec_command)  
-if __name__ == "__main__":
-    if sys.argv[1] == "--runtime_tuning":
-        runtime_tuning_exec()
-    else:
-        predictive_tuning_exec(sys.argv[1])
diff --git a/hpvm/test/dnn_benchmarks/scripts/run_dnns.py b/hpvm/test/dnn_benchmarks/scripts/run_dnns.py
deleted file mode 100644
index 0de85c7847309532db985d95aedbba02f2715059..0000000000000000000000000000000000000000
--- a/hpvm/test/dnn_benchmarks/scripts/run_dnns.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import os
-import sys
-dnns = ["alexnet", "alexnet2", "vgg16_cifar10", "vgg16_cifar100", "resnet18", "mobilenet_cifar10", "alexnet_imagenet", "resnet50_imagenet", "vgg16_imagenet", "lenet_mnist"]
-#dnns = ["resnet50_imagenet","alexnet"]
-#if sys.argv[1] == "--runtime":
-#	exec_command = "python3 run_dnn.py" + " --runtime_tuning  "  + dnns
-#	print(exec_command)
-#	os.system(exec_command)
-if __name__ == "__main__":
-    for dnn in dnns:
-        exec_command = "python3 run_dnn.py " + dnn 
-        print(exec_command)
-        os.system(exec_command)
diff --git a/hpvm/test/regressionTests/BuildDFG/AllocationNode.ll b/hpvm/test/hpvm_pass/BuildDFG/AllocationNode.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/AllocationNode.ll
rename to hpvm/test/hpvm_pass/BuildDFG/AllocationNode.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/CreateNode.ll b/hpvm/test/hpvm_pass/BuildDFG/CreateNode.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/CreateNode.ll
rename to hpvm/test/hpvm_pass/BuildDFG/CreateNode.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/CreateNodeAndEdge.ll b/hpvm/test/hpvm_pass/BuildDFG/CreateNodeAndEdge.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/CreateNodeAndEdge.ll
rename to hpvm/test/hpvm_pass/BuildDFG/CreateNodeAndEdge.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/LeafBindEdge.ll b/hpvm/test/hpvm_pass/BuildDFG/LeafBindEdge.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/LeafBindEdge.ll
rename to hpvm/test/hpvm_pass/BuildDFG/LeafBindEdge.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/LeafInComingBindEdge.ll b/hpvm/test/hpvm_pass/BuildDFG/LeafInComingBindEdge.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/LeafInComingBindEdge.ll
rename to hpvm/test/hpvm_pass/BuildDFG/LeafInComingBindEdge.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/LeafNodeGetters.ll b/hpvm/test/hpvm_pass/BuildDFG/LeafNodeGetters.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/LeafNodeGetters.ll
rename to hpvm/test/hpvm_pass/BuildDFG/LeafNodeGetters.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/OneRoot.ll b/hpvm/test/hpvm_pass/BuildDFG/OneRoot.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/OneRoot.ll
rename to hpvm/test/hpvm_pass/BuildDFG/OneRoot.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/ThreeLevel.ll b/hpvm/test/hpvm_pass/BuildDFG/ThreeLevel.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/ThreeLevel.ll
rename to hpvm/test/hpvm_pass/BuildDFG/ThreeLevel.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/ThreeLevelEdge.ll b/hpvm/test/hpvm_pass/BuildDFG/ThreeLevelEdge.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/ThreeLevelEdge.ll
rename to hpvm/test/hpvm_pass/BuildDFG/ThreeLevelEdge.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/TwoLaunch.ll b/hpvm/test/hpvm_pass/BuildDFG/TwoLaunch.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/TwoLaunch.ll
rename to hpvm/test/hpvm_pass/BuildDFG/TwoLaunch.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/TwoLevel.ll b/hpvm/test/hpvm_pass/BuildDFG/TwoLevel.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/TwoLevel.ll
rename to hpvm/test/hpvm_pass/BuildDFG/TwoLevel.ll
diff --git a/hpvm/test/regressionTests/BuildDFG/TwoLevelGraph.ll b/hpvm/test/hpvm_pass/BuildDFG/TwoLevelGraph.ll
similarity index 100%
rename from hpvm/test/regressionTests/BuildDFG/TwoLevelGraph.ll
rename to hpvm/test/hpvm_pass/BuildDFG/TwoLevelGraph.ll
diff --git a/hpvm/test/hpvm_pass/CMakeLists.txt b/hpvm/test/hpvm_pass/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe929f4d6316b54e4b4b19db1221ae146534ee0b
--- /dev/null
+++ b/hpvm/test/hpvm_pass/CMakeLists.txt
@@ -0,0 +1,20 @@
+  ../lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  opt hpvm-rt.bc
+  # Passes:
+  # Test utils:
+  FileCheck count not
+add_lit_testsuite(check-hpvm-pass "Running HPVM test cases for the passes"
diff --git a/hpvm/test/regressionTests/ClearDFG/ThreeLevel.CPU.ll b/hpvm/test/hpvm_pass/ClearDFG/ThreeLevel.CPU.ll
similarity index 100%
rename from hpvm/test/regressionTests/ClearDFG/ThreeLevel.CPU.ll
rename to hpvm/test/hpvm_pass/ClearDFG/ThreeLevel.CPU.ll
diff --git a/hpvm/test/regressionTests/ClearDFG/ThreeLevel.OpenCLToCPU.ll b/hpvm/test/hpvm_pass/ClearDFG/ThreeLevel.OpenCLToCPU.ll
similarity index 100%
rename from hpvm/test/regressionTests/ClearDFG/ThreeLevel.OpenCLToCPU.ll
rename to hpvm/test/hpvm_pass/ClearDFG/ThreeLevel.OpenCLToCPU.ll
diff --git a/hpvm/test/regressionTests/ClearDFG/ThreeLevel.constmem.OpenCLToCPU.ll b/hpvm/test/hpvm_pass/ClearDFG/ThreeLevel.constmem.OpenCLToCPU.ll
similarity index 100%
rename from hpvm/test/regressionTests/ClearDFG/ThreeLevel.constmem.OpenCLToCPU.ll
rename to hpvm/test/hpvm_pass/ClearDFG/ThreeLevel.constmem.OpenCLToCPU.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_CPU/OneLevel.codeGen.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CPU/OneLevel.codeGen.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_CPU/OneLevel.codeGen.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_CPU/OneLevel.codeGen.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_CPU/OneRootBasic.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CPU/OneRootBasic.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_CPU/OneRootBasic.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_CPU/OneRootBasic.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_CPU/ThreeLevel.OpenCL.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CPU/ThreeLevel.OpenCL.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_CPU/ThreeLevel.OpenCL.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_CPU/ThreeLevel.OpenCL.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_CPU/ThreeLevel.codeGen.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CPU/ThreeLevel.codeGen.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_CPU/ThreeLevel.codeGen.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_CPU/ThreeLevel.codeGen.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_CPU/ThreeLevel.cond.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CPU/ThreeLevel.cond.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_CPU/ThreeLevel.cond.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_CPU/ThreeLevel.cond.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_CPU/ThreeLevel.constmem.OpenCL.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CPU/ThreeLevel.constmem.OpenCL.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_CPU/ThreeLevel.constmem.OpenCL.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_CPU/ThreeLevel.constmem.OpenCL.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_CPU/TwoLevel.codeGen.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CPU/TwoLevel.codeGen.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_CPU/TwoLevel.codeGen.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_CPU/TwoLevel.codeGen.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_CPU/TwoRoot.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CPU/TwoRoot.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_CPU/TwoRoot.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_CPU/TwoRoot.ll
diff --git a/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/batchNorm.hpvm.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/batchNorm.hpvm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3afd9273f86c1bb58f85fbab9dce3ec14d8243ab
--- /dev/null
+++ b/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/batchNorm.hpvm.ll
@@ -0,0 +1,151 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMDFG2LLVM_CUDNN.so -S -inplace -dfg2llvm-cudnn < %s | FileCheck %s
+; ModuleID = 'batchNorm.ll'
+source_filename = "batchNorm.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z9relu_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_mS_mS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: i32 @main(
+; CHECK: call void @llvm_hpvm_initTensorRt(i32 0)
+; CHECK-NEXT: call void @llvm.hpvm.init()
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @llvm_hpvm_cleanupTensorRt()
+; CHECK-NEXT: call void @llvm.hpvm.cleanup()
+; CHECK-LABEL: @_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NEXT: call void @hpvm_request_tensor(
+; CHECK-NEXT: call void @hpvm_request_tensor(
+; CHECK-NEXT: call void @hpvm_request_tensor(
+; CHECK-NEXT: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @llvm.hpvm.tensor.batchnorm(i8* %t1, i8* %t2, i8* %t3, i8* %t4, i8* %t5, double 1.000000e-03)
+; CHECK: call i8* @tensorBatchNorm(i8* %t1, i8* %t2, i8* %t3, i8* %t4, i8* %t5, double 1.000000e-03)
+; CHECK: ret
+; CHECK-LABEL: @_Z9relu_nodePvm_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @llvm.hpvm.tensor.relu(i8* %t1)
+; CHECK: call i8* @tensorRelu(i8* %t1)
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 96) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_mS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !5
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.relu(i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9relu_nodePvm @_Z9relu_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.relu(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9relu_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9relu_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9relu_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.batchnorm(i8*, i8*, i8*, i8*, i8*, double) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m @_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2, i8* in %t3, i64 %bytes_t3, i8* in %t4, i64 %bytes_t4, i8* in %t5, i64 %bytes_t5) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.batchnorm(i8* %t1, i8* %t2, i8* %t3, i8* %t4, i8* %t5, double 1.000000e-03)
+  %returnStruct = insertvalue %struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_mS_mS_m @_Z4rootPvmS_mS_mS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %batch_normalization_1_gamma, i64 %batch_normalization_1_gamma_bytes, i8* in %batch_normalization_1_beta, i64 %batch_normalization_1_beta_bytes, i8* in %batch_normalization_1_mean, i64 %batch_normalization_1_mean_bytes, i8* in %batch_normalization_1_variance, i64 %batch_normalization_1_variance_bytes) #4 {
+  %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 3, i32 3, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 4, i32 4, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 5, i32 5, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 6, i32 6, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 7, i32 7, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 8, i32 8, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 9, i32 9, i1 false)
+  %_Z9relu_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9relu_nodePvm (i8*, i64)* @_Z9relu_nodePvm_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i8* %_Z9relu_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i8* %_Z9relu_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9relu_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9relu_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_mS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_cudnn = !{!2, !3}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{!4}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_promise = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5c964d17b48694847d60e6755519cbfa0603770f)"}
+!2 = !{%struct.out._Z9relu_nodePvm (i8*, i64)* @_Z9relu_nodePvm_cloned}
+!3 = !{%struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned}
+!4 = !{%struct.out._Z4rootPvmS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_mS_mS_m_cloned}
+!5 = !{!6, !7, i64 0}
+!6 = !{!"_ZTS6RootIn", !7, i64 0, !10, i64 8, !7, i64 16, !10, i64 24, !7, i64 32, !10, i64 40, !7, i64 48, !10, i64 56, !7, i64 64, !10, i64 72, !11, i64 80}
+!7 = !{!"any pointer", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+!10 = !{!"long", !8, i64 0}
+!11 = !{!"_ZTS5ret_t", !7, i64 0, !10, i64 8}
diff --git a/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/matMul.hpvm.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/matMul.hpvm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..978f3f87001249e111b56584fd85f6a28bfb30f7
--- /dev/null
+++ b/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/matMul.hpvm.ll
@@ -0,0 +1,135 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMDFG2LLVM_CUDNN.so -S -inplace -dfg2llvm-cudnn < %s | FileCheck %s
+; ModuleID = 'matMul.ll'
+source_filename = "matMul.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z9relu_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z11matMul_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: i32 @main(
+; CHECK: call void @llvm_hpvm_initTensorRt(i32 0)
+; CHECK-NEXT: call void @llvm.hpvm.init()
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @llvm_hpvm_cleanupTensorRt()
+; CHECK-NEXT: call void @llvm.hpvm.cleanup()
+; CHECK-LABEL: @_Z11matMul_nodePvmS_m_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NEXT: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @__hpvm__tensor_mul(i8* %t1, i8* %t2)
+; CHECK: call i8* @tensorGemmGPU(i8* %t1, i8* %t2)
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 48) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_m (i8*, i64, i8*, i64)* @_Z4rootPvmS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !5
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.relu(i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9relu_nodePvm @_Z9relu_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.relu(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9relu_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9relu_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9relu_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.mul(i8*, i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z11matMul_nodePvmS_m @_Z11matMul_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.mul(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z11matMul_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z11matMul_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z11matMul_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_m @_Z4rootPvmS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %m1, i64 %m1_bytes) #4 {
+  %_Z11matMul_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z11matMul_nodePvmS_m (i8*, i64, i8*, i64)* @_Z11matMul_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z11matMul_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matMul_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matMul_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matMul_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z9relu_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9relu_nodePvm (i8*, i64)* @_Z9relu_nodePvm_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z11matMul_nodePvmS_m_cloned.node, i8* %_Z9relu_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z11matMul_nodePvmS_m_cloned.node, i8* %_Z9relu_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9relu_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9relu_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_cudnn = !{!2, !3}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{!4}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_promise = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5c964d17b48694847d60e6755519cbfa0603770f)"}
+!2 = !{%struct.out._Z9relu_nodePvm (i8*, i64)* @_Z9relu_nodePvm_cloned}
+!3 = !{%struct.out._Z11matMul_nodePvmS_m (i8*, i64, i8*, i64)* @_Z11matMul_nodePvmS_m_cloned}
+!4 = !{%struct.out._Z4rootPvmS_m (i8*, i64, i8*, i64)* @_Z4rootPvmS_m_cloned}
+!5 = !{!6, !7, i64 0}
+!6 = !{!"_ZTS6RootIn", !7, i64 0, !10, i64 8, !7, i64 16, !10, i64 24, !11, i64 32}
+!7 = !{!"any pointer", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+!10 = !{!"long", !8, i64 0}
+!11 = !{!"_ZTS5ret_t", !7, i64 0, !10, i64 8}
diff --git a/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/meanPoolWithSoftmax.hpvm.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/meanPoolWithSoftmax.hpvm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c42ce49d803ca82cd559f7ab3b3c8107fe0a9b0f
--- /dev/null
+++ b/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/meanPoolWithSoftmax.hpvm.ll
@@ -0,0 +1,190 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMDFG2LLVM_CUDNN.so -S -inplace -dfg2llvm-cudnn < %s | FileCheck %s
+; ModuleID = 'meanPoolWithSoftmax.ll'
+source_filename = "softmaxActivation.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z9conv_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z13bias_add_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z12softmax_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z14pool_mean_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: i32 @main(
+; CHECK: call void @llvm_hpvm_initTensorRt(i32 0)
+; CHECK-NEXT: call void @llvm.hpvm.init()
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @llvm_hpvm_cleanupTensorRt()
+; CHECK-NEXT: call void @llvm.hpvm.cleanup()
+; CHECK-LABEL: @_Z9conv_nodePvmS_m_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @__hpvm__tensor_convolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1)
+; CHECK: call i8* @tensorConvolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1, i32 1, i32 0)
+; CHECK: ret
+; CHECK-LABEL: @_Z13bias_add_nodePvmS_m_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @__hpvm__tensor_add(i8* %t1, i8* %t2)
+; CHECK: call i8* @tensorAdd(i8* %t1, i8* %t2)
+; CHECK: ret
+; CHECK-LABEL: @_Z12softmax_nodePvm_cloned_cudnn( 
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @__hpvm__tensor_softmax(i8* %t1)
+; CHECK: call i8* @tensorSoftmax(i8* %t1)
+; CHECK: ret
+; CHECK-LABEL: @_Z14pool_mean_nodePvm_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NOT: %call = tail call i8* @__hpvm__tensor_pool_mean(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+; CHECK: call i8* @tensorPooling(i8* %t1, i32 1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) 
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 64) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !7
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.convolution(i8*, i8*, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9conv_nodePvmS_m @_Z9conv_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.convolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1)
+  %returnStruct = insertvalue %struct.out._Z9conv_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9conv_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z9conv_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13bias_add_nodePvmS_m @_Z13bias_add_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.add(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z13bias_add_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13bias_add_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z13bias_add_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.softmax(i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z12softmax_nodePvm @_Z12softmax_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.softmax(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z12softmax_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z12softmax_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z12softmax_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.pool.mean(i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z14pool_mean_nodePvm @_Z14pool_mean_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.pool.mean(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+  %returnStruct = insertvalue %struct.out._Z14pool_mean_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z14pool_mean_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z14pool_mean_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_m @_Z4rootPvmS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* nocapture readnone %conv2d_1_b, i64 %conv2d_1_b_bytes) #4 {
+  %_Z9conv_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z13bias_add_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 4, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 5, i32 3, i1 false)
+  %_Z12softmax_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z12softmax_nodePvm (i8*, i64)* @_Z12softmax_nodePvm_cloned to i8*))
+  %output2 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z12softmax_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output3 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z12softmax_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  %_Z14pool_mean_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z14pool_mean_nodePvm (i8*, i64)* @_Z14pool_mean_nodePvm_cloned to i8*))
+  %output4 = call i8* @llvm.hpvm.createEdge(i8* %_Z12softmax_nodePvm_cloned.node, i8* %_Z14pool_mean_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output5 = call i8* @llvm.hpvm.createEdge(i8* %_Z12softmax_nodePvm_cloned.node, i8* %_Z14pool_mean_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z14pool_mean_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z14pool_mean_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_cudnn = !{!2, !3, !4, !5}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{!6}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_promise = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5c964d17b48694847d60e6755519cbfa0603770f)"}
+!2 = !{%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned}
+!3 = !{%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned}
+!4 = !{%struct.out._Z12softmax_nodePvm (i8*, i64)* @_Z12softmax_nodePvm_cloned}
+!5 = !{%struct.out._Z14pool_mean_nodePvm (i8*, i64)* @_Z14pool_mean_nodePvm_cloned}
+!6 = !{%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned}
+!7 = !{!8, !9, i64 0}
+!8 = !{!"_ZTS6RootIn", !9, i64 0, !12, i64 8, !9, i64 16, !12, i64 24, !9, i64 32, !12, i64 40, !13, i64 48}
+!9 = !{!"any pointer", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C++ TBAA"}
+!12 = !{!"long", !10, i64 0}
+!13 = !{!"_ZTS5ret_t", !9, i64 0, !12, i64 8}
diff --git a/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/singleConvLayer.hpvm.ll b/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/singleConvLayer.hpvm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..91586784b04f12a5919f4dc48a6b786dc89481a0
--- /dev/null
+++ b/hpvm/test/hpvm_pass/DFG2LLVM_CUDNN/singleConvLayer.hpvm.ll
@@ -0,0 +1,192 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMDFG2LLVM_CUDNN.so -S -inplace -dfg2llvm-cudnn < %s | FileCheck %s
+; ModuleID = 'singleConvLayer.ll'
+source_filename = "singleConv.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z9conv_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z13bias_add_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z9tanh_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z13pool_max_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: i32 @main(
+; CHECK: call void @llvm_hpvm_initTensorRt(i32 0)
+; CHECK-NEXT: call void @llvm.hpvm.init()
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @llvm_hpvm_cleanupTensorRt()
+; CHECK-NEXT: call void @llvm.hpvm.cleanup()
+; CHECK-LABEL: @_Z9conv_nodePvmS_m_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @__hpvm__tensor_convolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1)
+; CHECK: call i8* @tensorConvolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1, i32 1, i32 0)
+; CHECK: ret
+; CHECK-LABEL: @_Z13bias_add_nodePvmS_m_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @__hpvm__tensor_add(i8* %t1, i8* %t2)
+; CHECK: call i8* @tensorAdd(i8* %t1, i8* %t2)
+; CHECK: ret
+; CHECK-LABEL: @_Z9tanh_nodePvm_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @__hpvm__tensor_tanh(i8* %t1)
+; CHECK: call i8* @tensorTanh(i8* %t1)
+; CHECK: ret
+; CHECK-LABEL: @_Z13pool_max_nodePvm_cloned_cudnn(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK-NOT: call i8* @__hpvm__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+; CHECK: call i8* @tensorPooling(i8* %t1, i32 0, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) 
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 64) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !7
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.convolution(i8*, i8*, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9conv_nodePvmS_m @_Z9conv_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.convolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1)
+  %returnStruct = insertvalue %struct.out._Z9conv_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9conv_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z9conv_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13bias_add_nodePvmS_m @_Z13bias_add_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.add(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z13bias_add_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13bias_add_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z13bias_add_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.tanh(i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9tanh_nodePvm @_Z9tanh_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.tanh(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9tanh_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9tanh_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9tanh_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13pool_max_nodePvm @_Z13pool_max_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+  %returnStruct = insertvalue %struct.out._Z13pool_max_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13pool_max_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z13pool_max_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_m @_Z4rootPvmS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* nocapture readnone %conv2d_1_b, i64 %conv2d_1_b_bytes) #4 {
+  %_Z9conv_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z13bias_add_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 4, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 5, i32 3, i1 false)
+  %_Z9tanh_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned to i8*))
+  %output2 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output3 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  %_Z13pool_max_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned to i8*))
+  %output4 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output5 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_cudnn = !{!2, !3, !4, !5}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{!6}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_promise = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5c964d17b48694847d60e6755519cbfa0603770f)"}
+!2 = !{%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned}
+!3 = !{%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned}
+!4 = !{%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned}
+!5 = !{%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned}
+!6 = !{%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned}
+!7 = !{!8, !9, i64 0}
+!8 = !{!"_ZTS6RootIn", !9, i64 0, !12, i64 8, !9, i64 16, !12, i64 24, !9, i64 32, !12, i64 40, !13, i64 48}
+!9 = !{!"any pointer", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C++ TBAA"}
+!12 = !{!"long", !10, i64 0}
+!13 = !{!"_ZTS5ret_t", !9, i64 0, !12, i64 8}
diff --git a/hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.atomic.ll b/hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.atomic.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.atomic.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.atomic.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.cond.const.ll b/hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.cond.const.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.cond.const.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.cond.const.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.cond.ll b/hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.cond.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.cond.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.cond.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.constmem.ll b/hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.constmem.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.constmem.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.constmem.ll
diff --git a/hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.ll b/hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.ll
similarity index 100%
rename from hpvm/test/regressionTests/DFG2LLVM_OPENCL/ThreeLevel.ll
rename to hpvm/test/hpvm_pass/DFG2LLVM_OPENCL/ThreeLevel.ll
diff --git a/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapAddTanh.ll b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapAddTanh.ll
new file mode 100644
index 0000000000000000000000000000000000000000..33c431a78d1fbae8404ff7f4b0c449165a950053
--- /dev/null
+++ b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapAddTanh.ll
@@ -0,0 +1,140 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMDFG2LLVM_WrapperAPI.so -S -inplace -dfg2llvm-wrapperapi --configuration-inputs-filename=conf_file.txt < %s | FileCheck %s
+; ModuleID = 'addTanh.hpvm.ll'
+source_filename = "addTanh.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z9tanh_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z11matadd_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: i32 @main(
+; CHECK: call void @llvm_hpvm_initApproxhpvmRt(i32 0)
+; CHECK-NEXT: call void @llvm_hpvm_initializeRuntimeController(
+; CHECK-NEXT: call void @llvm.hpvm.init()
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @llvm_hpvm_cleanupApproxhpvmRt()
+; CHECK: call void @llvm_hpvm_clearRuntimeController()
+; CHECK-NEXT: call void @llvm.hpvm.cleanup()
+; CHECK-LABEL: @_Z11matadd_nodePvmS_m_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call i8* @wrapper_tensorAdd(
+; CHECK: ret
+; CHECK-LABEL: @_Z9tanh_nodePvm_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call i8* @wrapper_tensorTanh(
+; CHECK: ret
+; Function Attrs: norecurse uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 48) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_m (i8*, i64, i8*, i64)* @_Z4rootPvmS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !5
+  tail call void @hpvm_request_tensor(i8* %0, i32 1)
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.tanh(i8*) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z9tanh_nodePvm @_Z9tanh_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.tanh(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9tanh_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9tanh_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9tanh_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z11matadd_nodePvmS_m @_Z11matadd_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.add(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z11matadd_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z11matadd_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z11matadd_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z4rootPvmS_m @_Z4rootPvmS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %m1, i64 %m1_bytes) #4 {
+  %_Z11matadd_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z11matadd_nodePvmS_m (i8*, i64, i8*, i64)* @_Z11matadd_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z11matadd_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matadd_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matadd_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matadd_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z9tanh_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z11matadd_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z11matadd_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9tanh_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9tanh_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_promise = !{!2, !3, !4}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5c2a920901bb51fcc2e51f920c0f726cbd6d3f0d)"}
+!2 = !{%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned}
+!3 = !{%struct.out._Z11matadd_nodePvmS_m (i8*, i64, i8*, i64)* @_Z11matadd_nodePvmS_m_cloned}
+!4 = !{%struct.out._Z4rootPvmS_m (i8*, i64, i8*, i64)* @_Z4rootPvmS_m_cloned}
+!5 = !{!6, !7, i64 0}
+!6 = !{!"_ZTS6RootIn", !7, i64 0, !10, i64 8, !7, i64 16, !10, i64 24, !11, i64 32}
+!7 = !{!"any pointer", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+!10 = !{!"long", !8, i64 0}
+!11 = !{!"_ZTS5ret_t", !7, i64 0, !10, i64 8}
diff --git a/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapBatchNormRelu.ll b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapBatchNormRelu.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f93d325a7ff0107f01510f639a3f09eb840f917e
--- /dev/null
+++ b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapBatchNormRelu.ll
@@ -0,0 +1,151 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMDFG2LLVM_WrapperAPI.so -S -inplace -dfg2llvm-wrapperapi --configuration-inputs-filename=conf_file.txt < %s | FileCheck %s
+; ModuleID = 'batchNorm.hpvm.ll'
+source_filename = "batchNorm.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z9relu_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_mS_mS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: i32 @main(
+; CHECK: call void @llvm_hpvm_initApproxhpvmRt(i32 0)
+; CHECK-NEXT: call void @llvm_hpvm_initializeRuntimeController(
+; CHECK-NEXT: call void @llvm.hpvm.init()
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @llvm_hpvm_cleanupApproxhpvmRt()
+; CHECK: call void @llvm_hpvm_clearRuntimeController()
+; CHECK-NEXT: call void @llvm.hpvm.cleanup()
+; CHECK-LABEL: @_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call i8* @wrapper_tensorBatchNorm(
+; CHECK: ret
+; CHECK-LABEL: @_Z9relu_nodePvm_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call i8* @wrapper_tensorRelu(
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 96) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_mS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !5
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.relu(i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9relu_nodePvm @_Z9relu_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.relu(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9relu_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9relu_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9relu_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.batchnorm(i8*, i8*, i8*, i8*, i8*, double) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m @_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2, i8* in %t3, i64 %bytes_t3, i8* in %t4, i64 %bytes_t4, i8* in %t5, i64 %bytes_t5) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.batchnorm(i8* %t1, i8* %t2, i8* %t3, i8* %t4, i8* %t5, double 1.000000e-03)
+  %returnStruct = insertvalue %struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_mS_mS_m @_Z4rootPvmS_mS_mS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %batch_normalization_1_gamma, i64 %batch_normalization_1_gamma_bytes, i8* in %batch_normalization_1_beta, i64 %batch_normalization_1_beta_bytes, i8* in %batch_normalization_1_mean, i64 %batch_normalization_1_mean_bytes, i8* in %batch_normalization_1_variance, i64 %batch_normalization_1_variance_bytes) #4 {
+  %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 3, i32 3, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 4, i32 4, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 5, i32 5, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 6, i32 6, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 7, i32 7, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 8, i32 8, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i32 9, i32 9, i1 false)
+  %_Z9relu_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9relu_nodePvm (i8*, i64)* @_Z9relu_nodePvm_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i8* %_Z9relu_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned.node, i8* %_Z9relu_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9relu_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9relu_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_mS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_promise = !{!2, !3, !4}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5c964d17b48694847d60e6755519cbfa0603770f)"}
+!2 = !{%struct.out._Z9relu_nodePvm (i8*, i64)* @_Z9relu_nodePvm_cloned}
+!3 = !{%struct.out._Z19batchNormLayer_nodePvmS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z19batchNormLayer_nodePvmS_mS_mS_mS_m_cloned}
+!4 = !{%struct.out._Z4rootPvmS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_mS_mS_m_cloned}
+!5 = !{!6, !7, i64 0}
+!6 = !{!"_ZTS6RootIn", !7, i64 0, !10, i64 8, !7, i64 16, !10, i64 24, !7, i64 32, !10, i64 40, !7, i64 48, !10, i64 56, !7, i64 64, !10, i64 72, !11, i64 80}
+!7 = !{!"any pointer", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+!10 = !{!"long", !8, i64 0}
+!11 = !{!"_ZTS5ret_t", !7, i64 0, !10, i64 8}
diff --git a/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapConv.ll b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapConv.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3a0ef959449b8057a7d3440b5da3fc2a671905ad
--- /dev/null
+++ b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapConv.ll
@@ -0,0 +1,136 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMDFG2LLVM_WrapperAPI.so -S -inplace -dfg2llvm-wrapperapi --configuration-inputs-filename=conf_file.txt < %s | FileCheck %s
+; ModuleID = 'fuseConv.hpvm_.ll'
+source_filename = "fuseConv.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z4rootPvmS_mS_m = type <{ i8*, i64 }>
+%struct.out._Z13pool_max_nodePvm = type <{ i8*, i64 }>
+; CHECK-LABEL: i32 @main(
+; CHECK: call void @llvm_hpvm_initApproxhpvmRt(i32 0)
+; CHECK-NEXT: call void @llvm_hpvm_initializeRuntimeController(
+; CHECK-NEXT: call void @llvm.hpvm.init()
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @llvm_hpvm_cleanupApproxhpvmRt()
+; CHECK: call void @llvm_hpvm_clearRuntimeController()
+; CHECK-NEXT: call void @llvm.hpvm.cleanup()
+; CHECK-LABEL: @_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call i8* @wrapper_ConvLayer2(
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 64) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !4
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.convolution(i8*, i8*, i32, i32, i32, i32) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.tanh(i8*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_m @_Z4rootPvmS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* in %conv2d_1_b, i64 %conv2d_1_b_bytes) #4 {
+  %_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13pool_max_nodePvm (i8*, i64, i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned.node, i32 3, i32 3, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned.node, i32 4, i32 4, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned.node, i32 5, i32 5, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+define %struct.out._Z13pool_max_nodePvm @_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned(i8* in %s_s_s_t1, i64 %s_s_s_bytes_t1, i8* in %s_s_s_t2, i64 %s_s_s_bytes_t2, i8* in %s_s_d_t2, i64 %s_s_d_bytes_t2) {
+  %s_s_s_call1 = call i8* @llvm.hpvm.tensor.convolution(i8* %s_s_s_t1, i8* %s_s_s_t2, i32 2, i32 2, i32 1, i32 1)
+  %s_s_call1 = call i8* @llvm.hpvm.tensor.add(i8* %s_s_s_call1, i8* %s_s_d_t2)
+  %s_call1 = call i8* @llvm.hpvm.tensor.tanh(i8* %s_s_call1)
+  %call1 = call i8* @llvm.hpvm.tensor.pool.max(i8* %s_call1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+  %returnStruct = insertvalue %struct.out._Z13pool_max_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13pool_max_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z13pool_max_nodePvm %returnStruct2
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_promise = !{!2, !3}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5ccb2a532b5a0d82cee5c0d29a629a29dec2307c)"}
+!2 = !{%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned}
+!3 = !{%struct.out._Z13pool_max_nodePvm (i8*, i64, i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned}
+!4 = !{!5, !6, i64 0}
+!5 = !{!"_ZTS6RootIn", !6, i64 0, !9, i64 8, !6, i64 16, !9, i64 24, !6, i64 32, !9, i64 40, !10, i64 48}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C++ TBAA"}
+!9 = !{!"long", !7, i64 0}
+!10 = !{!"_ZTS5ret_t", !6, i64 0, !9, i64 8}
diff --git a/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapGroupPoolSoftmax.ll b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapGroupPoolSoftmax.ll
new file mode 100644
index 0000000000000000000000000000000000000000..25344f98f56bd7954952094028a2f30bbdb62bb3
--- /dev/null
+++ b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapGroupPoolSoftmax.ll
@@ -0,0 +1,164 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMDFG2LLVM_WrapperAPI.so -S -inplace -dfg2llvm-wrapperapi --configuration-inputs-filename=conf_file.txt < %s | FileCheck %s
+; ModuleID = 'groupPoolSoftmax.hpvm.ll'
+source_filename = "groupPoolSoftmax.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z14groupConv_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z12softmax_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z13pool_max_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: i32 @main(
+; CHECK: call void @llvm_hpvm_initApproxhpvmRt(i32 0)
+; CHECK-NEXT: call void @llvm_hpvm_initializeRuntimeController(
+; CHECK-NEXT: call void @llvm.hpvm.init()
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @llvm_hpvm_cleanupApproxhpvmRt()
+; CHECK: call void @llvm_hpvm_clearRuntimeController()
+; CHECK-NEXT: call void @llvm.hpvm.cleanup()
+; CHECK-LABEL: @_Z14groupConv_nodePvmS_m_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call i8* @wrapper_tensorGroupConvolution(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @2, i32 0, i32 0), i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 32)
+; CHECK: ret
+; CHECK-LABEL: @_Z13pool_max_nodePvm_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call i8* @wrapper_tensorPooling(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @3, i32 0, i32 0), i8* %t1, i32 0, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+; CHECK: ret
+; CHECK-LABEL: @_Z12softmax_nodePvm_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call i8* @wrapper_tensorSoftmax(
+; CHECK: ret
+; Function Attrs: norecurse uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 48) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_m (i8*, i64, i8*, i64)* @_Z4rootPvmS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !6
+  tail call void @hpvm_request_tensor(i8* %0, i32 1)
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.group.convolution(i8*, i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z14groupConv_nodePvmS_m @_Z14groupConv_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.group.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1, i32 1, i32 32)
+  %returnStruct = insertvalue %struct.out._Z14groupConv_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z14groupConv_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z14groupConv_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.softmax(i8*) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z12softmax_nodePvm @_Z12softmax_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.softmax(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z12softmax_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z12softmax_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z12softmax_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z13pool_max_nodePvm @_Z13pool_max_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+  %returnStruct = insertvalue %struct.out._Z13pool_max_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13pool_max_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z13pool_max_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z4rootPvmS_m @_Z4rootPvmS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %gconv, i64 %gconv_bytes) #4 {
+  %_Z14groupConv_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z14groupConv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z14groupConv_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z14groupConv_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z14groupConv_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z14groupConv_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z14groupConv_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z13pool_max_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z14groupConv_nodePvmS_m_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z14groupConv_nodePvmS_m_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  %_Z12softmax_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z12softmax_nodePvm (i8*, i64)* @_Z12softmax_nodePvm_cloned to i8*))
+  %output2 = call i8* @llvm.hpvm.createEdge(i8* %_Z13pool_max_nodePvm_cloned.node, i8* %_Z12softmax_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output3 = call i8* @llvm.hpvm.createEdge(i8* %_Z13pool_max_nodePvm_cloned.node, i8* %_Z12softmax_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z12softmax_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z12softmax_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_promise = !{!2, !3, !4, !5}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5c2a920901bb51fcc2e51f920c0f726cbd6d3f0d)"}
+!2 = !{%struct.out._Z14groupConv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z14groupConv_nodePvmS_m_cloned}
+!3 = !{%struct.out._Z12softmax_nodePvm (i8*, i64)* @_Z12softmax_nodePvm_cloned}
+!4 = !{%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned}
+!5 = !{%struct.out._Z4rootPvmS_m (i8*, i64, i8*, i64)* @_Z4rootPvmS_m_cloned}
+!6 = !{!7, !8, i64 0}
+!7 = !{!"_ZTS6RootIn", !8, i64 0, !11, i64 8, !8, i64 16, !11, i64 24, !12, i64 32}
+!8 = !{!"any pointer", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C++ TBAA"}
+!11 = !{!"long", !9, i64 0}
+!12 = !{!"_ZTS5ret_t", !8, i64 0, !11, i64 8}
diff --git a/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapMatMul.ll b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapMatMul.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c1546533d844bc71003a8ff7148e6c09a932deaf
--- /dev/null
+++ b/hpvm/test/hpvm_pass/DFG2LLVM_WrapperAPI/wrapMatMul.ll
@@ -0,0 +1,153 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMDFG2LLVM_WrapperAPI.so -S -inplace -dfg2llvm-wrapperapi --configuration-inputs-filename=conf_file.txt < %s | FileCheck %s
+; ModuleID = 'fuseMatMul.hpvm.ll'
+source_filename = "fuseMatMul.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z13pool_max_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_m = type <{ i8*, i64 }>
+%struct.out._Z9tanh_nodePvm = type <{ i8*, i64 }>
+; CHECK-LABEL: i32 @main(
+; CHECK: call void @llvm_hpvm_initApproxhpvmRt(i32 0)
+; CHECK-NEXT: call void @llvm_hpvm_initializeRuntimeController(
+; CHECK-NEXT: call void @llvm.hpvm.init()
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @llvm_hpvm_cleanupApproxhpvmRt()
+; CHECK: call void @llvm_hpvm_clearRuntimeController()
+; CHECK-NEXT: call void @llvm.hpvm.cleanup()
+; CHECK-LABEL: @_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call void @hpvm_request_tensor(
+; CHECK: call i8* @wrapper_FCLayer(
+; CHECK: ret
+; CHECK-LABEL: @_Z13pool_max_nodePvm_cloned_wrapper_api(
+; CHECK: call void @hpvm_request_tensor(i8* %t1, i32 1)
+; CHECK: call i8* @wrapper_tensorPooling(
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 64) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !5
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.mul(i8*, i8*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.tanh(i8*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13pool_max_nodePvm @_Z13pool_max_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+  %returnStruct = insertvalue %struct.out._Z13pool_max_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13pool_max_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z13pool_max_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_m @_Z4rootPvmS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %matmul2d_1_w, i64 %matmul2d_1_w_bytes, i8* in %matmul2d_1_b, i64 %matmul2d_1_b_bytes) #4 {
+  %_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9tanh_nodePvm (i8*, i64, i8*, i64, i8*, i64)* @_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned.node, i32 3, i32 3, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned.node, i32 4, i32 4, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned.node, i32 5, i32 5, i1 false)
+  %_Z13pool_max_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned to i8*))
+  %output4 = call i8* @llvm.hpvm.createEdge(i8* %_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output5 = call i8* @llvm.hpvm.createEdge(i8* %_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+define %struct.out._Z9tanh_nodePvm @_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned(i8* in %s_s_t1, i64 %s_s_bytes_t1, i8* in %s_s_t2, i64 %s_s_bytes_t2, i8* in %s_d_t2, i64 %s_d_bytes_t2) {
+  %s_s_call1 = call i8* @llvm.hpvm.tensor.mul(i8* %s_s_t1, i8* %s_s_t2)
+  %s_call1 = call i8* @llvm.hpvm.tensor.add(i8* %s_s_call1, i8* %s_d_t2)
+  %call1 = call i8* @llvm.hpvm.tensor.tanh(i8* %s_call1)
+  %returnStruct = insertvalue %struct.out._Z9tanh_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9tanh_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9tanh_nodePvm %returnStruct2
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_promise = !{!2, !3, !4}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5ccb2a532b5a0d82cee5c0d29a629a29dec2307c)"}
+!2 = !{%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned}
+!3 = !{%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned}
+!4 = !{%struct.out._Z9tanh_nodePvm (i8*, i64, i8*, i64, i8*, i64)* @_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned}
+!5 = !{!6, !7, i64 0}
+!6 = !{!"_ZTS6RootIn", !7, i64 0, !10, i64 8, !7, i64 16, !10, i64 24, !7, i64 32, !10, i64 40, !11, i64 48}
+!7 = !{!"any pointer", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+!10 = !{!"long", !8, i64 0}
+!11 = !{!"_ZTS5ret_t", !7, i64 0, !10, i64 8}
diff --git a/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseConv.hpvm.ll b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseConv.hpvm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4c60c749ff808d4d04f599be39e4992bbc7c8db1
--- /dev/null
+++ b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseConv.hpvm.ll
@@ -0,0 +1,177 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMFuseHPVMTensorNodes.so -S -inplace -hpvm-fuse < %s | FileCheck %s
+; ModuleID = 'fuseConv.ll'
+source_filename = "fuseConv.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z9conv_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z13bias_add_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z9tanh_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z13pool_max_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: @_Z4rootPvmS_mS_m_cloned(
+; CHECK: call i8* @llvm.hpvm.createNode(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: ret
+; CHECK-LABEL: @_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned(
+; CHECK: call i8* @llvm.hpvm.tensor.convolution(
+; CHECK-NEXT: call i8* @llvm.hpvm.tensor.add(
+; CHECK-NEXT: call i8* @llvm.hpvm.tensor.tanh(
+; CHECK-NEXT: call i8* @llvm.hpvm.tensor.pool.max(
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 64) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !7
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.convolution(i8*, i8*, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9conv_nodePvmS_m @_Z9conv_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.convolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1)
+  %returnStruct = insertvalue %struct.out._Z9conv_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9conv_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z9conv_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13bias_add_nodePvmS_m @_Z13bias_add_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.add(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z13bias_add_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13bias_add_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z13bias_add_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.tanh(i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9tanh_nodePvm @_Z9tanh_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.tanh(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9tanh_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9tanh_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9tanh_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13pool_max_nodePvm @_Z13pool_max_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+  %returnStruct = insertvalue %struct.out._Z13pool_max_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13pool_max_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z13pool_max_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_m @_Z4rootPvmS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* in %conv2d_1_b, i64 %conv2d_1_b_bytes) #4 {
+  %_Z9conv_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z13bias_add_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 4, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 5, i32 3, i1 false)
+  %_Z9tanh_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned to i8*))
+  %output2 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output3 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  %_Z13pool_max_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned to i8*))
+  %output4 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output5 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_promise = !{!2, !3, !4, !5}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{!6}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5ccb2a532b5a0d82cee5c0d29a629a29dec2307c)"}
+!2 = !{%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned}
+!3 = !{%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned}
+!4 = !{%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned}
+!5 = !{%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned}
+!6 = !{%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned}
+!7 = !{!8, !9, i64 0}
+!8 = !{!"_ZTS6RootIn", !9, i64 0, !12, i64 8, !9, i64 16, !12, i64 24, !9, i64 32, !12, i64 40, !13, i64 48}
+!9 = !{!"any pointer", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C++ TBAA"}
+!12 = !{!"long", !10, i64 0}
+!13 = !{!"_ZTS5ret_t", !9, i64 0, !12, i64 8}
diff --git a/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseConvNoPool.hpvm.ll b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseConvNoPool.hpvm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3e521bb5297f6d8087fec1b2e381c06d290f2d0f
--- /dev/null
+++ b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseConvNoPool.hpvm.ll
@@ -0,0 +1,159 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMFuseHPVMTensorNodes.so -S -inplace -hpvm-fuse < %s | FileCheck %s
+; ModuleID = 'fuseConvNoPool.ll'
+source_filename = "fuseConvNoPool.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; CHECK-LABEL: @_Z4rootPvmS_mS_m_cloned(
+; CHECK: call i8* @llvm.hpvm.createNode(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: ret
+; CHECK-LABEL: @_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9relu_nodePvm_cloned(
+; CHECK: call i8* @llvm.hpvm.tensor.convolution(
+; CHECK: call i8* @llvm.hpvm.tensor.add(
+; CHECK: call i8* @llvm.hpvm.tensor.relu(
+; CHECK: ret
+%struct.out._Z9conv_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z13bias_add_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z9relu_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_m = type <{ i8*, i64 }>
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 64) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !6
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.convolution(i8*, i8*, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9conv_nodePvmS_m @_Z9conv_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.convolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1)
+  %returnStruct = insertvalue %struct.out._Z9conv_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9conv_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z9conv_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13bias_add_nodePvmS_m @_Z13bias_add_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.add(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z13bias_add_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13bias_add_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z13bias_add_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.relu(i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9relu_nodePvm @_Z9relu_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.relu(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9relu_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9relu_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9relu_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_m @_Z4rootPvmS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* nocapture readnone %conv2d_1_b, i64 %conv2d_1_b_bytes) #4 {
+  %_Z9conv_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z13bias_add_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 4, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 5, i32 3, i1 false)
+  %_Z9relu_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9relu_nodePvm (i8*, i64)* @_Z9relu_nodePvm_cloned to i8*))
+  %output2 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9relu_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output3 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9relu_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9relu_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z9relu_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_promise = !{!2, !3, !4, !5}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5ccb2a532b5a0d82cee5c0d29a629a29dec2307c)"}
+!2 = !{%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned}
+!3 = !{%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned}
+!4 = !{%struct.out._Z9relu_nodePvm (i8*, i64)* @_Z9relu_nodePvm_cloned}
+!5 = !{%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned}
+!6 = !{!7, !8, i64 0}
+!7 = !{!"_ZTS6RootIn", !8, i64 0, !11, i64 8, !8, i64 16, !11, i64 24, !8, i64 32, !11, i64 40, !12, i64 48}
+!8 = !{!"any pointer", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C++ TBAA"}
+!11 = !{!"long", !9, i64 0}
+!12 = !{!"_ZTS5ret_t", !8, i64 0, !11, i64 8}
diff --git a/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseDiffTargets.hpvm.ll b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseDiffTargets.hpvm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5bdfe023bb6854e85d7f725a32c3920c5edc01d8
--- /dev/null
+++ b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseDiffTargets.hpvm.ll
@@ -0,0 +1,195 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMFuseHPVMTensorNodes.so -S -inplace -hpvm-fuse < %s | FileCheck %s
+; ModuleID = 'fuseDiffTargets.ll'
+source_filename = "fuseDiffTargets.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z9conv_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z13bias_add_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z9tanh_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z13pool_max_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: @_Z9conv_nodePvmS_m_cloned(
+; CHECK: call i8* @llvm.hpvm.tensor.convolution(
+; CHECK: ret
+; CHECK-LABEL: @_Z13bias_add_nodePvmS_m_cloned(
+; CHECK: call i8* @llvm.hpvm.tensor.add(
+; CHECK: ret
+; CHECK-LABEL: @_Z9tanh_nodePvm_cloned(
+; CHECK: call i8* @llvm.hpvm.tensor.tanh(
+; CHECK: ret
+; CHECK-LABEL: @_Z13pool_max_nodePvm_cloned(
+; CHECK: call i8* @llvm.hpvm.tensor.pool.max(
+; CHECK: ret
+; CHECK-LABEL: @_Z4rootPvmS_mS_m_cloned(
+; CHECK: call i8* @llvm.hpvm.createNode(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK: call i8* @llvm.hpvm.createNode(
+; CHECK-NEXT: call i8* @llvm.hpvm.createEdge(
+; CHECK-NEXT: call i8* @llvm.hpvm.createEdge(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK: call i8* @llvm.hpvm.createNode(
+; CHECK-NEXT: call i8* @llvm.hpvm.createEdge(
+; CHECK-NEXT: call i8* @llvm.hpvm.createEdge(
+; CHECK: call i8* @llvm.hpvm.createNode(
+; CHECK-NEXT: call i8* @llvm.hpvm.createEdge(
+; CHECK-NEXT: call i8* @llvm.hpvm.createEdge(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 64) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !7
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.convolution(i8*, i8*, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9conv_nodePvmS_m @_Z9conv_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.convolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1)
+  %returnStruct = insertvalue %struct.out._Z9conv_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9conv_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z9conv_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13bias_add_nodePvmS_m @_Z13bias_add_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.add(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z13bias_add_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13bias_add_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z13bias_add_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.tanh(i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9tanh_nodePvm @_Z9tanh_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.tanh(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9tanh_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9tanh_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9tanh_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13pool_max_nodePvm @_Z13pool_max_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+  %returnStruct = insertvalue %struct.out._Z13pool_max_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13pool_max_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z13pool_max_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_m @_Z4rootPvmS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* in %conv2d_1_b, i64 %conv2d_1_b_bytes) #4 {
+  %_Z9conv_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z13bias_add_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 4, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 5, i32 3, i1 false)
+  %_Z9tanh_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned to i8*))
+  %output2 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output3 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  %_Z13pool_max_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned to i8*))
+  %output4 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output5 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_gpu = !{!2, !3, !4}
+!hpvm_hint_cpu = !{!5, !6}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!hpvm_hint_promise = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5ccb2a532b5a0d82cee5c0d29a629a29dec2307c)"}
+!2 = !{%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned}
+!3 = !{%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned}
+!4 = !{%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned}
+!5 = !{%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned}
+!6 = !{%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned}
+!7 = !{!8, !9, i64 0}
+!8 = !{!"_ZTS6RootIn", !9, i64 0, !12, i64 8, !9, i64 16, !12, i64 24, !9, i64 32, !12, i64 40, !13, i64 48}
+!9 = !{!"any pointer", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C++ TBAA"}
+!12 = !{!"long", !10, i64 0}
+!13 = !{!"_ZTS5ret_t", !9, i64 0, !12, i64 8}
diff --git a/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseMatMul.hpvm.ll b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseMatMul.hpvm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e888631a3027a4cb30468ffca223e3429caa772f
--- /dev/null
+++ b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/fuseMatMul.hpvm.ll
@@ -0,0 +1,187 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMFuseHPVMTensorNodes.so -S -inplace -hpvm-fuse < %s | FileCheck %s
+; ModuleID = 'fuseMatMul.ll'
+source_filename = "fuseMatMul.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z11matmul_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z13bias_add_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z9tanh_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z13pool_max_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: @_Z13pool_max_nodePvm_cloned(
+; CHECK: call i8* @llvm.hpvm.tensor.pool.max(
+; CHECK: ret
+; CHECK-LABEL: @_Z4rootPvmS_mS_m_cloned(
+; CHECK: call i8* @llvm.hpvm.createNode(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK: call i8* @llvm.hpvm.createNode(
+; CHECK-NEXT: call i8* @llvm.hpvm.createEdge(
+; CHECK-NEXT: call i8* @llvm.hpvm.createEdge(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: ret
+; CHECK-LABEL: @_Z11matmul_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned(
+; CHECK: call i8* @llvm.hpvm.tensor.mul(
+; CHECK: call i8* @llvm.hpvm.tensor.add(
+; CHECK: call i8* @llvm.hpvm.tensor.tanh(
+; CHECK-NOT: call i8* @llvm.hpvm.tensor.pool.max(
+; CHECK: ret
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 64) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !7
+  tail call void @hpvm_request_tensor(i8* %0, i32 1) #3
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.mul(i8*, i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z11matmul_nodePvmS_m @_Z11matmul_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.mul(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z11matmul_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z11matmul_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z11matmul_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13bias_add_nodePvmS_m @_Z13bias_add_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.add(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z13bias_add_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13bias_add_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z13bias_add_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.tanh(i8*) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z9tanh_nodePvm @_Z9tanh_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.tanh(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9tanh_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9tanh_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9tanh_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z13pool_max_nodePvm @_Z13pool_max_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+  %returnStruct = insertvalue %struct.out._Z13pool_max_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13pool_max_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z13pool_max_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_m @_Z4rootPvmS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %matmul2d_1_w, i64 %matmul2d_1_w_bytes, i8* in %matmul2d_1_b, i64 %matmul2d_1_b_bytes) #4 {
+  %_Z11matmul_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z11matmul_nodePvmS_m (i8*, i64, i8*, i64)* @_Z11matmul_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z11matmul_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z13bias_add_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z11matmul_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z11matmul_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 4, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 5, i32 3, i1 false)
+  %_Z9tanh_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned to i8*))
+  %output2 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output3 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  %_Z13pool_max_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned to i8*))
+  %output4 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output5 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_promise = !{!2, !3, !4, !5, !6}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 5ccb2a532b5a0d82cee5c0d29a629a29dec2307c)"}
+!2 = !{%struct.out._Z11matmul_nodePvmS_m (i8*, i64, i8*, i64)* @_Z11matmul_nodePvmS_m_cloned}
+!3 = !{%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned}
+!4 = !{%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned}
+!5 = !{%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned}
+!6 = !{%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned}
+!7 = !{!8, !9, i64 0}
+!8 = !{!"_ZTS6RootIn", !9, i64 0, !12, i64 8, !9, i64 16, !12, i64 24, !9, i64 32, !12, i64 40, !13, i64 48}
+!9 = !{!"any pointer", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C++ TBAA"}
+!12 = !{!"long", !10, i64 0}
+!13 = !{!"_ZTS5ret_t", !9, i64 0, !12, i64 8}
diff --git a/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/multiLaunch.hpvm.ll b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/multiLaunch.hpvm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6c40d752cc71b5bec7931257e5e431041073668e
--- /dev/null
+++ b/hpvm/test/hpvm_pass/FuseHPVMTensorNodes/multiLaunch.hpvm.ll
@@ -0,0 +1,184 @@
+; RUN: opt -load LLVMBuildDFG.so -load LLVMInPlaceDFGAnalysis.so -load LLVMFuseHPVMTensorNodes.so -S -inplace -hpvm-fuse < %s | FileCheck %s
+; ModuleID = 'multiLaunch.ll'
+source_filename = "multiLaunch.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+%struct.out._Z9conv_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z13bias_add_nodePvmS_m = type <{ i8*, i64 }>
+%struct.out._Z9tanh_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z13pool_max_nodePvm = type <{ i8*, i64 }>
+%struct.out._Z4rootPvmS_mS_m = type <{ i8*, i64 }>
+; CHECK-LABEL: @_Z4rootPvmS_mS_m_cloned(
+; CHECK: call i8* @llvm.hpvm.createNode(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK-NEXT: call void @llvm.hpvm.bind.input(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: call void @llvm.hpvm.bind.output(
+; CHECK: ret
+; CHECK-LABEL: @_Z9conv_nodePvmS_m_cloned__Z13bias_add_nodePvmS_m_cloned__Z9tanh_nodePvm_cloned__Z13pool_max_nodePvm_cloned(
+; CHECK: call i8* @llvm.hpvm.tensor.convolution(
+; CHECK-NEXT: call i8* @llvm.hpvm.tensor.add(
+; CHECK-NEXT: call i8* @llvm.hpvm.tensor.tanh(
+; CHECK-NEXT: call i8* @llvm.hpvm.tensor.pool.max(
+; CHECK: ret
+; Function Attrs: norecurse uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+  call void @llvm.hpvm.init()
+  %call = tail call noalias i8* @malloc(i64 64) #3
+  %call1 = tail call noalias i8* @malloc(i64 64) #3
+  %graphID = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID)
+  %input = bitcast i8* %call to i8**
+  %0 = load i8*, i8** %input, align 1, !tbaa !7
+  tail call void @hpvm_request_tensor(i8* %0, i32 1)
+  %graphID1 = call i8* @llvm.hpvm.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned to i8*), i8* %call1, i1 false)
+  call void @llvm.hpvm.wait(i8* %graphID1)
+  %input4 = bitcast i8* %call1 to i8**
+  %1 = load i8*, i8** %input4, align 1, !tbaa !7
+  tail call void @hpvm_request_tensor(i8* %1, i32 1)
+  call void @llvm.hpvm.cleanup()
+  ret i32 0
+; Function Attrs: nofree nounwind
+declare dso_local noalias i8* @malloc(i64) local_unnamed_addr #1
+declare dso_local void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.convolution(i8*, i8*, i32, i32, i32, i32) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z9conv_nodePvmS_m @_Z9conv_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.convolution(i8* %t1, i8* %t2, i32 2, i32 2, i32 1, i32 1)
+  %returnStruct = insertvalue %struct.out._Z9conv_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9conv_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z9conv_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.add(i8*, i8*) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z13bias_add_nodePvmS_m @_Z13bias_add_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.add(i8* %t1, i8* %t2)
+  %returnStruct = insertvalue %struct.out._Z13bias_add_nodePvmS_m undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13bias_add_nodePvmS_m %returnStruct, i64 0, 1
+  ret %struct.out._Z13bias_add_nodePvmS_m %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.tanh(i8*) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z9tanh_nodePvm @_Z9tanh_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.tanh(i8* %t1)
+  %returnStruct = insertvalue %struct.out._Z9tanh_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z9tanh_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z9tanh_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z13pool_max_nodePvm @_Z13pool_max_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #4 {
+  %call1 = call i8* @llvm.hpvm.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2)
+  %returnStruct = insertvalue %struct.out._Z13pool_max_nodePvm undef, i8* %call1, 0
+  %returnStruct2 = insertvalue %struct.out._Z13pool_max_nodePvm %returnStruct, i64 0, 1
+  ret %struct.out._Z13pool_max_nodePvm %returnStruct2
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createNode(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.input(i8*, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.createEdge(i8*, i8*, i1, i32, i32, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.bind.output(i8*, i32, i32, i1) #3
+; Function Attrs: uwtable
+define dso_local %struct.out._Z4rootPvmS_mS_m @_Z4rootPvmS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* in %conv2d_1_b, i64 %conv2d_1_b_bytes) #4 {
+  %_Z9conv_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned to i8*))
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 2, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z9conv_nodePvmS_m_cloned.node, i32 3, i32 3, i1 false)
+  %_Z13bias_add_nodePvmS_m_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned to i8*))
+  %output = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output1 = call i8* @llvm.hpvm.createEdge(i8* %_Z9conv_nodePvmS_m_cloned.node, i8* %_Z13bias_add_nodePvmS_m_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 4, i32 2, i1 false)
+  call void @llvm.hpvm.bind.input(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i32 5, i32 3, i1 false)
+  %_Z9tanh_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned to i8*))
+  %output2 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output3 = call i8* @llvm.hpvm.createEdge(i8* %_Z13bias_add_nodePvmS_m_cloned.node, i8* %_Z9tanh_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  %_Z13pool_max_nodePvm_cloned.node = call i8* @llvm.hpvm.createNode(i8* bitcast (%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned to i8*))
+  %output4 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 0, i32 0, i1 false)
+  %output5 = call i8* @llvm.hpvm.createEdge(i8* %_Z9tanh_nodePvm_cloned.node, i8* %_Z13pool_max_nodePvm_cloned.node, i1 true, i32 1, i32 1, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 0, i32 0, i1 false)
+  call void @llvm.hpvm.bind.output(i8* %_Z13pool_max_nodePvm_cloned.node, i32 1, i32 1, i1 false)
+  ret %struct.out._Z4rootPvmS_mS_m undef
+; Function Attrs: nounwind
+declare void @llvm.hpvm.init() #3
+; Function Attrs: nounwind
+declare i8* @llvm.hpvm.launch(i8*, i8*, i1) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.wait(i8*) #3
+; Function Attrs: nounwind
+declare void @llvm.hpvm.cleanup() #3
+attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+!hpvm_hint_promise = !{!2, !3, !4, !5, !6}
+!hpvm_hint_gpu = !{}
+!hpvm_hint_cpu = !{}
+!hpvm_hint_cpu_gpu = !{}
+!hpvm_hint_cudnn = !{}
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 9.0.0 (https://gitlab.engr.illinois.edu/llvm/hpvm.git 0bba65ccd19f0ff92a84eaadbc0450fa7a0f0ccc)"}
+!2 = !{%struct.out._Z9conv_nodePvmS_m (i8*, i64, i8*, i64)* @_Z9conv_nodePvmS_m_cloned}
+!3 = !{%struct.out._Z13bias_add_nodePvmS_m (i8*, i64, i8*, i64)* @_Z13bias_add_nodePvmS_m_cloned}
+!4 = !{%struct.out._Z9tanh_nodePvm (i8*, i64)* @_Z9tanh_nodePvm_cloned}
+!5 = !{%struct.out._Z13pool_max_nodePvm (i8*, i64)* @_Z13pool_max_nodePvm_cloned}
+!6 = !{%struct.out._Z4rootPvmS_mS_m (i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_m_cloned}
+!7 = !{!8, !9, i64 0}
+!8 = !{!"_ZTS6RootIn", !9, i64 0, !12, i64 8, !9, i64 16, !12, i64 24, !9, i64 32, !12, i64 40, !13, i64 48}
+!9 = !{!"any pointer", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C++ TBAA"}
+!12 = !{!"long", !10, i64 0}
+!13 = !{!"_ZTS5ret_t", !9, i64 0, !12, i64 8}
diff --git a/hpvm/test/regressionTests/GenHPVM/AllocationNode.ll b/hpvm/test/hpvm_pass/GenHPVM/AllocationNode.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/AllocationNode.ll
rename to hpvm/test/hpvm_pass/GenHPVM/AllocationNode.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/CreateNode.ll b/hpvm/test/hpvm_pass/GenHPVM/CreateNode.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/CreateNode.ll
rename to hpvm/test/hpvm_pass/GenHPVM/CreateNode.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/CreateNodeAndEdge.ll b/hpvm/test/hpvm_pass/GenHPVM/CreateNodeAndEdge.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/CreateNodeAndEdge.ll
rename to hpvm/test/hpvm_pass/GenHPVM/CreateNodeAndEdge.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/LeafBindEdge.ll b/hpvm/test/hpvm_pass/GenHPVM/LeafBindEdge.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/LeafBindEdge.ll
rename to hpvm/test/hpvm_pass/GenHPVM/LeafBindEdge.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/LeafNodeGetters.ll b/hpvm/test/hpvm_pass/GenHPVM/LeafNodeGetters.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/LeafNodeGetters.ll
rename to hpvm/test/hpvm_pass/GenHPVM/LeafNodeGetters.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/PopAndPush.ll b/hpvm/test/hpvm_pass/GenHPVM/PopAndPush.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/PopAndPush.ll
rename to hpvm/test/hpvm_pass/GenHPVM/PopAndPush.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/ThreeLevel.ll b/hpvm/test/hpvm_pass/GenHPVM/ThreeLevel.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/ThreeLevel.ll
rename to hpvm/test/hpvm_pass/GenHPVM/ThreeLevel.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/ThreeLevelEdge.ll b/hpvm/test/hpvm_pass/GenHPVM/ThreeLevelEdge.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/ThreeLevelEdge.ll
rename to hpvm/test/hpvm_pass/GenHPVM/ThreeLevelEdge.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/TwoLaunch.ll b/hpvm/test/hpvm_pass/GenHPVM/TwoLaunch.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/TwoLaunch.ll
rename to hpvm/test/hpvm_pass/GenHPVM/TwoLaunch.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/TwoLevel.ll b/hpvm/test/hpvm_pass/GenHPVM/TwoLevel.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/TwoLevel.ll
rename to hpvm/test/hpvm_pass/GenHPVM/TwoLevel.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/oneLaunchAlloca.ll b/hpvm/test/hpvm_pass/GenHPVM/oneLaunchAlloca.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/oneLaunchAlloca.ll
rename to hpvm/test/hpvm_pass/GenHPVM/oneLaunchAlloca.ll
diff --git a/hpvm/test/regressionTests/GenHPVM/oneLaunchMalloc.ll b/hpvm/test/hpvm_pass/GenHPVM/oneLaunchMalloc.ll
similarity index 100%
rename from hpvm/test/regressionTests/GenHPVM/oneLaunchMalloc.ll
rename to hpvm/test/hpvm_pass/GenHPVM/oneLaunchMalloc.ll
diff --git a/hpvm/test/hpvm_pass/lit.cfg.py b/hpvm/test/hpvm_pass/lit.cfg.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aae30a33698eabcc040ec6c7476c003679569c2
--- /dev/null
+++ b/hpvm/test/hpvm_pass/lit.cfg.py
@@ -0,0 +1,34 @@
+# -*- Python -*-
+# Configuration file for the 'lit' test runner.
+import os
+import lit.formats
+from lit.llvm import llvm_config
+# name: The name of this test suite.
+config.name = "HPVM-PASS"
+# testFormat: The test format to use to interpret tests.
+config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+# suffixes: A list of file extensions to treat as test files. This is overriden
+# by individual lit.local.cfg files in the test subdirectories.
+config.suffixes = [".ll"]
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+# test_exec_root: The root path where tests should be run.
+current_source_dir = os.path.dirname(os.path.relpath(__file__, config.llvm_src_root))
+current_binary_dir = os.path.join(config.llvm_obj_root, current_source_dir)
+config.test_exec_root = current_binary_dir
+# Tweak the PATH to include the tools dir.
+llvm_config.with_environment("PATH", config.llvm_tools_dir, append_path=True)
+tools = ["opt"]
+llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir)
diff --git a/hpvm/test/unitTests/ThreeLevel.ll b/hpvm/test/hpvm_pass/unitTests/ThreeLevel.ll
similarity index 100%
rename from hpvm/test/unitTests/ThreeLevel.ll
rename to hpvm/test/hpvm_pass/unitTests/ThreeLevel.ll
diff --git a/hpvm/test/unitTests/ThreeLevel.opt.ll b/hpvm/test/hpvm_pass/unitTests/ThreeLevel.opt.ll
similarity index 100%
rename from hpvm/test/unitTests/ThreeLevel.opt.ll
rename to hpvm/test/hpvm_pass/unitTests/ThreeLevel.opt.ll
diff --git a/hpvm/test/unitTests/TwoLevel.ll b/hpvm/test/hpvm_pass/unitTests/TwoLevel.ll
similarity index 100%
rename from hpvm/test/unitTests/TwoLevel.ll
rename to hpvm/test/hpvm_pass/unitTests/TwoLevel.ll
diff --git a/hpvm/test/lit.cfg.py b/hpvm/test/lit.cfg.py
deleted file mode 100644
index b4e55fbeae9b40c978b0dc047cb76ed4909efc66..0000000000000000000000000000000000000000
--- a/hpvm/test/lit.cfg.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# -*- Python -*-
-# Configuration file for the 'lit' test runner.
-import os
-import sys
-import lit.util
-import lit.formats
-from lit.llvm import llvm_config
-# name: The name of this test suite.
-config.name = 'HPVM'
-# testFormat: The test format to use to interpret tests.
-config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
-# suffixes: A list of file extensions to treat as test files. This is overriden
-# by individual lit.local.cfg files in the test subdirectories.
-config.suffixes = ['.ll']
-# excludes: A list of directories to exclude from the testsuite. The 'Inputs'
-# subdirectories contain auxiliary inputs for various tests in their parent
-# directories.
-config.excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt', 'benchmarks', 'dnn_benchmarks']
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-# test_exec_root: The root path where tests should be run.
-config.test_exec_root = os.path.join(config.llvm_obj_root, 'test')
-# Tweak the PATH to include the tools dir.
-llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
-# Propagate some variables from the host environment.
-opt_viewer_cmd = '%s %s/tools/opt-viewer/opt-viewer.py' % (sys.executable, config.llvm_src_root)
-tools = ['opt']
-llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir)
-# Targets
-config.targets = frozenset(config.targets_to_build.split())
-for arch in config.targets_to_build.split():
-    config.available_features.add(arch.lower() + '-registered-target')
diff --git a/hpvm/test/lit.site.cfg.py.in b/hpvm/test/lit.site.cfg.py.in
index 3fc50039dfa49299c6151fa7bb009e4de4e5d1f8..0ed68ccfa0d05e797463dcd2e0a1f9030a20b99a 100644
--- a/hpvm/test/lit.site.cfg.py.in
+++ b/hpvm/test/lit.site.cfg.py.in
@@ -1,64 +1,13 @@
+# This file is shared between test suites.
+# It's repeatedly generated into build directory with different CMAKE_CURRENT_SOURCE_DIR.
-import sys
-config.host_triple = "@LLVM_HOST_TRIPLE@"
-config.target_triple = "@TARGET_TRIPLE@"
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
 config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
-config.llvm_shlib_dir = "@SHLIBDIR@"
-config.llvm_shlib_ext = "@SHLIBEXT@"
-config.llvm_exe_ext = "@EXEEXT@"
-config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
-config.python_executable = "@PYTHON_EXECUTABLE@"
-config.gold_executable = "@GOLD_EXECUTABLE@"
-config.ld64_executable = "@LD64_EXECUTABLE@"
-config.ocamlfind_executable = "@OCAMLFIND@"
-config.have_ocamlopt = @HAVE_OCAMLOPT@
-config.have_ocaml_ounit = @HAVE_OCAML_OUNIT@
-config.ocaml_flags = "@OCAMLFLAGS@"
-config.include_go_tests = @LLVM_INCLUDE_GO_TESTS@
-config.go_executable = "@GO_EXECUTABLE@"
-config.enable_shared = @ENABLE_SHARED@
-config.enable_assertions = @ENABLE_ASSERTIONS@
-config.targets_to_build = "@TARGETS_TO_BUILD@"
-config.native_target = "@LLVM_NATIVE_ARCH@"
-config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
-config.host_os = "@HOST_OS@"
-config.host_cc = "@HOST_CC@"
-config.host_cxx = "@HOST_CXX@"
-config.host_ldflags = "@HOST_LDFLAGS@"
-config.llvm_use_intel_jitevents = @LLVM_USE_INTEL_JITEVENTS@
-config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
-config.have_zlib = @HAVE_LIBZ@
-config.have_libxar = @HAVE_LIBXAR@
-config.have_dia_sdk = @LLVM_ENABLE_DIA_SDK@
-config.enable_ffi = @LLVM_ENABLE_FFI@
-config.build_examples = @LLVM_BUILD_EXAMPLES@
-config.enable_threads = @LLVM_ENABLE_THREADS@
-config.build_shared_libs = @BUILD_SHARED_LIBS@
-config.link_llvm_dylib = @LLVM_LINK_LLVM_DYLIB@
-config.llvm_libxml2_enabled = @LLVM_LIBXML2_ENABLED@
-config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
-config.host_arch = "@HOST_ARCH@"
-config.have_opt_viewer_modules = @LLVM_HAVE_OPT_VIEWER_MODULES@
-config.libcxx_used = @LLVM_LIBCXX_USED@
-config.has_plugins = @LLVM_ENABLE_PLUGINS@
-# Support substitution of the tools_dir with user parameters. This is
-# used when we can't determine the tool dir at configuration time.
-    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
-    config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params
-except KeyError:
-    e = sys.exc_info()[1]
-    key, = e.args
-    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 # Let the main config do the real work.
-lit_config.load_config(config, "@LLVM_SOURCE_DIR@/tools/hpvm/test/lit.cfg.py")
+lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg.py")
diff --git a/hpvm/tools/py-approxhpvm/CMakeLists.txt b/hpvm/tools/py-approxhpvm/CMakeLists.txt
index 60fbc66aadd362e6aceb507dec5f1bec1223c418..f93f96dd0578a67630cc859bba2e24e071b39299 100644
--- a/hpvm/tools/py-approxhpvm/CMakeLists.txt
+++ b/hpvm/tools/py-approxhpvm/CMakeLists.txt
 # and does not export its file location.
 # Keep this in sync with hpvm/projects/hpvm-rt/CMakeLists.txt.
 set(HPVM_RT_PATH ${LLVM_BUILD_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc)
-    "$<TARGET_FILE:tensor_runtime>"
-    "$<TARGET_FILE:gpu_profiler>"
-    "$<TARGET_FILE:promise_profiler>"
+set(TENSOR_RUNTIME_LIB "$<TARGET_FILE:tensor_runtime>")
-    gpu_profiler
-    promise_profiler
diff --git a/hpvm/tools/py-approxhpvm/main.py.in b/hpvm/tools/py-approxhpvm/main.py.in
index fdbbaec1ccc070f87bedcd0f0c646e12531d99fe..af706a1eaa7a53879e525d87dd5034caf814db38 100644
--- a/hpvm/tools/py-approxhpvm/main.py.in
+++ b/hpvm/tools/py-approxhpvm/main.py.in
@@ -9,7 +9,7 @@ PathLike = Union[Path, str]
 HPVM_PROJECT_DIR = Path("@LLVM_PROJECT_DIR@") / "tools/hpvm"
 LLVM_BUILD_DIR = Path("@LLVM_BUILD_DIR@") / "bin"
@@ -22,7 +22,7 @@ INCLUDE_DIRS = [
     HPVM_PROJECT_DIR / "test/dnn_benchmarks/hpvm-c/include",  # hpvm-c intrinsics decl dir
     CUDA_TOOLKIT_ROOT_DIR / "include",  # CUDA include dir
     "pthread", "cudart", "curand", "cudnn", "cublas", "cufft", "OpenCL", "stdc++fs", "omp", "m"
@@ -119,10 +119,12 @@ def link_hpvm_rt(src_file: PathLike, target_file: PathLike) -> List[str]:
 def link_binary(src_file: PathLike, target_file: PathLike) -> List[str]:
-    linker_dir_flags = [f"-L{path}" for path in LINK_DIRS]
+    linker_dir_flags = []
+    for path in LINK_DIRS:
+        linker_dir_flags.extend([f"-L{path}", f"-Wl,-rpath={path}"])
     linker_lib_flags = [f"-l{lib}" for lib in LINK_LIBS]
     return [
-        str(LLVM_BUILD_DIR / "clang++"), str(src_file), *TENSOR_RUNTIME_LIBS, "-o", str(target_file),
+        str(LLVM_BUILD_DIR / "clang++"), str(src_file), str(TENSOR_RUNTIME_LIB), "-o", str(target_file),
         *linker_dir_flags, *linker_lib_flags