diff --git a/.gitmodules b/.gitmodules
index aeaea73f16d7cae575ca0786682e737c775f2c03..f7d3d37cde7f947d3d7d7f4f9d8d7879b60e33e6 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,4 @@
 [submodule "hpvm/projects/predtuner"]
 	path = hpvm/projects/predtuner
-	url = git@gitlab.engr.illinois.edu:yifanz16/predtuner.git
+	url = ../predtuner.git
+	branch = hpvm
diff --git a/README.md b/README.md
index acd94086990ccc9e85d2dc63fa8d5a81ae1b08cc..c2e4f3be4e11856dc2bb2b35e61cfbf04ae626af 100644
--- a/README.md
+++ b/README.md
@@ -65,9 +65,7 @@ HPVM has not been tested but might work on other CPUs supported by LLVM Backend,
 Checkout HPVM and go to directory `./hpvm` under project root:
 
 ```shell
-git clone --recursive https://gitlab.engr.illinois.edu/llvm/hpvm.git
-cd hpvm/
-git checkout approx_hpvm_reorg
+git clone --recursive -b approx_hpvm_reorg --single-branch https://gitlab.engr.illinois.edu/llvm/hpvm.git
 cd hpvm/
 ```
 
diff --git a/hpvm/CMakeLists.txt b/hpvm/CMakeLists.txt
index df1657a54ebf05fe2392bcaef8a7abdce86f4243..809a30cfa52e16f436dac4e22843f4c5a3add3d9 100644
--- a/hpvm/CMakeLists.txt
+++ b/hpvm/CMakeLists.txt
@@ -1,8 +1,9 @@
-cmake_minimum_required(VERSION 3.17)
+cmake_minimum_required(VERSION 3.18)
 project(hpvm CUDA CXX)
 get_filename_component(
   CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}/../.." ABSOLUTE
 )  # Set CUDA_TOOLKIT_ROOT_DIR by our own, to the parent folder of cuda nvcc
+message(STATUS "CUDA Architecture: ${CMAKE_CUDA_ARCHITECTURES}")
 
 # find_package will use the auxillary cmake/Find*.cmake we provide
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
diff --git a/hpvm/install.sh b/hpvm/install.sh
index 692c6195d28b23c418f37d8a574d91ef744e2b33..dd737034f043e2022710a94982467e60456d2bd4 100755
--- a/hpvm/install.sh
+++ b/hpvm/install.sh
@@ -1,11 +1,6 @@
 #!/bin/bash
-
-SCRIPTS_DIR=scripts
-
-BASH=/bin/bash
-
 # Run installer script
-$BASH $SCRIPTS_DIR/llvm_installer.sh
-
-# Run the tests
-$BASH $SCRIPTS_DIR/automated_tests.sh
+# Pass on args to installer that can parse them
+scripts/hpvm_installer.py "$@"
+# Set path.
+export PATH=$BUILD_DIR/bin:$PATH
diff --git a/hpvm/llvm_patches/construct_patch.sh b/hpvm/llvm_patches/construct_patch.sh
index cc50fcc4226f2c8ba31a9345b74c719186053e8f..b957c853e71f59bc17e7def6d544c86eefd382b6 100644
--- a/hpvm/llvm_patches/construct_patch.sh
+++ b/hpvm/llvm_patches/construct_patch.sh
@@ -1,29 +1,12 @@
 #!/bin/sh
 
 #### Computing Header Diff
-diff -u  $LLVM_SRC_ROOT/include/llvm/Bitcode/LLVMBitCodes.h  include/Bitcode/LLVMBitCodes.h > include/Bitcode/LLVMBitCodes.h.patch 
-
-diff -u  $LLVM_SRC_ROOT/include/llvm/IR/Attributes.td   include/IR/Attributes.td   > include/IR/Attributes.td.patch
-
-diff -u  $LLVM_SRC_ROOT/include/llvm/IR/Intrinsics.td   include/IR/Intrinsics.td > include/IR/Intrinsics.td.patch
-
-diff -u  $LLVM_SRC_ROOT/include/llvm/Support/Debug.h   include/Support/Debug.h > include/Support/Debug.h.patch
-
-
+for file in Bitcode/LLVMBitCodes.h IR/Attributes.td IR/Intrinsics.td Support/Debug.h; do
+    diff -u $LLVM_SRC_ROOT/include/llvm/$file include/$file > include/$file.patch || true
+done
 #### Computing Source File Diff
-
-diff -u  $LLVM_SRC_ROOT/lib/AsmParser/LLLexer.cpp   lib/AsmParser/LLLexer.cpp > lib/AsmParser/LLLexer.cpp.patch 
-
-diff -u  $LLVM_SRC_ROOT/lib/AsmParser/LLLexer.h   lib/AsmParser/LLLexer.h > lib/AsmParser/LLLexer.h.patch
-
-diff -u  $LLVM_SRC_ROOT/lib/AsmParser/LLParser.cpp   lib/AsmParser/LLParser.cpp > lib/AsmParser/LLParser.cpp.patch
-
-diff -u  $LLVM_SRC_ROOT/lib/AsmParser/LLParser.h   lib/AsmParser/LLParser.h > lib/AsmParser/LLParser.h.patch
-
-diff -u  $LLVM_SRC_ROOT/lib/AsmParser/LLToken.h   lib/AsmParser/LLToken.h > lib/AsmParser/LLToken.h.patch
-
-diff -u  $LLVM_SRC_ROOT/lib/IR/Attributes.cpp   lib/IR/Attributes.cpp > lib/IR/Attributes.cpp.patch
-
-diff -u  $LLVM_SRC_ROOT/lib/Bitcode/Reader/BitcodeReader.cpp   lib/Bitcode/Reader/BitcodeReader.cpp > lib/Bitcode/Reader/BitcodeReader.cpp.patch
-
-diff -u  $LLVM_SRC_ROOT/lib/Bitcode/Writer/BitcodeWriter.cpp   lib/Bitcode/Writer/BitcodeWriter.cpp > lib/Bitcode/Writer/BitcodeWriter.cpp.patch
+for file in AsmParser/LLLexer.cpp AsmParser/LLLexer.h AsmParser/LLParser.cpp \
+            AsmParser/LLParser.h AsmParser/LLToken.h IR/Attributes.cpp \
+            Bitcode/Reader/BitcodeReader.cpp Bitcode/Writer/BitcodeWriter.cpp; do
+    diff -u $LLVM_SRC_ROOT/lib/$file lib/$file > lib/$file.patch || true
+done
diff --git a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
index 4d88baa7840d98686e2f2135658f7595fa6bce30..6dece968058e4c52189611533998f3608f147cd3 100644
--- a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -59,7 +59,7 @@ set(
   RUNTIME_SRCS_FILENAME
   approx_knobs_utils.cc approx_simulation.cu approx_techniques.cu
   configuration.cpp
-  debug.cpp device_math.cu
+  debug.cpp
   error.cu
   fp16_gemm.cu freq_utils.cc
   global_data.cc group_conv.cu
diff --git a/hpvm/projects/hpvm-tensor-rt/README.md b/hpvm/projects/hpvm-tensor-rt/README.md
index e492c4c838df969f00baa85e6e3adcafdfc0a7f9..d3aad77a43a957148a6bf76d18ee4be15cab066e 100644
--- a/hpvm/projects/hpvm-tensor-rt/README.md
+++ b/hpvm/projects/hpvm-tensor-rt/README.md
@@ -10,7 +10,7 @@
 
 - cuDNN-7.0 or above
 
-- `cmake >= 3.17`
+- `cmake >= 3.18`
 
 - `make >= 4`
 
diff --git a/hpvm/projects/hpvm-tensor-rt/docs/Changes.md b/hpvm/projects/hpvm-tensor-rt/docs/Changes.md
index ebf0295ed664bf618195d79a461b58b576b0c071..85ac974dc448d8f0e920caf96507465810e61e52 100644
--- a/hpvm/projects/hpvm-tensor-rt/docs/Changes.md
+++ b/hpvm/projects/hpvm-tensor-rt/docs/Changes.md
@@ -56,7 +56,7 @@
 - Add rules to /llvm/projects/CmakeLists.txt
 - Automate the generation of 'tensor_runtime.ll`
 - Add CUDNN, CUDA paths to the template environment setup file
-- Move to using Cmake-3.17 (earlier 3.15)
+- Move to using Cmake-3.18 (earlier 3.15)
 
 
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
index 6b0f835f7361fb54b9826bdec7e1819333f989df..cac6b6fd686234cadf78096a729eecb1a3203250 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
@@ -3,8 +3,6 @@
 
 #include "tensor.h"
 
-#include "device_math.h"
-
 extern "C" {
 
 // NOTE: API for tensorGroupConvolution
@@ -59,19 +57,6 @@ void *tensorConvSampSim2(void *input_ptr, void *filter_ptr, int vertical_pad,
                          int skip_rate, int skip_offset,
                          float interpolation_rate);
 
-void *autotuner_tensorFft(void *input, bool inverse);
-
-void *autotuner_tensorReduce(void *input, size_t axis, MathOp func);
-
-void *autotuner_tensorProjectiveT(void *input, void *transformation);
-
-void *autotuner_tensorMap1(MathOp func, void *input);
-
-void *autotuner_tensorMap2(MathOp func, void *input1, void *input2);
-
-void *autotuner_tensorMap3(MathOp func, void *input1, void *input2,
-                           void *input3);
-
 void *tensorConvInputHalf(void *input_ptr, void *filter_ptr, int vertical_pad,
                           int horizontal_pad, int vertical_stride,
                           int horizontal_stride, int conv_mode, int conv_groups,
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
index 3b52cce9f62504753d63015a599d214194d48d98..4cab25ab593a40f11d17a96d4045fd11afa36530 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
@@ -53,15 +53,7 @@ public:
     POOL_MEAN,
     POOL_MIN,
     SOFTMAX,
-    FFT,
-    REDUCE,
-    PROJECTIVE_T,
-    MAP1,
-    MAP2,
-    MAP3,
-    //    STENCIL,
-    //    COSINE_T,
-    //  ADDITIONAL_TENSOR_OPERATION
+    // ADDITIONAL_TENSOR_OPERATION
     TENSOR_OP_END
   };
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
deleted file mode 100644
index 83781b148c4bb41619bbbb54d9e69cc9fc7f2543..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-device_math.h
-Provides pointer to CUDA math function and other properties ofa math operator
-(one among MathOp) on a certain scalar type.
-*/
-#ifndef DEVICE_MATH_H
-#define DEVICE_MATH_H
-
-#include <cuda_fp16.h>
-#include <device_launch_parameters.h>
-#include <limits>
-#include <stdexcept>
-
-#include "debug.h"
-
-enum class MathOp {
-  Hypot,
-  Atan2,
-  Add,
-  Sub,
-  Mul,
-  Div,
-  Sqrt,
-  Max,
-  Min,
-  Avg3,
-  Blend2,
-  AddWeighted,
-  PSNR
-};
-
-// Find the CUDA function for math operator `op`.
-// This is ONLY defined (through template specialization, in device_math.cu) for
-// float and half (see below).
-template <typename T> void *mathOpToFunc(MathOp op);
-
-template <> void *mathOpToFunc<float>(MathOp op);
-
-template <> void *mathOpToFunc<half>(MathOp op);
-
-// Returns the identity element of math operator `op`, for example, -inf for
-// MAX, 0 for ADD.
-// Specialization exists for half type.
-template <typename T> T reduceOpToIdentity(MathOp op) {
-  switch (op) {
-  case MathOp::Hypot:
-    return T(0.0f);
-  case MathOp::Add:
-    return T(0.0f);
-  case MathOp::Max:
-    return -std::numeric_limits<T>::max();
-  case MathOp::Min:
-    return std::numeric_limits<T>::max();
-  default:
-    ERROR("Operator does not have id value\n");
-  }
-  return T(); // For some compilers
-}
-
-template <> half reduceOpToIdentity<half>(MathOp op);
-
-#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
index 8a8ff8435db96607917fc627036e72318409ef9b..a472fcaa36484950de98f858a74f185900ab80b7 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
@@ -22,7 +22,6 @@
 #include "op_overheads.h"
 #include "half_precision_api.h"
 #include "approx_utils.h"
-#include "device_math.h"
 #include "global_data.h"
 #include "approx_knob_utils.h"
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
index c18ffcea26f93fe752500983f4d4a3fcfe59ded2..063112ab78641dc57f4d79259df837fa12177b42 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
@@ -144,24 +144,6 @@ void GPUNodeConfiguration::print() {
     case G_TENSOR_OP::SOFTMAX:
       printf("softmax");
       break;
-    case G_TENSOR_OP::FFT:
-      printf("fft");
-      break;
-    case G_TENSOR_OP::REDUCE:
-      printf("reduce");
-      break;
-    case G_TENSOR_OP::PROJECTIVE_T:
-      printf("projectiveT");
-      break;
-    case G_TENSOR_OP::MAP1:
-      printf("map1");
-      break;
-    case G_TENSOR_OP::MAP2:
-      printf("map2");
-      break;
-    case G_TENSOR_OP::MAP3:
-      printf("map3");
-      break;
       // TODO additional operations to be printed here
     default:
       ERROR("Unknown tensor operation.");
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
deleted file mode 100644
index 032443bd7a63a1640e463c0457dd362e09733be3..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
+++ /dev/null
@@ -1,189 +0,0 @@
-/* device_math.cu defines */
-#include "device_math.h"
-#include "error.h"
-
-#include <thrust/complex.h>
-
-#define DEF_FUNC_PTR(fname) __device__ void *fname##_ptr = (void *)(fname);
-
-#define DEF_FUNC_PTR_CAST(fname, castf)                                        \
-  __device__ void *fname##_ptr = (void *)(castf);
-
-#define CASE_FUNC(ename, fname)                                                \
-  case MathOp::ename: {                                                        \
-    void *v_func_ptr = nullptr;                                                \
-    checkCudaErrors(cudaMemcpyFromSymbol(&v_func_ptr, _internal::fname##_ptr,  \
-                                         sizeof(void *)));                     \
-    return v_func_ptr;                                                         \
-  }
-
-namespace _internal {
-
-// The following functions are not used, but they reference their cuda
-// counterpart which is necessary, otherwise ptx compilation breaks
-__device__ float hypotf_(float x, float y) { return hypotf(x, y); }
-
-__device__ float atan2f_(float x, float y) { return atan2f(x, y); }
-
-__device__ float sqrtf_(float x) { return sqrtf(x); }
-
-__device__ float fmax_(float x, float y) { return fmax(x, y); }
-
-__device__ float fmin_(float x, float y) { return fmin(x, y); }
-
-__device__ float add(float x, float y) { return x + y; }
-
-__device__ float sub(float x, float y) { return x - y; }
-
-__device__ float mul(float x, float y) { return x * y; }
-
-__device__ float div(float x, float y) { return x / y; }
-
-__device__ float favg3(float x) { return __fdividef(x, 3.0f); }
-
-__device__ float blend2(float bg, float fg) { return bg * 0.6 + fg * 0.4; }
-
-__device__ float addWeighted(float blurred, float image) {
-  return 0.7 * image + 0.3 * blurred;
-}
-
-__device__ float psnr(float x) { return -10 * log10(x); }
-
-__device__ float2 f2mul(float2 x1, float2 x2) {
-  return {x1.x * x2.x - x1.y * x2.y, x1.x * x2.y + x1.y * x2.x};
-}
-
-__device__ half2 h2mul(half2 x1, half2 x2) {
-  return {x1.x * x2.x - x1.y * x2.y, x1.x * x2.y + x1.y * x2.x};
-}
-
-__device__ half2 h2hypot(half2 x, half2 y) {
-  return h2sqrt(__hfma2(x, x, __hmul2(y, y)));
-}
-
-__device__ half2 h2max(half2 x, half2 y) {
-  return __hfma2(__hgt2(x, y), x, __hmul2(__hle2(x, y), y));
-}
-
-__device__ half2 h2min(half2 x, half2 y) {
-  return __hfma2(__hlt2(x, y), x, __hmul2(__hge2(x, y), y));
-}
-
-__device__ half2 h2avg3(half2 x) {
-  half2 three = __floats2half2_rn(3.0f, 3.0f);
-  return __h2div(x, three);
-}
-
-__device__ half2 h2blend2(half2 bg, half2 fg) {
-  half2 c1 = __floats2half2_rn(0.6f, 0.6f), c2 = __floats2half2_rn(0.4f, 0.4f);
-  return __hfma2(bg, c1, __hmul2(fg, c2));
-}
-
-__device__ half2 h2addWeighted(half2 blurred, half2 image) {
-  half2 c1 = __floats2half2_rn(0.7f, 0.7f), c2 = __floats2half2_rn(0.3f, 0.3f);
-  return __hfma2(image, c1, __hmul2(blurred, c2));
-}
-
-DEF_FUNC_PTR(hypotf)
-DEF_FUNC_PTR(atan2f)
-DEF_FUNC_PTR(add)
-DEF_FUNC_PTR(sub)
-DEF_FUNC_PTR(mul)
-DEF_FUNC_PTR(div)
-DEF_FUNC_PTR(sqrtf)
-DEF_FUNC_PTR_CAST(fmax, (float (*)(float, float))fmax)
-DEF_FUNC_PTR_CAST(fmin, (float (*)(float, float))fmin)
-DEF_FUNC_PTR(favg3)
-DEF_FUNC_PTR(blend2)
-DEF_FUNC_PTR(addWeighted)
-DEF_FUNC_PTR(psnr)
-
-DEF_FUNC_PTR(f2mul)
-
-DEF_FUNC_PTR(h2mul)
-
-DEF_FUNC_PTR(h2hypot)
-DEF_FUNC_PTR(__hadd2)
-DEF_FUNC_PTR(__hsub2)
-DEF_FUNC_PTR(__h2div)
-DEF_FUNC_PTR(h2sqrt)
-DEF_FUNC_PTR(h2max)
-DEF_FUNC_PTR(h2min)
-DEF_FUNC_PTR(h2avg3)
-DEF_FUNC_PTR(h2blend2)
-DEF_FUNC_PTR(h2addWeighted)
-
-} // namespace _internal
-
-template <> void *mathOpToFunc<float2>(MathOp op) {
-  switch (op) {
-    CASE_FUNC(Mul, f2mul)
-  default:
-    ERROR("Float2 function not found\n");
-    return nullptr; // For some compilers
-  }
-}
-
-template <> void *mathOpToFunc<half2>(MathOp op) {
-  switch (op) {
-    CASE_FUNC(Mul, h2mul)
-  default:
-    ERROR("Half2 function not found\n");
-    return nullptr; // For some compilers
-  }
-}
-
-template <> void *mathOpToFunc<float>(MathOp op) {
-  switch (op) {
-    CASE_FUNC(Hypot, hypotf)
-    CASE_FUNC(Atan2, atan2f)
-    CASE_FUNC(Add, add)
-    CASE_FUNC(Sub, sub)
-    CASE_FUNC(Mul, mul)
-    CASE_FUNC(Div, div)
-    CASE_FUNC(Sqrt, sqrtf)
-    CASE_FUNC(Max, fmax)
-    CASE_FUNC(Min, fmin)
-    CASE_FUNC(Avg3, favg3)
-    CASE_FUNC(Blend2, blend2)
-    CASE_FUNC(AddWeighted, addWeighted)
-    CASE_FUNC(PSNR, psnr)
-  default:
-    ERROR("Float function not found\n");
-  }
-  return nullptr; // For some compilers
-}
-
-template <> void *mathOpToFunc<half>(MathOp op) {
-  switch (op) {
-    CASE_FUNC(Hypot, h2hypot)
-    CASE_FUNC(Add, __hadd2)
-    CASE_FUNC(Sub, __hsub2)
-    CASE_FUNC(Div, __h2div)
-    CASE_FUNC(Sqrt, h2sqrt)
-    CASE_FUNC(Max, h2max)
-    CASE_FUNC(Min, h2min)
-    CASE_FUNC(Avg3, h2avg3)
-    CASE_FUNC(Blend2, h2blend2)
-    CASE_FUNC(AddWeighted, h2addWeighted)
-  default:
-    ERROR("Half function not found\n");
-  }
-  return nullptr; // For some compilers
-}
-
-template <> half reduceOpToIdentity<half>(MathOp op) {
-  switch (op) {
-  case MathOp::Hypot:
-    return 0.0f;
-  case MathOp::Add:
-    return 0.0f;
-  case MathOp::Max:
-    return -65504.0f;
-  case MathOp::Min:
-    return 65504.0f;
-  default:
-    ERROR("Operator does not have id value\n");
-  }
-  return 0.0f; // For some compilers
-}
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
index 66e8e3d1baf987d5d3c74a35f8b58cc957d5983b..b17285b5e05f6a8576c1e3991ad9f4ffa8735d9e 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -268,10 +268,12 @@ double ProfileInfo::getCurrentIterationComputeEnergy() {
 void ProfileInfo::set_out_file_name(std::string &str) { out_file_name = str; }
 
 void ProfileInfo::printToFile() {
-
   INFO("Writing Runtime Profile Info File...\n");
-  std::ofstream s_out(out_file_name.c_str());
 
+  if (control_time_info.size() == 0)
+    return;
+
+  std::ofstream s_out(out_file_name.c_str());
   if (!s_out) {
     ERROR("Failed to open output file.");
     abort();
@@ -795,36 +797,6 @@ void RuntimeController::readConfigurationFile(const char *str) {
           NodeConf->pushNewTensorOperation(
               GPUNodeConfiguration::TENSOR_OP::SOFTMAX);
           idx++;
-        } else if (tokens[idx] == "fft") {
-          DEBUG("Found fft operation\n");
-          NodeConf->pushNewTensorOperation(
-              GPUNodeConfiguration::TENSOR_OP::FFT);
-          idx++;
-        } else if (tokens[idx] == "reduce") {
-          DEBUG("Found reduce operation\n");
-          NodeConf->pushNewTensorOperation(
-              GPUNodeConfiguration::TENSOR_OP::REDUCE);
-          idx++;
-        } else if (tokens[idx] == "projectiveT") {
-          DEBUG("Found projectiveT operation\n");
-          NodeConf->pushNewTensorOperation(
-              GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T);
-          idx++;
-        } else if (tokens[idx] == "map1") {
-          DEBUG("Found map1 operation\n");
-          NodeConf->pushNewTensorOperation(
-              GPUNodeConfiguration::TENSOR_OP::MAP1);
-          idx++;
-        } else if (tokens[idx] == "map2") {
-          DEBUG("Found map2 operation\n");
-          NodeConf->pushNewTensorOperation(
-              GPUNodeConfiguration::TENSOR_OP::MAP2);
-          idx++;
-        } else if (tokens[idx] == "map3") {
-          DEBUG("Found map3 operation\n");
-          NodeConf->pushNewTensorOperation(
-              GPUNodeConfiguration::TENSOR_OP::MAP3);
-          idx++;
         } else /*Not a new operation. This means an approximation option*/
             if (tokens[idx] == "fp32") {
           DEBUG("Found fp32 option\n");
@@ -1390,14 +1362,14 @@ uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start,
       ERROR("Data file %s is not found. Aborting...\n", labels_file);
       abort();
     }
-    
+
     // Get number of labels
     fseek(file, 0, SEEK_END);
     long size = ftell(file);
     fseek(file, 0, SEEK_SET); // return file pointer to beginning
 
     // Allocate memory for labels
-    labels_from_file = (uint32_t *) malloc(size);
+    labels_from_file = (uint32_t *)malloc(size);
     if (labels_from_file == NULL) {
       ERROR("Memory allocation for labels unsucessfull. Aborting...\n");
       abort();
@@ -1474,7 +1446,6 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
   return accuracy;
 }
 
-
 #define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl
 //#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl
 //#define llvm_hpvm_invokeRtControl_ITERATE llvm_hpvm_invokeRtControl
diff --git a/hpvm/projects/keras/README.md b/hpvm/projects/keras/README.md
index 227cd2fed9f24f905f779f200dafee22169a8477..73dc8548f02652facf1b1161917edbb93fa8184b 100644
--- a/hpvm/projects/keras/README.md
+++ b/hpvm/projects/keras/README.md
@@ -2,7 +2,7 @@
 
 ## Installing Dependencies
 
-### Updating pip
+### Updating pip:
 The pip version required in this subproject must be >= `19.3`.
 
 To upgrade pip:
@@ -17,19 +17,23 @@ To check installed pip version:
 pip -V
 ```
 
-### Importing Conda Environment:
+### Importing and Creating the Required Conda Environment:
 
 ```
 conda env create -f keras_environment.yml --name ${KERAS_ENV_NAME}
 ```
 Note: pip version MUST be > 19.3
 
+This is a **one-time** installation step.
+
 ### Activating Conda Environment:
 
 ```
 conda activate ${KERAS_ENV_NAME}
 ```
 
+**NOTE:** This step must be performed each time (for each shell process) the frontend is to be used.
+
 ### Building and Installing Frontend:
 
 ```
@@ -37,6 +41,19 @@ python setup.py build
 
 python setup.py install
 ```
+**NOTE:** This step must be performed each time (for each shell process) the frontend is to be used.
+
+
+## Download CNN Model Files 
+
+The weight (model) and data files to use with the CNN benchmarks are hosted on Git LFS and need to separately downloaded. This can be done using:
+
+```
+git lfs fetch 
+git lfs checkout 
+```
+
+**NOTE:** Data donwload is necesary before running benchmarks
 
 
 ## Download CNN Model Files 
@@ -54,59 +71,113 @@ git lfs checkout
 
 Benchmarks under `./src/` 
 
+**NOTE:** Activate conda environment (above) before running benchmarks 
+
 List of benchmarks and the expected accuracies:
 
 | Benchmark       | Accuracy    |
 | ----------- | ----------- |
-| AlexNet-CIFAR10      | 79.16       |
-| AlexNet2-CIFAR10   | 85.10        |
+| AlexNet-CIFAR10      | 79.28       |
+| AlexNet2-CIFAR10   | 84.98        |
 | AlexNet-ImageNet | 56.30 |
-| LeNet-MNIST | 99.11 | todo: fix broken
-| MobileNet-CIFAR10 | 82.40 |
-| ResNet18-CIFAR10 | 89.52 |
+| LeNet-MNIST | 98.70 | 
+| MobileNet-CIFAR10 | 84.42 |
+| ResNet18-CIFAR10 | 89.56 |
 | ResNet50-ImageNet | 75.10 |
-| VGG16-CIFAR10 | 89.42 |
-| VGG16-CIFAR100 | 66.20 |
+| VGG16-CIFAR10 | 89.96 |
+| VGG16-CIFAR100 | 66.50 |
 | VGG16-ImageNet | 69.46 |
 
-Activate conda environment (above) before running benchmarks 
 
 ### Synopsis
 
 ```
-python src/${BENCH_NAME}.py  [hpvm_reload|keras_reload]  [frontend|keras_dump] 
+python src/${BENCH_NAME}.py  [hpvm_reload|keras_reload]  [frontend] [compile]
 
 ```
 
-**Parameters:**
 
-`hpvm_reload` : Reloads HPVM weights (format used in `model_params` found here: [ADD link to Google Drive]) from directory specified in Benchmark constructor. 
+**Command-line Parameters**
 
-`keras_reload`: Reloads weights in Keras `.h5` file format 
+`hpvm_reload` : Reloads HPVM weights ('.bin' binary format used in `model_params` found here: https://gitlab.engr.illinois.edu/llvm/hpvm/-/tree/approx_hpvm_reorg_keras/hpvm/test/dnn_benchmarks/model_params) from directory path specified in the `reload_dir` parameter set in code - this is described in "Parameters to Change in Code" (below).
 
-`frontend`: Invokes the HPVM frontend and dumps weights in directory specified in constructor
+`keras_reload`: Alternatively, reload weights in Keras `.h5` file format with path to file specified in `keras_model_file` described in "Parameters to Change in Code" (below).
 
-`keras_dump`: Dumps keras .h5 format model weights in directory specified in constructor
+`frontend`: Invokes the HPVM frontend and dumps weights (in HPVM `.bin` format) in the output directory specified. The parameters that control where data and source files are dumped are specified by parameters `data_dir` and `src_dir`, respectively. These are described below.
+
+`compile`: Optional Parameter. When specified, it compiles the HPVM-C code generated by the frontend into an HPVM binary under the directory specified by `src_dir` (described below). If `src_dir` path exists, a unique directory (which appends a unique ID) is created. 
+The binary is built with the name `HPVM_binary`. 
+
+**NOTE:** Before running `HPVM_binary` necessary to set CUDA/CUDNN paths with:
+
+```
+source ${PATH_TO_YOUR_HPVM_ROOT}/hpvm/set_paths.sh
+```
 
+**Parameters to Change in Code** 
 
+The AlexNet source is commented with explanations on how to use the Keras frontend interface. AlexNet source is [here](https://gitlab.engr.illinois.edu/llvm/hpvm/-/blob/approx_hpvm_reorg_keras/hpvm/projects/keras/src/alexnet.py).
 
-### Building New Benchmarks 
+* `NAME`: Benchmark Name - Can be set to any desired value
 
-All benchmarks inherit from the commom parent `Benchmark` class. 
-Each benchmark overrides virtual functions for building the model, training, inference, 
-and data preprocessing. 
+* `reload_dir`: Path to directory from where to reload weights in HPVM format. This directory is used to reload weights if `hpvm_reload` command-line option is used.
+
+* `keras_model_file`: Path to Keras .h5 model file to reload weigths from. Either of `reload_dir` or `keras_model_file` can be used. 
+`keras_model_file` is used when `keras_reload` commad-line parameter is used with the Benchmark script.
+
+* `data_dir`: Directory to dump weights specified specified in [constructor](https://gitlab.engr.illinois.edu/llvm/hpvm/-/blob/approx_hpvm_reorg_keras/hpvm/projects/keras/src/Benchmark.py#L21)
+ 
+* `src_dir`: Directory to dump ApproxHPVM sources in HPVM-C (C with HPVM compiler intrinsics) specified in [constructor](https://gitlab.engr.illinois.edu/llvm/hpvm/-/blob/approx_hpvm_reorg_keras/hpvm/projects/keras/src/Benchmark.py#L22) 
+
+* `num_classes`: number of output classes - dependent on the dataset used. For CIFAR10, `num_classes` is 10, CIFAR100 has 100 classes,
+ for ImageNet, number of classes is 1000.
+
+* `batch_size`: This parameter controls the size of each batch that is processed in HPVM. The batch size should be kept as large as the GPU memory 
+can support. This parameter should be adapted according to the memory size of the deployed device.
+
+
+
+### Using the Frontend with Custom (New) Benchmarks 
+
+Any new benchmarks must inherit from the commom parent `Benchmark` class 
+and override the virtual functions for building the model, training, 
+and data preprocessing. These methods are described below:
         
     
 `def buildModel(self)`:
-returns a keras model
+Constructs and returns a keras model
 
 `def data_preprocess(self)`:
-returns X_train, y_train, X_test, y_test, X_tuner, and y_tuner data â€” in that order; this data will be directly used later for training and inference
+returns X_train, y_train, X_test, y_test, X_tuner, and y_tuner data (in that order): 
+These are described here:
+
+* `X_train:` Training data (fp32) in NCHW format
+* `y_train:` Training labels (int32)
+
+* `X_test:` Testing/Evaluation data in NCHW format
+* `y_test:` Testing/Evaluation labels
+
+* `X_tuner:` Data to be used for autotuning 
+* `y_tuner:` Labels corresponding to tuning data
+
 
 `def trainModel(self, model, X_train, y_train, X_test, y_test)`:
-returns a trained keras model
+Trains the Keras model constructed in `buildModel` and is expected to return the 
+trained keras model - training parameters should be tuned here.
+
+
+## Automated Tests 
+
+`scripts/test_benchmarks.py` is an automated test script that evaluates the accuracy of each Benchmark in Keras and HPVM (after comilation using HPVM Compiler) and compares the accuracy of each binary to the known correct accuracy. Run from root of `project/keras/` as:
+
+```
+python scripts/test_benchmarks.py
+```
+
 
+## Suppported Operations
 
+List of supported operations and limitations detailed in https://gitlab.engr.illinois.edu/llvm/hpvm/-/blob/approx_hpvm_reorg_keras/hpvm/projects/keras/docs/Support.md
 
 
 
diff --git a/hpvm/projects/keras/docs/Support.md b/hpvm/projects/keras/docs/Support.md
new file mode 100644
index 0000000000000000000000000000000000000000..e5e7b1a1a2125940cd0749e9c957c43bf2205aa3
--- /dev/null
+++ b/hpvm/projects/keras/docs/Support.md
@@ -0,0 +1,40 @@
+
+## Supported Keras Operators 
+
+The Keras frontend supports `Sequential()` Keras models.
+The list of supported operations is as follows:
+
+* `Conv2D`
+* `DepthwiseConv2D`
+* `Dense`
+* `BatchNormalization`
+* `MaxPooling2D`
+* `AveragePooling2D`
+* `Flatten`
+* `Add`
+* `ZeroPadding2D`
+* `Activation` 
+   * `relu`
+   * `tanh`
+   * `softmax`
+
+
+
+## Limitations 
+
+* Currently, we support Convolutional Neural Networks (CNNs) that include the supported operators (above) - RNNs/LSTMs not supported
+* We currently only support models in NCHW format (NHWC is not supported)
+* Softmax operator should be the last operation in the CNN pipeline 
+* Softmax operation must be a separate operator (not specified as activation to another type of Keras operator). Example of what works:
+
+```
+Activation ("softmax")
+```
+
+Example of what is NOT supported:
+
+```
+Dense(num_classes, activation="softmax")
+```
+
+
diff --git a/hpvm/projects/keras/frontend/approxhpvm_translator.py b/hpvm/projects/keras/frontend/approxhpvm_translator.py
index 59bc0d17c0605672153a542e850d98cf9d868d77..f3bc0076c6024847620174d0bd311017b8f1c6e5 100644
--- a/hpvm/projects/keras/frontend/approxhpvm_translator.py
+++ b/hpvm/projects/keras/frontend/approxhpvm_translator.py
@@ -211,7 +211,7 @@ class TensorRtTranslator:
     self.knobs_str = ""
     self.cur_height = 32    
     self.cur_width = 32     
-    self.op_count = 1       
+    self.op_count = 0       
     
     
 
@@ -521,17 +521,16 @@ class TensorRtTranslator:
       activation_type = cur_node.activation_type
       out_var_name3 = self.getVariableName(cur_node)    
 
-      inst_str = genActivationCallStr(out_var_name1, out_var_name3, activation_type)
-      self.program_str += inst_str  
-
       if activation_type == "softmax":
         print ("Softmax canNOT be part of Dense/Conv Op. Insert: Activation('softmax');")
         sys.exit(0)
+        
+      inst_str = genActivationCallStr(out_var_name1, out_var_name3, activation_type)
+      self.program_str += inst_str  
 
-      #self.json_str += activation_type + "_" + str(self.op_count) + " : 0, \n"
-      #self.op_count += 1
       self.addBaselineKnob(activation_type)
-      
+
+        
 
     if layer_type == "BatchNormalization":
       input_var_name = self.getSingleInputName(cur_node)
@@ -946,7 +945,7 @@ class TensorRtTranslator:
     f.close()
 
 
-  def dumpJsonFile(self, dir_prefix):
+  def dumpJsonFile(self, dir_prefix, weights_dir):
 
     f = open(dir_prefix + "/tuner.json", "w+")
     f.write("{ \n\n")
@@ -966,7 +965,15 @@ class TensorRtTranslator:
     
     layer_knobs_str = " \"op_knobs\" : { \n"
     layer_knobs_str += self.knobs_str[:-3]
-    layer_knobs_str += " \n\n } \n\n"
+    layer_knobs_str += " \n\n }, \n\n"
+
+    layer_knobs_str += "\"baseline_knob\" : " + baseline_knobs + ", \n"
+    labels_path = weights_dir + "/tune_labels.bin"
+    layer_knobs_str += "\"tune_labels_path\" : \"" + labels_path + "\", \n"
+    layer_knobs_str += "\"conf_path\" : \"tuner_confs.txt\", \n"
+    layer_knobs_str += "\"fifo_path_r\": \"hpvm_fifo_r\", \n"
+    layer_knobs_str += "\"fifo_path_w\": \"hpvm_fifo_w\" \n"
+    
     f.write(layer_knobs_str)
 
     f.write("\n\n}")
@@ -978,7 +985,7 @@ class TensorRtTranslator:
 
     self.add_header()
     
-    dir_path = "std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + std::string(\"" + weights_dir +  "\"); \n"
+    dir_path = "std::string dir_prefix = std::string(\"" + weights_dir +  "\"); \n"
     self.weight_str += dir_path
 
     if test_data is not None:
@@ -997,7 +1004,7 @@ class TensorRtTranslator:
 
     self.generateSourceProgram(src_dir)
     
-    self.dumpJsonFile(src_dir)
+    self.dumpJsonFile(src_dir, weights_dir)
     
 
 
@@ -1074,6 +1081,15 @@ def getUniquePath(weights_dir):
   
 
 
+def createRecursiveDir(target_dir):
+
+  toks = target_dir.split("/")
+  for i in range(len(toks)):
+    path_str = "/".join(toks[0:i+1])
+    if not os.path.exists(path_str):
+      os.mkdir(path_str)
+  
+
 
 #***** Top level External Function ******* 
 def translate_to_approxhpvm(model,
@@ -1088,10 +1104,11 @@ def translate_to_approxhpvm(model,
 
   if not reload_weights:
     weights_dir = getUniquePath(weights_dir)
-    os.mkdir(weights_dir)   
+    createRecursiveDir(weights_dir)
+    
 
   src_dir = getUniquePath(src_dir)
-  os.mkdir(src_dir)   
+  createRecursiveDir(src_dir)
     
   dfg = DFG()    
   for i in range(len(model.layers)):
@@ -1117,6 +1134,11 @@ def translate_to_approxhpvm(model,
   hpvmTranslator = HPVMTranslator(dfg, weight_str, input_str, filter_names)    
   hpvmTranslator.translate(model, src_dir, test_data, tuner_data, batch_size)
 
+  promiseTranslator = PromiseRtTranslator(dfg, weight_str)
+  promiseTranslator.translate(model, src_dir, test_data)
+
+  
+  
   if reload_weights:
     print ("NOTE: Using existing pretrained weights \n")
   else:
@@ -1127,5 +1149,5 @@ def translate_to_approxhpvm(model,
   print ("-- ApproxHPVM src  : ", src_dir + "approxhpvm_src.cc")
 
   
-  return weights_dir
+  return src_dir
 
diff --git a/hpvm/projects/keras/frontend/hpvm_dfg_translator.py b/hpvm/projects/keras/frontend/hpvm_dfg_translator.py
index 2c229a0dd2b100cf83882f5640c1d2707c78398d..d14c64bbf406e991b0b623fa3b56f938153e7fe8 100644
--- a/hpvm/projects/keras/frontend/hpvm_dfg_translator.py
+++ b/hpvm/projects/keras/frontend/hpvm_dfg_translator.py
@@ -1,4 +1,5 @@
 
+import os
 import sys
 from frontend.utils import *
 from frontend.hpvm_intrinsics import *
@@ -17,6 +18,7 @@ class HPVMTranslator:
     self.root_str = ""
     self.root_struct_str = ""
     self.main_func_str = ""
+    self.tuner_main_func_str = ""
     self.file_header_str = ""
     self.hpvm_node_names = {}
     
@@ -554,8 +556,8 @@ class HPVMTranslator:
     headers += "#include <cstring> \n"
     
     headers += "#include <" + HPVM_header +  "> \n"
-    if LLVM_9_BRANCH:
-       headers += "#include \"config.h\" \n"
+    #if LLVM_9_BRANCH:
+    #   headers += "#include \"config.h\" \n"
     
     headers += "#include <tensorTypes.h> \n"
     headers += "#include <tensorUtils.h> \n\n"
@@ -626,12 +628,12 @@ class HPVMTranslator:
 
 
 
-  def genBatchLoop(self, test_data, batch_size):
+  def genBatchLoop(self, data_shape, batch_size):
 
-    chans = test_data.shape[1]
-    width = test_data.shape[2]
-    height = test_data.shape[3]    
-    test_input_size = test_data.shape[0]
+    chans = data_shape[1]
+    width = data_shape[2]
+    height = data_shape[3]    
+    test_input_size = data_shape[0]
 
     func_str = "unsigned int batch_size = " + str(batch_size) + "; \n"
     func_str += "unsigned int test_input_size = " +  str(test_input_size) +  "; \n"
@@ -645,8 +647,17 @@ class HPVMTranslator:
 
     func_str += "unsigned int start = i * batch_size; \n"
     func_str += "unsigned int end = (i + 1) * batch_size;  \n"
-   
-    func_str += "void* input = readInputBatch(input_path.c_str(), 0, start, end," + str(chans) + "," + str(width) + "," + str(height) +  ");  \n\n"
+
+    return func_str 
+    
+
+  def genBatchInput(self, data_shape, input_pth):
+
+    chans = data_shape[1]
+    width = data_shape[2]
+    height = data_shape[3]    
+
+    func_str = "void* input = readInputBatch(" + input_pth + ".c_str(), 0, start, end," + str(chans) + "," + str(width) + "," + str(height) +  ");  \n\n"
    
     func_str += "args->input = input;  \n"
     func_str += "args->input_bytes = 0; \n\n"
@@ -664,7 +675,7 @@ class HPVMTranslator:
 
     return func_str
 
-  # FIXIT
+  
   def handleTuneTestData(self):
 
     input_str = "void* input = test_input; \n"
@@ -695,7 +706,8 @@ class HPVMTranslator:
        main_func_str += "args->" + f_name + " = " + f_name + "; \n"
        main_func_str += "args->" + f_name + "_bytes = 0; \n"       
     
-     main_func_str += self.genBatchLoop(test_data, batch_size)
+     main_func_str += self.genBatchLoop(test_data.shape, batch_size)
+     main_func_str += self.genBatchInput(test_data.shape, "input_path")
     
      main_func_str += "void* dfg = " + HPVM_launch + "(0, root, (void*) args); \n\n"
      main_func_str += HPVM_wait + "(dfg); \n\n"
@@ -709,20 +721,127 @@ class HPVMTranslator:
      main_func_str += "llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); \n"
   
      main_func_str += self.endBatchLoop()
-
      main_func_str += HPVM_cleanup + "(); \n "
   
-     ####main_func_str += "computeAccuracy3(labels, result); \n"    
      main_func_str += "return 0; \n\n"
      main_func_str += "} \n"    
     
      self.main_func_str += main_func_str
 
 
+
+  def genTunerMainFunction(self, src_dir, test_data, batch_size):    
+
+     tuner_main_func_str = "int main(int argc, char* argv[]){ \n\n"
+     tuner_main_func_str += self.weight_str
+     tuner_main_func_str += self.input_str
+     tuner_main_func_str += "RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); \n\n"
+
+     tuner_main_func_str += self.handleTuneTestData()  
+ 
+     for f_name in self.filter_names:    
+       tuner_main_func_str += "args->" + f_name + " = " + f_name + "; \n"
+       tuner_main_func_str += "args->" + f_name + "_bytes = 0; \n"       
+
+     tuner_main_func_str += "\nint ret = 0; \n"
+     tuner_main_func_str += "while ((ret = fifo_wait())) { \n"
+     tuner_main_func_str += "\n" + HPVM_init + "(); \n\n"
+     tuner_main_func_str += "std::string input_pth = (ret == 1 ? test_input_path : tune_input_path); \n"
+     tuner_main_func_str += "std::string labels_pth = (ret == 1 ? test_labels_path : tune_labels_path); \n"
+
+     abs_src_path = str(os.getcwd()) + "/" + src_dir 
+     tuner_main_func_str += "auto* fp = open_fifo(\"" + abs_src_path + "/hpvm_fifo_w\", \"wb\"); \n\n"
+     tuner_main_func_str += "float total_accuracy = 0; \n"
+     
+     tuner_main_func_str += self.genBatchLoop(test_data.shape, batch_size)
+     tuner_main_func_str += self.genBatchInput(test_data.shape, "input_pth")
+
+     tuner_main_func_str += "void* dfg = " + HPVM_launch + "(0, root, (void*) args); \n\n"
+     tuner_main_func_str += HPVM_wait + "(dfg); \n\n"
+
+     if LLVM_4_BRANCH:
+       tuner_main_func_str += "void *result = static_cast<RootIn*>(args)->input; \n"
+     elif LLVM_9_BRANCH:
+       tuner_main_func_str += "void *result = static_cast<RootIn *>(args)->r.tensor; \n"
+    
+     tuner_main_func_str += "hpvm_request_tensor(result, 0); \n\n"
+     tuner_main_func_str += "uint32_t* labels = readLabelsBatch3(labels_pth.c_str(), start, end); \n"
+     tuner_main_func_str += "total_accuracy += computeAccuracy3(labels, result) * batch_size  ; \n"
+
+     tuner_main_func_str += "\nfifo_write_batch(fp, result); \n"     
+
+     tuner_main_func_str += self.endBatchLoop()
+
+     tuner_main_func_str += "write_accuracy(total_accuracy / test_input_size); \n"
+     tuner_main_func_str += "fclose(fp); \n"
+     tuner_main_func_str += HPVM_cleanup + "(); \n "
+
+     tuner_main_func_str += "\n}\n\n"  # End of FIFO loop
   
+     tuner_main_func_str += "return 0; \n\n"
+     tuner_main_func_str += "} \n"    
     
+     self.tuner_main_func_str += tuner_main_func_str
+     
+
+  def addFIFORoutines(self, src_dir):
 
-  def generateSourceProgram(self, dir_prefix):
+    abs_src_dir = str(os.getcwd()) + "/" + src_dir 
+
+    FIFO_str = """
+   
+ FILE *open_fifo(const char *path, const char *mode) { 
+  auto* fd = fopen(path, mode);
+  if (!fd) {
+    std::cerr << \"Error opening FIFO file: \" << strerror(errno);
+    abort(); 
+  }
+
+   return fd;
+}
+
+
+int fifo_wait() {
+    auto* fp = open_fifo(\"""" + abs_src_dir + """/hpvm_fifo_r\", \"r\");
+    const int maxn = 100;
+    char linebuf[maxn];
+    fgets(linebuf, maxn, fp);
+    fclose(fp);
+    std::string line(linebuf);
+    if (line == \"test\")
+      return 1;
+    if (line == \"tune\")
+      return 2;
+    if (line == \"stop\")
+      return 0;
+    std::cerr << \"Invalid fifo file content \" << line;
+    abort();
+}
+
+void fifo_write_batch(FILE *fp, void *output_ptr) {
+    auto *output = (Tensor *) output_ptr;
+    const auto &dim = output->dims;
+    size_t num_dims = dim.num_dims;
+    fwrite(&num_dims, sizeof(size_t), 1, fp);
+    fwrite(dim.dim_sizes, sizeof(size_t), dim.num_dims, fp);
+    fwrite(output->host_data, 1, output->size_in_bytes, fp);
+}
+
+
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
+
+
+
+"""
+
+    return FIFO_str
+  
+
+
+  def generateTestProgram(self, dir_prefix):
     
     program_str = self.file_header_str + self.node_str + self.root_str
     program_str += self.root_struct_str + self.main_func_str
@@ -733,19 +852,38 @@ class HPVMTranslator:
     f.write(program_str)
     f.close()
 
+
+
+  def generateTunerProgram(self, dir_prefix, FIFO_str):
+    
+    program_str = self.file_header_str + FIFO_str + self.node_str + self.root_str
+    program_str += self.root_struct_str + self.tuner_main_func_str
+
+    DEBUG (program_str)
+    
+    f = open(dir_prefix + "/approxhpvm_tuner_src.cc", "w+")
+    f.write(program_str)
+    f.close()
+
     
   
   def translate(self, model, src_dir, test_data, tuner_data, batch_size):
 
     self.genFileHeader()
+    
     self.genRootNodeHeader()
     self.genRootStructure()
+    
     self.codegen(self.dfg)
     self.genRootNodeFooter()
+    
     self.genMainFunction(test_data, batch_size)
+    self.genTunerMainFunction(src_dir, test_data, batch_size)
 
     # dump generated program string to source file
-    self.generateSourceProgram(src_dir)
-    
+    self.generateTestProgram(src_dir)
+
+    FIFO_str = self.addFIFORoutines(src_dir)
+    self.generateTunerProgram(src_dir, FIFO_str)
   
 
diff --git a/hpvm/projects/keras/frontend/knobs.py b/hpvm/projects/keras/frontend/knobs.py
index 291221acb544dbcdf88c810b9401356d2da91be7..942df9021cacdffd21950db9ce98bce80957571a 100644
--- a/hpvm/projects/keras/frontend/knobs.py
+++ b/hpvm/projects/keras/frontend/knobs.py
@@ -31,8 +31,8 @@ knobs_speedups[268] = 2
 knobs_speedups[269] = 2
 
 
-conv_knobs = "12, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 261, 262, 263, 264, 265, 266, 267, 268, 269"
+conv_knobs = "\"12\", \"151\", \"152\", \"153\", \"154\", \"155\", \"156\", \"157\", \"158\", \"159\", \"160\", \"161\", \"162\", \"163\", \"164\", \"165\", \"166\", \"167\", \"168\", \"261\", \"262\", \"263\", \"264\", \"265\", \"266\", \"267\", \"268\", \"269\""
 
-baseline_knobs = "12"
+baseline_knobs = "\"12\""
 
 
diff --git a/hpvm/projects/keras/frontend/promise_translator.py b/hpvm/projects/keras/frontend/promise_translator.py
index 30acf47cd8aeedaaae8ee1ba0fda637a7d931940..015c1c562a0be295b9e3df0381f582e454192f16 100644
--- a/hpvm/projects/keras/frontend/promise_translator.py
+++ b/hpvm/projects/keras/frontend/promise_translator.py
@@ -185,6 +185,7 @@ class PromiseRtTranslator:
     self.quant_ranges = {}
     # Used to generate PromiseSim Info
     self.layer_str = ""
+    self.cur_layer_id = 1
     self.layer_size_str = "" 
     self.layer_input_sizes = {}
     self.unique_op_types = {}
@@ -238,9 +239,7 @@ class PromiseRtTranslator:
       
     first_op = state.getFirstOp()
     layer_name = first_op.layer_name
-
-    #print("** layer_name = ", layer_name)    
-          
+        
     unique_id = 0
     if promise_layer_type not in self.unique_op_types:
       self.unique_op_types[promise_layer_type] = 1
@@ -261,7 +260,6 @@ class PromiseRtTranslator:
     
     weights_shape = central_op.weights.shape
     input_size = self.layer_input_sizes[layer_name]  
-    #print ("layer_name = ", layer_name, " input_size = ", input_size)
     N = self.batch_size
     C = input_size[1]
 
@@ -294,41 +292,48 @@ class PromiseRtTranslator:
 
   def appendLayerString(self, promise_layer_type, state):
 
-    layer_str = ""
+    
+    layer_str = str(self.cur_layer_id) + " gpu "
+    self.cur_layer_id += 1
+    
     for op in state.ops:
       op_type = op.layer_type    
       if op_type == "Conv2D":
-        layer_str += "conv  "
+        layer_str += "conv fp32 1 "
         if op.use_bias:
-          layer_str += "add  "
+          layer_str += "add fp32 1 "
         if op.activation_type != "linear":
-          layer_str += "activation  "
+          layer_str += op.activation_type + " fp32 1 "
 
       if op_type == "DepthwiseConv2D":
-        layer_str += "depthwise_conv  "
+        layer_str += "group_conv fp32 1"
         if op.use_bias:
-          layer_str += "add  "
+          layer_str += "add "
         if op.activation_type != "linear":
-          layer_str += "activation  "
+          layer_str += op.activation_type + " fp32 1"
 
       if op_type == "BatchNormalization":
-        layer_str += "batchnorm  "
+        layer_str += "batchnorm fp32 1 "
           
       if op_type == "Dense":
-        layer_str += "dense  "
+        layer_str += "mul fp32 1 "
         if op.use_bias:
-          layer_str += "add  "
+          layer_str += "add fp32 1 "
         if op.activation_type != "linear":
-          layer_str += "activation  "
-      
-      if "Pooling" in op_type:
-        layer_str += "pool  "
+          layer_str += op.activation_type + " fp32 1 "
+
+          
+      if op_type == "MaxPooling2D":
+        layer_str += "pool_max fp32 1 "
+
+      if op_type == "AveragePooling2D":
+        layer_str += "pool_mean fp32 1 "
       
       if op_type == "Add":    
-        layer_str += "add  "
+        layer_str += "add fp32 1 "
 
       if op_type == "Activation":
-        layer_str += "activation  "
+        layer_str += op.activation_type + " fp32 1 "
 
     layer_str += "\n"
 
@@ -355,13 +360,10 @@ class PromiseRtTranslator:
   # Retrieve input name of the previous layer
   def getInputLayerName(self, cur_node):
 
-    #print (cur_node.layer_name)
     # Assumption: If no inputs, the previous layer must be input layer
     if len(cur_node.inputs) == 0:
       return "input"
 
-    #print ("Input_type = ", cur_node.inputs[0].layer_type)
-
     pred_layer_type = cur_node.inputs[0].layer_type
     # FIXME: Assuming the 'inference' phase - hence skipping Dropout
     #if pred_layer_type == "Flatten" or pred_layer_type == "Dropout":
@@ -381,7 +383,6 @@ class PromiseRtTranslator:
   # Retrieve input name of the previous layer
   def getSingleInputName(self, cur_node):
 
-    #print (cur_node.layer_name)
     # Assumption: If no inputs, the previous layer must be input layer
     if len(cur_node.inputs) == 0:
       return "input"
@@ -396,7 +397,6 @@ class PromiseRtTranslator:
   
     # get input to the layer
     input_node_name = cur_node.inputs[0].layer_name  # get the input layer ID
-
     
     input_var_name = ""
     if input_node_name in self.output_map:
@@ -450,12 +450,6 @@ class PromiseRtTranslator:
 
     weights = cur_node.weights
 
-    #min_val = np.amin(weights)
-    #max_val = np.amax(weights)
-
-    #min_val = np.percentile(weights, 0.5)
-    #max_val = np.percentile(weights, 99.5)
-
     (min_val, max_val) = get_best_quant_range(weights)
     
     
@@ -488,8 +482,6 @@ class PromiseRtTranslator:
     prev_layer_name = self.getInputLayerName(first_op)
     cur_layer_name = last_op.layer_name
 
-    # print ("prev_layer_name ", prev_layer_name , " cur_layer_name = ", cur_layer_name)
-
     if prev_layer_name not in self.quant_ranges or cur_layer_name not in self.quant_ranges:
       print ("ERROR: Layer_name = ", prev_layer_name ," or ", cur_layer_name, " not found in quant_range")
       sys.exit(0)
@@ -506,8 +498,6 @@ class PromiseRtTranslator:
     
   def genDenseLayer(self, state):
     
-    print ("\n\n Layer = ", state.op_string, "\n\n")
-
     first_op = state.getFirstOp()
     dense_op = state.getDenseOp()
     last_op = state.getLastOp()
@@ -520,40 +510,16 @@ class PromiseRtTranslator:
     b_min, b_max = self.getBiasRange(dense_op)   
     
     activation_id = state.getActivationID()
-
-    # NOTE: retrieve the quantization ranges for inputs and ouputs
-    input_quant_range, output_quant_range = self.getQuantRange(state)
-    
-    promise_layer_str = "void* " + output_var + " = FCLayer_PROMISE(" + input_var + ", "
-    promise_layer_str += str(input_quant_range[0]) + ", "  + str(input_quant_range[1]) + ", "
-    promise_layer_str += w_name + ", " + str(w_min) + ", " + str(w_max) + ", "
-    promise_layer_str += b_name + ", " + str(b_min) + ", " + str(b_max) + ", "
-    promise_layer_str += str(activation_id) + ", "
-    promise_layer_str += str(output_quant_range[0]) + ", "  + str(output_quant_range[1]) + ", "
-    promise_layer_str += str(self.swing_value) 
-    promise_layer_str += "); \n"
-    
-    print (promise_layer_str)
-
-    self.program_str += promise_layer_str
-
     
     self.appendLayerString("FC", state)
     
     state.clear()
 
 
-    # NOTE: This dumps quantization range files needed for HPVM wrapper backend
-    dumpQuantizeRanges(self.weights_dir, input_quant_range[0], input_quant_range[1],\
-                       w_min, w_max, b_min, b_max, \
-                       output_quant_range[0], output_quant_range[1])
-
 
     
   def genConvLayer(self, state):
     
-    print ("\n\n Layer = ", state.op_string, "\n\n")
-
     first_op = state.getFirstOp()
     conv_op = state.getConvOp()
     last_op = state.getLastOp()
@@ -570,41 +536,15 @@ class PromiseRtTranslator:
     pool_id, pool_size = state.getPoolInfo()
     strides = state.getStrides()
 
-    # NOTE: retrieve the quantization ranges for inputs and ouputs
-    input_quant_range, output_quant_range = self.getQuantRange(state)
-
-    # NOTE: Assuming symmetric K*K pool size
-    promise_layer_str = "void* " + output_var + " = ConvLayer_PROMISE(" + input_var + ", "
-    promise_layer_str += str(input_quant_range[0]) + ", "  + str(input_quant_range[1]) + ", "
-    promise_layer_str += w_name + ", " + str(w_min) + ", " + str(w_max) + ", "
-    promise_layer_str += b_name + ", " + str(b_min) + ", " + str(b_max) + ", "
-    promise_layer_str += str(padding) + ", " + str(padding) + ", "
-    promise_layer_str += str(strides[0]) + ", " + str(strides[1]) + ", "
-    promise_layer_str += str(pool_id) + ", " + str(pool_size[0]) + ", "
-    promise_layer_str += str(activation_id) + ", "
-    promise_layer_str += str(output_quant_range[0]) + ", "  + str(output_quant_range[1]) + ", "    
-    promise_layer_str += str(self.swing_value) 
-    promise_layer_str += "); \n"
-
-    print (promise_layer_str)
-    
-    self.program_str += promise_layer_str
-
     self.appendLayerString("Conv", state)
 
     state.clear()
 
-
-    # NOTE: This dumps quantization range files needed for HPVM wrapper backend
-    dumpQuantizeRanges(self.weights_dir, input_quant_range[0], input_quant_range[1],\
-                       w_min, w_max, b_min, b_max, \
-                       output_quant_range[0], output_quant_range[1])
     
 
 
   def genDepthwiseConvLayer(self, state):
-    print ("\n\n Layer = ", state.op_string, "\n\n")
-
+  
     conv_op = state.getDepthwiseConvOp()
     first_op = state.getFirstOp()
     last_op = state.getLastOp()
@@ -618,44 +558,7 @@ class PromiseRtTranslator:
     padding = state.getPadding()
     pool_id, pool_size = state.getPoolInfo()
     strides = state.getStrides()
-
-    promise_layer_str = "void* " + output_var + " = "
-    promise_layer_str += "tensorConvolution(" + input_var + ", "
-    promise_layer_str += w_name  + ", "
-    promise_layer_str += str(padding) + ", "
-    promise_layer_str += str(padding) + ", "
-    promise_layer_str += str(strides[0]) + ", "
-    promise_layer_str += str(strides[1]) + ", "
-    promise_layer_str += "1, "
-
-    C = conv_op.weights.shape[2]
-    promise_layer_str += str(C) + "); \n"
-
-    # FIX: ADD code for TensorAdd and ACTIVATION
-    # TODO: ADD code for TensorAdd and ACTIVATION
-
-    input_var = output_var
-    if nodeHasBias(conv_op):
-      output_var2 = self.getVariableName(conv_op)    
-      promise_layer_str += "void* " + output_var2 + " = "
-      promise_layer_str += "tensorAdd(" + input_var + ", "
-      promise_layer_str += conv_op.layer_name + "_b"
-      promise_layer_str += "); \n"
-
-      # Update variable that holds input for next operation
-      input_var = output_var2
-
-
-    if nodeHasActivation(conv_op):
-      activation_type = conv_op.activation_type
-      output_var = self.getVariableName(conv_op)    
-      promise_layer_str += genActivationCallStr(input_var, output_var, activation_type)  
-
-      
-    print (promise_layer_str)    
-    self.program_str += promise_layer_str
-
-       
+    
     self.appendLayerString("DepthwiseConv", state)
 
     state.clear()
@@ -670,17 +573,6 @@ class PromiseRtTranslator:
     input_var = self.getSingleInputName(first_op)
     output_var = self.getVariableName(last_op)
 
-    promise_layer_str = "void* " + output_var + " = "
-    promise_layer_str += "tensorBatchNorm(" + input_var + ", "
-    promise_layer_str += first_op.layer_name + "_gamma, "
-    promise_layer_str += first_op.layer_name + "_beta, "
-    promise_layer_str += first_op.layer_name + "_mean, "
-    promise_layer_str += first_op.layer_name + "_variance, "
-    promise_layer_str += str(first_op.epsilon)
-    promise_layer_str += "); \n"
-
-    self.program_str += promise_layer_str
-
     self.appendLayerString("BatchNorm", state)
 
     state.clear()
@@ -689,25 +581,17 @@ class PromiseRtTranslator:
     
 
   def genSoftmaxLayer(self, state):
-    print ("\n\n Layer = ", state.op_string, "\n\n")
-
+  
     first_op = state.getFirstOp()
     last_op = state.getLastOp()
 
-    input_var = self.getSingleInputName(first_op)
-    output_var = self.getVariableName(last_op)
-    
-    promise_layer_str = "void* " + output_var + " = tensorSoftmax(" + input_var + "); \n"
-    print (promise_layer_str)
-
-    self.program_str += promise_layer_str
+    self.layer_str += str(self.cur_layer_id) + " gpu softmax fp32 1\n"  
     
     state.clear()
 
 
   def genAddLayer(self, state):
-    print ("\n\n Layer = ", state.op_string, "\n\n")
-
+  
     first_op = state.getFirstOp()
     last_op = state.getLastOp()
 
@@ -729,8 +613,7 @@ class PromiseRtTranslator:
     
     
   def genActivationLayer(self, state):
-    print ("\n\n Layer = ", state.op_string, "\n\n")
-
+  
     first_op = state.getFirstOp()
     input_var = self.getSingleInputName(first_op)
     output_var = self.getVariableName(first_op)
@@ -757,8 +640,7 @@ class PromiseRtTranslator:
     
   # FIXME: Only supporting single AveragePooling layers
   def genPoolLayer(self, state):
-    print ("\n\n Layer = ", state.op_string, "\n\n")
-
+  
     # For single pool layer should be all same
     pool_op = state.getPoolOp()
 
@@ -830,7 +712,6 @@ class PromiseRtTranslator:
       return  
 
     layer_name = cur_node.layer_name
-    print (layer_name)
     self.visited_nodes[layer_name] = True
 
     self.genPreviousLayer(state)
@@ -847,7 +728,6 @@ class PromiseRtTranslator:
       return  
 
     layer_name = cur_node.layer_name
-    print ("handle_conv", layer_name)
     self.visited_nodes[layer_name] = True
 
     self.genPreviousLayer(state)
@@ -864,7 +744,6 @@ class PromiseRtTranslator:
       return  
 
     layer_name = cur_node.layer_name
-    print ("handle_depthwise_conv", layer_name)
     self.visited_nodes[layer_name] = True
 
     self.genPreviousLayer(state)
@@ -881,7 +760,7 @@ class PromiseRtTranslator:
       return  
 
     layer_name = cur_node.layer_name
-    print ("handle_batchnorm", layer_name)
+    #print ("handle_batchnorm", layer_name)
     self.visited_nodes[layer_name] = True
 
     self.genPreviousLayer(state)
@@ -901,7 +780,6 @@ class PromiseRtTranslator:
       return
 
     layer_name = cur_node.layer_name
-    print (layer_name)
     self.visited_nodes[layer_name] = True
 
     self.genPreviousLayer(state)
@@ -920,7 +798,6 @@ class PromiseRtTranslator:
       return
 
     layer_name = cur_node.layer_name
-    print (layer_name)
     self.visited_nodes[layer_name] = True
 
     # NOTE: If end of DNN
@@ -942,7 +819,6 @@ class PromiseRtTranslator:
       return
 
     layer_name = cur_node.layer_name
-    print (layer_name)
     self.visited_nodes[layer_name] = True
 
     self.genPreviousLayer(state)
@@ -957,7 +833,6 @@ class PromiseRtTranslator:
       return
   
     layer_name = cur_node.layer_name
-    print (layer_name)
     self.visited_nodes[layer_name] = True
 
     layer_type = cur_node.layer_type
@@ -978,7 +853,6 @@ class PromiseRtTranslator:
   def handleLayers(self, output_node, state):
 
     layer_type = output_node.layer_type
-    #print ("layer_type", layer_type)
 
     if layer_type == "ZeroPadding2D":
       self.handle_padding(output_node, state)
@@ -1006,7 +880,7 @@ class PromiseRtTranslator:
         
     if(self.isForwardLayer(layer_type)):
       layer_name = output_node.layer_name
-      print ("NOTE: Skippping = ", layer_name)
+      #print ("NOTE: Skippping = ", layer_name)
       self.visited_nodes[layer_name] = True
       self.traverseSuccessors(output_node, state)   
 
@@ -1036,7 +910,6 @@ class PromiseRtTranslator:
       layer_it += 1
 
     batch_size = 1000
-    #batch_size = len(x_test)
     input_size = len(x_test)
     num_batches = input_size // batch_size 
 
@@ -1048,8 +921,6 @@ class PromiseRtTranslator:
       
       start = i * batch_size
       end = (i + 1) * batch_size
-
-      print ("start = ", start, " end = , ", end)
       
       # Inference over test set
       layer_outs = functor([x_test[start:end], 1.])
@@ -1062,11 +933,9 @@ class PromiseRtTranslator:
       ind = 0
       for layer_out in layer_outs:
         layer_name = model.layers[ind].name
-        print ("layer_name = ", layer_name)
     
-        (min_val, max_val) = get_best_quant_range(layer_out)
-      
-        print ("min_val = ", min_val, " max_val = ", max_val)
+        (min_val, max_val) = get_best_quant_range(layer_out)    
+        #print ("min_val = ", min_val, " max_val = ", max_val)
 
         layer_ranges[layer_name].append((min_val, max_val))
         #self.quant_ranges[layer_name] = (min_val, max_val)
@@ -1094,7 +963,7 @@ class PromiseRtTranslator:
         
       self.quant_ranges[layer_name] = (min_val, max_val)    
 
-      print ("---- min = ", min_val, "  max = ", max_val, " ----- \n\n")
+      #print ("---- min = ", min_val, "  max = ", max_val, " ----- \n\n")
 
       ind += 1
 
@@ -1111,8 +980,8 @@ class PromiseRtTranslator:
         continue
 
       layer_name = layer.name
-      print ("layer_name = ", layer_name)
-      print ("layer_shape = ", layer.input.shape)
+      #print ("layer_name = ", layer_name)
+      #print ("layer_shape = ", layer.input.shape)
       self.layer_input_sizes[layer_name] = layer.input.shape
 
 
@@ -1163,18 +1032,14 @@ class PromiseRtTranslator:
   def endBatchLoop(self):
 
     end_loop_str = ""
-    #end_loop_str += "\nuint8_t* labels = readLabelsBatch2(labels_path.c_str(),start,end); \n"
-    #end_loop_str += "\nuint32_t* labels = readLabelsBatch2(labels_path.c_str(),start,end); \n"
     end_loop_str += "\nuint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); \n"
 
     
     last_node = self.dfg.last_node
     output_var = self.output_map[last_node.layer_name]
-    #accuracy_call = "\nfloat accuracy = computeAccuracy2(labels, batch_size, " + output_var + "); \n"
     accuracy_call = "\nfloat accuracy = computeAccuracy3(labels, " + output_var + "); \n"
     end_loop_str += accuracy_call
  
-    #end_loop_str += "float accuracy = computeAccuracy2(labels, batch_size, var_60); "
     end_loop_str += "final_accuracy += accuracy; \n"
     end_loop_str += "freeBatchMemory(); \n "
     end_loop_str += "\n}\n\n"
@@ -1220,9 +1085,7 @@ class PromiseRtTranslator:
     if test_data is not None and self.dfg.last_node is not None:
       last_node = self.dfg.last_node
       output_var = self.output_map[last_node.layer_name]
-      #accuracy_call = "\ncomputeAccuracy2(labels," + str(len(test_data)) + "," + output_var + "); \n"
-      #footer_str += accuracy_call
-
+ 
     accuracy_call =  "\ndumpExecutionAccuracies(); \n"
     footer_str += accuracy_call
     
@@ -1238,14 +1101,20 @@ class PromiseRtTranslator:
 
   def dumpLayerStr(self, dir_prefix):
 
-    f = open(dir_prefix + "/layer_composition.txt", "w+")
-    f.write(self.layer_str)
-    f.close()
+    config_str = "0\n"
+    config_str += "+++++\n"
+    config_str += "conf1 1 1 100 0\n"
+    config_str += self.layer_str
+    config_str += "-----"
 
-    f = open(dir_prefix + "/layers.txt", "w+")
-    f.write(self.layer_size_str)
+    f = open(dir_prefix + "/tuner_confs.txt", "w+")
+    f.write(config_str)
     f.close()
 
+    #f = open(dir_prefix + "/layers.txt", "w+")
+    #f.write(self.layer_size_str)
+    #f.close()
+
     
       
   def dumpProgramString(self, final_str, dir_prefix):
@@ -1258,8 +1127,6 @@ class PromiseRtTranslator:
     
   def generateSourceProgram(self, weights_dir, x_test):
 
-    print(self.program_str)
-    
     final_str = ""
     header_str = self.genHeader()
     final_str += header_str
@@ -1270,7 +1137,6 @@ class PromiseRtTranslator:
     loop_str = self.genBatchLoop(x_test)
     final_str += loop_str
     
-    #final_str += "\n\n" + self.weight_str + "\n\n"
     final_str += self.program_str
 
     end_loop_str = self.endBatchLoop()
@@ -1281,7 +1147,7 @@ class PromiseRtTranslator:
 
     footer_str = self.genFooter(x_test)
     final_str += footer_str    
-    print (final_str)
+    #print (final_str)
     
     self.dumpProgramString(final_str, weights_dir)
     
@@ -1291,7 +1157,7 @@ class PromiseRtTranslator:
     
   def translate(self, model, weights_dir, x_test):
 
-    print ("\n\n\n **** PromiseRT Translator ****** \n\n\n")
+    #print ("\n\n\n **** PromiseRT Translator ****** \n\n\n")
     root_node = self.dfg.root_node
     state = State()
 
@@ -1299,13 +1165,10 @@ class PromiseRtTranslator:
     
     self.findLayerInputSizes(model, x_test)
     
-    self.findQuantizeRanges(model, x_test)
-    
     self.handleLayers(root_node, state)
 
-    print ("\n *** Generated PROMISE Layers **** \n ")
-    
-    self.generateSourceProgram(weights_dir, x_test)
+    # Commented out Promise code-gen - Not needed in this release version
+    #self.generateSourceProgram(weights_dir, x_test)
 
     self.dumpLayerStr(weights_dir)
 
diff --git a/hpvm/projects/keras/scripts/test_benchmarks.py b/hpvm/projects/keras/scripts/test_benchmarks.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac145b9dd0c0a7af0b6bbb9618eb41bc085cd6c
--- /dev/null
+++ b/hpvm/projects/keras/scripts/test_benchmarks.py
@@ -0,0 +1,189 @@
+
+
+import os
+import subprocess
+
+class Benchmark:
+
+    def __init__(self, binary_path, test_accuracy):
+
+        self.binary_path = binary_path
+        self.test_accuracy = test_accuracy
+        self.epsilon = 0.05 # Adding some slack for accuracy difference
+
+
+    def getPath(self):
+        return self.binary_path
+
+    
+    def readAccuracy(self, accuracy_file):
+
+        f = open(accuracy_file, "r") # File with final benchmark accuracy 
+        acc_str = f.read()
+        return float(acc_str)
+    
+        
+    def runKeras(self):
+
+        # Test Bechmark accuracy with pretrained weights (hpvm_relaod)
+        run_cmd = "python " + self.binary_path + " hpvm_reload "
+        try:
+            subprocess.call(run_cmd, shell=True)
+        except:
+            return False
+
+        accuracy = self.readAccuracy("final_accuracy")
+
+        print ("accuracy = ", accuracy, " test_accuracy = ", self.test_accuracy) 
+
+        test_success = False
+        if (abs(self.test_accuracy - accuracy) < self.epsilon):
+            print ("Test for " + self. binary_path + " Passed ")
+            test_success = True
+        else:
+            print ("Test Failed for " + self.binary_path)
+            test_success = False
+
+        return test_success
+
+
+    def runHPVM(self):
+
+        # Test Bechmark accuracy with pretrained weights (hpvm_relaod)
+        run_cmd = "python " + self.binary_path + " hpvm_reload frontend compile"
+        try:
+            subprocess.call(run_cmd, shell=True)
+        except:
+            return False
+
+        working_dir = open("working_dir.txt").read()
+        cur_dir = os.getcwd()
+        
+        os.chdir(working_dir)
+        binary_path =  "./HPVM_binary"
+        
+        try:
+            subprocess.call(binary_path, shell=True)
+        except:
+            return False
+        
+        accuracy = self.readAccuracy("final_accuracy")
+        print ("accuracy = ", accuracy, " test_accuracy = ", self.test_accuracy) 
+
+        test_success = False
+        if (abs(self.test_accuracy - accuracy) < self.epsilon):
+            print ("Test for " + self. binary_path + " Passed ")
+            test_success = True
+        else:
+            print ("Test Failed for " + self.binary_path)
+            test_success = False
+
+        os.chdir(cur_dir)  # Change back to original working directory
+        
+        return test_success
+ 
+        
+
+class BenchmarkTests:
+
+    def __init__(self):
+
+        self.benchmarks = []
+        self.passed_tests = []
+        self.failed_tests = []
+        self.passed_hpvm_tests = []
+        self.failed_hpvm_tests = []
+
+
+    def addBenchmark(self, benchmark):
+
+        self.benchmarks.append(benchmark)
+
+
+    def runKerasTests(self):
+
+        for benchmark in self.benchmarks:
+            test_success = benchmark.runKeras()
+
+            if not test_success:
+                self.failed_tests.append(benchmark.getPath())
+            else:
+                self.passed_tests.append(benchmark.getPath())
+
+
+    def runHPVMTests(self):
+
+        for benchmark in self.benchmarks:
+            test_success = benchmark.runHPVM()
+
+            if not test_success:
+                self.failed_hpvm_tests.append(benchmark.getPath())
+            else:
+                self.passed_hpvm_tests.append(benchmark.getPath())
+
+                
+    def printKerasSummary(self):
+
+        failed_test_count = len(self.failed_tests)
+        passed_test_count = len(self.passed_tests)
+        
+        print (" Tests Passed  = " + str(passed_test_count) + " / " + str(len(self.benchmarks)))
+        print ("******* Passed Tests ** \n")
+        for passed_test in self.passed_tests:
+            print ("Passed: " + passed_test)
+
+        print (" Tests Failed  = " + str(failed_test_count) + " / " + str(len(self.benchmarks)))
+        print ("****** Failed Tests *** \n")
+        for failed_test in self.failed_tests:
+            print ("Failed: " + failed_test)
+            
+
+    def printHPVMSummary(self):
+
+        failed_test_count = len(self.failed_hpvm_tests)
+        passed_test_count = len(self.passed_hpvm_tests)
+        
+        print (" Tests Passed  = " + str(passed_test_count) + " / " + str(len(self.benchmarks)))
+        print ("******* Passed Tests ** \n")
+        for passed_test in self.passed_hpvm_tests:
+            print ("Passed: " + passed_test)
+
+        print (" Tests Failed  = " + str(failed_test_count) + " / " + str(len(self.benchmarks)))
+        print ("****** Failed Tests *** \n")
+        for failed_test in self.failed_hpvm_tests:
+            print ("Failed: " + failed_test)
+            
+
+        
+            
+if __name__ == "__main__":
+
+    testMgr = BenchmarkTests()
+    AlexNet = Benchmark("src/alexnet.py", 79.28)
+    AlexNet_ImageNet = Benchmark("src/alexnet_imagenet.py", 56.30)
+    AlexNet2 = Benchmark("src/alexnet2.py", 84.98)
+    LeNet = Benchmark("src/lenet.py", 98.70)
+    MobileNet = Benchmark("src/mobilenet_cifar10.py", 84.42)
+    ResNet18 = Benchmark("src/resnet18_cifar10.py", 89.56)
+    ResNet50 = Benchmark("src/resnet50_imagenet.py", 75.10)
+    VGG16_cifar10 = Benchmark("src/vgg16_cifar10.py", 89.96)
+    VGG16_cifar100 = Benchmark("src/vgg16_cifar100.py", 66.50)
+    VGG16_ImageNet = Benchmark("src/vgg16_imagenet.py", 69.46)
+
+    testMgr.addBenchmark(AlexNet)
+    testMgr.addBenchmark(AlexNet_ImageNet)
+    testMgr.addBenchmark(AlexNet2)
+    testMgr.addBenchmark(LeNet)
+    testMgr.addBenchmark(MobileNet)
+    testMgr.addBenchmark(ResNet18)
+    testMgr.addBenchmark(ResNet50)
+    testMgr.addBenchmark(VGG16_cifar10)
+    testMgr.addBenchmark(VGG16_cifar100)
+    testMgr.addBenchmark(VGG16_ImageNet)
+
+    testMgr.runKerasTests()
+    testMgr.printKerasSummary()
+    
+    testMgr.runHPVMTests()
+    testMgr.printHPVMSummary()
+
diff --git a/hpvm/projects/keras/src/Benchmark.py b/hpvm/projects/keras/src/Benchmark.py
index 3610b2e9a5ad10c2b3d90795eb20b3d6839b730f..aaa7bdacc47ee570c21ab5e4d797737e5f193811 100644
--- a/hpvm/projects/keras/src/Benchmark.py
+++ b/hpvm/projects/keras/src/Benchmark.py
@@ -3,6 +3,7 @@
 import sys
 import os
 import shutil
+import subprocess
 from keras.utils.np_utils import to_categorical
 from keras.models import load_model
 from frontend.approxhpvm_translator import translate_to_approxhpvm
@@ -39,18 +40,41 @@ class Benchmark:
 
     # Compiles frontend generated sources
     def compileSource(self, working_dir):
+              
+        src_file = os.getcwd() + "/" + working_dir + "/approxhpvm_src.cc"
+        target_binary = os.getcwd() + "/" + working_dir + "/HPVM_binary"
+        approx_conf_file = "tuner_confs.txt"
 
-        # set LLVM_SRC_ROOT
-        os.environ["CFLAGS"] = ""
-        os.environ["CXXFLAGS"] = ""
-
-        dest_file = working_dir + "CMakeLists.txt"
-        shutil.copy("cmake_template/CMakeLists.txt", dest_file)
-
-        # Cmake ../
-        # make
-
-
+        FNULL = open(os.devnull, 'w')
+        
+        try:
+            subprocess.run([
+                "approxhpvm.py", 
+                "-h"
+            ], check=True, stdout=FNULL)
+            
+        except:
+            print ("\n\n ERROR: Could not find approxhpvm.py (HPVM compile script)!! \n\n")
+            print ("To Compile, Must set PATH to include approxhpvm.py script. Do the following: ")
+            print ("**** export PATH=${PATH_TO_YOUR_HPVM_INSTALLATION}/build/bin/:$PATH *****")
+            sys.exit(1)
+
+
+        try:
+            subprocess.run([
+                "approxhpvm.py", src_file, target_binary,
+                "-t", "tensor", "--conf-file", approx_conf_file
+            ], check=True)
+        except:
+            print ("\n\n ERROR: HPVM Compilation Failed!! \n\n")
+            sys.exit(1)
+
+        f = open("working_dir.txt", "w+")
+        f.write(working_dir)
+        f.close()
+       
+            
+        
     def printUsage(self):
 
         print ("Usage: python ${benchmark.py} [hpvm_reload|train] [frontend] [compile]")
@@ -90,13 +114,20 @@ class Benchmark:
       score = model.evaluate(X_test, to_categorical(y_test, self.num_classes), verbose=0)
       print('Test accuracy2:', score[1])
 
+      f = open("final_accuracy", "w+")
+      f.write(str(score[1] * 100))
+      f.close()
+
 
       if len(argv) > 2:
         if argv[2] == "frontend":
+
+          if argv[1] == "hpvm_reload": # If reloading HPVM weights use this as directory to load from in HPVM-C generated src
+              self.data_dir = self.reload_dir
           
           # Main call to ApproxHPVM-Keras Frontend
           working_dir = translate_to_approxhpvm(model,
-                                                self.data_dir, self.src_dir,  ##  "data/test_src/", 
+                                                self.data_dir, self.src_dir,   
                                                 X_test, y_test,
                                                 X_tuner, y_tuner,
                                                 self.batch_size, # FIXIT
diff --git a/hpvm/projects/keras/src/Config.py b/hpvm/projects/keras/src/Config.py
index 2edc5c1add5542edabdd052097ccb4b45d608472..99e696d632c50db4ae8098a2f4836ca994b672aa 100644
--- a/hpvm/projects/keras/src/Config.py
+++ b/hpvm/projects/keras/src/Config.py
@@ -1,3 +1,13 @@
 
+import pathlib
+
+
 # Path Relative to Model Params Directory
-MODEL_PARAMS_DIR = "../../../hpvm/test/dnn_benchmarks/model_params/"
+abs_path = pathlib.Path(__file__).parent.absolute()
+MODEL_PARAMS_DIR = str(abs_path) + "/../../../../hpvm/test/dnn_benchmarks/model_params/"
+
+
+if __name__ == "__main__":
+
+    abs_path = pathlib.Path(__file__).parent.absolute()
+    print (abs_path)
diff --git a/hpvm/projects/keras/src/alexnet.py b/hpvm/projects/keras/src/alexnet.py
index 4b23fd995ffcc5a4f3234566a8a76dac8c12c6aa..0eefe1b3d3dfa28cd009d74806a9bff41f6d597b 100644
--- a/hpvm/projects/keras/src/alexnet.py
+++ b/hpvm/projects/keras/src/alexnet.py
@@ -21,9 +21,11 @@ from Benchmark import Benchmark
 from Config import MODEL_PARAMS_DIR
 
 
-
+# Inherits from Benchmark class defined in src/Benchmark.py
 class AlexNet_CIFAR10(Benchmark):
 
+    # buildModel overrides the buildModel declared in src/Benchmark.py
+    # Goal: Build a Keras Sequential Model (other model types not supported) and return the (uninitalized/untrained) Model 
     def buildModel(self):
 
         activation_type = 'tanh'
@@ -51,36 +53,41 @@ class AlexNet_CIFAR10(Benchmark):
         model.add(Dropout(0.4))
 
         model.add(Flatten())
-        #model.add(Flatten())
-        #model.add(Dense(256))
         model.add(Dense(self.num_classes))
         model.add(Activation('softmax'))
         
         return model
 
-    
+
+    # This routine is called from the common `run` method in src/Benchmark.py
+    # Goal: Return Training and Testing data after preprocessing/normalization
     def data_preprocess(self):
 
         (X_train, y_train), (X_val, y_val) = cifar10.load_data()
 
         X_train = X_train / 255.0
-        X_val = X_val / 255.0
-
+ 
         mean = np.mean(X_train)
         std = np.std(X_train)
         X_train = (X_train - mean) / (std + 1e-7)
-        X_val = (X_val - mean) / (std + 1e-7)  
 
-        X_test = X_val[0:5000]
-        y_test = y_val[0:5000]
-        X_tuner = X_val[5000:]
-        y_tuner = y_val[5000:]
+        X_test = np.fromfile(MODEL_PARAMS_DIR + '/alexnet_cifar10/test_input.bin', dtype=np.float32)
+        y_test = np.fromfile(MODEL_PARAMS_DIR + '/alexnet_cifar10/test_labels.bin', dtype=np.uint32)
+
+        X_test = X_test.reshape((-1,3,32,32))
+
+
+        X_tuner = np.fromfile(MODEL_PARAMS_DIR + '/alexnet_cifar10/tune_input.bin', dtype=np.float32)
+        y_tuner = np.fromfile(MODEL_PARAMS_DIR + '/alexnet_cifar10/tune_labels.bin', dtype=np.uint32)
+
+        X_tuner = X_tuner.reshape((-1,3,32,32))
 
         return X_train, y_train, X_test, y_test, X_tuner, y_tuner
-    
 
+
+    # Goal: Given a Keras Sequential Model - setup the training parameters, train, and return the trained Model
     def trainModel(self, model, X_train, y_train, X_test, y_test):
-        
+
         y_train = to_categorical(y_train, self.num_classes)
         y_test = to_categorical(y_test, self.num_classes)
 
@@ -126,22 +133,27 @@ class AlexNet_CIFAR10(Benchmark):
 
     
 if __name__ == '__main__':
-  
+
+    # Using GPU ID 0 - Change to use different GPU
     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-    # Changing to NCHW format
+    # Changing to NCHW format - HPVM currently supports NCHW - NHWC format is not supported
     K.set_image_data_format('channels_first')
 
 
-    ### Parameters specific to each benchmark
+    # *** Below are Parameters specific to each benchmark *****
     reload_dir = MODEL_PARAMS_DIR + '/alexnet_cifar10/'
+    ## Either the HPVM weights are loaded (above) or the Keras Model from the path below 
     keras_model_file = MODEL_PARAMS_DIR + '/alexnet_cifar10/weights.h5'
-    data_dir = '/alexnet_cifar10/' 
-    src_dir = 'data/alexnet_cifar10_src/'
-    num_classes = 10
-    batch_size = 500
-        
+    data_dir = ''   # if reloading weights, data_dir can be set to empty string (value is ignored)
+ 
+    src_dir = 'data/alexnet_cifar10_src/'  # Directory where HPVM sources are downloaded
+    num_classes = 10  # Specify num out output classes - CIFAR10 has `10` classes
+    batch_size = 500  # Batch Size set to 500 - Adjust this value based on your GPU memory 
+
+    # All Classes inherit from 'Benchmark` class in src/Benchmark.py and have a common Constructor
     model = AlexNet_CIFAR10('AlexNet_CIFAR10', reload_dir, keras_model_file, data_dir, src_dir, num_classes, batch_size)
-    
+
+    # This invokes the common run function in src/Benchmark.py 
     model.run(sys.argv)
 
     
diff --git a/hpvm/projects/keras/src/alexnet2.py b/hpvm/projects/keras/src/alexnet2.py
index de69d8c12972df7a1fa51338b30676ffafc65f4e..d2c7d566bb2793a848bdb88c19e2905e6030d588 100644
--- a/hpvm/projects/keras/src/alexnet2.py
+++ b/hpvm/projects/keras/src/alexnet2.py
@@ -63,26 +63,30 @@ class AlexNet2_CIFAR10(Benchmark):
         (X_train, y_train), (X_val, y_val) = cifar10.load_data()
 
         X_train = X_train / 255.0
-        X_val = X_val / 255.0
 
         mean = np.mean(X_train)
         std = np.std(X_train)
         X_train = (X_train - mean) / (std + 1e-7)
-        X_val = (X_val - mean) / (std + 1e-7)  
 
-        X_test = X_val[0:5000]
-        y_test = y_val[0:5000]
-        X_tuner = X_val[5000:]
-        y_tuner = y_val[5000:]
+        X_test = np.fromfile(MODEL_PARAMS_DIR + '/alexnet2_cifar10/test_input.bin', dtype=np.float32)
+        y_test = np.fromfile(MODEL_PARAMS_DIR + '/alexnet2_cifar10/test_labels.bin', dtype=np.uint32)
+
+        X_test = X_test.reshape((-1,3,32,32))
+
+
+        X_tuner = np.fromfile(MODEL_PARAMS_DIR + '/alexnet2_cifar10/tune_input.bin', dtype=np.float32)
+        y_tuner = np.fromfile(MODEL_PARAMS_DIR + '/alexnet2_cifar10/tune_labels.bin', dtype=np.uint32)
+
+        X_tuner = X_tuner.reshape((-1,3,32,32))
 
         return X_train, y_train, X_test, y_test, X_tuner, y_tuner
-    
+
 
     def trainModel(self, model, X_train, y_train, X_test, y_test):
-                
+
         y_train = to_categorical(y_train, self.num_classes)
         y_test = to_categorical(y_test, self.num_classes)
-        
+
         model.compile(
             loss='categorical_crossentropy',
             optimizer=Adam(lr=0.0001),
@@ -133,7 +137,7 @@ if __name__ == '__main__':
     ### Parameters specific to each benchmark
     reload_dir = MODEL_PARAMS_DIR + '/alexnet2_cifar10/'
     keras_model_file = MODEL_PARAMS_DIR + '/alexnet2_cifar10/weights.h5'
-    data_dir = '/alexnet2_cifar10/' 
+    data_dir = '' 
     src_dir = 'data/alexnet2_cifar10_src/'
     num_classes = 10
     batch_size = 500
diff --git a/hpvm/projects/keras/src/alexnet_imagenet.py b/hpvm/projects/keras/src/alexnet_imagenet.py
index e3ab937e9bb355fde74a63664c8657c76d6343f5..1cfe7a79c2a1350689d09d07fdc50f3ce998d8af 100644
--- a/hpvm/projects/keras/src/alexnet_imagenet.py
+++ b/hpvm/projects/keras/src/alexnet_imagenet.py
@@ -94,7 +94,7 @@ if __name__ == '__main__':
     ### Parameters specific to each benchmark
     reload_dir = MODEL_PARAMS_DIR + '/alexnet_imagenet/'
     keras_model_file = MODEL_PARAMS_DIR + '/alexnet_imagenet/weights.h5'
-    data_dir = '/alexnet_imagenet/' 
+    data_dir = '' 
     src_dir = 'data/alexnet_imagenet_src/'
     num_classes = 1000
     batch_size = 50
@@ -104,4 +104,4 @@ if __name__ == '__main__':
     model.run(sys.argv)
 
 
-    
\ No newline at end of file
+    
diff --git a/hpvm/projects/keras/src/lenet.py b/hpvm/projects/keras/src/lenet.py
index 01c84719e6b90d317f7e0dce012577b08b33fcbf..70dd73a66ad49cee83a0f061d1240522332c469c 100644
--- a/hpvm/projects/keras/src/lenet.py
+++ b/hpvm/projects/keras/src/lenet.py
@@ -103,7 +103,7 @@ if __name__ == '__main__':
     ### Parameters specific to each benchmark
     reload_dir = MODEL_PARAMS_DIR + '/lenet_mnist/'
     keras_model_file = MODEL_PARAMS_DIR + '/lenet_mnist/weights.h5'
-    data_dir = '/lenet_mnist/' 
+    data_dir = '' 
     src_dir = 'data/lenet_mnist_src/'
     num_classes = 10
     batch_size = 500
diff --git a/hpvm/projects/keras/src/mobilenet_cifar10.py b/hpvm/projects/keras/src/mobilenet_cifar10.py
index 367a4dfc6244228b7b1336d1a63044273cebd2fb..34335b0f1a7e3e414f7915a5eb9305086b7344d8 100644
--- a/hpvm/projects/keras/src/mobilenet_cifar10.py
+++ b/hpvm/projects/keras/src/mobilenet_cifar10.py
@@ -105,26 +105,32 @@ class MobileNet_CIFAR10(Benchmark):
         (X_train, y_train), (X_val, y_val) = cifar10.load_data()
 
         X_train = X_train / 255.0
-        X_val = X_val / 255.0
+        #X_val = X_val / 255.0
 
         mean = np.mean(X_train)
         std = np.std(X_train)
         X_train = (X_train - mean) / (std + 1e-7)
-        X_val = (X_val - mean) / (std + 1e-7)  
+        #X_val = (X_val - mean) / (std + 1e-7)
+
+        X_test = np.fromfile(MODEL_PARAMS_DIR + '/mobilenet_cifar10/test_input.bin', dtype=np.float32)
+        y_test= np.fromfile(MODEL_PARAMS_DIR + '/mobilenet_cifar10/test_labels.bin', dtype=np.uint32)
+
+        X_test = X_test.reshape((-1,3,32,32))
+
+        X_tuner = np.fromfile(MODEL_PARAMS_DIR + '/mobilenet_cifar10/tune_input.bin', dtype=np.float32)
+        y_tuner = np.fromfile(MODEL_PARAMS_DIR + '/mobilenet_cifar10/tune_labels.bin', dtype=np.uint32)
+
+        X_tuner = X_tuner.reshape((-1,3,32,32))
 
-        X_test = X_val[0:5000]
-        y_test = y_val[0:5000]
-        X_tuner = X_val[5000:]
-        y_tuner = y_val[5000:]
 
         return X_train, y_train, X_test, y_test, X_tuner, y_tuner
-    
+
 
     def trainModel(self, model, X_train, y_train, X_test, y_test):
 
         y_train = to_categorical(y_train, self.num_classes)
         y_test = to_categorical(y_test, self.num_classes)
-        
+
         # data augmentation, horizontal flips only
         datagen = ImageDataGenerator(
                 featurewise_center=False,
@@ -177,7 +183,7 @@ if __name__ == '__main__':
     ### Parameters specific to each benchmark
     reload_dir = MODEL_PARAMS_DIR + '/mobilenet_cifar10/'
     keras_model_file = MODEL_PARAMS_DIR + '/mobilenet_cifar10/weights.h5'
-    data_dir = '/mobilenet_cifar10/' 
+    data_dir = '' 
     src_dir = 'data/mobilenet_cifar10_src/'
     num_classes = 10
     batch_size = 500
diff --git a/hpvm/projects/keras/src/resnet18_cifar10.py b/hpvm/projects/keras/src/resnet18_cifar10.py
index 74abc7ad9f860963c770aaa6bea27b7c16d59738..02753f9eac83a252e5b128f29981b39c14f35d2c 100644
--- a/hpvm/projects/keras/src/resnet18_cifar10.py
+++ b/hpvm/projects/keras/src/resnet18_cifar10.py
@@ -438,34 +438,20 @@ class ResNet18_CIFAR10(Benchmark):
         (X_train, y_train), (X_val, y_val) = cifar10.load_data()
 
         X_train = X_train / 255.0
-        X_val = X_val / 255.0
-
         mean = np.mean(X_train)
         std = np.std(X_train)
-#         X_train = (X_train - mean) / (std + 1e-7)
-#         X_val = (X_val - mean) / (std + 1e-7)
         X_train = (X_train - mean)
-        X_val = (X_val - mean)
-
-
-        X_test_val = np.fromfile(MODEL_PARAMS_DIR + '/resnet18_cifar10/test_input.bin', dtype=np.float32)
-        Y_test_val = np.fromfile(MODEL_PARAMS_DIR + '/resnet18_cifar10/test_labels.bin', dtype=np.uint32)
-
-        X_test_val = X_test_val.reshape((-1,3,32,32))
-
-
-        X_tune_val = np.fromfile(MODEL_PARAMS_DIR + '/resnet18_cifar10/tune_input.bin', dtype=np.float32)
-        Y_tune_val = np.fromfile(MODEL_PARAMS_DIR + '/resnet18_cifar10/tune_labels.bin', dtype=np.uint32)
 
-        X_tune_val = X_tune_val.reshape((-1,3,32,32))
+        X_test = np.fromfile(MODEL_PARAMS_DIR + '/resnet18_cifar10/test_input.bin', dtype=np.float32)
+        y_test = np.fromfile(MODEL_PARAMS_DIR + '/resnet18_cifar10/test_labels.bin', dtype=np.uint32)
 
+        X_test = X_test.reshape((-1,3,32,32))
 
-        X_test = X_test_val[:5000]
-        y_test= Y_test_val[:5000]
 
-        X_tuner = X_tune_val[:5000]
-        y_tuner = Y_tune_val[:5000]
+        X_tuner = np.fromfile(MODEL_PARAMS_DIR + '/resnet18_cifar10/tune_input.bin', dtype=np.float32)
+        y_tuner = np.fromfile(MODEL_PARAMS_DIR + '/resnet18_cifar10/tune_labels.bin', dtype=np.uint32)
 
+        X_tuner = X_tuner.reshape((-1,3,32,32))
 
         return X_train, y_train, X_test, y_test, X_tuner, y_tuner
 
@@ -569,7 +555,7 @@ if __name__ == '__main__':
     ### Parameters specific to each benchmark
     reload_dir = MODEL_PARAMS_DIR + '/resnet18_cifar10/'
     keras_model_file = MODEL_PARAMS_DIR + '/resnet18_cifar10/weights.h5'
-    data_dir = '/resnet18_cifar10/' 
+    data_dir = '' 
     src_dir = 'data/resnet18_cifar10_src/'
     num_classes = 10
     batch_size = 500
diff --git a/hpvm/projects/keras/src/resnet50_imagenet.py b/hpvm/projects/keras/src/resnet50_imagenet.py
index 0c3006213d7880f6133e1f8030256d50d25ea35d..de42ae48d834b6f55e7827138f60baeefe8fb897 100644
--- a/hpvm/projects/keras/src/resnet50_imagenet.py
+++ b/hpvm/projects/keras/src/resnet50_imagenet.py
@@ -142,7 +142,7 @@ if __name__ == '__main__':
     ### Parameters specific to each benchmark
     reload_dir = MODEL_PARAMS_DIR + '/resnet50_imagenet/'
     keras_model_file = MODEL_PARAMS_DIR + '/resnet50_imagenet/weights.h5'
-    data_dir = '/resnet50_imagenet/' 
+    data_dir = '' 
     src_dir = 'data/resnet50_imagenet_src/'
     num_classes = 1000
     batch_size = 50
@@ -152,4 +152,4 @@ if __name__ == '__main__':
     model.run(sys.argv)
 
 
-    
\ No newline at end of file
+    
diff --git a/hpvm/projects/keras/src/vgg16_cifar10.py b/hpvm/projects/keras/src/vgg16_cifar10.py
index 873e23b766ffbd58c1d5db89141da60fee88126e..9a5071ee94a54e4832eade954f779d64ebd3416e 100644
--- a/hpvm/projects/keras/src/vgg16_cifar10.py
+++ b/hpvm/projects/keras/src/vgg16_cifar10.py
@@ -103,34 +103,40 @@ class VGG16_CIFAR10(Benchmark):
         (X_train, y_train), (X_val, y_val) = cifar10.load_data()
 
         X_train = X_train / 255.0
-        X_val = X_val / 255.0
+        #X_val = X_val / 255.0
 
         mean = np.mean(X_train)
         std = np.std(X_train)
         X_train = (X_train - mean) / (std + 1e-7)
-        X_val = (X_val - mean) / (std + 1e-7)  
+        #X_val = (X_val - mean) / (std + 1e-7)
+
+        X_test= np.fromfile(MODEL_PARAMS_DIR + '/vgg16_cifar10/test_input.bin', dtype=np.float32)
+        y_test = np.fromfile(MODEL_PARAMS_DIR + '/vgg16_cifar10/test_labels.bin', dtype=np.uint32)
+
+        X_test = X_test.reshape((-1,3,32,32))
+
+        X_tuner= np.fromfile(MODEL_PARAMS_DIR + '/vgg16_cifar10/tune_input.bin', dtype=np.float32)
+        y_tuner = np.fromfile(MODEL_PARAMS_DIR + '/vgg16_cifar10/tune_labels.bin', dtype=np.uint32)
+
+        X_tuner = X_tuner.reshape((-1,3,32,32))
 
-        X_test = X_val[0:5000]
-        y_test = y_val[0:5000]
-        X_tuner = X_val[5000:]
-        y_tuner = y_val[5000:]
 
         return X_train, y_train, X_test, y_test, X_tuner, y_tuner
 
-    
+
     def trainModel(self, model, X_train, y_train, X_test, y_test):
 
         y_train = to_categorical(y_train, self.num_classes)
         y_test = to_categorical(y_test, self.num_classes)
-        
+
         batch_size = 128
         learning_rate = 0.01
         lr_drop = 20
 
-        
+
         def lr_scheduler(epoch):
             return learning_rate * (0.5 ** (epoch // lr_drop))
-        
+
         reduce_lr = keras.callbacks.LearningRateScheduler(lr_scheduler)
 
         #data augmentation
@@ -179,7 +185,7 @@ if __name__ == '__main__':
     ### Parameters specific to each benchmark
     reload_dir = MODEL_PARAMS_DIR + '/vgg16_cifar10/'
     keras_model_file = MODEL_PARAMS_DIR + '/vgg16_cifar10/weights.h5'
-    data_dir = '/vgg16_cifar10/' 
+    data_dir = '' 
     src_dir = 'data/vgg16_cifar10_src/'
     num_classes = 10
     batch_size = 500
diff --git a/hpvm/projects/keras/src/vgg16_cifar100.py b/hpvm/projects/keras/src/vgg16_cifar100.py
index 03bb852e00bb61a7b17836f5c4df5bbf56c4b466..0fd51ebe03c56ecd622cfab970c51f3096a7d2f4 100644
--- a/hpvm/projects/keras/src/vgg16_cifar100.py
+++ b/hpvm/projects/keras/src/vgg16_cifar100.py
@@ -119,34 +119,39 @@ class VGG16_CIFAR100(Benchmark):
         (X_train, y_train), (X_val, y_val) = cifar100.load_data()
 
         X_train = X_train / 255.0
-        X_val = X_val / 255.0
+        #X_val = X_val / 255.0
 
         mean = np.mean(X_train)
         std = np.std(X_train)
         X_train = (X_train - mean) / (std + 1e-7)
-        X_val = (X_val - mean) / (std + 1e-7)  
+        #X_val = (X_val - mean) / (std + 1e-7)
 
-        X_test = X_val[0:5000]
-        y_test = y_val[0:5000]
-        X_tuner = X_val[5000:]
-        y_tuner = y_val[5000:]
+        X_test = np.fromfile(MODEL_PARAMS_DIR + '/vgg16_cifar100/test_input.bin', dtype=np.float32)
+        y_test = np.fromfile(MODEL_PARAMS_DIR + '/vgg16_cifar100/test_labels.bin', dtype=np.uint32)
+
+        X_test = X_test.reshape((-1,3,32,32))
+
+        X_tuner = np.fromfile(MODEL_PARAMS_DIR + '/vgg16_cifar100/tune_input.bin', dtype=np.float32)
+        y_tuner = np.fromfile(MODEL_PARAMS_DIR + '/vgg16_cifar100/tune_labels.bin', dtype=np.uint32)
+
+        X_tuner = X_tuner.reshape((-1,3,32,32))
 
         return X_train, y_train, X_test, y_test, X_tuner, y_tuner
-    
-    
+
+
     def trainModel(self,model, X_train, y_train, X_test, y_test):
 
         y_train = to_categorical(y_train, self.num_classes)
         y_test = to_categorical(y_test, self.num_classes)
-        
+
         batch_size = 128
         learning_rate = 0.1
         lr_drop = 30
-        
- 
+
+
         def lr_scheduler(epoch):
             return learning_rate * (0.5 ** (epoch // lr_drop))
-        
+
         reduce_lr = keras.callbacks.LearningRateScheduler(lr_scheduler)
 
         #data augmentation
@@ -195,7 +200,7 @@ if __name__ == '__main__':
     ### Parameters specific to each benchmark
     reload_dir = MODEL_PARAMS_DIR + '/vgg16_cifar100/'
     keras_model_file = MODEL_PARAMS_DIR + '/vgg16_cifar100/weights.h5'
-    data_dir = '/vgg16_cifar100/' 
+    data_dir = '' 
     src_dir = 'data/vgg16_cifar100_src/'
     num_classes = 100
     batch_size = 100
diff --git a/hpvm/projects/keras/src/vgg16_imagenet.py b/hpvm/projects/keras/src/vgg16_imagenet.py
index 35ab92479e545ba44cf2175cb7b8abcec84c4fed..6b9458b5378c421f5ef8f8811e4721056fd19643 100644
--- a/hpvm/projects/keras/src/vgg16_imagenet.py
+++ b/hpvm/projects/keras/src/vgg16_imagenet.py
@@ -127,14 +127,14 @@ if __name__ == '__main__':
     ### Parameters specific to each benchmark
     reload_dir = MODEL_PARAMS_DIR + '/vgg16_imagenet/'
     keras_model_file = MODEL_PARAMS_DIR + '/vgg16_imagenet/weights.h5'
-    data_dir = '/vgg16_imagenet/' 
+    data_dir = '' 
     src_dir = 'data/vgg16_imagenet_src/'
     num_classes = 1000
-    batch_size = 50
+    batch_size = 25
 
     alexnet = VGG16('VGG16_imagenet', reload_dir, keras_model_file, data_dir, src_dir, num_classes, batch_size)
     
     alexnet.run(sys.argv)
 
 
-    
\ No newline at end of file
+    
diff --git a/hpvm/projects/predtuner b/hpvm/projects/predtuner
index 6ff4237cf4386ebb4fcaeb5e448ef6eac8a41c91..9c2482aeb8db796b9f5578d0c342b5e5d0e8b376 160000
--- a/hpvm/projects/predtuner
+++ b/hpvm/projects/predtuner
@@ -1 +1 @@
-Subproject commit 6ff4237cf4386ebb4fcaeb5e448ef6eac8a41c91
+Subproject commit 9c2482aeb8db796b9f5578d0c342b5e5d0e8b376
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in
index 015b0aea3c67ff038b45ee0021388408162ab41a..94a8e0a534c04b323b4b66f369ab2d624a2a745f 100644
--- a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in
@@ -1,3 +1,4 @@
+#include <fstream>
 #include <string>
 #include <array>
 #include <hpvm.h>
@@ -49,6 +50,11 @@ void fifo_write_batch(FILE *fp, void *output_ptr) {
   fwrite(output->host_data, 1, output->size_in_bytes, fp);
 }
 
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
+
 {% for node in nodes %}
 void var_{{node.idx}}_node(
 {%- for n in range(node.input_size) -%}
@@ -118,10 +124,11 @@ int main(){
 {% endfor %}
 
   RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn)));
-  void* {{input_name}} = create4DTensor(0, nchw, batch_size, {{input_shape|join(', ')}});
 {% for n in root_inputs %}
+{% if n != input_name %}
   args->{{n}} = {{n}};
   args->{{n}}_bytes = 0;
+{% endif %}
 {% endfor %}
 
   int ret = 0;
@@ -133,20 +140,27 @@ int main(){
 
     // Keep this open so the other side knows we have more batches to write
     auto* fp = open_fifo("{{fifo_path_w}}", "wb");
+    float total_accuracy = 0;
     for (int i = 0; i < batch_count; i++){
       int start = i * batch_size, end = start + batch_size;
-      copyInputBatch(input_pth, start, end, {{input_shape|join(', ')}}, {{input_name}});
+      void *{{input_name}} = readInputBatch(input_pth, 0, start, end, {{input_shape|join(', ')}});
+      args->input = {{input_name}};
+      args->input_bytes = 0;
 
       void* dfg = __hpvm__launch(0, root, (void*) args);
       __hpvm__wait(dfg);
       void *result = static_cast<RootIn*>(args)->r.tensor;
       hpvm_request_tensor(result, 0);
 
-      llvm_hpvm_invokeRtControl(result, labels_pth, start, end);
+      uint32_t* labels = readLabelsBatch3(labels_pth, start, end);
+      float accuracy = computeAccuracy3(labels, result);
+      total_accuracy += accuracy * batch_size;
+
       fifo_write_batch(fp, result);
       freeBatchMemory();
     }
     fclose(fp);
+    write_accuracy(total_accuracy / input_size);
     __hpvm__cleanup();
   }
 
diff --git a/hpvm/scripts/hpvm_installer.py b/hpvm/scripts/hpvm_installer.py
new file mode 100755
index 0000000000000000000000000000000000000000..11ad3045528543d8398828394cc7236fc849bb30
--- /dev/null
+++ b/hpvm/scripts/hpvm_installer.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+from pathlib import Path
+from argparse import ArgumentParser, Namespace
+from subprocess import check_call
+from os import makedirs, chdir, environ
+
+VERSION = "9.0.0"
+URL = "http://releases.llvm.org"
+WGET = "wget"
+CLANG_DIR = f"cfe-{VERSION}.src"
+CLANG_TARBALL = f"{CLANG_DIR}.tar.xz"
+LLVM_DIR = f"llvm-{VERSION}.src"
+LLVM_TARBALL = f"{LLVM_DIR}.tar.xz"
+
+ROOT_DIR = Path.cwd()
+BUILD_DIR = ROOT_DIR / "build"
+TEST_DIR = ROOT_DIR / "test"
+LLVM_LIT = BUILD_DIR / "bin/llvm-lit"
+
+LINKS = [
+    "CMakeLists.txt",
+    "cmake",
+    "include",
+    "lib",
+    "projects",
+    "test",
+    "tools",
+]
+MAKE_TARGETS = ["approxhpvm.py"]
+MAKE_TEST_TARGETS = ["hpvm-check"]
+
+
+def parse_args():
+    parser = ArgumentParser(
+        "hpvm_installer", description="Script for automatic HPVM installation."
+    )
+    parser.add_argument(
+        "-m",
+        "--no-build",
+        action="store_true",
+        help="Configure but don't build HPVM. "
+        "This will require you to install HPVM manually using cmake and make. "
+        "For more details, refer to README.md. Default: False.",
+    )
+    parser.add_argument(
+        "-j",
+        "--parallel",
+        type=int,
+        default=2,
+        help="How many threads to build with. This argument is relayed on to 'make'. Default: 2",
+    )
+    parser.add_argument(
+        "-t",
+        "--targets",
+        type=str,
+        default="all",
+        help="Build target(s) for LLVM such as X86, ARM. "
+        'Use semicolon to separate multiple targets such as "X86;ARM". '
+        'Defaults to "all" which is to build all supported targets. '
+        "Supported targets: AArch64, AMDGPU, ARM, BPF, Hexagon, Mips, MSP430, NVPTX, PowerPC, "
+        "Sparc, SystemZ, X86, XCore.",
+    )
+    parser.add_argument(
+        "-r", "--run-tests", action="store_true", help="Build and run test cases"
+    )
+    return parser.parse_args()
+
+
+def prompt_args():
+    def parse_yn(s: str):
+        table = {"y": True, "n": False}
+        return table.get(s)
+
+    def parse_int(s: str):
+        try:
+            v = int(s)
+            return v
+        except ValueError:
+            return None
+        if v <= 0:
+            return None
+
+    def parse_targets(s: str):
+        if " " in s:
+            return None
+        return s
+
+    print("No Flags found. Using command line prompts.")
+    print("Alternatively, please call this script with -h for all available options.")
+    auto_build = input_with_check(
+        "Build and install HPVM automatically? [y/n]: ", parse_yn, "Please enter y or n"
+    )
+    if not auto_build:
+        # Just stuff something in the other fields. We won't need them.
+        return Namespace(no_build=True, parallel="", targets="", run_tests=False)
+    threads = input_with_check(
+        "Number of threads: ", parse_int, "Please enter a positive integer"
+    )
+    print(
+        "These build targets are supported: AArch64, AMDGPU, ARM, BPF, Hexagon, "
+        "Mips, MSP430, NVPTX, PowerPC, Sparc, SystemZ, X86, XCore.\n"
+        "If building for multiple targets, seperate options with semicolon:\n"
+        "e.g. X86;ARM"
+    )
+    targets = input_with_check(
+        "Build target: ", parse_targets, "Input shouldn't contain space"
+    )
+
+    run_tests = input_with_check(
+        "Build and run tests? [y/n]: ", parse_yn, "Please enter y or n"
+    )
+    return Namespace(
+        no_build=not auto_build, parallel=threads, targets=targets, run_tests=run_tests
+    )
+
+
+def print_args(args):
+    print("Running with the following options:")
+    print(f"  Automated: {not args.no_build}")
+    print(f"  Threads: {args.parallel}")
+    print(f"  Targets: {args.targets}")
+    print(f"  Run tests: {args.run_tests}")
+
+
+def check_download_llvm_clang():
+    if Path("llvm/").is_dir():
+        print("Found LLVM, not extracting it again.")
+    else:
+        if Path(LLVM_TARBALL).is_file():
+            print(f"Found {LLVM_TARBALL}, not downloading it again.")
+        else:
+            print(f"Downloading {LLVM_TARBALL}...")
+            print(f"=============================")
+            check_call([WGET, f"{URL}/{VERSION}/{LLVM_TARBALL}"])
+        check_call(["tar", "xf", LLVM_TARBALL])
+        check_call(["mv", LLVM_DIR, "llvm"])
+    tools = Path("llvm/tools")
+    assert tools.is_dir(), "Problem with LLVM download. Exiting!"
+    if Path(LLVM_TARBALL).is_file():
+        Path(LLVM_TARBALL).unlink()  # Remove tarball
+    environ["LLVM_SRC_ROOT"] = str(ROOT_DIR / "llvm")
+
+    if (tools / "clang/").is_dir():
+        print("Found clang, not downloading it again.")
+        return
+    chdir(tools)
+    print(f"Downloading {CLANG_TARBALL}...")
+    print(f"=============================")
+    check_call([WGET, f"{URL}/{VERSION}/{CLANG_TARBALL}"])
+    check_call(["tar", "xf", CLANG_TARBALL])
+    check_call(["mv", CLANG_DIR, "clang"])
+    assert Path("clang/").is_dir(), "Problem with clang download. Exiting!"
+    if Path(CLANG_TARBALL).is_file():
+        Path(CLANG_TARBALL).unlink()
+    chdir(ROOT_DIR)
+
+
+def link_and_patch():
+    from os import symlink
+
+    hpvm = ROOT_DIR / "llvm/tools/hpvm"
+    print("Adding HPVM sources to tree...")
+    makedirs(hpvm, exist_ok=True)
+    for link in LINKS:
+        if not (hpvm / link).exists():
+            print(ROOT_DIR / link, hpvm / link)
+            symlink(ROOT_DIR / link, hpvm / link)
+    print("Applying HPVM patches...")
+    chdir("llvm_patches")
+    check_call(["bash", "./construct_patch.sh"])
+    check_call(["bash", "./apply_patch.sh"])
+    print("Patches applied.")
+    chdir("..")
+
+
+def maybe_build(build: bool, nthreads: int, targets: str, build_test_targets: bool):
+    if not build:
+        print(
+            """
+HPVM not installed.
+To complete installation, follow these instructions:
+- Create and navigate to a folder "./build" 
+- Run "cmake ../llvm [options]". Find potential options in README.md.
+- Run "make -j<number of threads> approxhpvm.py" and then "make install"
+For more details refer to README.md.
+"""
+        )
+        return
+    print("Now building...")
+    print(f"Using {nthreads} threads to build HPVM.")
+    makedirs(BUILD_DIR, exist_ok=True)
+
+    chdir(BUILD_DIR)
+    cmake_args = [
+        "cmake",
+        "../llvm",
+        "-DCMAKE_C_COMPILER=gcc",
+        "-DCMAKE_CXX_COMPILER=g++",
+        f"-DLLVM_TARGETS_TO_BUILD={targets}",
+    ]
+    print(f"CMake: {' '.join(cmake_args)}")
+    print(f"=============================")
+    check_call(cmake_args)
+    make_args = ["make", f"-j{nthreads}", *MAKE_TARGETS]
+    if build_test_targets:
+        make_args += MAKE_TEST_TARGETS
+    print(f"Make: {' '.join(make_args)}")
+    print(f"=============================")
+    check_call(make_args)
+    chdir(ROOT_DIR)
+
+
+def run_tests():
+    chdir(BUILD_DIR)
+    # Run regression tests
+    check_call([LLVM_LIT, "-v", TEST_DIR / "regressionTests"])
+    # Run unit tests
+    check_call([LLVM_LIT, "-v", TEST_DIR / "unitTests"])
+
+
+def input_with_check(prompt: str, parse, prompt_when_invalid: str):
+    input_str = input(prompt)
+    value = parse(input_str)
+    while value is None:
+        print(f"{prompt_when_invalid}; got {input_str}")
+        input_str = input(prompt)
+        value = parse(input_str)
+    return value
+
+
+def main():
+    from sys import argv
+
+    # Don't parse args if no args given -- use prompt mode
+    args = prompt_args() if len(argv) == 1 else parse_args()
+    print_args(args)
+    check_download_llvm_clang()
+    link_and_patch()
+    maybe_build(not args.no_build, args.parallel, args.targets, args.run_tests)
+    if args.run_tests:
+        run_tests()
+    else:
+        print("Skipping tests.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hpvm/scripts/llvm_installer.sh b/hpvm/scripts/llvm_installer.sh
deleted file mode 100755
index a8fa022047fb7983c466b618863a7b2a66a50f92..0000000000000000000000000000000000000000
--- a/hpvm/scripts/llvm_installer.sh
+++ /dev/null
@@ -1,255 +0,0 @@
-#!/bin/bash
-
-function read_yn {
-  read -p "$1 [y/n]: " read_value
-  while [ ! $read_value == "y" ] && [ ! $read_value == "n" ]; do
-    echo "Please answer y or n; got $read_value"
-    read -p "$1 [y/n]:" read_value
-  done
-  eval $2=$read_value
-}
-
-VERSION="9.0.0"
-
-URL="http://releases.llvm.org"
- 
-WGET=wget
-
-CURRENT_DIR=`pwd`
-INSTALL_DIR=`pwd`/install
-BUILD_DIR=$CURRENT_DIR/build
-
-# Using 2 threads by default
-NUM_THREADS=2
-
-SUFFIX=".tar.xz"
-CLANG_SRC="cfe-$VERSION.src"
-LLVM_SRC="llvm-$VERSION.src"
-
-HPVM_RT=hpvm-rt/hpvm-rt.bc
-
-
-TARGET=all
-TARGET_INPUT=all
-FLAGGED=false
-
-# Get flags
-while getopts 'hmj:t:' opt; do
-  case $opt in
-    h) 
-      echo
-      echo
-      echo "This is the help menu for HPVM installation"
-      echo
-      echo "There are 3 options for installation:"
-      echo
-      echo "-m is a manual installation flag. This will require you to install HPVM manually by running cmake and make manually." 
-      echo "For more details, refer to README.md. Defaults to automatic installation."
-      echo
-      echo "-j is the threads flag. Accepts one argument: how many threads to build with." 
-      echo "To build with 2 threads, enter -j2. Defaults to 2 threads."
-      echo
-      echo "-t is the build target flag. Accepts one argument: which build target(s) you would like to build to." 
-      echo "For single target, enter -a ARM. For multiple targets, enter -t \"X86;ARM\"." 
-      echo "Supports the following targets: AArch64, AMDGPU, ARM, BPF, Hexagon, Mips, MSP430, NVPTX, PowerPC, Sparc, SystemZ, X86, XCore."
-      echo "Defaults to targeting all supported architectures."
-      echo
-      echo "If no flags are provided, the script will use command line prompts for all options."
-      echo
-      exit
-      ;;
-    m) 
-      AUTOMATE=false
-      FLAGGED=true
-      ;;
-    j) 
-      if ! [[ $OPTARG =~ ^[0-9]+$ ]]; then
-        echo "Invalid argument for # of threads: $OPTARG"
-        exit -1;
-      else
-        NUM_THREADS=$OPTARG
-        FLAGGED=true
-      fi
-      ;;
-    t) 
-      TARGET=$OPTARG
-      FLAGGED=true
-      ;;
-  esac
-done
-
-if $FLAGGED; then
-  echo "Running with the following options:"
-  echo Automated: $AUTOMATE
-  echo Threads: $NUM_THREADS
-  echo Targets: $TARGET
-  echo
-else
-  echo "No Flags found. Using command line prompts."
-  read -p "Build and install HPVM automatically? (y or n): " AUTOMATE_INPUT
-
-  if [[ $AUTOMATE_INPUT == "" ]]; then
-    echo "No input given. Using default: $AUTOMATE"
-  elif [[ ! $AUTOMATE_INPUT == "y" ]] && [[ ! $AUTOMATE_INPUT == "n" ]]; then 
-    echo "Invalid input. Using default: $AUTOMATE"
-  elif [[ $AUTOMATE_INPUT == "n" ]]; then
-    AUTOMATE=false
-  fi
-
-
-  echo
-  read -p "Number of threads: " NUM_THREADS_INPUT
-
-  if [[ $NUM_THREADS_INPUT == "" ]]; then
-    echo "No input given. Using default: $NUM_THREADS"
-  elif ! [[ $NUM_THREADS_INPUT =~ ^[0-9]+$ ]]; then
-    echo "Given input is not an integer. Using default: $NUM_THREADS"
-  elif [ ! $NUM_THREADS_INPUT -gt 0 ]; then
-    echo "Given input is not greater than 0. Using default: $NUM_THREADS"
-  else
-    NUM_THREADS=$NUM_THREADS_INPUT
-  fi
-  
-  echo
-  echo 
-  echo "Supports the following options: AArch64, AMDGPU, ARM, BPF, Hexagon, Mips, MSP430, NVPTX, PowerPC, Sparc, SystemZ, X86, XCore."
-  echo "If building for multiple targets, seperate options with semicolon:"
-  echo "e.g. X86;ARM"
-  read -p "Build target: " TARGET_INPUT
-  if [[ $TARGET_INPUT == "" ]]; then
-    echo "No input given. Using default: $TARGET"
-  else
-    TARGET=$TARGET_INPUT
-  fi
-  echo
-
-  echo "Running with the following options:"
-  echo Automated: $AUTOMATE
-  echo Threads: $NUM_THREADS
-  echo Targets: $TARGET
-  echo
-fi
-
-if [ -d $LLVM_SRC ]; then
-    echo Found $LLVM_SRC, not dowloading it again!
-elif [ -d llvm ]; then
-    echo Found LLVM, not downloading it again!
-else
-    echo $WGET $URL/$VERSION/$LLVM_SRC$SUFFIX
-    $WGET $URL/$VERSION/$LLVM_SRC$SUFFIX
-    tar xf $LLVM_SRC$SUFFIX
-    rm $LLVM_SRC$SUFFIX
-fi
-
-if [ -d $LLVM_SRC ]; then
-    echo Everything looks sane.
-    mv $LLVM_SRC llvm
-elif [ -d llvm ]; then
-    echo Everything looks sane.
-else
-    echo Problem with LLVM download. Exiting!
-    exit
-fi
-
-LLVM_SRC=llvm
-
-if [ -d $CURRENT_DIR/$LLVM_SRC/tools ]; then
-    cd $CURRENT_DIR/$LLVM_SRC/tools
-    echo In tools.
-else
-    echo Something is wrong with LLVM checkout. Exiting!
-    exit 1
-fi
-
-if [ -d clang ]; then
-    echo Found clang! Not downloading clang again.
-else
-    $WGET $URL/$VERSION/$CLANG_SRC$SUFFIX
-    tar xf $CLANG_SRC$SUFFIX
-    rm $CLANG_SRC$SUFFIX
-    mv $CLANG_SRC clang
-    if [ -d clang ]; then
-	echo Everything looks sane.
-    else
-	echo Problem with clang download. Exiting!
-	exit
-    fi
-fi
-
-cd $CURRENT_DIR
-
-HPVM_DIR=$CURRENT_DIR/$LLVM_SRC/tools/hpvm
-
-if [ ! -d $HPVM_DIR ]; then
-  echo Adding HPVM sources to tree
-  mkdir -p $HPVM_DIR
-  ln -s $CURRENT_DIR/CMakeLists.txt $HPVM_DIR
-  ln -s $CURRENT_DIR/cmake $HPVM_DIR/
-  ln -s $CURRENT_DIR/include $HPVM_DIR/
-  ln -s $CURRENT_DIR/lib $HPVM_DIR/
-  ln -s $CURRENT_DIR/projects $HPVM_DIR/
-  ln -s $CURRENT_DIR/test $HPVM_DIR/
-  ln -s $CURRENT_DIR/tools $HPVM_DIR/
-else
-  echo $CURRENT_DIR/$LLVM_SRC/tools/hpvm exists.
-fi
-
-export LLVM_SRC_ROOT=$CURRENT_DIR/$LLVM_SRC
-
-echo Applying HPVM patches
-cd $CURRENT_DIR/llvm_patches
-/bin/bash ./construct_patch.sh
-/bin/bash ./apply_patch.sh
-
-echo Patches applied.
-
-if ! $AUTOMATE ; then
-  echo
-  echo "HPVM not installed."
-  echo "To complete installation, follow these instructions:"
-  echo "  - Create and navigate to a folder \"./build\" "
-  echo "  - Run \"cmake ../llvm [options]\". Find potential options in README.md."
-  echo "  - Run \"make -j<number of threads> approxhpvm.py\" and then \"make install\""
-  echo "For more details refer to README.md."
-  echo 
-  echo "Exiting."
-  exit  
-fi
-
-echo
-echo Now building...
-
-echo Using $NUM_THREADS threads to build HPVM.
-echo
-
-cd $CURRENT_DIR
-
-if [ ! -d $BUILD_DIR ]; then
-  mkdir -p $BUILD_DIR
-fi
-
-if [ ! -d $INSTALL_DIR ]; then
-  mkdir -p $INSTALL_DIR
-fi
-
-export PATH=$BUILD_DIR/bin:$PATH
-
-cd $BUILD_DIR
-echo cmake ../$LLVM_SRC -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLLVM_TARGETS_TO_BUILD=$TARGET  -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR
-cmake ../$LLVM_SRC -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLLVM_TARGETS_TO_BUILD=$TARGET  -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR
-
-echo make -j$NUM_THREADS approxhpvm.py
-make -j$NUM_THREADS approxhpvm.py
-#make install
-
-if [ -f $BUILD_DIR/tools/hpvm/projects/$HPVM_RT ]; then
-    true
-else
-    echo $BUILD_DIR/tools/hpvm/projects/$HPVM_RT
-    echo HPVM not installed properly.
-    exit 0
-fi
-
-cd $CURRENT_DIR
-
-
diff --git a/hpvm/test/CMakeLists.txt b/hpvm/test/CMakeLists.txt
index 654197c16cf9c086d348e16cbc83e34130c3c39b..4c96ee124f066bbe35c2f8117ea29078f38df7ae 100644
--- a/hpvm/test/CMakeLists.txt
+++ b/hpvm/test/CMakeLists.txt
@@ -25,135 +25,23 @@ configure_lit_site_cfg(
 
 # Set the depends list as a variable so that it can grow conditionally.
 # NOTE: Sync the substitutions in test/lit.cfg when adding to this list.
-set(LLVM_TEST_DEPENDS
-          BugpointPasses
-          FileCheck
-          LLVMHello
-          UnitTests
-          bugpoint
-          count
-          llc
-          lli
-          lli-child-target
-          llvm-addr2line
-          llvm-ar
-          llvm-as
-          llvm-bcanalyzer
-          llvm-c-test
-          llvm-cat
-          llvm-cfi-verify
-          llvm-config
-          llvm-cov
-          llvm-cvtres
-          llvm-cxxdump
-          llvm-cxxfilt
-          llvm-cxxmap
-          llvm-diff
-          llvm-dis
-          llvm-dlltool
-          dsymutil
-          llvm-dwarfdump
-          llvm-dwp
-          llvm-elfabi
-          llvm-exegesis
-          llvm-extract
-          llvm-isel-fuzzer
-          llvm-jitlink
-          llvm-lib
-          llvm-link
-          llvm-lipo
-          llvm-lto2
-          llvm-mc
-          llvm-mca
-          llvm-modextract
-          llvm-mt
-          llvm-nm
-          llvm-objcopy
-          llvm-objdump
-          llvm-opt-fuzzer
-          llvm-opt-report
-          llvm-pdbutil
-          llvm-profdata
-          llvm-ranlib
-          llvm-rc
-          llvm-readobj
-          llvm-readelf
-          llvm-rtdyld
-          llvm-size
-          llvm-split
-          llvm-strings
-          llvm-strip
-          llvm-symbolizer
-          llvm-tblgen
-          llvm-undname
-          llvm-xray
-          not
-          obj2yaml
-          opt
-          sancov
-          sanstats
-          verify-uselistorder
-          yaml-bench
-          yaml2obj
-        )
-
-if(TARGET llvm-lto)
-  set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} llvm-lto)
-endif()
-
-# If Intel JIT events are supported, depend on a tool that tests the listener.
-if( LLVM_USE_INTEL_JITEVENTS )
-  set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} llvm-jitlistener)
-endif( LLVM_USE_INTEL_JITEVENTS )
-
-if(TARGET LLVMgold)
-  set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} LLVMgold)
-endif()
-
-if(TARGET llvm-go)
-  set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} llvm-go)
-endif()
-
-if(TARGET LTO)
-  set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} LTO)
-endif()
-
-if(TARGET ocaml_llvm)
-  # Clear all non-OCaml cross-target dependencies when building out-of-tree.
-  if(LLVM_OCAML_OUT_OF_TREE)
-    set(LLVM_TEST_DEPENDS)
-  endif()
-
-  set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS}
-          ocaml_llvm
-          ocaml_llvm_all_backends
-          ocaml_llvm_analysis
-          ocaml_llvm_bitreader
-          ocaml_llvm_bitwriter
-          ocaml_llvm_executionengine
-          ocaml_llvm_irreader
-          ocaml_llvm_linker
-          ocaml_llvm_target
-          ocaml_llvm_ipo
-          ocaml_llvm_passmgr_builder
-          ocaml_llvm_scalar_opts
-          ocaml_llvm_transform_utils
-          ocaml_llvm_vectorize
-        )
-endif()
-
-#add_custom_target(llvm-test-depends DEPENDS ${LLVM_TEST_DEPENDS})
-#set_target_properties(llvm-test-depends PROPERTIES FOLDER "Tests")
-
-add_lit_testsuite(check-hpvm "Running the LLVM regression tests"
+set(
+  HPVM_TEST_DEPENDS
+  opt hpvm-rt.bc
+  # Passes:
+  LLVMGenHPVM LLVMBuildDFG LLVMLocalMem LLVMClearDFG
+  LLVMDFG2LLVM_CPU LLVMDFG2LLVM_OpenCL
+  # Test utils:
+  FileCheck count not
+)
+
+add_lit_testsuite(check-hpvm "Running the HPVM regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS ${LLVM_TEST_DEPENDS}
-  )
+  DEPENDS ${HPVM_TEST_DEPENDS}
+)
 set_target_properties(check-hpvm PROPERTIES FOLDER "Tests")
 
-add_lit_testsuites(HPVM ${CMAKE_CURRENT_SOURCE_DIR}
-  DEPENDS ${LLVM_TEST_DEPENDS}
-  )
+add_lit_testsuites(HPVM ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${HPVM_TEST_DEPENDS})
 
 # Setup a legacy alias for 'check-llvm'. This will likely change to be an
 # alias for 'check-all' at some point in the future.
diff --git a/hpvm/test/README.md b/hpvm/test/README.md
index 801af4c15a1832a467511804527e4e70043eb3df..7e8b408a0c127bf2365eaf7c7b8498178c7c11b1 100644
--- a/hpvm/test/README.md
+++ b/hpvm/test/README.md
@@ -1,5 +1,65 @@
-# Test Directory Organization
-The test directory is organized as follows:
-* unitTests: Includes unit tests for HPVM.
-* regressionTests: Includes regression tests for HPVM.
-* benchmarks: Includes the current benchmarks and a template, as well as directions for compiling and running HPVM benchmarks.
+# HPVM Test and Benchmarks
+
+## Directory Organization
+
+This directory is organized as follows:
+
+* `unitTests/` and `regressionTests/`: unit and regression tests for HPVM.
+  These are LLVM-bitcode test cases for HPVM passes.
+
+* `benchmarks/`: includes a few applications written in HPVM-C, a template, and directions for compiling and running these benchmarks.
+
+* `dnn_benchmarks/`: ten (10) DNN benchmarks in HPVM-C, Keras and PyTorch, supported by ApproxHPVM.
+  This tests HPVM as well as the Keras and PyTorch frontends.
+
+  * `dnn_benchmarks/hpvm-c` contains the HPVM-C version of these DNNs.
+    Their organization and usage are similar to the benchmarks under `benchmarks/`.
+  * `dnn_benchmarks/keras` contains these DNNs implemented in Keras,
+    and code for generating them down to HPVM-C (testing Keras frontend).
+  * `dnn_benchmarks/pytorch` contains these DNNs in PyTorch
+    and code for generating them down to HPVM-C (testing PyTorch/ONNX frontend).
+
+  The code generated from Keras and PyTorch frontend should be largely similar and functionally equivalent.
+
+## Running Test Cases and Benchmarks
+
+The easiest way to run `unitTests/` and `regressionTests/` is
+to build the target `hpvm-check` in the global build directory: `make -j hpvm-check`.
+`hpvm-check` doesn't automatically run `benchmarks/` and `dnn_benchmarks` as they are extremely time-consuming.
+
+`benchmarks/` can only be compiled in-source with `make`.
+We are working to migrate it into the `cmake` system.
+
+### HPVM-C DNN Benchmarks
+
+To build all `dnn_benchmarks/hpvm-c`, use `make -j dnn_benchmarks`.
+
+Alternatively, it's possible to build just 1 DNN benchmark.
+The output of CMake shows a list of these benchmarks as target names, starting with
+> List of test dnn benchmarks: alexnet2_cifar10;alexnet2_cifar10...
+
+Currently, there are 20 of them. These are:
+
+|                   |                         |
+|-------------------|-------------------------|
+| lenet_mnist       | lenet_mnist_cudnn       |
+| alexnet_cifar10   | alexnet_cifar10_cudnn   |
+| alexnet2_cifar10  | alexnet2_cifar10_cudnn  |
+| vgg16_cifar10     | vgg16_cifar10_cudnn     |
+| vgg16_cifar100    | vgg16_cifar100_cudnn    |
+| mobilenet_cifar10 | mobilenet_cifar10_cudnn |
+| resnet18_cifar10  | resnet18_cifar10_cudnn  |
+| alexnet_imagenet  | alexnet_imagenet_cudnn  |
+| vgg16_imagenet    | vgg16_imagenet_cudnn    |
+| resnet50_imagenet | resnet50_imagenet_cudnn |
+
+`_cudnn` suffix indicates the code is generated onto cuDNN functions.
+Otherwise they are generated to `tensor_runtime` DNN functions which are hand-written in CUDA.
+
+### DNN Frontends
+
+TODO: figure out how to
+
+1. Auto run all hpvm-c DNN benchmarks
+2. Compare the output accuracy to groundtruth
+3. Auto run Keras and PyTorch tests (generating, compiling and running all DNNs)
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/include/hpvm.h b/hpvm/test/dnn_benchmarks/hpvm-c/include/hpvm.h
index 0587311910512c7c35ee69b8df5a440096da1484..9e2e6bc36e488a0d3e61bf0e2e8171bdce064115 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/include/hpvm.h
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/include/hpvm.h
@@ -83,15 +83,6 @@ void *__hpvm__tensor_pool_mean(void *, int, int, int, int, int, int);
 void *__hpvm__tensor_relu(void *);
 void *__hpvm__tensor_tanh(void *);
 void *__hpvm__tensor_softmax(void *);
-// Tensor ops for image processing
-void *__hpvm__tensor_fft(void *);
-void *__hpvm__tensor_reduce(void *, int, void *);
-void *__hpvm__tensor_projectiveT(void *, void *);
-void *__hpvm__tensor_map1(void *, void *);
-void *__hpvm__tensor_map2(void *, void *, void *);
-void *__hpvm__tensor_map3(void *, void *, void *, void *);
-void *__hpvm__tensor_cosineT(void *);
-void *__hpvm__tensor_stencil(void *);
 
 // New HPVM intrinsic for Setting Node ID
 void *__hpvm__node_id(int);
diff --git a/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py b/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py
index 32d982187a0e1079e277d4b29ef401321833241e..d0451b70b44325a355345ad95ab9bf85154002c5 100644
--- a/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py
+++ b/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py
@@ -2,9 +2,9 @@ import os
 import shutil
 import site
 from pathlib import Path
-from subprocess import Popen
 
 import torch
+from predtuner import config_pylogger
 from predtuner.pipedbin import PipedBinaryApp
 from torch2hpvm import BinDataset, ModelExporter
 from torch.nn import Module
@@ -12,32 +12,38 @@ from torch.nn import Module
 site.addsitedir(os.path.dirname(__file__))
 import dnn
 
+# Set up logger to put log file in /tmp
+msg_logger = config_pylogger(output_dir="/tmp", verbose=True)
+
+
 benchmarks = [
-    (dnn.LeNet, 1, 28, 5000, "lenet_mnist"),
-    (dnn.AlexNet, 3, 32, 5000, "alexnet_cifar10"),
-    (dnn.AlexNet2, 3, 32, 5000, "alexnet2_cifar10"),
-    (dnn.AlexNetImageNet, 3, 224, 500, "alexnet_imagenet"),
-    (dnn.MobileNet, 3, 32, 5000, "mobilenet_cifar10"),
-    (dnn.ResNet18, 3, 32, 5000, "resnet18_cifar10"),
-    (dnn.ResNet50, 3, 224, 100, "resnet50_imagenet"),
-    (dnn.VGG16Cifar10, 3, 32, 5000, "vgg16_cifar10"),
-    (dnn.VGG16Cifar100, 3, 32, 5000, "vgg16_cifar100"),
-    (dnn.VGG16ImageNet, 3, 224, 100, "vgg16_imagenet"),
+    (dnn.LeNet, 1, 28, 500, "lenet_mnist"),
+    (dnn.AlexNet, 3, 32, 500, "alexnet_cifar10"),
+    (dnn.AlexNet2, 3, 32, 500, "alexnet2_cifar10"),
+    (dnn.AlexNetImageNet, 3, 224, 100, "alexnet_imagenet"),
+    (dnn.MobileNet, 3, 32, 500, "mobilenet_cifar10"),
+    (dnn.ResNet18, 3, 32, 500, "resnet18_cifar10"),
+    (dnn.ResNet50, 3, 224, 50, "resnet50_imagenet"),
+    (dnn.VGG16Cifar10, 3, 32, 500, "vgg16_cifar10"),
+    (dnn.VGG16Cifar100, 3, 32, 500, "vgg16_cifar100"),
+    (dnn.VGG16ImageNet, 3, 224, 50, "vgg16_imagenet"),
 ]
+model_param = Path(__file__).parent / "../model_params"
+
+
+def generate(model_cls, nch, img_size, batch_size, pathname):
+    codegen_dir = Path(f"/tmp/{pathname}_tune")
+    build_dir = codegen_dir / "build"
+    metadata_file = codegen_dir / "ops.json"
+    binary_file = build_dir / pathname
+    build_dir = codegen_dir / "build"
+    # if binary_file.is_file() and metadata_file.is_file():
+    #     return binary_file, metadata_file
 
-self_folder = Path(__file__).parent
-model_cls, nch, img_size, batch_size, pathname = benchmarks[0]
-codegen_dir = Path(f"/tmp/{pathname}_tune")
-build_dir = codegen_dir / "build"
-metadata_file = codegen_dir / "ops.json"
-binary_file = build_dir / pathname
-conf_file = codegen_dir / ModelExporter.config_file_name
-if not binary_file.is_file() or not metadata_file.is_file():
     print(f"Generating {pathname} to {codegen_dir}")
     if codegen_dir.exists():
         shutil.rmtree(codegen_dir)
-
-    params = self_folder / "../model_params" / pathname
+    params = model_param / pathname
     dataset_shape = 5000, nch, img_size, img_size
     bin_tuneset = BinDataset(
         params / "tune_input.bin", params / "tune_labels.bin", dataset_shape
@@ -46,14 +52,29 @@ if not binary_file.is_file() or not metadata_file.is_file():
         params / "test_input.bin", params / "test_labels.bin", dataset_shape
     )
     model: Module = model_cls()
-    checkpoint = self_folder / "../model_params" / f"{pathname}.pth.tar"
+    checkpoint = model_param / f"{pathname}.pth.tar"
     model.load_state_dict(torch.load(checkpoint.as_posix()))
-
     exporter = ModelExporter(
         model, bin_tuneset, bin_testset, codegen_dir, target="hpvm_tensor_inspect"
     )
     exporter.generate(batch_size=batch_size).compile(binary_file, build_dir)
-app = PipedBinaryApp("test", binary_file, metadata_file)
-tuner = app.get_tuner()
-tuner.tune(100, 3.0, is_threshold_relative=True, perf_model="perf_linear", qos_model="qos_p1")
-tuner.dump_configs("configs.json")
+    return binary_file, metadata_file
+
+
+def main():
+    for model_cls, nch, img_size, batch_size, pathname in benchmarks:
+        print(f"Testing {pathname}")
+        binary_file, metadata_file = generate(
+            model_cls, nch, img_size, batch_size, pathname
+        )
+        app = PipedBinaryApp("test", binary_file, metadata_file)
+        tuner = app.get_tuner()
+        tuner.tune(100, 3.0, 3.0, True, 50, cost_model="cost_linear")
+        tuner.dump_configs("configs.json")
+        fig = tuner.plot_configs(show_qos_loss=True)
+        fig.savefig("configs.png", dpi=300)
+        app.dump_hpvm_configs(tuner.best_configs, "hpvm_confs.txt")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/hpvm/test/lit.cfg.py b/hpvm/test/lit.cfg.py
index 1f78a88103ecb76cd7b130ff302aead7f7ee8375..b4e55fbeae9b40c978b0dc047cb76ed4909efc66 100644
--- a/hpvm/test/lit.cfg.py
+++ b/hpvm/test/lit.cfg.py
@@ -4,15 +4,10 @@
 
 import os
 import sys
-import re
-import platform
-import subprocess
 
 import lit.util
 import lit.formats
 from lit.llvm import llvm_config
-from lit.llvm.subst import FindTool
-from lit.llvm.subst import ToolSubst
 
 # name: The name of this test suite.
 config.name = 'HPVM'
@@ -22,12 +17,12 @@ config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
 
 # suffixes: A list of file extensions to treat as test files. This is overriden
 # by individual lit.local.cfg files in the test subdirectories.
-config.suffixes = ['.ll', '.c', '.cxx', '.test', '.txt', '.s', '.mir']
+config.suffixes = ['.ll']
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
 # subdirectories contain auxiliary inputs for various tests in their parent
 # directories.
-config.excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt']
+config.excludes = ['Inputs', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt', 'benchmarks', 'dnn_benchmarks']
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
@@ -42,292 +37,15 @@ llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
 llvm_config.with_system_environment(
     ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP', 'ASAN_SYMBOLIZER_PATH', 'MSAN_SYMBOLIZER_PATH'])
 
-
-# Set up OCAMLPATH to include newly built OCaml libraries.
-top_ocaml_lib = os.path.join(config.llvm_lib_dir, 'ocaml')
-llvm_ocaml_lib = os.path.join(top_ocaml_lib, 'llvm')
-
-llvm_config.with_system_environment('OCAMLPATH')
-llvm_config.with_environment('OCAMLPATH', top_ocaml_lib, append_path=True)
-llvm_config.with_environment('OCAMLPATH', llvm_ocaml_lib, append_path=True)
-
-llvm_config.with_system_environment('CAML_LD_LIBRARY_PATH')
-llvm_config.with_environment(
-    'CAML_LD_LIBRARY_PATH', llvm_ocaml_lib, append_path=True)
-
-
-# Set up OCAMLRUNPARAM to enable backtraces in OCaml tests.
-llvm_config.with_environment('OCAMLRUNPARAM', 'b')
-
-# Provide the path to asan runtime lib 'libclang_rt.asan_osx_dynamic.dylib' if
-# available. This is darwin specific since it's currently only needed on darwin.
-
-
-def get_asan_rtlib():
-    if not 'Address' in config.llvm_use_sanitizer or \
-       not 'Darwin' in config.host_os or \
-       not 'x86' in config.host_triple:
-        return ''
-    try:
-        import glob
-    except:
-        print('glob module not found, skipping get_asan_rtlib() lookup')
-        return ''
-    # The libclang_rt.asan_osx_dynamic.dylib path is obtained using the relative
-    # path from the host cc.
-    host_lib_dir = os.path.join(os.path.dirname(config.host_cc), '../lib')
-    asan_dylib_dir_pattern = host_lib_dir + \
-        '/clang/*/lib/darwin/libclang_rt.asan_osx_dynamic.dylib'
-    found_dylibs = glob.glob(asan_dylib_dir_pattern)
-    if len(found_dylibs) != 1:
-        return ''
-    return found_dylibs[0]
-
-
 llvm_config.use_default_substitutions()
 
-# Add site-specific substitutions.
-config.substitutions.append(('%llvmshlibdir', config.llvm_shlib_dir))
-config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
-config.substitutions.append(('%exeext', config.llvm_exe_ext))
-
-
-lli_args = []
-# The target triple used by default by lli is the process target triple (some
-# triple appropriate for generating code for the current process) but because
-# we don't support COFF in MCJIT well enough for the tests, force ELF format on
-# Windows.  FIXME: the process target triple should be used here, but this is
-# difficult to obtain on Windows.
-if re.search(r'cygwin|windows-gnu|windows-msvc', config.host_triple):
-    lli_args = ['-mtriple=' + config.host_triple + '-elf']
-
-llc_args = []
-
-# Similarly, have a macro to use llc with DWARF even when the host is Windows
-if re.search(r'windows-msvc', config.target_triple):
-    llc_args = [' -mtriple=' +
-                config.target_triple.replace('-msvc', '-gnu')]
-
-# Provide the path to asan runtime lib if available. On darwin, this lib needs
-# to be loaded via DYLD_INSERT_LIBRARIES before libLTO.dylib in case the files
-# to be linked contain instrumented sanitizer code.
-ld64_cmd = config.ld64_executable
-asan_rtlib = get_asan_rtlib()
-if asan_rtlib:
-    ld64_cmd = 'DYLD_INSERT_LIBRARIES={} {}'.format(asan_rtlib, ld64_cmd)
-
-ocamlc_command = '%s ocamlc -cclib -L%s %s' % (
-    config.ocamlfind_executable, config.llvm_lib_dir, config.ocaml_flags)
-ocamlopt_command = 'true'
-if config.have_ocamlopt:
-    ocamlopt_command = '%s ocamlopt -cclib -L%s -cclib -Wl,-rpath,%s %s' % (
-        config.ocamlfind_executable, config.llvm_lib_dir, config.llvm_lib_dir, config.ocaml_flags)
-
 opt_viewer_cmd = '%s %s/tools/opt-viewer/opt-viewer.py' % (sys.executable, config.llvm_src_root)
 
-tools = [
-    ToolSubst('%lli', FindTool('lli'), post='.', extra_args=lli_args),
-    ToolSubst('%llc_dwarf', FindTool('llc'), extra_args=llc_args),
-    ToolSubst('%go', config.go_executable, unresolved='ignore'),
-    ToolSubst('%gold', config.gold_executable, unresolved='ignore'),
-    ToolSubst('%ld64', ld64_cmd, unresolved='ignore'),
-    ToolSubst('%ocamlc', ocamlc_command, unresolved='ignore'),
-    ToolSubst('%ocamlopt', ocamlopt_command, unresolved='ignore'),
-    ToolSubst('%opt-viewer', opt_viewer_cmd),
-    ToolSubst('%llvm-objcopy', FindTool('llvm-objcopy')),
-    ToolSubst('%llvm-strip', FindTool('llvm-strip')),
-]
-
-# FIXME: Why do we have both `lli` and `%lli` that do slightly different things?
-tools.extend([
-    'dsymutil', 'lli', 'lli-child-target', 'llvm-ar', 'llvm-as',
-    'llvm-bcanalyzer', 'llvm-config', 'llvm-cov', 'llvm-cxxdump', 'llvm-cvtres',
-    'llvm-diff', 'llvm-dis', 'llvm-dwarfdump', 'llvm-exegesis', 'llvm-extract',
-    'llvm-isel-fuzzer', 'llvm-jitlink', 'llvm-opt-fuzzer', 'llvm-lib',
-    'llvm-link', 'llvm-lto', 'llvm-lto2', 'llvm-mc', 'llvm-mca',
-    'llvm-modextract', 'llvm-nm', 'llvm-objcopy', 'llvm-objdump',
-    'llvm-pdbutil', 'llvm-profdata', 'llvm-ranlib', 'llvm-rc', 'llvm-readelf',
-    'llvm-readobj', 'llvm-rtdyld', 'llvm-size', 'llvm-split', 'llvm-strings',
-    'llvm-strip', 'llvm-tblgen', 'llvm-undname', 'llvm-c-test', 'llvm-cxxfilt',
-    'llvm-xray', 'yaml2obj', 'obj2yaml', 'yaml-bench', 'verify-uselistorder',
-    'bugpoint', 'llc', 'llvm-symbolizer', 'opt', 'sancov', 'sanstats'])
-
-# The following tools are optional
-tools.extend([
-    ToolSubst('llvm-go', unresolved='ignore'),
-    ToolSubst('llvm-mt', unresolved='ignore'),
-    ToolSubst('Kaleidoscope-Ch3', unresolved='ignore'),
-    ToolSubst('Kaleidoscope-Ch4', unresolved='ignore'),
-    ToolSubst('Kaleidoscope-Ch5', unresolved='ignore'),
-    ToolSubst('Kaleidoscope-Ch6', unresolved='ignore'),
-    ToolSubst('Kaleidoscope-Ch7', unresolved='ignore'),
-    ToolSubst('Kaleidoscope-Ch8', unresolved='ignore')])
+tools = ['opt']
 
 llvm_config.add_tool_substitutions(tools, config.llvm_tools_dir)
 
 # Targets
-
 config.targets = frozenset(config.targets_to_build.split())
-
 for arch in config.targets_to_build.split():
     config.available_features.add(arch.lower() + '-registered-target')
-
-# Features
-known_arches = ["x86_64", "mips64", "ppc64", "aarch64"]
-if (config.host_ldflags.find("-m32") < 0
-    and any(config.llvm_host_triple.startswith(x) for x in known_arches)):
-  config.available_features.add("llvm-64-bits")
-
-config.available_features.add("host-byteorder-" + sys.byteorder + "-endian")
-
-if sys.platform in ['win32']:
-    # ExecutionEngine, no weak symbols in COFF.
-    config.available_features.add('uses_COFF')
-else:
-    # Others/can-execute.txt
-    config.available_features.add('can-execute')
-
-# Loadable module
-if config.has_plugins:
-    config.available_features.add('plugins')
-
-# Static libraries are not built if BUILD_SHARED_LIBS is ON.
-if not config.build_shared_libs and not config.link_llvm_dylib:
-    config.available_features.add('static-libs')
-
-def have_cxx_shared_library():
-    readobj_exe = lit.util.which('llvm-readobj', config.llvm_tools_dir)
-    if not readobj_exe:
-        print('llvm-readobj not found')
-        return False
-
-    try:
-        readobj_cmd = subprocess.Popen(
-            [readobj_exe, '-needed-libs', readobj_exe], stdout=subprocess.PIPE)
-    except OSError:
-        print('could not exec llvm-readobj')
-        return False
-
-    readobj_out = readobj_cmd.stdout.read().decode('ascii')
-    readobj_cmd.wait()
-
-    regex = re.compile(r'(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)')
-    needed_libs = False
-    for line in readobj_out.splitlines():
-        if 'NeededLibraries [' in line:
-            needed_libs = True
-        if ']' in line:
-            needed_libs = False
-        if needed_libs and regex.search(line.lower()):
-            return True
-    return False
-
-if have_cxx_shared_library():
-    config.available_features.add('cxx-shared-library')
-
-if config.libcxx_used:
-    config.available_features.add('libcxx-used')
-
-# Direct object generation
-if not 'hexagon' in config.target_triple:
-    config.available_features.add('object-emission')
-
-# LLVM can be configured with an empty default triple
-# Some tests are "generic" and require a valid default triple
-if config.target_triple:
-    config.available_features.add('default_triple')
-
-import subprocess
-
-
-def have_ld_plugin_support():
-    if not os.path.exists(os.path.join(config.llvm_shlib_dir, 'LLVMgold' + config.llvm_shlib_ext)):
-        return False
-
-    ld_cmd = subprocess.Popen(
-        [config.gold_executable, '--help'], stdout=subprocess.PIPE, env={'LANG': 'C'})
-    ld_out = ld_cmd.stdout.read().decode()
-    ld_cmd.wait()
-
-    if not '-plugin' in ld_out:
-        return False
-
-    # check that the used emulations are supported.
-    emu_line = [l for l in ld_out.split('\n') if 'supported emulations' in l]
-    if len(emu_line) != 1:
-        return False
-    emu_line = emu_line[0]
-    fields = emu_line.split(':')
-    if len(fields) != 3:
-        return False
-    emulations = fields[2].split()
-    if 'elf_x86_64' not in emulations:
-        return False
-    if 'elf32ppc' in emulations:
-        config.available_features.add('ld_emu_elf32ppc')
-
-    ld_version = subprocess.Popen(
-        [config.gold_executable, '--version'], stdout=subprocess.PIPE, env={'LANG': 'C'})
-    if not 'GNU gold' in ld_version.stdout.read().decode():
-        return False
-    ld_version.wait()
-
-    return True
-
-
-if have_ld_plugin_support():
-    config.available_features.add('ld_plugin')
-
-
-def have_ld64_plugin_support():
-    if not os.path.exists(os.path.join(config.llvm_shlib_dir, 'libLTO' + config.llvm_shlib_ext)):
-        return False
-
-    if config.ld64_executable == '':
-        return False
-
-    ld_cmd = subprocess.Popen(
-        [config.ld64_executable, '-v'], stderr=subprocess.PIPE)
-    ld_out = ld_cmd.stderr.read().decode()
-    ld_cmd.wait()
-
-    if 'ld64' not in ld_out or 'LTO' not in ld_out:
-        return False
-
-    return True
-
-
-if have_ld64_plugin_support():
-    config.available_features.add('ld64_plugin')
-
-# Ask llvm-config about asserts and global-isel.
-llvm_config.feature_config(
-    [('--assertion-mode', {'ON': 'asserts'}),
-     ('--has-global-isel', {'ON': 'global-isel'})])
-
-if 'darwin' == sys.platform:
-    try:
-        sysctl_cmd = subprocess.Popen(['sysctl', 'hw.optional.fma'],
-                                      stdout=subprocess.PIPE)
-    except OSError:
-        print('Could not exec sysctl')
-    result = sysctl_cmd.stdout.read().decode('ascii')
-    if -1 != result.find('hw.optional.fma: 1'):
-        config.available_features.add('fma3')
-    sysctl_cmd.wait()
-
-# .debug_frame is not emitted for targeting Windows x64.
-if not re.match(r'^x86_64.*-(windows-gnu|windows-msvc)', config.target_triple):
-    config.available_features.add('debug_frame')
-
-if config.have_libxar:
-    config.available_features.add('xar')
-
-if config.enable_threads:
-    config.available_features.add('thread_support')
-
-if config.llvm_libxml2_enabled:
-    config.available_features.add('libxml2')
-
-if config.have_opt_viewer_modules:
-    config.available_features.add('have_opt_viewer_modules')
diff --git a/hpvm/test/unitTests/TwoLevel.ll b/hpvm/test/unitTests/TwoLevel.ll
index 0289319517b7d6a1f83f7b64d615bcbd72630821..840a2b5685d33d02584b72d96482fedda9a52fb6 100644
--- a/hpvm/test/unitTests/TwoLevel.ll
+++ b/hpvm/test/unitTests/TwoLevel.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define dso_local void @Func1(i32* %In, i64 %Insize, i32* %Out, i64 %Outsize) #0 {
 entry:
   tail call void @__hpvm__hint(i32 1) #3
-  tail call void (i32, ...) @__hpvm__attributes(i32 1, i32* %In, i32* %Out, i32 1, i32* %Out) #3
+  tail call void (i32, ...) @__hpvm__attributes(i32 2, i32* %In, i32* %Out, i32 1, i32* %Out) #3
   %0 = load i32, i32* %In, align 4, !tbaa !2
   store i32 %0, i32* %Out, align 4, !tbaa !2
   tail call void (i32, ...) @__hpvm__return(i32 1, i32* %Out) #3