diff --git a/.gitignore b/.gitignore
index 605cade94d773b32c3d3c79d2340e247aad51886..f3a8c7903b4dab089be74151a36c29481c1ec843 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,4 @@
-hpvm/build/
-hpvm/install/
+hpvm/build*/
 hpvm/llvm/
 hpvm/llvm-*.src.tar.xz
 hpvm/llvm-*.src/
@@ -23,7 +22,6 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
 lib64/
 parts/
 sdist/
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bd5edbd1a467666f67c66be132b3a9d9bbd2d540
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,28 @@
+image: hpvm/gitlab-ci
+variables:
+  GIT_SUBMODULE_STRATEGY: recursive
+  # Use a better compressor
+  FF_USE_FASTZIP: "true"
+  # output upload and download progress every 2 seconds
+  TRANSFER_METER_FREQUENCY: "2s"
+  # Use no compression for artifacts
+  CACHE_COMPRESSION_LEVEL: "fastest"
+cache:
+  key: "$CI_COMMIT_REF_SLUG"
+  paths:
+    - hpvm/build/
+    - hpvm/llvm/
+  when: always
+
+build:
+  stage: build
+  tags:
+    - hpvm
+  script:
+    - pwd
+    - source activate hpvm && cd hpvm
+    - ./install.sh -j32 -t "X86" DCMAKE_BUILD_TYPE=Release
+    - cd ..
+  only:
+    - hpvm-release-exp
+    - merge_requests
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..be6d1e0985998a369e994efc434ed3d417b26bc3
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,22 @@
+ARG IMAGE_NAME=nvidia/cuda
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+
+# Install dependencies: python 3.6, curl, git, libboost
+RUN apt-get update && apt-get install -y --no-install-recommends python3 curl git libboost-dev
+
+# Install cmake
+RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.20.0/cmake-3.20.0-linux-x86_64.sh -o cmake.sh && \
+    bash ./cmake.sh --skip-license --prefix=/usr && rm cmake.sh
+
+# Install conda
+RUN curl https://repo.anaconda.com/archive/Anaconda3-2020.11-Linux-x86_64.sh -o anaconda.sh && \
+    bash anaconda.sh -b && rm anaconda.sh
+
+# Set PATH to include conda
+ENV PATH="/root/anaconda3/bin:${PATH}"
+
+# Send conda env spec into container
+COPY . /root/hpvm/
+
+# Create conda env named hpvm based on spec
+RUN conda env create -n hpvm -f /root/hpvm/hpvm/env.yaml
diff --git a/README.md b/README.md
index 7fb3736d8b631233cdc150a83b716564499f1e7b..8d1040a8f41d273728de7585d3d2aaa366728bba 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # The HPVM Compiler Infrastructure
 
 [![Documentation Status](https://readthedocs.org/projects/hpvm/badge/?version=latest)](https://hpvm.readthedocs.io/en/latest/?badge=latest)
+[![pipeline status](https://gitlab.engr.illinois.edu/llvm/hpvm/badges/hpvm-release-exp/pipeline.svg)](https://gitlab.engr.illinois.edu/llvm/hpvm/-/commits/hpvm-release-exp)
 
 This repository contains the source code and documentation for the HPVM Compiler Infrastructure.
 
diff --git a/hpvm/CMakeLists.txt b/hpvm/CMakeLists.txt
index 809a30cfa52e16f436dac4e22843f4c5a3add3d9..71e6de5999ad0127da5155d93fe3403c05cb3c7f 100644
--- a/hpvm/CMakeLists.txt
+++ b/hpvm/CMakeLists.txt
@@ -7,11 +7,16 @@ message(STATUS "CUDA Architecture: ${CMAKE_CUDA_ARCHITECTURES}")
 
 # find_package will use the auxillary cmake/Find*.cmake we provide
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
-find_package(CUDNN 7 EXACT REQUIRED)  # CUDNN_INCLUDE_PATH, CUDNN_LIBRARY_PATH
+find_package(CUDNN 7 EXACT REQUIRED)  # CUDNN_INCLUDE_PATH, CUDNN_LIBRARY_DIR and CUDNN::cudnn
+find_package(OpenCL)  # Defines ${OpenCL_INCLUDE_DIRS} and ${OpenCL_LIBRARY}
+if(${OpenCL_FOUND})
+  set(HPVM_USE_OPENCL 1)
+else()
+  message(WARNING "OpenCL not found. --opencl flag of hpvm-clang will be unavailable for this build.")
+  set(HPVM_USE_OPENCL 0)
+endif()
 
 include_directories(./include/)
-# find_package will use the auxillary cmake/Find*.cmake we provide
-list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
 # Generate TENSOR_RT_PREFIX into config.h
 set(TENSOR_RT_PREFIX ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
diff --git a/hpvm/cmake/FindCUDNN.cmake b/hpvm/cmake/FindCUDNN.cmake
index e5a427f0317a6f3b8f7e7b2cc89fd176fd4362dc..bb4918f704a5fe210f0e25e893e1b33335189add 100644
--- a/hpvm/cmake/FindCUDNN.cmake
+++ b/hpvm/cmake/FindCUDNN.cmake
@@ -10,8 +10,9 @@
 # The following are set after configuration is done:
 #  CUDNN_FOUND
 #  CUDNN_INCLUDE_PATH
-#  CUDNN_LIBRARY_PATH
+#  CUDNN_LIBRARY_DIR
 #
+# It also provides the IMPORTed target CUDNN::cudnn.
 
 include(FindPackageHandleStandardArgs)
 
@@ -45,11 +46,8 @@ endif()
 find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
   PATHS ${CUDNN_LIBRARY}
   PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
-# Get director from filename ${CUDNN_LIBRARY_PATH}
-get_filename_component(
-  CUDNN_LIBRARY_PATH
-  "${CUDNN_LIBRARY_PATH}/.." ABSOLUTE
-)
+# Get directory from filename ${CUDNN_LIBRARY_PATH}
+get_filename_component(CUDNN_LIBRARY_DIR "${CUDNN_LIBRARY_PATH}/.." ABSOLUTE)
 
 # This version check is from OpenCV repo: https://github.com/opencv/opencv/blob/master/cmake/FindCUDNN.cmake
 # extract version from the include
@@ -80,4 +78,8 @@ find_package_handle_standard_args(
   VERSION_VAR CUDNN_VERSION
 )
 
+add_library(CUDNN::cudnn IMPORTED INTERFACE)
+target_include_directories(CUDNN::cudnn SYSTEM INTERFACE "${CUDNN_INCLUDE_PATH}")
+target_link_libraries(CUDNN::cudnn INTERFACE "${CUDNN_LIBRARY_PATH}")
+
 mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
diff --git a/hpvm/docs/install.rst b/hpvm/docs/install.rst
index a4f386ff1b49d48e17e4c1e085fa9c0de41a5c19..1b6ad4c6068d43e21c983d689594698e586a8810 100644
--- a/hpvm/docs/install.rst
+++ b/hpvm/docs/install.rst
@@ -150,14 +150,14 @@ Some common options that can be used with CMake are:
 * ``-DCMAKE_BUILD_TYPE=type`` --- Valid options for type are Debug, Release, RelWithDebInfo, and MinSizeRel. Default is Debug.
 * ``-DLLVM_ENABLE_ASSERTIONS=On`` --- Compile with assertion checks enabled (default is Yes for Debug builds, No for all other build types).
 
-Now, compile the HPVM Compilation Tool ``approxhpvm.py`` using:
+Now, compile the HPVM Compilation Tool ``hpvm-clang`` using:
 
 .. code-block:: shell
 
-   make -j<number of threads> approxhpvm.py
+   make -j<number of threads> hpvm-clang
 
 With all the aforementioned steps, HPVM should be built, installed, tested and ready to use.
-In particular, ``approxhpvm.py`` should be an executable command from your command line.
+In particular, ``hpvm-clang`` should be an executable command from your command line.
 
 Benchmarks and Tests
 --------------------
@@ -166,7 +166,7 @@ We provide a number of general benchmarks, DNN benchmarks, and test cases, writt
 
 ``make`` targets ``check-hpvm-pass``, ``check-hpvm-dnn``, and ``check-hpvm-profiler``
 tests various components of HPVM and are increasingly time-consuming.
-You can run tests similarly as how ``approxhpvm.py`` is compiled: for example,
+You can run tests similarly as how ``hpvm-clang`` is compiled: for example,
 
 .. code-block:: shell
 
diff --git a/hpvm/include/SupportHPVM/HPVMUtils.h b/hpvm/include/SupportHPVM/HPVMUtils.h
index 9a91494a41d6109cda8a8b9b885919fd197fb768..781306956dad0eacc85eadaaf60be4c8ce0e7b21 100644
--- a/hpvm/include/SupportHPVM/HPVMUtils.h
+++ b/hpvm/include/SupportHPVM/HPVMUtils.h
@@ -404,6 +404,7 @@ bool tagIncludesTarget(hpvm::Target Tag, hpvm::Target T) {
       return false;
   default:
     assert(false && "Unknown Target\n");
+    return false;  // What kind of compiler doesn't know this is unreachable?!
   }
 }
 
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/include/hpvm.h b/hpvm/include/hpvm.h
similarity index 100%
rename from hpvm/test/dnn_benchmarks/hpvm-c/include/hpvm.h
rename to hpvm/include/hpvm.h
diff --git a/hpvm/install.sh b/hpvm/install.sh
index dd737034f043e2022710a94982467e60456d2bd4..3676a61972ff32566102d0eebdb490f00eaccb4b 100755
--- a/hpvm/install.sh
+++ b/hpvm/install.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# Run installer script
-# Pass on args to installer that can parse them
+# Run installer script and pass on args to installer that can parse them
 scripts/hpvm_installer.py "$@"
-# Set path.
-export PATH=$BUILD_DIR/bin:$PATH
+ret_code=$?
+echo "Installer returned with code $ret_code"
+exit $ret_code
diff --git a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
index 7ebdc39c28bdd49c193c5ae980439439178492a3..e7293a0640b5d7e45614459ed9687768998142a4 100644
--- a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
+++ b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
@@ -58,19 +58,7 @@ bool BuildDFG::runOnModule(Module &M) {
         Roots.push_back(Root);
         BuildGraph(Root, F);
 
-        for (DFGraph::children_iterator i = Root->getChildGraph()->begin(),
-                                        e = Root->getChildGraph()->end();
-             i != e; i++) {
-          DFNode *N = *i;
-          // DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n");
-        }
         Root->getChildGraph()->sortChildren();
-        for (DFGraph::children_iterator i = Root->getChildGraph()->begin(),
-                                        e = Root->getChildGraph()->end();
-             i != e; i++) {
-          DFNode *N = *i;
-          // DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n");
-        }
         viewDFGraph(Root->getChildGraph());
       }
     }
diff --git a/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp b/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp
index a44b79b3fe20cb52383ff7458466a72d041b90fc..d5904bd83c0eadcbdd912a79443bd7126acc36c5 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp
@@ -283,30 +283,6 @@ void CGT_CPU::addWhileLoop(Instruction *CondBlockStart, Instruction *BodyStart,
   ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
 }
 
-Instruction *CGT_CPU::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
-                                          BasicBlock *Body) {
-  Module *M = Entry->getParent()->getParent();
-  Type *Int64Ty = Type::getInt64Ty(M->getContext());
-
-  // Insert a PHI instruction at the beginning of the condition block
-  Instruction *IB = Cond->getFirstNonPHI();
-  PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
-
-  ConstantInt *IConst =
-      ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
-  Instruction *CounterIncr =
-      BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
-                                "cnt_incr", Body->getTerminator());
-
-  // Set incoming values for Phi node
-  IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
-  CounterPhi->addIncoming(IConst, Entry);
-  CounterPhi->addIncoming(CounterIncr, Body);
-
-  // Return the pointer to the created PHI node in the corresponding argument
-  return CounterPhi;
-}
-
 /* Add Loop around the instruction I
  * Algorithm:
  * (1) Split the basic block of instruction I into three parts, where the
diff --git a/hpvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp b/hpvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
index 4653ad0f8a4c4ddd8d76e984b1750d9f94d813a9..0559e8136da6bff441e3c1fb0b948bcaaeb954ee 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
@@ -540,9 +540,6 @@ void CGT_CUDNN::codeGen(DFLeafNode *N) {
       case Intrinsic::hpvm_node_id: { /* llvm.hpvm.node.id */
         DEBUG(errs() << F_cudnn->getName()
                      << "\t: Handling Node ID Intrinsic \n");
-        // Get uint32 argument
-        Value *Op = II->getOperand(0);
-
         // Argument list for the runtime call
         std::vector<Value *> Args;
         Args.push_back(II->getOperand(0));
diff --git a/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt b/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt
index 4041df11ce8d79e39d6f72bdf0a1068eae449300..1e34f44f403e373607d2efc10aca7f796c81b938 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt
+++ b/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt
@@ -10,6 +10,7 @@ add_llvm_library( LLVMDFG2LLVM_OpenCL
 
   DEPENDS
   intrinsics_gen
+  llvm-cbe  # Called within the pass
   PLUGIN_TOOL
   opt
   )
diff --git a/hpvm/lib/Transforms/DFG2LLVM_OpenCL/DFG2LLVM_OpenCL.cpp b/hpvm/lib/Transforms/DFG2LLVM_OpenCL/DFG2LLVM_OpenCL.cpp
index 5a58f272b3042a4ebfc2e3c7bb3606b5c19e8d84..286bcdebd2ed3322b134581f3e8fdac286ee325b 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_OpenCL/DFG2LLVM_OpenCL.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_OpenCL/DFG2LLVM_OpenCL.cpp
@@ -47,6 +47,8 @@
 #include "llvm/Support/ToolOutputFile.h"
 
 #include <sstream>
+#include <unistd.h>  // ".", nullptr() (POSIX-only)
+#include <cstdlib>  // std::system
 
 #ifndef LLVM_BUILD_DIR
 #error LLVM_BUILD_DIR is not defined
@@ -147,7 +149,7 @@ static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&,
 static Value *genWorkGroupPtr(Module &M, std::vector<Value *>,
                               ValueToValueMapTy &, Instruction *,
                               const Twine &WGName = "WGSize");
-static std::string getPTXFilename(const Module &);
+static std::string getPTXFilePath(const Module &);
 static void changeDataLayout(Module &);
 static void changeTargetTriple(Module &);
 static void findReturnInst(Function *, std::vector<ReturnInst *> &);
@@ -258,7 +260,7 @@ public:
     DEBUG(errs() << *KernelM);
   }
 
-  void writeKernelsModule();
+  void writeKModCompilePTX();
 };
 
 // Initialize the HPVM runtime API. This makes it easier to insert these calls
@@ -846,7 +848,7 @@ void CGT_OpenCL::codeGen(DFInternalNode *N) {
     // Now the remaining nodes to be visited should be ignored
     KernelLaunchNode = NULL;
     DEBUG(errs() << "Insert Runtime calls\n");
-    insertRuntimeCalls(N, kernel, getPTXFilename(M));
+    insertRuntimeCalls(N, kernel, getPTXFilePath(M));
 
   } else {
     DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
@@ -1901,7 +1903,7 @@ bool DFG2LLVM_OpenCL::runOnModule(Module &M) {
     CGTVisitor->visit(rootNode);
   }
 
-  CGTVisitor->writeKernelsModule();
+  CGTVisitor->writeKModCompilePTX();
 
   // TODO: Edit module epilogue to remove the HPVM intrinsic declarations
   delete CGTVisitor;
@@ -2010,16 +2012,18 @@ void CGT_OpenCL::addCLMetadata(Function *F) {
   MDN_annotations->addOperand(MDNvvmAnnotationsNode);
 }
 
-void CGT_OpenCL::writeKernelsModule() {
+void CGT_OpenCL::writeKModCompilePTX() {
 
   // In addition to deleting all other functions, we also want to spiff it
   // up a little bit.  Do this now.
   legacy::PassManager Passes;
+  auto kmodName = getKernelsModuleName(M);
+  auto ptxPath = getPTXFilePath(M);
 
   DEBUG(errs() << "Writing to File --- ");
-  DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n");
+  DEBUG(errs() << kmodName << "\n");
   std::error_code EC;
-  ToolOutputFile Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
+  ToolOutputFile Out(kmodName.c_str(), EC, sys::fs::F_None);
   if (EC) {
     DEBUG(errs() << EC.message() << '\n');
   }
@@ -2030,6 +2034,13 @@ void CGT_OpenCL::writeKernelsModule() {
 
   // Declare success.
   Out.keep();
+
+  // Starts calling llvm-cbe.
+  auto llvmCBE = std::string(LLVM_BUILD_DIR_STR) + "/bin/llvm-cbe";
+  std::string command = llvmCBE + " " + kmodName + " -o " + ptxPath;
+  DEBUG(errs() << "Compiling PTX from kernel module:\n");
+  DEBUG(errs() << command);
+  std::system(command.c_str());
 }
 
 Function *CGT_OpenCL::transformFunctionToVoid(Function *F) {
@@ -2346,10 +2357,13 @@ static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize,
 }
 
 // Get generated PTX binary name
-static std::string getPTXFilename(const Module &M) {
+static std::string getPTXFilePath(const Module &M) {
   std::string moduleID = M.getModuleIdentifier();
-  moduleID.append(".kernels.cl");
-  return moduleID;
+  char *cwd_p = get_current_dir_name();
+  std::string cwd(cwd_p);
+  free(cwd_p);
+  std::string ptxPath = cwd + "/" + moduleID + ".kernels.cl";
+  return ptxPath;
 }
 
 // Changes the data layout of the Module to be compiled with OpenCL backend
diff --git a/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py b/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py
index 14d0f491b8dccb41f7136087f4146f64e0470693..4e91fbbe4a4af2c16b7583443360a09d88b0ac61 100644
--- a/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py
+++ b/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py
@@ -50,7 +50,7 @@ def profile_configs(
         # Run binary_path binary,
         # which generates `profile_filename` and `qos_filename` file in cwd.
         try:
-            check_call([str(binary_path), "-c", str(temp_file.name)], stdout=PIPE)
+            check_call([str(binary_path), "-c", str(temp_file.name)])     
         except CalledProcessError as e:
             print("Output from the program:")
             print(e.output)
@@ -158,9 +158,12 @@ def read_hpvm_configs(config_file: PathLike) -> Tuple[str, List[Config]]:
 
 
 def write_hpvm_config(header: str, configs: Iterable[Config], to_file: PathLike):
+    
     text_segs = [header] + [str(config) for config in configs]
     with open(to_file, "w") as f:
         f.write("\n".join(text_segs))
+        f.flush()
+
 
 
 def _read_profile_file(profile_file_path: Path):
diff --git a/hpvm/projects/hpvm-rt/CMakeLists.txt b/hpvm/projects/hpvm-rt/CMakeLists.txt
index 6efd8d3d0a9d86236adc87657fb68b782f3daaa0..3147d00e892f8d89b71ebbe7e04aa0fa33210534 100644
--- a/hpvm/projects/hpvm-rt/CMakeLists.txt
+++ b/hpvm/projects/hpvm-rt/CMakeLists.txt
@@ -3,16 +3,18 @@ add_definitions(-DNUM_CORES=8)
 SET(CMAKE_C_COMPILER ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/clang)
 SET(CMAKE_CXX_COMPILER ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/clang++)
 SET(CMAKE_CXX_STANDARD 11)
-# Defines ${OpenCL_INCLUDE_DIRS} and ${OpenCL_LIBRARY} if found
-find_package(OpenCL REQUIRED)
 
 # This puts libhpvm-rt.a in lib/ which we don't care about
 # we want ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/hpvm-rt.dir/hpvm-rt.cpp.o
 # which is a LLVM Bitcode file because of the -flto below.
 add_llvm_library(hpvm-rt hpvm-rt.cpp DEPENDS clang)
-target_compile_options(hpvm-rt PUBLIC -flto)
-target_include_directories(hpvm-rt PRIVATE ${OpenCL_INCLUDE_DIRS})
-target_link_directories(hpvm-rt PUBLIC ${OpenCL_LIBRARY})
+target_compile_options(hpvm-rt PUBLIC -flto -DHPVM_USE_OPENCL=${HPVM_USE_OPENCL})
+if(${HPVM_USE_OPENCL})
+  target_include_directories(hpvm-rt PRIVATE ${OpenCL_INCLUDE_DIRS})
+  target_link_directories(hpvm-rt PUBLIC ${OpenCL_LIBRARY})
+else()
+  message(STATUS "hpvm-rt.bc is not using OpenCL.")
+endif()
 
 # Move and rename hpvm-rt.cpp.o to be an actual bc code
 add_custom_command(
@@ -22,4 +24,4 @@ add_custom_command(
     ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/hpvm-rt.dir/hpvm-rt.cpp.o
     ${CMAKE_CURRENT_BINARY_DIR}/hpvm-rt.bc
 )
-add_custom_target(hpvm-rt.bc ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/hpvm-rt.bc")
+add_custom_target(hpvm-rt-bc ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/hpvm-rt.bc")
diff --git a/hpvm/projects/hpvm-rt/hpvm-rt.cpp b/hpvm/projects/hpvm-rt/hpvm-rt.cpp
index dff4f0e4a21e8e0eafad36bd7b14805c584cd10c..f0716378fe8c62639555396a3d268aae051534ba 100644
--- a/hpvm/projects/hpvm-rt/hpvm-rt.cpp
+++ b/hpvm/projects/hpvm-rt/hpvm-rt.cpp
@@ -1,4 +1,6 @@
-#include <CL/cl.h>
+
+//#define HPVM_USE_OPENCL 1
+
 #include <algorithm>
 #include <cassert>
 #include <cstdio>
@@ -8,9 +10,14 @@
 #include <map>
 #include <pthread.h>
 #include <string>
-
 #include <unistd.h>
 
+#ifdef HPVM_USE_OPENCL
+
+#include <CL/cl.h>
+
+#endif
+
 #if _POSIX_VERSION >= 200112L
 #include <sys/time.h>
 #endif
@@ -40,6 +47,9 @@ typedef struct {
   std::vector<CircularBuffer<uint64_t> *> *isLastInputBuffers;
 } DFNodeContext_CPU;
 
+
+#ifdef HPVM_USE_OPENCL
+
 typedef struct {
   cl_context clOCLContext;
   cl_command_queue clCommandQue;
@@ -51,6 +61,9 @@ cl_context globalOCLContext;
 cl_device_id *clDevices;
 cl_command_queue globalCommandQue;
 
+#endif
+
+
 MemTracker MTracker;
 vector<DFGDepth> DStack;
 // Mutex to prevent concurrent access by multiple thereads in pipeline
@@ -59,6 +72,8 @@ pthread_mutex_t ocl_mtx;
 #define NUM_TESTS 1
 hpvm_TimerSet kernel_timer;
 
+#ifdef HPVM_USE_OPENCL
+
 static const char *getErrorString(cl_int error) {
   switch (error) {
   // run-time and JIT compiler errors
@@ -209,6 +224,15 @@ static inline void checkErr(cl_int err, cl_int success, const char *name) {
   }
 }
 
+#endif
+
+
+void openCLAbort(){
+ cout <<" ERROR: OpenCL NOT found!. Please Recompile with OpenCL - Make sure to have OpenCL on System \n ";
+ abort();
+}
+
+
 /************************* Depth Stack Routines ***************************/
 
 void llvm_hpvm_cpu_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
@@ -260,6 +284,9 @@ uint64_t llvm_hpvm_cpu_getDimInstance(unsigned level, unsigned dim) {
 /********************** Memory Tracking Routines **************************/
 
 void llvm_hpvm_track_mem(void *ptr, size_t size) {
+
+#ifdef HPVM_USE_OPENCL 
+  
   DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
   if (MTE != NULL) {
@@ -269,9 +296,19 @@ void llvm_hpvm_track_mem(void *ptr, size_t size) {
   DEBUG(cout << "Inserting ID " << ptr << " in the MemTracker Table\n");
   MTracker.insert(ptr, size, MemTrackerEntry::HOST, ptr);
   DEBUG(MTracker.print());
+
+#else
+
+  openCLAbort();
+  
+#endif
+  
 }
 
 void llvm_hpvm_untrack_mem(void *ptr) {
+
+#ifdef HPVM_USE_OPENCL 
+
   DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
   if (MTE == NULL) {
@@ -284,11 +321,22 @@ void llvm_hpvm_untrack_mem(void *ptr) {
     clReleaseMemObject((cl_mem)MTE->getAddress());
   MTracker.remove(ptr);
   DEBUG(MTracker.print());
+
+#else
+
+  openCLAbort();
+  
+#endif
+  
 }
 
+
+#ifdef HPVM_USE_OPENCL 
+
 static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size,
                                        DFNodeContext_OCL *Context, bool isInput,
                                        bool isOutput) {
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "[OCL] Request memory: " << ptr
              << " for context: " << Context->clOCLContext << flush << "\n");
@@ -343,13 +391,20 @@ static void *llvm_hpvm_ocl_request_mem(void *ptr, size_t size,
   DEBUG(MTracker.print());
   pthread_mutex_unlock(&ocl_mtx);
   return d_input;
+  
 }
 
+#endif
+
+
 void *llvm_hpvm_cpu_argument_ptr(void *ptr, size_t size) {
   return llvm_hpvm_request_mem(ptr, size);
 }
 
 void *llvm_hpvm_request_mem(void *ptr, size_t size) {
+
+#ifdef HPVM_USE_OPENCL 
+
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "[CPU] Request memory: " << ptr << flush << "\n");
   MemTrackerEntry *MTE = MTracker.lookup(ptr);
@@ -386,6 +441,13 @@ void *llvm_hpvm_request_mem(void *ptr, size_t size) {
   DEBUG(MTracker.print());
   pthread_mutex_unlock(&ocl_mtx);
   return ptr;
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 /*************************** Timer Routines **********************************/
@@ -419,6 +481,9 @@ get_last_async(struct hpvm_TimerSet *timers) {
 }
 
 static void insert_marker(struct hpvm_TimerSet *tset, enum hpvm_TimerID timer) {
+
+#ifdef HPVM_USE_OPENCL
+  
   cl_int ciErrNum = CL_SUCCESS;
   struct hpvm_async_time_marker_list **new_event = &(tset->async_markers);
 
@@ -441,10 +506,20 @@ static void insert_marker(struct hpvm_TimerSet *tset, enum hpvm_TimerID timer) {
   if (ciErrNum != CL_SUCCESS) {
     fprintf(stderr, "Error Enqueueing Marker!\n");
   }
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 static void insert_submarker(struct hpvm_TimerSet *tset, char *label,
                              enum hpvm_TimerID timer) {
+
+#ifdef HPVM_USE_OPENCL
+  
   cl_int ciErrNum = CL_SUCCESS;
   struct hpvm_async_time_marker_list **new_event = &(tset->async_markers);
 
@@ -467,10 +542,20 @@ static void insert_submarker(struct hpvm_TimerSet *tset, char *label,
   if (ciErrNum != CL_SUCCESS) {
     fprintf(stderr, "Error Enqueueing Marker!\n");
   }
+
+#else
+
+  openCLAbort();
+
+#endif
+  
 }
 
 /* Assumes that all recorded events have completed */
 static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) {
+
+#ifdef HPVM_USE_OPENCL
+  
   struct hpvm_async_time_marker_list *next_interval = NULL;
   struct hpvm_async_time_marker_list *last_marker = get_last_async(tset);
   hpvm_Timestamp total_async_time = 0;
@@ -517,6 +602,13 @@ static hpvm_Timestamp record_async_times(struct hpvm_TimerSet *tset) {
     next_interval->timerID = INVALID_TIMERID;
 
   return total_async_time;
+
+#else
+
+  openCLAbort();
+
+#endif
+  
 }
 
 static void accumulate_time(hpvm_Timestamp *accum, hpvm_Timestamp start,
@@ -733,6 +825,9 @@ void hpvm_AddSubTimer(struct hpvm_TimerSet *timers, char *label,
 }
 
 void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) {
+
+#ifdef HPVM_USE_OPENCL
+  
   // cerr << "Switch to timer: " << timer << flush << "\n";
   /* Stop the currently running timer */
   if (timers->current != hpvm_TimerID_NONE) {
@@ -849,10 +944,21 @@ void hpvm_SwitchToTimer(struct hpvm_TimerSet *timers, enum hpvm_TimerID timer) {
     }
   }
   timers->current = timer;
+
+#else
+
+  openCLAbort();
+
+#endif
+
+
 }
 
 void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label,
                            enum hpvm_TimerID category) {
+
+#ifdef HPVM_USE_OPENCL
+  
   struct hpvm_SubTimerList *subtimerlist =
       timers->sub_timer_list[timers->current];
   struct hpvm_SubTimer *curr =
@@ -1001,6 +1107,13 @@ void hpvm_SwitchToSubTimer(struct hpvm_TimerSet *timers, char *label,
   }
 
   timers->current = category;
+
+#else
+
+  openCLAbort();
+
+#endif
+  
 }
 
 void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) {
@@ -1069,6 +1182,9 @@ void hpvm_PrintTimerSet(struct hpvm_TimerSet *timers) {
 }
 
 void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) {
+
+#ifdef HPVM_USE_OPENCL
+  
   /* clean up all of the async event markers */
   struct hpvm_async_time_marker_list *event = timers->async_markers;
   while (event != NULL) {
@@ -1106,6 +1222,13 @@ void hpvm_DestroyTimerSet(struct hpvm_TimerSet *timers) {
       free(timers->sub_timer_list[i]);
     }
   }
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 /**************************** Pipeline API ************************************/
@@ -1304,10 +1427,13 @@ void llvm_hpvm_cpu_wait(void *graphID) {
   DEBUG(cout << "\t... pthread Done!\n");
 }
 
+
+#ifdef HPVM_USE_OPENCL
+
 // Returns the platform name.
 std::string getPlatformName(cl_platform_id pid) {
+ 
   cl_int status;
-
   size_t sz;
   status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
   checkErr(status, CL_SUCCESS, "Query for platform name size failed");
@@ -1318,12 +1444,18 @@ std::string getPlatformName(cl_platform_id pid) {
 
   const auto &tmp = std::string(name, name + sz);
   delete[] name;
-  return tmp;
+  return tmp;  
 }
 
+#endif
+
+
+#ifdef HPVM_USE_OPENCL
+
 // Searches all platforms for the first platform whose name
 // contains the search string (case-insensitive).
 cl_platform_id findPlatform(const char *platform_name_search) {
+  
   cl_int status;
 
   std::string search = platform_name_search;
@@ -1360,7 +1492,13 @@ cl_platform_id findPlatform(const char *platform_name_search) {
   assert(false && "No matching platform found!");
 }
 
+#endif
+
+
 void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) {
+
+#ifdef HPVM_USE_OPENCL
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(std::string Target = T == hpvm::GPU_TARGET ? "GPU" : "SPIR");
   DEBUG(cout << "Initializing Context for " << Target << " device\n");
@@ -1450,9 +1588,19 @@ void *llvm_hpvm_ocl_initContext(enum hpvm::Target T) {
 
   pthread_mutex_unlock(&ocl_mtx);
   return globalOCLContext;
+
+#else
+
+  openCLAbort();
+
+#endif
+  
 }
 
 void llvm_hpvm_ocl_clearContext(void *graphID) {
+
+#ifdef HPVM_USE_OPENCL
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Clear Context\n");
   DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
@@ -1464,9 +1612,19 @@ void llvm_hpvm_ocl_clearContext(void *graphID) {
   cout << "Printing HPVM Timer: KernelTimer\n";
   hpvm_PrintTimerSet(&kernel_timer);
   pthread_mutex_unlock(&ocl_mtx);
+
+#else
+
+  openCLAbort();
+
+#endif
+  
 }
 
 void llvm_hpvm_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
+
+#ifdef HPVM_USE_OPENCL
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Shared Memory Input:");
   DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
@@ -1477,10 +1635,20 @@ void llvm_hpvm_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
   cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL);
   checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument");
   pthread_mutex_unlock(&ocl_mtx);
+
+#else
+
+  openCLAbort();
+
+#endif
+  
 }
 
 void llvm_hpvm_ocl_argument_scalar(void *graphID, void *input, int arg_index,
                                    size_t size) {
+
+#ifdef HPVM_USE_OPENCL
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Scalar Input:");
   DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
@@ -1491,10 +1659,20 @@ void llvm_hpvm_ocl_argument_scalar(void *graphID, void *input, int arg_index,
   cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
   checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
   pthread_mutex_unlock(&ocl_mtx);
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index,
                                  size_t size, bool isInput, bool isOutput) {
+
+#ifdef HPVM_USE_OPENCL
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Pointer Input:");
   DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input
@@ -1518,9 +1696,19 @@ void *llvm_hpvm_ocl_argument_ptr(void *graphID, void *input, int arg_index,
   DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n");
   pthread_mutex_unlock(&ocl_mtx);
   return d_input;
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 void *llvm_hpvm_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
+
+#ifdef HPVM_USE_OPENCL
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set device memory for Output Struct:");
   DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
@@ -1536,12 +1724,22 @@ void *llvm_hpvm_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
   DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n");
   pthread_mutex_unlock(&ocl_mtx);
   return d_output;
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 void llvm_hpvm_ocl_free(void *ptr) {}
 
 void *llvm_hpvm_ocl_getOutput(void *graphID, void *h_output, void *d_output,
                               size_t size) {
+
+#ifdef HPVM_USE_OPENCL
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Get Output:\n");
   DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output
@@ -1555,11 +1753,21 @@ void *llvm_hpvm_ocl_getOutput(void *graphID, void *h_output, void *d_output,
   checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output");
   pthread_mutex_unlock(&ocl_mtx);
   return h_output;
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim,
                                 const size_t *localWorkSize,
                                 const size_t *globalWorkSize) {
+
+#ifdef HPVM_USE_OPENCL 
+ 
   pthread_mutex_lock(&ocl_mtx);
 
   size_t GlobalWG[3];
@@ -1610,6 +1818,13 @@ void *llvm_hpvm_ocl_executeNode(void *graphID, unsigned workDim,
 
   pthread_mutex_unlock(&ocl_mtx);
   return NULL;
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1656,6 +1871,9 @@ static char *LoadProgSource(const char *Filename, size_t *szFinalLength) {
 }
 
 void *llvm_hpvm_ocl_launch(const char *FileName, const char *KernelName) {
+
+#ifdef HPVM_USE_OPENCL
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Launch OCL Kernel\n");
   // Initialize OpenCL
@@ -1717,14 +1935,31 @@ void *llvm_hpvm_ocl_launch(const char *FileName, const char *KernelName) {
 
   pthread_mutex_unlock(&ocl_mtx);
   return Context;
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 void llvm_hpvm_ocl_wait(void *graphID) {
+
+#ifdef HPVM_USE_OPENCL
+  
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Wait\n");
   DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
   clFinish(Context->clCommandQue);
   pthread_mutex_unlock(&ocl_mtx);
+
+#else
+
+  openCLAbort();
+
+#endif
+
 }
 
 void llvm_hpvm_switchToTimer(void **timerSet, enum hpvm_TimerID timer) {
@@ -1749,3 +1984,5 @@ void *llvm_hpvm_initializeTimerSet() {
   pthread_mutex_unlock(&ocl_mtx);
   return TS;
 }
+
+
diff --git a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
index a142d524b69cb605b85c496aa140c806ad258dfd..cbabc8bbe0111a0ec6c99520176a8b37a530a4fb 100644
--- a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -24,10 +24,8 @@ configure_file(
 # -- Default include directories
 set(
   INCLUDES
-  ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
-  ${CUDNN_INCLUDE_PATH}
   ./tensor_runtime/include ${CMAKE_CURRENT_BINARY_DIR}/tensor_runtime/include
-  ./dnn_sources/include
+  ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUDNN_INCLUDE_PATH}
 )
 
 # Build gpu_profiler and soc_simulator (dependencies)
@@ -39,13 +37,43 @@ add_library(soc_simulator SHARED soc_simulator/promise_timing_model.cpp)
 target_include_directories(soc_simulator PUBLIC soc_simulator/)
 
 # -- Link libraries
-find_package(OpenMP REQUIRED)  # Provides ${OpenMP_CXX_FLAGS}
-set(LINK_DIR CUDNN_LIBRARY_PATH)
-set(LINK_LIBS gpu_profiler soc_simulator stdc++fs cudnn curand cublas)
+find_package(OpenMP REQUIRED)  # Provides ${OpenMP_CXX_FLAGS} and OpenMP::OpenMP_CXX
+# This will use the CUDA found by CUDA language support in the root CMake,
+# but it exports the CUDA::* targets (used below) so we can freely add libraries to link to.
+find_package(CUDAToolkit REQUIRED)
+set(
+  LINK_LIBS
+  gpu_profiler soc_simulator
+  CUDA::cublas CUDA::curand CUDNN::cudnn
+  OpenMP::OpenMP_CXX
+)
 if(USE_GFLAGS)
   list(APPEND LINK_LIBS gflags)
 endif()
 
+# -- Manually list the directories (TRT_LINK_DIRS) and libraries (TRT_LINK_LIBS)
+# tensor_runtime links to, because CMake doesn't help us do this.
+# This is needed by both hpvm-clang and the RPATH setting step (below).
+# First, take a guess at the paths to the libraries that are used here.
+# (CMake, why do you make this so difficult?)
+foreach(interface_lib ${LINK_LIBS})
+  get_target_property(actual_libs ${interface_lib} INTERFACE_LINK_LIBRARIES)
+  foreach(actual_lib ${actual_libs})
+    # ${actual_lib} may not be a path, then taking the directory of it should return "".
+    get_filename_component(libdir ${actual_lib} DIRECTORY)
+    get_filename_component(libname ${actual_lib} NAME)
+    if(NOT ${libdir} STREQUAL "")
+      list(APPEND TRT_LINK_DIRS ${libdir})
+    endif()
+    if(NOT ${libname} STREQUAL "" AND NOT ${libname} STREQUAL "actual_libs-NOTFOUND")
+      list(APPEND TRT_LINK_LIBS ${libname})
+    endif()
+  endforeach()
+endforeach()
+# Dedup, just for shorter compiler arguments.
+list(REMOVE_DUPLICATES TRT_LINK_DIRS)
+list(REMOVE_DUPLICATES TRT_LINK_LIBS)
+
 # -- Definitions
 set(DEFS -DPROMISE_TUNER_ENABLED -DSIMULATION_MODE=true)
 if(USE_GFLAGS)
@@ -77,16 +105,27 @@ endforeach()
 # -- Adding tensor_runtime targets
 function(add_tensor_runtime target_name)
   add_library(${target_name} SHARED ${RUNTIME_SRCS})
-  set_property(TARGET ${target_name} PROPERTY CUDA_ARCHITECTURES 60)
+  set_target_properties(${target_name} PROPERTIES CUDA_ARCHITECTURES 60)
   target_compile_options(
     ${target_name} PRIVATE
     --expt-relaxed-constexpr -maxrregcount 32 -Xcompiler=${OpenMP_CXX_FLAGS}
     $<$<CONFIG:DEBUG>:-lineinfo -Xcompiler=-ggdb>
   )
   target_include_directories(${target_name} PUBLIC ${INCLUDES})
-  target_link_directories(${target_name} PUBLIC ${LINK_DIR})
-  target_link_libraries(${target_name} PUBLIC ${LINK_LIBS} ${OpenMP_CXX_FLAGS})
+  target_link_libraries(${target_name} PUBLIC ${LINK_LIBS})
   target_compile_definitions(${target_name} PRIVATE ${DEFS} ${ARGN})
+
+  # We have to manually set rpath because cmake is not willing to comply...
+  foreach(libdir ${TRT_LINK_DIRS})
+    target_link_options(${target_name} PRIVATE "-Wl,-rpath,${libdir}")
+  endforeach()
+  # Also slap TRT_LINK_DIRS and TRT_LINK_LIBS on this target
+  # so that hpvm-clang can read them. (we'll create our own properties.)
+  set_target_properties(
+    ${target_name} PROPERTIES
+    TRT_LINK_DIRS "${TRT_LINK_DIRS}"
+    TRT_LINK_LIBS "${TRT_LINK_LIBS}"
+  )
 endfunction(add_tensor_runtime)
 
 # Adding new rule for building a cuDNN runtime library
@@ -101,8 +140,8 @@ add_tensor_runtime(tensor_runtime_online -DONLINE_PROFILING=true -DFP16_tuning=f
 add_dependencies(tensor_runtime_online tensor_runtime)
 
 # Adding rule for the debugging source
-add_executable(unit_tests dnn_sources/src/unit_tests.cc)
-target_link_libraries(unit_tests  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}  ${OpenMP_CXX_FLAGS})
+add_executable(unit_tests tests/unit_tests.cc)
+target_link_libraries(unit_tests tensor_runtime_online)
 
 # -- Compile tensor_runtime.ll if possible
 if(INDEP_BUILD)
@@ -126,6 +165,8 @@ endif()
 # If some clang-9 is found, create a tensor_runtime.ll from tensor_signatures.cc
 if(CLANG_NAME)
   message(STATUS "Creating tensor_runtime.ll in ${TENSOR_RT_LL_PREFIX}")
+  # Manually add cuda includes because add_custom_command doesn't handle them
+  # (unlike add_library which has CUDA-lang support).
   foreach(dir ${INCLUDES})
     list(APPEND INCLUDE_COMPILER_STRINGS "-I${dir}")
   endforeach()
@@ -136,59 +177,3 @@ if(CLANG_NAME)
     -o ${TENSOR_RT_LL_PREFIX}/tensor_runtime.ll
   )
 endif()
-
-
-#**************** FP32 TensorRT Source Builds *********** 
-
-add_executable(lenet_mnist_fp32 dnn_sources/src/fp32/lenet_mnist.cc)
-target_link_libraries(lenet_mnist_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(alexnet_cifar10_fp32 dnn_sources/src/fp32/alexnet_cifar10.cc)
-target_link_libraries(alexnet_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(alexnet2_cifar10_fp32 dnn_sources/src/fp32/alexnet2_cifar10.cc)
-target_link_libraries(alexnet2_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(vgg16_cifar10_fp32 dnn_sources/src/fp32/vgg16_cifar10.cc)
-target_link_libraries(vgg16_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(resnet18_cifar10_fp32 dnn_sources/src/fp32/resnet18_cifar10.cc)
-target_link_libraries(resnet18_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(vgg16_cifar100_fp32 dnn_sources/src/fp32/vgg16_cifar100.cc)
-target_link_libraries(vgg16_cifar100_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(mobilenet_cifar10_fp32 dnn_sources/src/fp32/mobilenet.cc)
-target_link_libraries(mobilenet_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(alexnet_imagenet_fp32 dnn_sources/src/fp32/alexnet_imagenet.cc)
-target_link_libraries(alexnet_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(vgg16_imagenet_fp32 dnn_sources/src/fp32/vgg16_imagenet.cc)
-target_link_libraries(vgg16_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(resnet50_imagenet_fp32 dnn_sources/src/fp32/resnet50_imagenet.cc)
-target_link_libraries(resnet50_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-#********* FP16 TensorRT Source Builds ****** 
-
-add_executable(lenet_mnist_fp16 dnn_sources/src/fp16/lenet_mnist_half.cc)
-target_link_libraries(lenet_mnist_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(alexnet_cifar10_fp16 dnn_sources/src/fp16/alexnet_cifar10_half.cc)
-target_link_libraries(alexnet_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(alexnet2_cifar10_fp16 dnn_sources/src/fp16/alexnet2_cifar10_half.cc)
-target_link_libraries(alexnet2_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(resnet18_cifar10_fp16 dnn_sources/src/fp16/resnet18_cifar10_half.cc)
-target_link_libraries(resnet18_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(vgg16_cifar10_fp16 dnn_sources/src/fp16/vgg16_cifar10_half.cc)
-target_link_libraries(vgg16_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(vgg16_cifar100_fp16 dnn_sources/src/fp16/vgg16_cifar100_half.cc)
-target_link_libraries(vgg16_cifar100_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(mobilenet_cifar10_fp16 dnn_sources/src/fp16/mobilenet_half.cc)
-target_link_libraries(mobilenet_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
diff --git a/hpvm/projects/hpvm-tensor-rt/soc_simulator/promise_timing_model.cpp b/hpvm/projects/hpvm-tensor-rt/soc_simulator/promise_timing_model.cpp
index 87150f45a3a967443a947cf1b95b0e8d6fbae842..5bcd8173a5966888d89238cd94c50484cd719a69 100644
--- a/hpvm/projects/hpvm-tensor-rt/soc_simulator/promise_timing_model.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/soc_simulator/promise_timing_model.cpp
@@ -27,7 +27,7 @@ Scratchpad::Scratchpad(const bool enable,
            const unsigned size,
            const double dram_latency,
            const double dram_bandwidth)
-    : enable_(enable), dram_(dram_latency, dram_bandwidth) {
+    : dram_(dram_latency, dram_bandwidth), enable_(enable) {
 
     num_lines_ = size / line_size;
     lines_.resize(num_lines_);
@@ -78,7 +78,7 @@ std::pair<double, double> Scratchpad::access(const unsigned address,
 
     // Keep reading line by line until everything is read
     while (num_bytes_remaining > 0) {
-        if (lines_[getIndex(addr)] == address) {
+        if ((unsigned) lines_[getIndex(addr)] == address) {
             // Hit
             hits++;
         } else {
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h
similarity index 50%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
rename to hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h
index 61fd362afcc665e21a7ba8636c8df778ac95184e..d4961d19b9326daa4571d066dfe2b3177f6a78d4 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h
@@ -3,23 +3,35 @@
 #ifndef UTILS_HEADER
 #define UTILS_HEADER
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
 #include <sstream>
 #include <vector>
 #include <bits/stdc++.h>
 #include <tensor_runtime.h>
 #include <tensor.h>
 #include <cmath>
-#include <string.h>
+
 
 std::vector<float> run_accuracies;
 std::string model_params_path = "../../test/dnn_benchmarks/model_params/";
 
+
+void printTensorInfo(void *tensor_ptr) {
+
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
+
+  if (tensor->gpu_data != NULL) {
+    printf("Successful cudaMalloc \n");
+  }
+
+  printf("tensor dims = %d \n", tensor->dims.num_dims);
+  printf("dim1_size = %lu \n", tensor->dims.dim_sizes[0]);
+  printf("dim2_size = %lu \n", tensor->dims.dim_sizes[1]);
+  printf("num_elems = %lu \n", tensor->num_elems);
+}
+
 // FIXIT: Move this to debug.h and include in all files
-void dumpWeightsToFile(const char *file_name, void *weights_ptr) {
+void dumpWeightsToFile(char *file_name, void *weights_ptr) {
+
   struct Tensor *weights = (Tensor *)weights_ptr;
   // Move data back to host
   hpvm_request_tensor(weights, 0);
@@ -31,6 +43,10 @@ void dumpWeightsToFile(const char *file_name, void *weights_ptr) {
     abort();
   }
 
+  // printf("size_in_bytes = %lu \n", weights->size_in_bytes);
+  size_t bytes_written =
+      fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
+  // printf("bytes_written = %lu \n", bytes_written);
   fclose(fp);
 }
 
@@ -58,21 +74,13 @@ void fillWithOnesAndTwos(void *tensor_ptr) {
   // initialization is specific to the floating point type
   if (tensor->data_type == CUDNN_DATA_FLOAT) {
     float *data_arr = (float *)tensor->host_data;
-
-    for (unsigned int i = 0; i < tensor->num_elems; i++) {
-      if (i % 2 == 0)
-        data_arr[i] = 1.0;
-      else
-        data_arr[i] = 2.0;
-    }
-
-    /*for(unsigned int i = 0; i < tensor->num_elems/2; i++){
+    for (unsigned int i = 0; i < tensor->num_elems / 2; i++) {
       data_arr[i] = 1.0;
     }
 
-    for(unsigned int i = tensor->num_elems/2; i < tensor->num_elems; i++){
+    for (unsigned int i = tensor->num_elems / 2; i < tensor->num_elems; i++) {
       data_arr[i] = 2.0;
-    }*/
+    }
   }
 }
 
@@ -106,6 +114,18 @@ void fillTensorWithNegOnes(void *tensor_ptr) {
   }
 }
 
+void fillTensorVals(void *tensor_ptr) {
+
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
+  // initialization is specific to the floating point type
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = i + 1;
+    }
+  }
+}
+
 void printTensorValues(void *tensor_ptr) {
 
   struct Tensor *tensor = (struct Tensor *)tensor_ptr;
@@ -113,14 +133,11 @@ void printTensorValues(void *tensor_ptr) {
   hpvm_request_tensor(tensor, 0);
 
   // printing is specific to the floating point type
-  if (tensor->data_type != CUDNN_DATA_FLOAT) {
-    // printf("\n WARNING: The tensor is non-float type tensor \n\n");
-  }
-
-  float *data_arr = (float *)tensor->host_data;
-
-  for (unsigned int i = 0; i < tensor->num_elems; i++) {
-    printf("%f,", data_arr[i]);
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      printf("%f,", data_arr[i]);
+    }
   }
 
   printf("\n");
@@ -131,11 +148,49 @@ void printTensorDims(void *tensor_ptr) {
   struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
   printf("Num_elems = %lu \n", tensor->num_elems);
-  for (int i = 0; i < tensor->dims.num_dims; i++) {
+  for (unsigned int i = 0; i < tensor->dims.num_dims; i++) {
     printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]);
   }
 }
 
+void compareTensors(void *tensor1_ptr, void *tensor2_ptr) {
+
+  struct Tensor *tensor1 = (struct Tensor *)tensor1_ptr;
+  struct Tensor *tensor2 = (struct Tensor *)tensor2_ptr;
+
+  hpvm_request_tensor(tensor1, 0);
+  hpvm_request_tensor(tensor2, 0);
+
+  float *tensor_data1 = (float *)tensor1->host_data;
+  float *tensor_data2 = (float *)tensor2->host_data;
+
+  for (unsigned int i = 0; i < tensor1->num_elems; i++) {
+    if (tensor_data1[i] != tensor_data2[i]) {
+      printf("Tensor data mismatch at index %d \n", i);
+      abort();
+    }
+  }
+}
+
+void compareValues(void *tensor_ptr, float *data, size_t num_elems) {
+
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
+
+  hpvm_request_tensor(tensor, 0);
+
+  float *tensor_data = (float *)tensor->host_data;
+  for (unsigned int i = 0; i < num_elems; i++) {
+    if (tensor_data[i] != data[i]) {
+      printf("Tensor data mismatch");
+      abort();
+    }
+  }
+}
+
+
+
+
+
 struct Tensor *readTrainedWeights(const char *file_name, int data_type,
                                   long int dim1_size, long int dim2_size,
                                   long int dim3_size, long int dim4_size) {
@@ -146,7 +201,7 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type,
   long int size_in_bytes =
       type_size * dim1_size * dim2_size * dim3_size * dim4_size;
   float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
-  // printf("size_in_bytes  = %lu \n", size_in_bytes);
+  printf("size_in_bytes  = %lu \n", size_in_bytes);
 
   int file_header_size = 0;
 
@@ -157,7 +212,11 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type,
   }
 
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
-  fread(tensor_data, 1, size_in_bytes, file);
+  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
+
+  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes,
+  // bytes_read);
+
   fclose(file);
 
   struct Tensor *weights = (struct Tensor *)create4DTensor(
@@ -170,9 +229,9 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type,
   return weights;
 }
 
-struct Tensor *readInputBatch(const char *file_name, int data_type,
-                              long int start, long int end, long int dim2_size,
-                              long int dim3_size, long int dim4_size) {
+struct Tensor *readInputBatch(const char *file_name, long data_type,
+			      long start, long end,
+			      long dim2_size, long dim3_size, long dim4_size) {
 
   long int dim1_size = end - start;
   // FIXIT: Don't assume floating point types
@@ -191,9 +250,12 @@ struct Tensor *readInputBatch(const char *file_name, int data_type,
   }
 
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-  fread(tensor_data, 1, size_in_bytes, file);
+  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
+
   fclose(file);
 
+  // printf ("FIXED input BATCH read \n");
+
   struct Tensor *weights = (struct Tensor *)create4DTensor(
       data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
 
@@ -203,10 +265,7 @@ struct Tensor *readInputBatch(const char *file_name, int data_type,
   return weights;
 }
 
-uint8_t *readLabelsBatch(const char *labels_file, int start, int end) {
-
-  int num_labels = end - start;
-  int file_header_size = sizeof(uint8_t) * start;
+uint8_t *readLabels(const char *labels_file, int num_labels) {
 
   uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
   FILE *file = fopen(labels_file, "rb");
@@ -215,14 +274,30 @@ uint8_t *readLabelsBatch(const char *labels_file, int start, int end) {
     abort();
   }
 
-  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-  fread(labels, 1, sizeof(uint8_t) * num_labels, file);
+  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
+
+  fclose(file);
+
+  return labels;
+}
+
+uint32_t *readLabels3(const char *labels_file, int num_labels) {
+
+  uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
+    printf("Data file %s is not found. Aborting...\n", labels_file);
+    abort();
+  }
+
+  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
+
   fclose(file);
 
-  // printf("--labels bytes_read = %lu \n", bytes_read);
   return labels;
 }
 
+
 uint32_t *readLabelsBatch3(const char *labels_file, int start, int end) {
 
   int num_labels = end - start;
@@ -236,29 +311,31 @@ uint32_t *readLabelsBatch3(const char *labels_file, int start, int end) {
   }
 
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-  fread(labels, 1, sizeof(uint32_t) * num_labels, file);
+
+  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
+
   fclose(file);
 
   return labels;
 }
 
-// NOTE: batch_size and num_classes are Unused arguments
-float computeAccuracy2(uint8_t *labels, int batch_size, void *result_ptr,
-                       size_t num_classes = 10) {
+
+
+float computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
   struct Tensor *result = (struct Tensor *)result_ptr;
 
   size_t batch_dim = result->dims.dim_sizes[0];
-  num_classes = result->dims.dim_sizes[1];
+  size_t num_classes = result->dims.dim_sizes[1];
   float *data = (float *)result->host_data;
   int num_errors = 0;
 
-  printf("batch_dim = %lu, channels = %lu \n", batch_dim, num_classes);
+  printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
 
   for (unsigned int i = 0; i < batch_dim; i++) {
 
     int chosen = 0;
-    for (size_t id = 1; id < num_classes; ++id) {
+    for (unsigned int id = 1; id < num_classes; ++id) {
       if (data[i * num_classes + chosen] < data[i * num_classes + id])
         chosen = id;
     }
@@ -285,27 +362,49 @@ float computeAccuracy2(uint8_t *labels, int batch_size, void *result_ptr,
   return accuracy;
 }
 
-float computeAccuracy3(uint32_t *labels, void *result_ptr) {
+struct ClassProb {
+  float prob;
+  int index;
+};
+
+bool descendFloatComp(ClassProb obj1, ClassProb obj2) {
+  return obj1.prob > obj2.prob;
+}
+
+float computeTop5Accuracy(uint8_t *labels, int num_labels, void *result_ptr,
+                          unsigned num_classes = 10) {
 
   struct Tensor *result = (struct Tensor *)result_ptr;
 
   size_t batch_dim = result->dims.dim_sizes[0];
-  size_t num_classes = result->dims.dim_sizes[1];
+  size_t channels = result->dims.dim_sizes[1];
   float *data = (float *)result->host_data;
   int num_errors = 0;
 
-  printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
+  printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
 
-  for (size_t i = 0; i < batch_dim; i++) {
+  for (unsigned int i = 0; i < num_labels; i++) {
 
-    uint32_t chosen = 0;
-    for (size_t id = 1; id < num_classes; ++id) {
-      if (data[i * num_classes + chosen] < data[i * num_classes + id])
-        chosen = id;
+    std::vector<ClassProb> elem_probs;
+    for (unsigned int id = 0; id < num_classes; ++id) {
+      ClassProb cProb;
+      cProb.prob = data[i * channels + id];
+      cProb.index = id;
+      elem_probs.push_back(cProb);
     }
 
-    if (chosen != labels[i])
-      num_errors++;
+  std:
+    sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
+    // Check if any of top-5 predictions matches
+    bool matched = false;
+    for (int j = 0; j < 5; j++) {
+      ClassProb cProb = elem_probs[j];
+      if (cProb.index == labels[i])
+        matched = true;
+    }
+
+    if (!matched)
+      num_errors += 1;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
@@ -344,11 +443,38 @@ void dumpFinalAccuracy(float accuracy) {
   run_accuracies.push_back(accuracy);
 }
 
+void dumpAvgPSNR(float avg_psnr) {
+
+  FILE *fp = fopen("avg_psnr", "w+");
+  if (fp != NULL) {
+    std::ostringstream ss;
+    ss << std::fixed << avg_psnr;
+    std::string print_str = ss.str();
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+  }
+
+  fclose(fp);
+}
+
+void dumpPSNRStd(float psnr_std) {
+
+  FILE *fp = fopen("psnr_std.txt", "w+");
+  if (fp != NULL) {
+    std::ostringstream ss;
+    ss << std::fixed << psnr_std;
+    std::string print_str = ss.str();
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+  }
+
+  fclose(fp);
+}
+
+
 void dumpExecutionAccuracies() {
 
   FILE *fp = fopen("run_accuracies.txt", "w+");
   if (fp != NULL) {
-    for (size_t i = 0; i < run_accuracies.size(); i++) {
+    for (unsigned int i = 0; i < run_accuracies.size(); i++) {
       float accuracy = run_accuracies[i];
       std::ostringstream ss;
       ss << std::fixed << accuracy;
@@ -360,4 +486,110 @@ void dumpExecutionAccuracies() {
 
   fclose(fp);
 }
+
+float readPSNRFromFile(const char *file_name) {
+
+  float psnr;
+  FILE *pFile = fopen(file_name, "r");
+  if (pFile == NULL) {
+    printf("ERROR: psnr.txt not found! \n");
+    abort();
+  }
+
+  fscanf(pFile, "%f", &psnr);
+  printf("**** PSNR read = %f \n\n", psnr);
+  return psnr;
+}
+
+float computePSNRViolation(void *gold_ptr, void *approx_ptr,
+                           float PSNR_threshold) {
+
+  PSNR_threshold = readPSNRFromFile("psnr.txt");
+  std::vector<float> psnr_list;
+
+  struct Tensor *gold_tensor = (struct Tensor *)gold_ptr;
+  struct Tensor *approx_tensor = (struct Tensor *)approx_ptr;
+
+  size_t *dim_sizes = gold_tensor->dims.dim_sizes;
+  size_t batch_dim = dim_sizes[0];
+  size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
+
+  printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size);
+
+  float *gold_data = (float *)gold_tensor->host_data;
+  float *approx_data = (float *)approx_tensor->host_data;
+
+  FILE *fp = fopen("img_psnr.txt", "w+");
+
+  float sum_psnr = 0.0;
+  int num_errors = 0;
+  for (size_t i = 0; i < batch_dim; i++) {
+    float mse_sum = 0.0;
+    float max_val = -999999;
+    size_t offset = i * image_size;
+
+    for (size_t j = 0; j < image_size; j++) {
+      float diff = gold_data[offset + j] - approx_data[offset + j];
+      float diff_square = diff * diff;
+      mse_sum += diff_square;
+
+      if (max_val < gold_data[offset + j]) {
+        max_val = gold_data[offset + j];
+      }
+    }
+
+    mse_sum = mse_sum / image_size;
+    float psnr = 20 * log10(255 / sqrt(mse_sum));
+
+    sum_psnr += psnr;
+    if (psnr < PSNR_threshold)
+      num_errors += 1;
+
+    printf("PSNR value = %f \n", psnr);
+    psnr_list.push_back(psnr);
+
+    std::ostringstream ss;
+    ss << std::fixed << psnr;
+    std::string print_str = ss.str();
+    fwrite(print_str.c_str(), 1, print_str.length(), fp);
+    fwrite("\n", 1, 1, fp);
+  }
+
+  float violation_rate = (num_errors * 1.0) / batch_dim * 100.0;
+  printf("*** violation_rate= %f \n\n", violation_rate);
+
+  float avg_psnr = sum_psnr / batch_dim;
+  printf("*** avg_psnr =  %f \n\n", avg_psnr);
+  dumpAvgPSNR(avg_psnr);
+
+  float success_rate = 100.0 - violation_rate;
+  dumpFinalAccuracy(success_rate);
+
+  fclose(fp);
+
+  float var = 0.0;
+  for (size_t i = 0; i < batch_dim; i++) {
+    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr);
+  }
+
+  var /= batch_dim;
+  float std = sqrt(var);
+
+  dumpPSNRStd(std);
+
+  return violation_rate;
+}
+
+void dumpOutput(void *output_ptr, const char *file_name) {
+
+  struct Tensor *out_tensor = (struct Tensor *)output_ptr;
+  size_t size_in_bytes = out_tensor->size_in_bytes;
+  printf("** Output size = %lu \n", size_in_bytes);
+
+  float *host_data = (float *)out_tensor->host_data;
+  FILE *fd = fopen(file_name, "w+");
+  fwrite(host_data, 1, size_in_bytes, fd);
+  fclose(fd);
+}
+
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc b/hpvm/projects/hpvm-tensor-rt/tests/unit_tests.cc
similarity index 99%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
rename to hpvm/projects/hpvm-tensor-rt/tests/unit_tests.cc
index 746f62bce19b25c3b74bec4908cdc3c87bee034a..ffb4c3a809b3e936f6c27ebd7c11aef5c4460104 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tests/unit_tests.cc
@@ -6,7 +6,7 @@
 #include <string.h>
 #include "tensor_runtime.h"
 #include "tensor_cpu_runtime.h"
-#include "utils.h"
+#include "tensorUtils.h"
 #include "tensor_custom_ops_cpu.h"
 
 using namespace std;
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/compile.py b/hpvm/projects/torch2hpvm/torch2hpvm/compile.py
index 922b6795ade457ba4c961af4d2e70ce150e22e92..172448a60d4f65fc4aafc09c9a76d9cb492ff7b0 100644
--- a/hpvm/projects/torch2hpvm/torch2hpvm/compile.py
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/compile.py
@@ -172,7 +172,7 @@ class ModelExporter:
         from subprocess import run
 
         args = [
-            "approxhpvm.py",
+            "hpvm-clang",
             str(self.codefile),
             str(output_binary),
             *self.compile_args,
diff --git a/hpvm/scripts/hpvm_installer.py b/hpvm/scripts/hpvm_installer.py
index 26166d6b58443f65deb37b2c5a8370c35b91eeff..cce8b3f07928d9ab096df3166bd02f4e6f8e1f5d 100755
--- a/hpvm/scripts/hpvm_installer.py
+++ b/hpvm/scripts/hpvm_installer.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python3
-from argparse import ArgumentParser, Namespace
+from argparse import ArgumentParser
 from os import chdir, environ, makedirs
 from pathlib import Path
 from subprocess import CalledProcessError, check_call
-from typing import List
+from typing import List, Union
 
 VERSION = "9.0.0"
 URL = "http://releases.llvm.org"
-WGET = "wget"
+DOWNLOADER = "curl"
 CLANG_DIR = f"cfe-{VERSION}.src"
 CLANG_TARBALL = f"{CLANG_DIR}.tar.xz"
 LLVM_DIR = f"llvm-{VERSION}.src"
@@ -30,7 +30,7 @@ LINKS = [
     "test",
     "tools",
 ]
-MAKE_TARGETS = ["approxhpvm.py"]
+MAKE_TARGETS = ["hpvm-clang"]
 MAKE_TEST_TARGETS = ["check-hpvm-dnn", "check-hpvm-pass"]
 
 # Relative to project root which is __file__.parent.parent
@@ -79,6 +79,11 @@ def parse_args(args=None):
         "Supported targets: AArch64, AMDGPU, ARM, BPF, Hexagon, Mips, MSP430, NVPTX, PowerPC, "
         "Sparc, SystemZ, X86, XCore.",
     )
+    parser.add_argument(
+        "--ninja",
+        action="store_true",
+        help="Use Ninja to build HPVM. Uses 'make' otherwise.",
+    )
     parser.add_argument(
         "-r", "--run-tests", action="store_true", help="Build and run test cases"
     )
@@ -150,7 +155,8 @@ Example: "DCMAKE_BUILD_TYPE=Release DCMAKE_INSTALL_PREFIX=install".
 Arguments: """
     )
     args.cmake_args = input()
-    args.cmake_args = [f"-{arg}" for arg in args.cmake_args.split(" ")]
+    if args.cmake_args.strip() != "":    
+      args.cmake_args = [f"-{arg}" for arg in args.cmake_args.split(" ")]
 
     args.no_params = not input_with_check(
         "Download DNN weights (recommended)? [y/n]: ", parse_yn, "Please enter y or n"
@@ -165,6 +171,8 @@ def print_args(args):
     print("Running with the following options:")
     print(f"  Automated build: {not args.no_build}")
     print(f"  Build directory: {args.build_dir}")
+    build_sys = "ninja" if args.ninja else "make"
+    print(f"  Build system: {build_sys}")
     print(f"  Threads: {args.parallel}")
     print(f"  Targets: {args.targets}")
     print(f"  Download DNN weights: {not args.no_params}")
@@ -181,7 +189,7 @@ def check_download_llvm_clang():
         else:
             print(f"Downloading {LLVM_TARBALL}...")
             print(f"=============================")
-            check_call([WGET, f"{URL}/{VERSION}/{LLVM_TARBALL}"])
+            download(f"{URL}/{VERSION}/{LLVM_TARBALL}", LLVM_TARBALL)
         check_call(["tar", "xf", LLVM_TARBALL])
         check_call(["mv", LLVM_DIR, "llvm"])
     tools = Path("llvm/tools")
@@ -196,7 +204,7 @@ def check_download_llvm_clang():
     chdir(tools)
     print(f"Downloading {CLANG_TARBALL}...")
     print(f"=============================")
-    check_call([WGET, f"{URL}/{VERSION}/{CLANG_TARBALL}"])
+    download(f"{URL}/{VERSION}/{CLANG_TARBALL}", CLANG_TARBALL)
     check_call(["tar", "xf", CLANG_TARBALL])
     check_call(["mv", CLANG_DIR, "clang"])
     assert Path("clang/").is_dir(), "Problem with clang download. Exiting!"
@@ -214,7 +222,7 @@ def check_download_model_params():
     else:
         print(f"Downloading DNN model parameters: {MODEL_PARAMS_TAR}...")
         print(f"=============================")
-        check_call([WGET, MODEL_PARAMS_LINK, "-O", MODEL_PARAMS_TAR])
+        download(MODEL_PARAMS_LINK, MODEL_PARAMS_TAR)
     print(
         f"Extracting DNN model parameters {MODEL_PARAMS_TAR} => {MODEL_PARAMS_DIR}..."
     )
@@ -255,7 +263,7 @@ def build(
     build_dir: Path,
     nthreads: int,
     targets: str,
-    build_test_targets: bool,
+    use_ninja: bool,
     cmake_additional_args: List[str],
 ):
     print("Now building...")
@@ -271,13 +279,17 @@ def build(
         f"-DLLVM_TARGETS_TO_BUILD={targets}",
         *cmake_additional_args,
     ]
+    if use_ninja:
+        cmake_args.append("-GNinja")
     print(f"CMake: {' '.join(cmake_args)}")
     print(f"=============================")
     check_call(cmake_args)
-    make_args = ["make", f"-j{nthreads}", *MAKE_TARGETS]
-    print(f"Make: {' '.join(make_args)}")
+
+    build_sys = "ninja" if use_ninja else "make"
+    build_args = [build_sys, f"-j{nthreads}", *MAKE_TARGETS]
+    print(f"Build system ({build_sys}): {' '.join(build_args)}")
     print(f"=============================")
-    check_call(make_args)
+    check_call(build_args)
     chdir(ROOT_DIR)
 
 
@@ -291,12 +303,13 @@ def install_py_packages():
         check_call([sys.executable, "-m", "pip", "install", str(package_home)])
 
 
-def run_tests(build_dir: Path, nthreads: int):
+def run_tests(build_dir: Path, use_ninja: bool, nthreads: int):
     chdir(build_dir)
-    make_args = ["make", f"-j{nthreads}", *MAKE_TEST_TARGETS]
-    print(f"Tests: {' '.join(make_args)}")
+    build_sys = "ninja" if use_ninja else "make"
+    build_args = [build_sys, f"-j{nthreads}", *MAKE_TARGETS]
+    print(f"Tests: {' '.join(build_args)}")
     print(f"=============================")
-    check_call(make_args)
+    check_call(build_args)
     chdir(ROOT_DIR)
 
 
@@ -310,6 +323,10 @@ def input_with_check(prompt: str, parse, prompt_when_invalid: str):
     return value
 
 
+def download(link: str, output: Union[Path, str]):
+    check_call(["curl", "-L", link, "-o", str(output)])
+
+
 def main():
     from sys import argv
 
@@ -328,21 +345,15 @@ HPVM not installed.
 To complete installation, follow these instructions:
 - Create and navigate to a folder "./build" 
 - Run "cmake ../llvm [options]". Find potential options in README.md.
-- Run "make -j<number of threads> approxhpvm.py" and then "make install"
+- Run "make -j<number of threads> hpvm-clang" and then "make install"
 For more details refer to README.md.
 """
         )
         return
     else:
-        build(
-            args.build_dir,
-            args.parallel,
-            args.targets,
-            args.run_tests,
-            args.cmake_args,
-        )
+        build(args.build_dir, args.parallel, args.targets, args.ninja, args.cmake_args)
     if args.run_tests:
-        run_tests(args.build_dir, args.parallel)
+        run_tests(args.build_dir, args.ninja, args.parallel)
     else:
         print("Skipping tests.")
 
diff --git a/hpvm/test/CMakeLists.txt b/hpvm/test/CMakeLists.txt
index 660003538fe72d45e3dbfc1178fd296cdc7156b5..3c4f26472317f511edaab98c5e4a4f8ed7ba2dfb 100644
--- a/hpvm/test/CMakeLists.txt
+++ b/hpvm/test/CMakeLists.txt
@@ -1,4 +1,12 @@
 include(../cmake/TestFile.cmake)  # Generation of `.test` files in CMake
+
+# clang and clang++ are used to compile benchmarks
+# in `benchmarks` and `dnn_benchmarks/hpvm-c`.
+set(CLANG_C ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/clang)
+set(CLANG_CXX ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/clang++)
+
 add_subdirectory(hpvm_pass)  # Passes test suite
-add_subdirectory(dnn_benchmarks/hpvm-c)  # DNN accuracy test suite
+add_subdirectory(benchmarks)
+add_subdirectory(dnn_benchmarks/hpvm-c)  # HPVM-C DNN accuracy test suite
+add_subdirectory(dnn_benchmarks/tensor-rt-src)  # tensor_runtime DNN (build only, no tests)
 add_subdirectory(dnn_benchmarks/profiling)  # hpvm-profiler test suite
diff --git a/hpvm/test/benchmarks/CMakeLists.txt b/hpvm/test/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4d4f0691fc0e70cd9b532947688da264b2ca4e7b
--- /dev/null
+++ b/hpvm/test/benchmarks/CMakeLists.txt
@@ -0,0 +1,38 @@
+macro(hpvm_compile_util_sources compiler_flags util_srcs language_mode)
+  # "Returns" ${util_bitcodes}, a list of paths to generated bitcode (.ll) files
+  foreach(src ${util_srcs})
+    set(src_path ${CMAKE_CURRENT_SOURCE_DIR}/${src})
+    get_filename_component(src_name ${src_path} NAME)
+    set(output_path ${CMAKE_CURRENT_BINARY_DIR}/${src_name}.ll)
+    add_custom_command(
+        OUTPUT ${output_path}
+        COMMAND ${CLANG_C} -x${language_mode}
+          ${compiler_flags} ${src_path} -emit-llvm -S -o ${output_path}
+        DEPENDS ${src_path}
+    )
+    list(APPEND util_bitcodes ${output_path})
+  endforeach()
+endmacro()
+
+function(add_hpvm_benchmark
+  target_name output_bin_name all_flags language_mode
+  main_src util_bitcodes
+)
+  set(output_bin_path ${CMAKE_CURRENT_BINARY_DIR}/${output_bin_name})
+  set(main_src_path ${CMAKE_CURRENT_SOURCE_DIR}/${main_src})
+  if(util_bitcodes)
+    set(bitcodes_arg -b ${util_bitcodes})
+  else()
+    set(bitcodes_arg "")
+  endif()
+  add_custom_command(
+    OUTPUT ${output_bin_path}
+    COMMAND hpvm-clang ${all_flags} -x${language_mode}
+      ${bitcodes_arg} -- ${main_src_path} ${output_bin_path}
+    DEPENDS ${main_src_path} ${util_bitcodes} hpvm-clang
+  )
+  add_custom_target(${target_name} DEPENDS ${output_bin_path})
+endfunction(add_hpvm_benchmark)
+
+add_subdirectory(hpvm-cava)
+add_subdirectory(pipeline)
diff --git a/hpvm/test/benchmarks/hpvm-cava/CMakeLists.txt b/hpvm/test/benchmarks/hpvm-cava/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d89a01a05669a0c49fecf1ccbd25029fce14143b
--- /dev/null
+++ b/hpvm/test/benchmarks/hpvm-cava/CMakeLists.txt
@@ -0,0 +1,24 @@
+set(other_srcs src/load_cam_model.c src/cam_pipe_utility.c src/utility.c)
+set(
+  compiler_flags
+  # Insanity warning: only -O1 works for this benchmark.
+  -O1 -I ${CMAKE_CURRENT_SOURCE_DIR}/src
+  -DDMA_MODE -DDMA_INTERFACE_V3
+)
+
+# Sets ${util_bitcodes}
+hpvm_compile_util_sources("${compiler_flags}" "${other_srcs}" "c")
+
+set(all_flags_cpu ${compiler_flags} "-DDEVICE=CPU_TARGET" "-lpthread")
+add_hpvm_benchmark(
+  "hpvm_cava_cpu" "hpvm-cava-cpu" "${all_flags_cpu}" "c"
+  src/main.c "${util_bitcodes}"
+)
+
+if(HPVM_USE_OPENCL)
+  set(all_flags_gpu ${compiler_flags} "-DDEVICE=GPU_TARGET" "--opencl" "-lpthread")
+  add_hpvm_benchmark(
+    "hpvm_cava_gpu" "hpvm-cava-gpu" "${all_flags_gpu}" "c"
+    src/main.c "${util_bitcodes}"
+  )
+endif()
diff --git a/hpvm/test/benchmarks/hpvm-cava/Makefile b/hpvm/test/benchmarks/hpvm-cava/Makefile
deleted file mode 100644
index 58dfa72aacb172252ea1c13ed3331322fb600861..0000000000000000000000000000000000000000
--- a/hpvm/test/benchmarks/hpvm-cava/Makefile
+++ /dev/null
@@ -1,131 +0,0 @@
-# This Makefile compiles the HPVM-CAVA pilot project. 
-# It builds HPVM-related dependencies, then the native camera pipeline ISP code.
-#
-# Paths to some dependencies (e.g., HPVM, LLVM) must exist in Makefile.config,
-# which can be copied from Makefile.config.example for a start.
-
-CONFIG_FILE := ../include/Makefile.config
-
-ifeq ($(wildcard $(CONFIG_FILE)),)
-    $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example)
-endif
-include $(CONFIG_FILE)
-
-# Compiler Flags
-
-LFLAGS += -lm -lrt
-
-ifeq ($(TARGET),)
-    TARGET = gpu
-endif
-
-# Build dirs
-SRC_DIR = src/
-CAM_PIPE_SRC_DIR = $(SRC_DIR)
-BUILD_DIR = build/$(TARGET)
-CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
-
-
-INCLUDES += -I$(SRC_DIR) -I$(CAM_PIPE_SRC_DIR)
-ifneq ($(CONFUSE_ROOT),)
-  INCLUDES += -I$(CONFUSE_ROOT)/include
-  LFLAGS += -L$(CONFUSE_ROOT)/lib
-endif
-
-EXE = cava-hpvm-$(TARGET)
-
-LFLAGS += -pthread
-
-## BEGIN HPVM MAKEFILE
-LANGUAGE=hpvm
-SRCDIR_OBJS=load_cam_model.ll cam_pipe_utility.ll utility.ll
-OBJS_SRC=src/load_cam_model.c src/cam_pipe_utility.c src/utility.c
-HPVM_OBJS=main.hpvm.ll
-APP = $(EXE)
-APP_CUDALDFLAGS=-lm -lstdc++
-APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3
-APP_CXXFLAGS=-ffast-math -O0 -I/opt/opencv/include
-APP_LDFLAGS=$(LFLAGS)
-
-CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
-OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
-CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
-LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
-
-HPVM_RT_PATH = $(LLVM_BUILD_DIR)/tools/hpvm/projects/hpvm-rt
-
-HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.bc
-
-
-TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce
-
-ifeq ($(TARGET),seq)
-  DEVICE = CPU_TARGET
-  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_CPU.so -load LLVMClearDFG.so -dfg2llvm-cpu -clearDFG
-  HPVM_OPTFLAGS += -hpvm-timers-cpu
-else
-  DEVICE = GPU_TARGET
-  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_OpenCL.so -load LLVMDFG2LLVM_CPU.so -load LLVMClearDFG.so -localmem -dfg2llvm-opencl -dfg2llvm-cpu -clearDFG
-  HPVM_OPTFLAGS += -hpvm-timers-cpu -hpvm-timers-ptx
-endif
-  TESTGEN_OPTFLAGS += -hpvm-timers-gen
-
-CFLAGS += -DDEVICE=$(DEVICE)
-CXXFLAGS += -DDEVICE=$(DEVICE)
-
-# Add BUILDDIR as a prefix to each element of $1
-INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
-
-.PRECIOUS: $(BUILD_DIR)/%.ll
-
-OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS))
-KERNEL = $(TEST_OBJS).kernels.ll
-
-ifeq ($(TARGET),gpu)
-  KERNEL_OCL = $(TEST_OBJS).kernels.cl
-endif
-
-HOST_LINKED = $(BUILD_DIR)/$(APP).linked.ll
-HOST = $(BUILD_DIR)/$(APP).host.ll
-
-ifeq ($(OPENCL_PATH),)
-FAILSAFE=no_opencl
-else 
-FAILSAFE=
-endif
-
-# Targets
-
-default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(EXE)
-
-clean :
-	if [ -f $(EXE) ]; then rm $(EXE); fi
-	if [ -f DataflowGraph.dot ]; then rm DataflowGraph.dot*; fi
-	if [ -d $(BUILD_DIR) ]; then rm -rf $(BUILD_DIR); fi
-
-$(KERNEL_OCL) : $(KERNEL)
-	$(OCLBE) $< -o $@
-
-$(EXE) : $(HOST_LINKED)
-	$(CXX) -O3 $(LDFLAGS) $< -o $@
-
-$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB)
-	$(LLVM_LINK) $^ -S -o $@
-
-$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS)
-	$(OPT) $(HPVM_OPTFLAGS) -S $< -o $(HOST)
-
-$(BUILD_DIR):
-	mkdir -p $(BUILD_DIR)
-
-$(BUILD_DIR)/%.ll : $(SRC_DIR)/%.c
-	$(CC) $(OBJS_CFLAGS) -emit-llvm -S -o $@ $<
-
-$(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c
-	$(CC) $(CFLAGS) -emit-llvm -S -o $@ $<
-
-$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll
-	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
-
-## END HPVM MAKEFILE
diff --git a/hpvm/test/benchmarks/hpvm-cava/src/main.c b/hpvm/test/benchmarks/hpvm-cava/src/main.c
index 09430239d7c06a0b2fab2a59c7e6310babe617c6..0d8fb37d0103b55234e59e8d2b66742faed207b6 100644
--- a/hpvm/test/benchmarks/hpvm-cava/src/main.c
+++ b/hpvm/test/benchmarks/hpvm-cava/src/main.c
@@ -53,6 +53,7 @@ typedef struct __attribute__((__packed__)) {
 } RootIn;
 
 typedef enum _argnum {
+  CAM_MODEL,
   RAW_IMAGE_BIN,
   OUTPUT_IMAGE_BIN,
   NUM_REQUIRED_ARGS,
@@ -67,7 +68,7 @@ typedef struct _arguments {
 } arguments;
 
 static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n";
-static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary";
+static char args_doc[] = "path/to/cam-model path/to/raw-image-binary path/to/output-image-binary";
 static struct argp_option options[] = {
     {"num-inputs", 'n', "N", 0, "Number of input images"},
     {0},
@@ -724,14 +725,14 @@ int main(int argc, char *argv[]) {
       *transform_out, *gamut_out;
   float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
 
-  TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
+  TsTw = get_TsTw(args.args[CAM_MODEL], wb_index);
   float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
   free(TsTw);
   TsTw = trans;
-  ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
-  weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
-  coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
-  tone_map = get_tone_map("cam_models/NikonD7000/");
+  ctrl_pts = get_ctrl_pts(args.args[CAM_MODEL], num_ctrl_pts);
+  weights = get_weights(args.args[CAM_MODEL], num_ctrl_pts);
+  coefs = get_coefs(args.args[CAM_MODEL], num_ctrl_pts);
+  tone_map = get_tone_map(args.args[CAM_MODEL]);
 
   input_scaled = (float *)malloc_aligned(bytes_fimage);
   result_scaled = (float *)malloc_aligned(bytes_fimage);
diff --git a/hpvm/test/benchmarks/pipeline/CMakeLists.txt b/hpvm/test/benchmarks/pipeline/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..db2b8eeab1647f4ac72c05127877ecec1f949c90
--- /dev/null
+++ b/hpvm/test/benchmarks/pipeline/CMakeLists.txt
@@ -0,0 +1,15 @@
+find_package(OpenCV 2)
+if(${OpenCV_FOUND})
+  set(
+    all_flags
+    -O3 -I${OpenCV_INCLUDE_DIRS}
+    -ffast-math -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize
+    -lpthread
+  )
+  add_hpvm_benchmark("pipeline_cpu" "pipeline-cpu" "${all_flags}" "c++" src/main.cc "")
+  if(HPVM_USE_OPENCL)
+    add_hpvm_benchmark("pipeline_gpu" "pipeline-gpu" "${all_flags};--opencl" "c++" src/main.cc "")
+  endif()
+else()
+  message(WARNING "opencv-2 not found; not compiling HPVM benchmark 'pipeline'.")
+endif()
diff --git a/hpvm/test/benchmarks/pipeline/Makefile b/hpvm/test/benchmarks/pipeline/Makefile
deleted file mode 100644
index 8a55393f241f30d840cbf85c31488e652c2023a0..0000000000000000000000000000000000000000
--- a/hpvm/test/benchmarks/pipeline/Makefile
+++ /dev/null
@@ -1,118 +0,0 @@
-# This Makefile compiles the HPVM-pipeline test.
-# It builds HPVM-related dependencies, then the native camera pipeline ISP code.
-#
-# Paths to some dependencies (e.g., HPVM, LLVM) must exist in Makefile.config,
-# which can be copied from Makefile.config.example for a start.
-
-CONFIG_FILE := ../include/Makefile.config
-
-ifeq ($(wildcard $(CONFIG_FILE)),)
-    $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example)
-endif
-include $(CONFIG_FILE)
-
-# Compiler Flags
-
-LFLAGS += -lm -lrt
-
-ifeq ($(TARGET),)
-    TARGET = gpu
-endif
-
-# Build dirs
-SRC_DIR = src/
-BUILD_DIR = build/$(TARGET)
-CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
-
-EXE = pipeline-$(TARGET)
-
-INCLUDES += -I$(SRC_DIR)
-
-## BEGIN HPVM MAKEFILE
-SRCDIR_OBJS=
-HPVM_OBJS=main.hpvm.ll
-APP = $(EXE)
-APP_CFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize
-APP_CXXFLAGS += $(INCLUDES) -ffast-math -O3 -fno-lax-vector-conversions -fno-vectorize -fno-slp-vectorize
-APP_LDFLAGS=`pkg-config opencv --libs`
-
-CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS)
-OBJS_CFLAGS = $(APP_CFLAGS) $(PLATFORM_CFLAGS)
-CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
-LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
-
-HPVM_RT_PATH = $(LLVM_BUILD_DIR)/tools/hpvm/projects/hpvm-rt
-HPVM_RT_LIB = $(HPVM_RT_PATH)/hpvm-rt.bc
-
-TESTGEN_OPTFLAGS = -load LLVMGenHPVM.so -genhpvm -globaldce
-
-ifeq ($(TARGET),seq)
-  DEVICE = CPU_TARGET
-  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_CPU.so -load LLVMClearDFG.so -dfg2llvm-cpu -clearDFG
-  HPVM_OPTFLAGS += -hpvm-timers-cpu
-else
-  DEVICE = GPU_TARGET
-  HPVM_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_OpenCL.so -load LLVMDFG2LLVM_CPU.so -load LLVMClearDFG.so -localmem -dfg2llvm-opencl -dfg2llvm-cpu -clearDFG
-  HPVM_OPTFLAGS += -hpvm-timers-cpu -hpvm-timers-ptx
-endif
-  TESTGEN_OPTFLAGS += -hpvm-timers-gen
-
-CFLAGS += -DDEVICE=$(DEVICE)
-CXXFLAGS += -DDEVICE=$(DEVICE)
-
-# Add BUILDDIR as a prefix to each element of $1
-INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
-
-.PRECIOUS: $(BUILD_DIR)/%.ll
-
-OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
-TEST_OBJS = $(call INBUILDDIR,$(HPVM_OBJS))
-KERNEL = $(TEST_OBJS).kernels.ll
-
-ifeq ($(TARGET),seq)
-else
-  KERNEL_OCL = $(TEST_OBJS).kernels.cl
-endif
-
-HOST_LINKED = $(BUILD_DIR)/$(APP).linked.ll
-HOST = $(BUILD_DIR)/$(APP).host.ll
-
-ifeq ($(OPENCL_PATH),)
-FAILSAFE=no_opencl
-else 
-FAILSAFE=
-endif
-
-# Targets
-default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(EXE)
-
-clean :
-	if [ -f $(EXE) ]; then rm $(EXE); fi
-	if [ -f DataflowGraph.dot ]; then rm DataflowGraph.dot*; fi
-	if [ -d $(BUILD_DIR) ]; then rm -rf $(BUILD_DIR); fi
-
-$(KERNEL_OCL) : $(KERNEL)
-	$(OCLBE) $< -o $@
-
-$(EXE) : $(HOST_LINKED)
-	$(CXX) -O3 $(LDFLAGS) $< -o $@
-
-$(HOST_LINKED) : $(HOST) $(OBJS) $(HPVM_RT_LIB)
-	$(LLVM_LINK) $^ -S -o $@
-
-$(HOST) $(KERNEL): $(BUILD_DIR)/$(HPVM_OBJS)
-	$(OPT) $(HPVM_OPTFLAGS) -S $< -o $(HOST)
-
-$(BUILD_DIR):
-	mkdir -p $(BUILD_DIR)
-
-$(BUILD_DIR)/%.ll : $(SRC_DIR)/%.cc
-	$(CC) $(OBJS_CFLAGS) -emit-llvm -S -o $@ $<
-
-$(BUILD_DIR)/main.ll : $(SRC_DIR)/main.cc
-	$(CC) $(CXXFLAGS) -emit-llvm -S -o $@ $<
-
-$(BUILD_DIR)/main.hpvm.ll : $(BUILD_DIR)/main.ll
-	$(OPT) $(TESTGEN_OPTFLAGS) $< -S -o $@
-
-## END HPVM MAKEFILE
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt
index 3a0c6534e02ce82fdfd02f483f71c6be1a9ab433..9f34317d34157d57468c60cb854828b5c54f1cde 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt
@@ -1,53 +1,21 @@
-# First get approxhpvm.py which we then use to compile benchmarks.
-get_filename_component(APPROXHPVM_PY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py REALPATH)
-
-# Configure config.h which tells the benchmarks where's the model parameter directory.
-# We can also use the one in tensor_runtime, but we're avoiding that so as to 
-# decouple things.
+# Each source file contains a @MODEL_PARAMS_DIR@ waiting to be filled in.
 set(MODEL_PARAMS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../model_params/")
-configure_file(
-  "include/config.h.in"
-  "${CMAKE_CURRENT_BINARY_DIR}/include/config.h"
-)
-# This will be an extra include directory (specific to these benchmarks)
-# and we'll give this to approxhpvm.py
-set(CONFIG_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/include")
-
-# --[ llvm-lit test setup
-# lit.cfg.py looks for tests in CMAKE_CURRENT_BINARY_DIR (see lit.cfg.py)
-# as most of the tests require some kind of compilation / generation
-# which is best done over there.
-configure_lit_site_cfg(
-  ../../lit.site.cfg.py.in
-  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
-  MAIN_CONFIG
-  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
-)
-add_lit_testsuite(check-hpvm-dnn "Running HPVM DNNs"
-  ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS dnn_benchmarks  # Compile all dnn benchmarks to run them
-  ARGS "-j1"  # Run DNN benchmarks sequentially
-)
-# Install an accuracy comparator under build/bin
-set(BIN_DIR ${LLVM_BINARY_DIR}/${LLVM_TOOLS_INSTALL_DIR})
-add_custom_command(
-  OUTPUT ${BIN_DIR}/check_dnn_acc.py
-  COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/check_dnn_acc.py ${BIN_DIR}
-  COMMAND chmod +x ${BIN_DIR}/check_dnn_acc.py
-  DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/check_dnn_acc.py
-)
-
 set(test_compile_targets "")
-function(compile_hpvm_c bin_filename src_filepath codegen_target)
+function(compile_hpvm_c target_name src_filepath codegen_target)
+  set(generated_file_path "${CMAKE_CURRENT_BINARY_DIR}/${target_name}.cpp")
+  set(output_bin_path "${CMAKE_CURRENT_BINARY_DIR}/hpvm_${target_name}")
+  configure_file(${src_filepath} ${generated_file_path})
+  # Add an "hpvm_" prefix here because Ninja generator doesn't like
+  # the name of output file and custom target to clash.
   add_custom_command(
-    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bin_filename}
-    DEPENDS ${src_filepath} approxhpvm.py
-    COMMAND ${APPROXHPVM_PY}
-      ${src_filepath} ${CMAKE_CURRENT_BINARY_DIR}/${bin_filename}
-      -t ${codegen_target} -I ${CONFIG_INCLUDE_DIR} ${ARGV}
+    OUTPUT ${output_bin_path}
+    DEPENDS ${generated_file_path} hpvm-clang
+    COMMAND hpvm-clang
+      ${generated_file_path} ${output_bin_path} -O3 -fno-exceptions
+      -t ${codegen_target} -I ${CMAKE_CURRENT_SOURCE_DIR}/include ${ARGN}
   )
-  add_custom_target(${bin_filename} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bin_filename})
-  set(test_compile_targets ${test_compile_targets} ${bin_filename} PARENT_SCOPE)
+  add_custom_target(${target_name} DEPENDS ${output_bin_path})
+  set(test_compile_targets ${test_compile_targets} ${target_name} PARENT_SCOPE)
 endfunction(compile_hpvm_c)
 
 function(hpvm_add_dnn_test benchmark_target)
@@ -61,7 +29,8 @@ function(hpvm_add_dnn_test benchmark_target)
   # Removes the final_accuracy file
   llvm_test_run(EXECUTABLE rm final_accuracy)
   # llvm_add_test creates .test file to given output path for given binary.
-  llvm_add_test(${benchmark_target}.test ${CMAKE_CURRENT_BINARY_DIR}/${benchmark_target})
+  # Also add "hpvm_" prefix here because compile_hpvm_c() did.
+  llvm_add_test(${benchmark_target}.test ${CMAKE_CURRENT_BINARY_DIR}/hpvm_${benchmark_target})
   # TODO: add_dependencies
 endfunction(hpvm_add_dnn_test)
 
@@ -79,6 +48,31 @@ foreach(dir ${entries})
   hpvm_add_dnn_test(${dirname}_cudnn)
 endforeach(dir)
 
+# Install an accuracy comparator under build/bin for test suite.
+set(BIN_DIR ${LLVM_BINARY_DIR}/${LLVM_TOOLS_INSTALL_DIR})
+add_custom_command(
+  OUTPUT ${BIN_DIR}/check_dnn_acc.py
+  COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/check_dnn_acc.py ${BIN_DIR}
+  COMMAND chmod +x ${BIN_DIR}/check_dnn_acc.py
+  DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/check_dnn_acc.py
+)
+
 message(STATUS "List of HPVM-C DNN benchmarks: ${test_compile_targets}")
 add_custom_target(dnn_benchmarks DEPENDS ${test_compile_targets} ${BIN_DIR}/check_dnn_acc.py)
 message(STATUS "Target name for compiling all DNN benchmarks: dnn_benchmarks")
+
+# --[ llvm-lit test setup
+# lit.cfg.py looks for tests in CMAKE_CURRENT_BINARY_DIR (see lit.cfg.py)
+# as most of the tests require some kind of compilation / generation
+# which is best done over there.
+configure_lit_site_cfg(
+  ../../lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+)
+add_lit_testsuite(check-hpvm-dnn "Running HPVM DNNs"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS dnn_benchmarks  # Compile all dnn benchmarks to run them
+  ARGS "-j1"  # Run DNN benchmarks sequentially
+)
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
index 860e3b6423bc78d073096a981f765bed10fb73a7..39f49784d76470c4e0bab213127369806e1e2531 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
@@ -429,7 +429,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/alexnet2_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
index f44e19dece121cb01a1f3e6a8bf9e27ea945e6ce..dafd1a6ae084c4e1bf819ce1ac94e667c696eb24 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
@@ -434,7 +434,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/alexnet2_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
index 6d8973ad982b1aa3b206a0cf40ee1888c37e293f..64350c590bb181fa4eaab4b2bf5fb37f69e11c09 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
@@ -380,7 +380,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/alexnet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   uint8_t *labels = readLabels(labels_path.c_str(), 5000);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
index b2a940d501d8b1c2e29dbe7240012ace8197bbb4..72af2ff4a1b33aabac427d203101c32c4a7403c7 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
@@ -386,7 +386,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/alexnet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
index 474ab64cadf3eac158d39e6e1e6686765c3bac36..37e7a34a51a14b6903d549f271d3c0c83822fec8 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
@@ -478,7 +478,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/alexnet_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
index 10e95202f2e2188a9dcd1c12a168a612f897fcf9..1206d7bac4b9dcff2b4cfd7183f4a3e5f65d73d9 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
@@ -483,7 +483,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/alexnet_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/data/tuner_confs.txt
index 32a9642d38ab816246b9e5cca01c6efcec3a2d8d..b8224da41c52f093bd61a23bae05eb09f39148a0 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/data/tuner_confs.txt
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/data/tuner_confs.txt
@@ -9,400 +9,400 @@ conf1 1 1 98.7 0.0
 -----
 +++++
 conf2 1.828613181003043 2.071721708828981 98.65 0.04999999999999716
-1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 156 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 261 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf3 1.8936889628815377 2.139779619692146 98.65 0.04999999999999716
-1 gpu conv perf_fp16 152 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 152 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf4 1.8936889628815377 2.139779619692146 98.65 0.04999999999999716
-1 gpu conv perf_fp16 152 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 152 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf5 1.8936889628815377 2.139779619692146 98.65 0.04999999999999716
-1 gpu conv perf_fp16 152 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 152 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf6 1.8247639611533713 2.0227145446958756 98.64 0.060000000000002274
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 261 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf7 1.8247639611533713 2.0227145446958756 98.64 0.060000000000002274
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 261 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf8 1.8406161850501603 2.037849502542524 98.64 0.060000000000002274
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf9 1.8406161850501603 2.037849502542524 98.64 0.060000000000002274
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf10 1.8406161850501603 2.037849502542524 98.64 0.060000000000002274
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf11 1.8663357888260776 2.115790921611576 98.64 0.060000000000002274
-1 gpu conv perf_fp16 155 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 155 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf12 1.8663357888260776 2.115790921611576 98.64 0.060000000000002274
-1 gpu conv perf_fp16 155 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 155 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf13 1.8663357888260776 2.115790921611576 98.64 0.060000000000002274
-1 gpu conv perf_fp16 155 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 155 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf14 1.8645645142051612 2.1037012333044935 98.61999999999999 0.0800000000000125
-1 gpu conv perf_fp16 167 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 167 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf15 1.8645645142051612 2.1037012333044935 98.61999999999999 0.0800000000000125
-1 gpu conv perf_fp16 167 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 167 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf16 1.8645645142051612 2.1037012333044935 98.61999999999999 0.0800000000000125
-1 gpu conv perf_fp16 167 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 167 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf17 2.2168527051833635 2.453341076720038 98.61999999999999 0.0800000000000125
-1 gpu conv samp_fp16 264 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf18 2.2168527051833635 2.453341076720038 98.61999999999999 0.0800000000000125
-1 gpu conv samp_fp16 264 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf19 2.2168527051833635 2.453341076720038 98.61999999999999 0.0800000000000125
-1 gpu conv samp_fp16 264 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf20 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853
-1 gpu conv fp16 12 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 12 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf21 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853
-1 gpu conv fp16 12 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 12 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf22 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853
-1 gpu conv fp16 12 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 12 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf23 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf24 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf25 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf26 2.200653361151419 2.425091789360736 98.6 0.10000000000000853
-1 gpu conv samp_fp16 266 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv samp_fp16 266 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf27 2.200653361151419 2.425091789360736 98.6 0.10000000000000853
-1 gpu conv samp_fp16 266 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv samp_fp16 266 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf28 1.8406161850501603 2.037849502542524 98.58 0.12000000000000455
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf29 1.8406161850501603 2.037849502542524 98.58 0.12000000000000455
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf30 1.8406161850501603 2.037849502542524 98.58 0.12000000000000455
-1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv fp16 11 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf31 1.8445326456180258 2.087601822059355 98.58 0.12000000000000455
-1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 156 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf32 1.8445326456180258 2.087601822059355 98.58 0.12000000000000455
-1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 156 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf33 1.8445326456180258 2.087601822059355 98.58 0.12000000000000455
-1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 156 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf34 1.8916677984300285 2.155437579874673 98.58 0.12000000000000455
-1 gpu conv perf_fp16 158 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 158 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf35 1.8916677984300285 2.155437579874673 98.58 0.12000000000000455
-1 gpu conv perf_fp16 158 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 158 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf36 1.8916677984300285 2.155437579874673 98.58 0.12000000000000455
-1 gpu conv perf_fp16 158 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 158 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf37 1.8649226857257986 2.1076025277601325 98.56 0.14000000000000057
-1 gpu conv perf_fp16 168 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 168 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf38 1.8649226857257986 2.1076025277601325 98.56 0.14000000000000057
-1 gpu conv perf_fp16 168 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 168 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf39 1.8649226857257986 2.1076025277601325 98.56 0.14000000000000057
-1 gpu conv perf_fp16 168 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 168 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf40 1.8463058650555446 2.067271423078985 98.56 0.14000000000000057
-1 gpu conv perf_fp16 157 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 157 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf41 1.8463058650555446 2.067271423078985 98.56 0.14000000000000057
-1 gpu conv perf_fp16 157 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 157 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf42 1.8463058650555446 2.067271423078985 98.56 0.14000000000000057
-1 gpu conv perf_fp16 157 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 157 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf43 1.9234076467497994 2.1864740913112275 98.56 0.14000000000000057
-1 gpu conv perf_fp16 153 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 153 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf44 1.9234076467497994 2.1864740913112275 98.56 0.14000000000000057
-1 gpu conv perf_fp16 153 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 153 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf45 1.9234076467497994 2.1864740913112275 98.56 0.14000000000000057
-1 gpu conv perf_fp16 153 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 153 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf46 1.8698191484268973 2.13979218727595 98.54 0.1599999999999966
-1 gpu conv perf_fp16 159 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 159 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf47 1.8698191484268973 2.13979218727595 98.54 0.1599999999999966
-1 gpu conv perf_fp16 159 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 159 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf48 1.8575043605938137 2.092057786757256 98.52 0.18000000000000682
-1 gpu conv perf_fp16 165 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 165 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf49 1.8575043605938137 2.092057786757256 98.52 0.18000000000000682
-1 gpu conv perf_fp16 165 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 165 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf50 1.8575043605938137 2.092057786757256 98.52 0.18000000000000682
-1 gpu conv perf_fp16 165 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 165 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
 -----
 +++++
 conf51 1.8534621507951072 2.1231113105788597 98.44000000000001 0.2599999999999909
-1 gpu conv perf_fp16 159 add fp16 1 pool_max fp16 1 tanh fp16 1
-2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1
+1 gpu conv perf_fp16 159 add fp16 1 tanh fp16 12 pool_max fp16 12
+2 gpu conv samp_fp16 261 add fp16 1 tanh fp16 12 pool_max fp16 12
 3 gpu mul fp16 12 add fp16 1 tanh fp16 1
 4 gpu mul fp16 12 add fp16 1 tanh fp16 1
 5 gpu softmax fp32 1
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
index 5c42f6953cfd9256cea73b39868a7ec571f18565..d7ab4238ebac5598b92c432aced85a602bb5ce89 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
@@ -282,7 +282,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/lenet_mnist/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
index 0c2568f81b701cb474a257b190be61b4bba45f3e..26acc65a99287ea9f20e037dd996635315d76e48 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
@@ -287,7 +287,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/lenet_mnist/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
index 01d027341686291c83e605bdeee1bbcffa68d6e9..5f8c63dbfbfb800dc6f60f9ed9a6108dee0a9a48 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
@@ -1984,7 +1984,7 @@ int main(int argc, char *argv[]) {
   }
 
   std::string dir_prefix =
-      std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
+      std::string("@MODEL_PARAMS_DIR@") + "/mobilenet_cifar10/";
 
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
index e51e85dd980dd910389ec4415174e6e005f75c41..2070089053ef0b6e7e0ca33c2c6cc4cea17b8e29 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
@@ -1989,7 +1989,7 @@ int main(int argc, char *argv[]) {
   }
 
   std::string dir_prefix =
-      std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
+      std::string("@MODEL_PARAMS_DIR@") + "/mobilenet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
index fa83c534d0639241205758018f8f7c37401e6b22..5b580f26821e67cc96c8347e485b792f40105176 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
@@ -1318,7 +1318,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/resnet18_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
index c7b789c2343a8dfd1e847652af2bd1d6adfd51f1..735e2c9abab91f00560faa5496e234321027b82c 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
@@ -1249,7 +1249,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/resnet18_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
index 91d07e30469e675fd2027f29290e35a0db888174..160563064cc47effd463c4915b0c7f0d93bff56f 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
@@ -5151,7 +5151,7 @@ int main(int argc, char *argv[]) {
   }
 
   std::string dir_prefix =
-      std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/";
+      std::string("@MODEL_PARAMS_DIR@") + "/resnet50_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
index 932580e03e7ccc4495d8d76be2f7147369e36d68..c5cf2cb3a0177a5cce9ad0cf460484e63ded0ecd 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
@@ -4927,7 +4927,7 @@ int main(int argc, char *argv[]) {
   }
 
   std::string dir_prefix =
-      std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/";
+      std::string("@MODEL_PARAMS_DIR@") + "/resnet50_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
index 195c676c11d53b19e0d18ed4908198a929d188aa..bec6139c2d089e90d09fa239e1b15c9a835fd4ea 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
@@ -845,7 +845,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
index c304237ea57ba15d48cff0773860cdc469fc2a04..4fa7d5c121bacff122821fe983ed443e3c6db249 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
@@ -850,7 +850,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
index 4cd5c134293d85983146352175e278915ab1d2ba..8666030fba4390d29d9324f5a5c7d60324325f05 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
@@ -845,7 +845,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_cifar100/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
index 532fca6b856f296624c21e9a18421763c4b70f48..6d01caa3b7c0875cff4f3e16131ddd09195e92b7 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
@@ -850,7 +850,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_cifar100/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
index 8e299f40e6ddd04a3ce9f8d9dffff49b1de36189..b1b2b4f2e312b6372e10a2fce3ef12eab2dddded 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
@@ -893,7 +893,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
index 930a33e43c706e6e91475fc97671c39c23f63387..eb29e45805671072428318412f27b05d0da90199 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
@@ -898,7 +898,7 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/include/config.h.in b/hpvm/test/dnn_benchmarks/hpvm-c/include/config.h.in
deleted file mode 100644
index 0eb8f3f24d0e51f2aaa12f1fd672043599490082..0000000000000000000000000000000000000000
--- a/hpvm/test/dnn_benchmarks/hpvm-c/include/config.h.in
+++ /dev/null
@@ -1 +0,0 @@
-#define MODEL_PARAMS_DIR "@MODEL_PARAMS_DIR@"
diff --git a/hpvm/test/dnn_benchmarks/keras/Benchmark.py b/hpvm/test/dnn_benchmarks/keras/Benchmark.py
index c225ff97661aef8f4fceb123b79190cb7c819dd7..f3d8e9e6b2268618dc835e3d27374a8f7d738a86 100644
--- a/hpvm/test/dnn_benchmarks/keras/Benchmark.py
+++ b/hpvm/test/dnn_benchmarks/keras/Benchmark.py
@@ -52,20 +52,23 @@ class Benchmark:
         
         try:
             subprocess.run([
-                "approxhpvm.py", 
+                "hpvm-clang", 
                 "-h"
             ], check=True, stdout=FNULL)
             
         except:
-            print ("\n\n ERROR: Could not find approxhpvm.py (HPVM compile script)!! \n\n")
-            print ("To Compile, Must set PATH to include approxhpvm.py script. Do the following: ")
-            print ("**** export PATH=${PATH_TO_YOUR_HPVM_INSTALLATION}/build/bin/:$PATH *****")
+            print("""
+
+ERROR: Could not find hpvm-clang (HPVM compile script)!!
+
+hpvm-clang is installed to the python environment used when compiling HPVM.
+Please try rerunning 'make -j hpvm-clang'.""")
             sys.exit(1)
 
 
         try:
             subprocess.run([
-                "approxhpvm.py", src_file, target_binary,
+                "hpvm-clang", src_file, target_binary,
                 "-t", "tensor", "--conf-file", approx_conf_file
             ], check=True)
         except:
diff --git a/hpvm/test/dnn_benchmarks/profiling/test_hpvm_c_profiling.py b/hpvm/test/dnn_benchmarks/profiling/test_hpvm_c_profiling.py
index 5f4a96740cedb05295e4fcde0c5dfa65a0be34cc..230fdf8b73dfd7959cfaa98fe06eafe6a75087b1 100755
--- a/hpvm/test/dnn_benchmarks/profiling/test_hpvm_c_profiling.py
+++ b/hpvm/test/dnn_benchmarks/profiling/test_hpvm_c_profiling.py
@@ -14,7 +14,7 @@ benchmarks_srcdir = Path(__file__).parent / "../hpvm-c/benchmarks"
 # So we know where the benchmark binaries are due to source directory structure,
 # and this is not hardcoding.
 dnn = argv[1]
-bench_bin_file = benchmarks_bindir / dnn
+bench_bin_file = benchmarks_bindir / f"hpvm_{dnn}"
 config_file = benchmarks_srcdir / dnn / "data/tuner_confs.txt"
 out_config_file = f"./{dnn}.txt"
 profile_configs(bench_bin_file, config_file, out_config_file)
diff --git a/hpvm/test/dnn_benchmarks/tensor-rt-src/CMakeLists.txt b/hpvm/test/dnn_benchmarks/tensor-rt-src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6e22eba67471855971005bf9e57ed0aa38dafff8
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/CMakeLists.txt
@@ -0,0 +1,40 @@
+#**************** FP32 and FP16 TensorRT Source Builds *********** 
+# Don't put binaries in build/bin. This doesn't affect global setting.
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+set(MODEL_PARAMS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../model_params/")
+set(test_compile_targets "")
+function(add_trt_source target_name filepath)
+  set(generated_file_path "${CMAKE_CURRENT_BINARY_DIR}/${target_name}.cpp")
+  configure_file(${filepath} ${generated_file_path})
+  add_executable(${target_name} ${generated_file_path})
+  target_link_libraries(${target_name} tensor_runtime_online)
+  set(test_compile_targets ${test_compile_targets} ${target_name} PARENT_SCOPE)
+endfunction(add_trt_source)
+
+set(
+  DNN_NAMES
+  alexnet_cifar10
+  alexnet2_cifar10
+  vgg16_cifar10
+  resnet18_cifar10
+  vgg16_cifar100
+  mobilenet_cifar10
+  alexnet_imagenet
+  vgg16_imagenet
+  resnet50_imagenet
+)
+foreach(dnn_name ${DNN_NAMES})
+  # FP32 version
+  if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/fp32/${dnn_name}.cc")
+    add_trt_source(${dnn_name}_fp32 "${CMAKE_CURRENT_SOURCE_DIR}/fp32/${dnn_name}.cc")
+  endif()
+  # FP16 version
+  if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/fp16/${dnn_name}_half.cc")
+    add_trt_source(${dnn_name}_fp16 "${CMAKE_CURRENT_SOURCE_DIR}/fp16/${dnn_name}_half.cc")
+  endif()
+endforeach()
+
+message(STATUS "List of tensor_runtime DNN benchmarks: ${test_compile_targets}")
+add_custom_target(trt_dnn_benchmarks DEPENDS ${test_compile_targets})
+message(STATUS "Target name for compiling all DNN benchmarks: trt_dnn_benchmarks")
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/alexnet2_cifar10_half.cc
similarity index 96%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/alexnet2_cifar10_half.cc
index 0b344035296bdbab2744e32604f3a8881feb6230..ab80718fd33d0b9787be4a0f183e3a7a65dc76e7 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/alexnet2_cifar10_half.cc
@@ -1,7 +1,8 @@
 
 
-#include "../../../tensor_runtime/include/tensor_runtime.h"
-#include "../../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 /* NOTE: Reference Architecture to use for profiling */
 void testCifarNet() {
@@ -9,8 +10,7 @@ void testCifarNet() {
   printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
 
   std::string dir_prefix =
-      model_params_path + std::string("/alexnet2_cifar10/");
-  std::string input_path = dir_prefix + std::string("test_input.bin");
+      std::string("@MODEL_PARAMS_DIR@") + "/alexnet2_cifar10/";  std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
 
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/alexnet_cifar10_half.cc
similarity index 95%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/alexnet_cifar10_half.cc
index 7d493b8720ab701f87fdd53b315da7eafecf6637..b3b69d6b695eca9286b90685f3e071e234887d27 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/alexnet_cifar10_half.cc
@@ -1,13 +1,14 @@
 
 
-#include "../../../tensor_runtime/include/tensor_runtime.h"
-#include "../../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
-  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/");
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/alexnet_cifar10/";
 
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/lenet_mnist_half.cc
similarity index 97%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/lenet_mnist_half.cc
index 03dc905bbfcb07ad9a266fc153cd1a6a0db9837e..44b78b9169707fd6c7b9ff6503a4a9aa8d2ec947 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/lenet_mnist_half.cc
@@ -1,7 +1,8 @@
 
 
 #include "tensor_runtime.h"
-#include "utils.h"
+#include "tensorUtils.h"
+
 
 /* NOTE: Reference Architecture to use for profiling */
 void testLenetTanh() {
@@ -11,7 +12,7 @@ void testLenetTanh() {
 
   int test_batch_size = 5000;
 
-  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/lenet_mnist/";
 
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/mobilenet_cifar10_half.cc
similarity index 99%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/mobilenet_cifar10_half.cc
index d6eaef755743ce961d3d9c2f013eef26a77579f7..d4423bf4345756e72ad46b140ae8cafc26eae264 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/mobilenet_cifar10_half.cc
@@ -1,15 +1,15 @@
 
 
-#include "../../../tensor_runtime/include/tensor_runtime.h"
-#include "../../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
   std::string dir_prefix =
-      model_params_path + std::string("/mobilenet_cifar10/");
-
+      std::string("@MODEL_PARAMS_DIR@") + "/mobilenet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
 
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/resnet18_cifar10_half.cc
similarity index 98%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/resnet18_cifar10_half.cc
index 40e128eb8a80f6e080c090589a3e91b80ffa082f..76dea5ef08713d22fe7086b678bb3274378d0fd9 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/resnet18_cifar10_half.cc
@@ -1,15 +1,15 @@
 
 
-#include "../../../tensor_runtime/include/tensor_runtime.h"
-#include "../../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
   std::string dir_prefix =
-      model_params_path + std::string("/resnet18_cifar10/");
-
+      std::string("@MODEL_PARAMS_DIR@") + "/resnet18_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
 
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/vgg16_cifar100_half.cc
similarity index 98%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/vgg16_cifar100_half.cc
index eb3275b83009ec4300e9cb713f3b182727661db4..2772fd3da42d50aa2ff5391d1e3c85c610a4960a 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/vgg16_cifar100_half.cc
@@ -1,13 +1,14 @@
 
 
-#include "../../../tensor_runtime/include/tensor_runtime.h"
-#include "../../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/");
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_cifar100/";
 
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/vgg16_cifar10_half.cc
similarity index 98%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/vgg16_cifar10_half.cc
index 2a4b5a6a914698f621284e0f5b19843b817068df..954f6778b899d2cefb2b28d68a32fad33d52f70c 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp16/vgg16_cifar10_half.cc
@@ -1,13 +1,14 @@
 
 
-#include "../../../tensor_runtime/include/tensor_runtime.h"
-#include "../../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/");
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/alexnet2_cifar10.cc
similarity index 96%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/alexnet2_cifar10.cc
index 396e9f11cae92c2f6613b5acb799caecbf025a59..e7431234d705449efa0fc5aafe23238e89be1d30 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/alexnet2_cifar10.cc
@@ -1,7 +1,8 @@
 
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
-#include "../../tensor_runtime/include/tensor_runtime.h"
-#include "../include/utils.h"
 
 /* NOTE: Reference Architecture to use for profiling */
 void testCifarNet() {
@@ -9,8 +10,7 @@ void testCifarNet() {
   printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
 
   std::string dir_prefix =
-      model_params_path + std::string("/alexnet2_cifar10/");
-  std::string input_path = dir_prefix + std::string("test_input.bin");
+      std::string("@MODEL_PARAMS_DIR@") + "/alexnet2_cifar10/";  std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
 
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/alexnet_cifar10.cc
similarity index 96%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/alexnet_cifar10.cc
index 600512078563baf850f440ea97e78cb2d73be170..12c304c9b401c586a0da4658b092f2b791268983 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/alexnet_cifar10.cc
@@ -1,13 +1,14 @@
 
 
-#include "../../tensor_runtime/include/tensor_runtime.h"
-#include "../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
-  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/");
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/alexnet_cifar10/";
 
   std::string input_path = dir_prefix + std::string("test_input.bin");
   // std::string labels_path = dir_prefix + std::string("labels.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/alexnet_imagenet.cc
similarity index 98%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/alexnet_imagenet.cc
index 29909e5938ca0f700c4ee22165ae2ad354e53a32..b57e60c0fef41b283ad57a7b203759a8f014252d 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/alexnet_imagenet.cc
@@ -1,15 +1,15 @@
 
 
 #include "tensor_runtime.h"
-#include "utils.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
   std::string dir_prefix =
-      model_params_path + std::string("/alexnet_imagenet/");
-
+      std::string("@MODEL_PARAMS_DIR@") + "/alexnet_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/lenet_mnist.cc
similarity index 97%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/lenet_mnist.cc
index 61a0eeb441458ff6f91af8bc76ecc17a33428aec..9777670722b69c8b23a82a77312d17386f2d5c3f 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/lenet_mnist.cc
@@ -1,7 +1,8 @@
 
 
 #include "tensor_runtime.h"
-#include "../include/utils.h"
+#include "tensorUtils.h"
+
 
 int total_runs = 1;
 
@@ -10,7 +11,7 @@ void testLenetTanh() {
 
   int test_batch_size = 5000;
 
-  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/lenet_mnist/";
 
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/mobilenet_cifar10.cc
similarity index 99%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/mobilenet_cifar10.cc
index 85849126cf164693d12fb08aba8326033ca61b82..3e37bf7feb6641af3afdeb8fb9f3a65fdfcbdce3 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/mobilenet_cifar10.cc
@@ -1,15 +1,14 @@
 
 
-#include "../../tensor_runtime/include/tensor_runtime.h"
-#include "../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
   std::string dir_prefix =
-      model_params_path + std::string("/mobilenet_cifar10/");
-  std::string input_path = dir_prefix + std::string("test_input.bin");
+      std::string("@MODEL_PARAMS_DIR@") + "/mobilenet_cifar10/";  std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/resnet18_cifar10.cc
similarity index 98%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/resnet18_cifar10.cc
index bd3dd7dc1ea23f3cb8ad91e8632b347dd51a848b..c8a99419a81d19b374642c21c977a511413f9ae2 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/resnet18_cifar10.cc
@@ -1,15 +1,15 @@
 
 
-#include "../../../tensor_runtime/include/tensor_runtime.h"
-#include "../../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
   std::string dir_prefix =
-      model_params_path + std::string("/resnet18_cifar10/");
-
+      std::string("@MODEL_PARAMS_DIR@") + "/resnet18_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
 
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/resnet50_imagenet.cc
similarity index 99%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/resnet50_imagenet.cc
index 0cccb124b0dca81d45887df50c4a9bcaf2a21db5..3aeabc22736e6955a9ad5ad07144fc38057616ea 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/resnet50_imagenet.cc
@@ -1,15 +1,15 @@
 
 
-#include "../../tensor_runtime/include/tensor_runtime.h"
-#include "../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
   std::string dir_prefix =
-      model_params_path + std::string("/resnet50_imagenet/");
-
+      std::string("@MODEL_PARAMS_DIR@") + "/resnet50_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/vgg16_cifar10.cc
similarity index 98%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/vgg16_cifar10.cc
index 813874f0ed888ed5889b0574de454ca2720c944d..f7fffadfc36ba0fd248371efb35a1b7dfede68d3 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/vgg16_cifar10.cc
@@ -1,13 +1,13 @@
 
 
-#include "../../tensor_runtime/include/tensor_runtime.h"
-#include "../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/");
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/vgg16_cifar100.cc
similarity index 98%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/vgg16_cifar100.cc
index 84164bf24bbff855b7a0975d7df2883e674b34c8..d3949c7cc568063f3b344d8497551fa1f4f4102c 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/vgg16_cifar100.cc
@@ -1,13 +1,14 @@
 
 
-#include "../../tensor_runtime/include/tensor_runtime.h"
-#include "../include/utils.h"
+#include "tensor_runtime.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/");
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_cifar100/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
 
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/vgg16_imagenet.cc
similarity index 98%
rename from hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc
rename to hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/vgg16_imagenet.cc
index eca833b08ff374e208f0cafabbf598cd0f7b5d90..2bb1be2821a8d33062bf1cfd83bb978f59884fa9 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc
+++ b/hpvm/test/dnn_benchmarks/tensor-rt-src/fp32/vgg16_imagenet.cc
@@ -1,13 +1,14 @@
 
 
 #include "tensor_runtime.h"
-#include "utils.h"
+#include "tensorUtils.h"
+
 
 int main() {
 
   llvm_hpvm_initTensorRt(0);
 
-  std::string dir_prefix = model_params_path + std::string("/vgg16_imagenet/");
+  std::string dir_prefix = std::string("@MODEL_PARAMS_DIR@") + "/vgg16_imagenet/";
 
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
diff --git a/hpvm/test/hpvm_pass/CMakeLists.txt b/hpvm/test/hpvm_pass/CMakeLists.txt
index fe929f4d6316b54e4b4b19db1221ae146534ee0b..a2be31fd4fc27dcdb83c5ed7424c6577eefb3d5d 100644
--- a/hpvm/test/hpvm_pass/CMakeLists.txt
+++ b/hpvm/test/hpvm_pass/CMakeLists.txt
@@ -7,7 +7,7 @@ configure_lit_site_cfg(
 
 set(
   HPVM_TEST_DEPENDS
-  opt hpvm-rt.bc
+  opt hpvm-rt-bc
   # Passes:
   LLVMGenHPVM LLVMBuildDFG LLVMLocalMem LLVMClearDFG
   LLVMDFG2LLVM_CPU LLVMDFG2LLVM_OpenCL
diff --git a/hpvm/tools/CMakeLists.txt b/hpvm/tools/CMakeLists.txt
index 495348404f3594d8bca46b87acd8be2d08d2c8a4..4758dde6a3f1564739135225f6f8673b5d34843d 100644
--- a/hpvm/tools/CMakeLists.txt
+++ b/hpvm/tools/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_llvm_tool_subdirectory(hpvm-config)
-add_llvm_tool_subdirectory(py-approxhpvm)
\ No newline at end of file
+add_llvm_tool_subdirectory(hpvm-clang)
\ No newline at end of file
diff --git a/hpvm/tools/hpvm-clang/CMakeLists.txt b/hpvm/tools/hpvm-clang/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3564b00d0aa480996c152c1f027fa6fc39cd144c
--- /dev/null
+++ b/hpvm/tools/hpvm-clang/CMakeLists.txt
@@ -0,0 +1,87 @@
+# This file is very tightly coupled with main.py.in.
+# Watch out and keep them in sync.
+
+# CMake fills in some variables in main.py.in and generate it into a python package:
+# `hpvmpy`, which is the main entry point and Python API for HPVM.
+
+# ---[ Define variables for main.py.in
+# main.py.in requires the following variables:
+# LLVM_PROJECT_DIR, LLVM_BUILD_DIR
+# TRT_PATH, TRT_INCLUDE_DIRS, TRT_LINK_DIRS, TRT_LINK_LIBS
+# DIRECT_LINK_LIBS, HPVM_USE_OPENCL (defined globally)
+# AVAILABLE_PASSES, HPVM_RT_PATH
+
+set(LLVM_PROJECT_DIR ${CMAKE_SOURCE_DIR})
+set(LLVM_BUILD_DIR ${CMAKE_BINARY_DIR})
+
+get_target_property(TRT_INCLUDE_DIRS tensor_runtime INCLUDE_DIRECTORIES)
+get_target_property(TRT_LINK_DIRS tensor_runtime TRT_LINK_DIRS)
+get_target_property(TRT_LINK_LIBS tensor_runtime TRT_LINK_LIBS)
+
+set(DIRECT_LINK_LIBS "$<TARGET_FILE:tensor_runtime>")
+if(${HPVM_USE_OPENCL})
+  # We need to link to OpenCL libs when hpvm uses opencl
+  # because OpenCL functions may be injected by the OpenCL pass.
+  list(APPEND DIRECT_LINK_LIBS ${OpenCL_LIBRARY})
+else()
+  message(STATUS "hpvm-clang is not using OpenCL (--opencl flag not available).")
+endif()
+
+# The hpvm-rt runtime
+# This has to be explicitly set as hpvm-rt.bc is created in a custom_target
+# and does not export its file location.
+# Keep this in sync with hpvm/projects/hpvm-rt/CMakeLists.txt.
+set(HPVM_RT_PATH ${LLVM_BUILD_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc)
+# And this must be manually in sync with `hpvm/lib/Transforms/*`.
+# which is fine for because we don't have many passes for now.
+set(
+    AVAILABLE_PASSES
+    LLVMBuildDFG
+    LLVMClearDFG
+    LLVMDFG2LLVM_CPU
+    LLVMDFG2LLVM_CUDNN
+    LLVMDFG2LLVM_OpenCL
+    LLVMDFG2LLVM_WrapperAPI
+    LLVMFuseHPVMTensorNodes
+    LLVMGenHPVM
+    LLVMInPlaceDFGAnalysis
+    LLVMLocalMem
+)
+
+# ---[ Create package folder structure
+# This sounds crazy but since main.py.in is generated into another file under build/ dir,
+# to make a python package around it, we'll have to generate the whole package structure
+# in build/ as well.
+# Target dir structure:
+# ${CMAKE_CURRENT_BINARY_DIR}
+#   hpvmpy/
+#     __init__.py  <- generated from main.py.in
+#   setup.py       <- copied from setup.py
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/hpvmpy)
+file(COPY setup.py DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+# ---[ Generate main.py.in to hpvmpy/__init__.py
+set(init_path ${CMAKE_CURRENT_BINARY_DIR}/hpvmpy/__init__.py)
+# First resolve all `@symbol@` by configuring the file
+configure_file(main.py.in ${CMAKE_CURRENT_BINARY_DIR}/main.py.conf)
+# Then resolve all generator expressions we configured into the previous file
+file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/main.py INPUT ${CMAKE_CURRENT_BINARY_DIR}/main.py.conf)
+# Delibrately create an extra step of moving file
+# which is carried out at build time (as a target)
+# so we can set these dependencies on it.
+set(DEPS tensor_runtime hpvm-rt-bc clang opt llvm-link ${AVAILABLE_PASSES})
+add_custom_command(
+    OUTPUT ${init_path}
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/main.py ${init_path}
+    DEPENDS ${DEPS} ${CMAKE_CURRENT_BINARY_DIR}/main.py
+)
+
+# ---[ Call python3 -m pip to install this package.
+add_custom_target(
+    hpvmpy
+    COMMAND python3 -m pip install ./
+    DEPENDS ${init_path} setup.py
+)
+# hpvm-clang is the name of the compiler binary (see setup.py)
+# We provide this alias for convenience
+add_custom_target(hpvm-clang DEPENDS hpvmpy)
diff --git a/hpvm/tools/hpvm-clang/main.py.in b/hpvm/tools/hpvm-clang/main.py.in
new file mode 100644
index 0000000000000000000000000000000000000000..e2bc5cbafa23bd64094a3198ad8466f682f6bbdc
--- /dev/null
+++ b/hpvm/tools/hpvm-clang/main.py.in
@@ -0,0 +1,275 @@
+#!/usr/bin/env python3
+import argparse
+import os
+from pathlib import Path
+from typing import List, Union, Optional
+
+PathLike = Union[Path, str]
+
+HPVM_PROJECT_DIR = Path("@LLVM_PROJECT_DIR@") / "tools/hpvm"
+LLVM_BUILD_BIN = Path("@LLVM_BUILD_DIR@") / "bin"
+
+# Directories to include
+TRT_INCLUDE_DIRS = "@TRT_INCLUDE_DIRS@".split(";")
+TRT_LINK_DIRS = [Path(s) for s in "@TRT_LINK_DIRS@".split(";")]
+TRT_LINK_LIBS = "@TRT_LINK_LIBS@".split(";")
+DIRECT_LINK_LIBS = "@DIRECT_LINK_LIBS@".split(";")
+HPVM_USE_OPENCL = int("@HPVM_USE_OPENCL@")
+
+AVAILABLE_PASSES = "@AVAILABLE_PASSES@".split(";")
+HPVM_RT_PATH = "@HPVM_RT_PATH@"
+
+
+def compile_hpvm_c(
+    hpvm_src: PathLike,
+    output_file: PathLike,
+    tensor_target: Optional[str],
+    opencl: bool,
+    link_bitcode: List[PathLike] = None,
+    include: List[PathLike] = None,
+    macro: List[str] = None,
+    flags: List[str] = None,
+    optim_level: str = "0",  # -O0
+    is_cpp: bool = True,  # otherwise is C
+    std: str = None,  # language std (-std=c++11)
+    link_dirs: List[PathLike] = None,
+    link_libs: List[str] = None,
+    working_dir: PathLike = None,
+    conf_file: PathLike = None,
+):
+    from subprocess import check_output
+
+    passes = ["LLVMBuildDFG"]
+    pass_flags = ["buildDFG"]
+    if tensor_target == "tensor":
+        if conf_file is None:
+            raise ValueError("conf_file must be defined when tensor_target=='tensor'.")
+        passes += ["LLVMInPlaceDFGAnalysis", "LLVMFuseHPVMTensorNodes", "LLVMDFG2LLVM_WrapperAPI"]
+        pass_flags += [
+            "inplace", "hpvm-fuse", "dfg2llvm-wrapperapi",
+            f"configuration-inputs-filename={conf_file}"
+        ]
+    elif tensor_target == "cudnn":
+        passes += ["LLVMInPlaceDFGAnalysis", "LLVMDFG2LLVM_CUDNN"]
+        pass_flags += ["inplace", "dfg2llvm-cudnn"]
+    elif tensor_target is None:
+        passes += ["LLVMLocalMem"]
+        pass_flags += ["localmem"]
+    else:
+        raise ValueError(f"Tensor target {tensor_target} not recognized")
+    if opencl:
+        passes += ["LLVMDFG2LLVM_OpenCL"]
+        pass_flags += ["dfg2llvm-opencl"]
+    passes += ["LLVMDFG2LLVM_CPU", "LLVMClearDFG"]
+    pass_flags += ["dfg2llvm-cpu", "clearDFG"]
+
+    working_dir = Path(working_dir or ".")
+    if not working_dir.is_dir():
+        os.makedirs(working_dir)
+
+    # All commands for compiling the main hpvm_c file
+    name_stem = Path(hpvm_src).stem
+    ll_file = working_dir / f"{name_stem}.ll"
+    hpvm_ll_file = working_dir / f"{name_stem}.hpvm.ll"
+    llvm_ll_file = working_dir / f"{name_stem}.llvm.ll"
+    hpvm_rt_linked_file = working_dir / f"{name_stem}.linked.bc"
+    link_bitcode_ = [Path(bc) for bc in (link_bitcode or [])]
+    commands = [
+        hpvm_c_to_ll(hpvm_src, ll_file, include, macro, flags, optim_level, is_cpp, std),
+        opt_codegen_hpvm(ll_file, hpvm_ll_file),
+        _run_opt(hpvm_ll_file, llvm_ll_file, passes, pass_flags),
+        link_hpvm_rt(link_bitcode_ + [llvm_ll_file], hpvm_rt_linked_file),
+    ]
+    commands.append(
+        link_binary(hpvm_rt_linked_file, output_file, link_dirs, link_libs)
+    )
+    for command in commands:
+        print(" ".join(command))
+        check_output(command)
+
+
+def hpvm_c_to_ll(
+    src_file: PathLike,
+    target_file: PathLike,
+    extra_includes: List[PathLike] = None,
+    macros: List[str] = None,
+    flags: List[str] = None,
+    optim_level: str = "0",  # -O0
+    is_cpp: bool = True,  # otherwise is C
+    std: str = None,  # --std=c++11
+) -> List[str]:
+    includes = (extra_includes or []) + TRT_INCLUDE_DIRS
+    includes_s = [f"-I{path}" for path in includes]
+    macros = [f"-D{macro}" for macro in (macros or [])]
+    flags = [f"-f{flg}" for flg in (flags or [])]
+    if std:
+        flags.append(f"-std={std}")
+    clang = "clang++" if is_cpp else "clang"
+    return [
+        str(LLVM_BUILD_BIN / clang), *includes_s, *flags, *macros,
+        f"-O{optim_level}", "-emit-llvm", "-S",
+        str(src_file), "-o", str(target_file)
+    ]
+
+
+def opt_codegen_hpvm(src_file: PathLike, target_file: PathLike) -> List[str]:
+    return _run_opt(src_file, target_file, ["LLVMGenHPVM"], ["genhpvm", "globaldce"])
+
+
+def link_hpvm_rt(bitcodes: List[PathLike], target_file: PathLike) -> List[str]:
+    bitcodes_s = [str(bc) for bc in bitcodes]
+    return [str(LLVM_BUILD_BIN / "llvm-link"), *bitcodes_s, HPVM_RT_PATH, "-S", "-o", str(target_file)]
+
+
+def link_binary(
+    src_file: PathLike,
+    target_file: PathLike,
+    extra_link_dirs: List[PathLike] = None,
+    extra_link_libs: List[str] = None
+) -> List[str]:
+    link_dirs, link_libs = _link_args(extra_link_dirs or [], extra_link_libs or [])
+    linker_dir_flags = []
+    for path in link_dirs:
+        linker_dir_flags.extend([f"-L{path}", f"-Wl,-rpath={path}"])
+    linker_lib_flags = [f"-l{lib}" for lib in link_libs]
+    return [
+        str(LLVM_BUILD_BIN / "clang++"), str(src_file),
+        "-o", str(target_file), *linker_dir_flags, *linker_lib_flags
+    ]
+
+
+def _link_args(extra_link_dirs: List[PathLike], extra_link_libs: List[str]):
+    def drop_suffix(libname: str):
+        import re
+
+        match = re.match(r"lib(.*)\.so", libname)
+        return libname if match is None else match.group(1)
+
+    link_dirs, link_libs = extra_link_dirs.copy(), extra_link_libs.copy()
+    for lib in DIRECT_LINK_LIBS:
+        lib = Path(lib)
+        link_dirs.append(lib.parent)
+        link_libs.append(lib.name)
+    link_dirs += TRT_LINK_DIRS
+    link_libs += TRT_LINK_LIBS
+    link_libnames = [drop_suffix(s) for s in link_libs]
+    return link_dirs, link_libnames
+
+
+def _run_opt(
+    src_file: PathLike,
+    target_file: PathLike,
+    pass_names: List[str],
+    pass_flags: List[str],
+) -> List[str]:
+    unavailable = set(pass_names) - set(AVAILABLE_PASSES)
+    if unavailable:
+        raise ValueError(f"Passes {unavailable} are unavailable for this compilation.")
+    load_passes_strs = [s for pass_ in pass_names for s in ["-load", f"{pass_}.so"]]
+    pass_flags_strs = [f"-{flag}" for flag in pass_flags]
+    return [
+        str(LLVM_BUILD_BIN / "opt"), *load_passes_strs, *pass_flags_strs,
+        "-S", str(src_file), "-o", str(target_file)
+    ]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("hpvm-clang")
+    parser.add_argument(
+        "hpvm_src", type=Path,
+        help="""HPVM-C code to compile.
+HPVM-C code must be single file, but additional bitcode file can be linked together.
+See option -b for that."""
+    )
+    parser.add_argument("output_file", type=Path, help="Path to generate binary to")
+    parser.add_argument(
+        "-x", type=str, metavar="language", default="c++",
+        help="Treat input file as having type <language>",
+    )
+    parser.add_argument(
+        "-b",
+        "--link-bitcode",
+        type=Path,
+        nargs="+",
+        help="Additional bitcode (.ll/.bc) files to link to",
+    )
+    parser.add_argument(
+        "-t",
+        "--tensor-target",
+        type=str,
+        choices=["tensor", "cudnn"],
+        help="Backend to use for tensor operators",
+    )
+    parser.add_argument(
+        "--conf-file", type=Path,
+        help="File to approximation configurations; required for tensor target 'tensor'"
+    )
+    parser.add_argument(
+        "--opencl",
+        action="store_true",
+        help="Use OpenCL support. Requires HPVM built with OpenCL",
+    )
+    parser.add_argument(
+        "-d", "--working-dir", type=Path, help="Directory to generate temp files in"
+    )
+
+    # Relaying arguments for clang++ (source -> bitcode stage)
+    parser.add_argument(
+        "-I", "--include", type=Path, action="append", metavar="dir",
+        help="[clang emit-llvm] Add directory to include search path"
+    )
+    parser.add_argument(
+        "-D", type=str, action="append", metavar="<macro>=<value>",
+        help="[clang emit-llvm] Define macro"
+    )
+    parser.add_argument(
+        "-f", type=str, action="append", metavar="flag",
+        help="[clang emit-llvm] clang++ flags (such as -ffastmath)"
+    )
+    parser.add_argument(
+        "-O", type=str, default="0", metavar="level",
+        help="[clang emit-llvm] Optimization level"
+    )
+    parser.add_argument(
+        "--std", type=str,
+        help="[clang emit-llvm] Language standard to compile for. Double dashes (--std, not -std)."
+    )
+
+    # Relaying arguments for clang++ (linking stage)
+    parser.add_argument(
+        "-L", type=Path, action="append", metavar="dir",
+        help="[clang linker] Add directory to library search path"
+    )
+    parser.add_argument(
+        "-l", type=str, action="append", metavar="name",
+        help="[clang linker] Link library (such as -lpthread)"
+    )
+
+    args = parser.parse_args()
+    if args.tensor_target == "tensor":
+        if args.conf_file is None:
+            parser.error('Tensor target "tensor" requires --conf-file argument')
+    if args.x == "c":
+        args.is_cpp = False
+    elif args.x == "c++":
+        args.is_cpp = True
+    else:
+        parser.error(f"Language mode {args.x} not supported yet -- only c or c++")
+    if not HPVM_USE_OPENCL and args.opencl:
+        parser.error(f"OpenCL is disabled for this build of HPVM.")
+    return args
+
+
+def main():
+    args = vars(parse_args())
+    args["macro"] = args.pop("D")
+    args["flags"] = args.pop("f")
+    args["optim_level"] = args.pop("O")
+    args["link_dirs"] = args.pop("L")
+    args["link_libs"] = args.pop("l")
+    args.pop("x")
+    compile_hpvm_c(**args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hpvm/tools/hpvm-clang/setup.py b/hpvm/tools/hpvm-clang/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..75262a069de32e6f5de4bff89738e0011dd91326
--- /dev/null
+++ b/hpvm/tools/hpvm-clang/setup.py
@@ -0,0 +1,15 @@
+import setuptools
+
+setuptools.setup(
+    name="hpvmpy",
+    version="1.0",
+    author="Yifan Zhao",
+    author_email="yifanz16@illinois.edu",
+    description="HPVM Python API",
+    packages=["hpvmpy"],
+    entry_points={
+        "console_scripts": [
+            "hpvm-clang = hpvmpy:main",
+        ],
+    },
+)
diff --git a/hpvm/tools/py-approxhpvm/CMakeLists.txt b/hpvm/tools/py-approxhpvm/CMakeLists.txt
deleted file mode 100644
index f93f96dd0578a67630cc859bba2e24e071b39299..0000000000000000000000000000000000000000
--- a/hpvm/tools/py-approxhpvm/CMakeLists.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-# This file is very tightly coupled with main.py.in.
-# Watch out and keep them in sync.
-
-set(LLVM_PROJECT_DIR ${CMAKE_SOURCE_DIR})
-set(LLVM_BUILD_DIR ${CMAKE_BINARY_DIR})
-set(LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
-# The hpvm-rt runtime
-# This has to be explicitly set as hpvm-rt.bc is created in a custom_target
-# and does not export its file location.
-# Keep this in sync with hpvm/projects/hpvm-rt/CMakeLists.txt.
-set(HPVM_RT_PATH ${LLVM_BUILD_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc)
-set(TENSOR_RUNTIME_LIB "$<TARGET_FILE:tensor_runtime>")
-set(
-    AVAILABLE_PASSES
-    LLVMBuildDFG
-    LLVMInPlaceDFGAnalysis
-    LLVMDFG2LLVM_CPU
-    LLVMDFG2LLVM_CUDNN
-    LLVMDFG2LLVM_WrapperAPI
-    LLVMFuseHPVMTensorNodes
-    LLVMClearDFG
-    LLVMGenHPVM
-)
-# CUDA_TOOLKIT_ROOT_DIR and CUDNN_LIBRARY_PATH has been defined globally
-set(CUDNN_DIR ${CUDNN_LIBRARY_PATH})
-# First resolve all `@symbol@` by configuring the file
-configure_file(main.py.in ${CMAKE_CURRENT_BINARY_DIR}/main.py.conf)
-# Then resolve all generator expressions we configured into the previous file
-file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/main.py INPUT ${CMAKE_CURRENT_BINARY_DIR}/main.py.conf)
-# Delibrately create an extra step of moving file
-# which is carried out at build time (as a target)
-# so we can set these dependencies on it
-set(
-    DEPS
-    tensor_runtime
-    LLVMBuildDFG
-    LLVMInPlaceDFGAnalysis
-    LLVMDFG2LLVM_CPU
-    LLVMDFG2LLVM_CUDNN
-    LLVMDFG2LLVM_WrapperAPI
-    LLVMFuseHPVMTensorNodes
-    LLVMClearDFG
-    LLVMGenHPVM
-    hpvm-rt.bc
-    clang opt llvm-link
-)
-add_custom_command(
-    OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/main.py ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py
-    COMMAND chmod +x ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py
-    DEPENDS ${DEPS} ${CMAKE_CURRENT_BINARY_DIR}/main.py
-)
-add_custom_target(approxhpvm.py ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py)
diff --git a/hpvm/tools/py-approxhpvm/main.py.in b/hpvm/tools/py-approxhpvm/main.py.in
deleted file mode 100644
index af706a1eaa7a53879e525d87dd5034caf814db38..0000000000000000000000000000000000000000
--- a/hpvm/tools/py-approxhpvm/main.py.in
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import os
-from pathlib import Path
-from typing import List, Union, Optional
-
-PathLike = Union[Path, str]
-
-HPVM_PROJECT_DIR = Path("@LLVM_PROJECT_DIR@") / "tools/hpvm"
-LLVM_BUILD_DIR = Path("@LLVM_BUILD_DIR@") / "bin"
-CUDA_TOOLKIT_ROOT_DIR = Path("@CUDA_TOOLKIT_ROOT_DIR@")
-TENSOR_RUNTIME_LIB = Path("@TENSOR_RUNTIME_LIB@")
-AVAILABLE_PASSES = "@AVAILABLE_PASSES@".split(";")
-HPVM_RT_PATH = "@HPVM_RT_PATH@"
-CUDNN_DIR = "@CUDNN_DIR@"
-
-# Directories to include
-INCLUDE_DIRS = [
-    HPVM_PROJECT_DIR / "include",  # HPVM include dir
-    # Tensor runtime include dir
-    HPVM_PROJECT_DIR / "projects/hpvm-tensor-rt/tensor_runtime/include",
-    HPVM_PROJECT_DIR / "test/dnn_benchmarks/hpvm-c/include",  # hpvm-c intrinsics decl dir
-    CUDA_TOOLKIT_ROOT_DIR / "include",  # CUDA include dir
-]
-LINK_DIRS = [CUDA_TOOLKIT_ROOT_DIR / "lib64", CUDNN_DIR, TENSOR_RUNTIME_LIB.parent]
-LINK_LIBS = [
-    "pthread", "cudart", "curand", "cudnn", "cublas", "cufft", "OpenCL", "stdc++fs", "omp", "m"
-]
-COMPILE_FLAGS = ["fno-exceptions", "std=c++11", "O3"]
-
-
-def compile_hpvm_c(
-    input_file: PathLike,
-    output_file: PathLike,
-    codegen_target: str = "tensor",
-    include: List[PathLike] = None,
-    working_dir: PathLike = None,
-    conf_file: PathLike = None,
-):
-    from subprocess import check_output
-
-    codegen_functions = {
-        "tensor": lambda i, o: opt_codegen_tensor(i, o, conf_file),
-        "cudnn": opt_codegen_cudnn
-    }
-    codegen_f = codegen_functions[codegen_target]
-    working_dir = Path(working_dir or ".")
-    if not working_dir.is_dir():
-        os.makedirs(working_dir)
-    name_stem = Path(input_file).stem
-
-    ll_file = working_dir / f"{name_stem}.ll"
-    hpvm_ll_file = working_dir / f"{name_stem}.hpvm.ll"
-    llvm_ll_file = working_dir / f"{name_stem}.llvm.ll"
-    hpvm_rt_linked_file = working_dir / f"{name_stem}.linked.bc"
-    commands = [
-        hpvm_c_to_ll(input_file, ll_file, extra_includes=include),
-        opt_codegen_hpvm(ll_file, hpvm_ll_file),
-        codegen_f(hpvm_ll_file, llvm_ll_file),
-        link_hpvm_rt(llvm_ll_file, hpvm_rt_linked_file),
-        link_binary(hpvm_rt_linked_file, output_file),
-    ]
-    for command in commands:
-        print(" ".join(command))
-        check_output(command)
-
-
-def hpvm_c_to_ll(
-    src_file: PathLike,
-    target_file: PathLike,
-    extra_includes: Optional[List[PathLike]] = None,
-    flags: List[str] = None,
-) -> List[str]:
-    extra_includes = extra_includes or []
-    includes = [f"-I{path}" for path in INCLUDE_DIRS + extra_includes]
-    flags = [f"-{flg}" for flg in (flags or []) + COMPILE_FLAGS]
-    return [
-        str(LLVM_BUILD_DIR / "clang++"), *includes, *flags, "-emit-llvm", "-S",
-        str(src_file), "-o", str(target_file)
-    ]
-
-
-def opt_codegen_hpvm(src_file: PathLike, target_file: PathLike) -> List[str]:
-    return _run_opt(src_file, target_file, ["LLVMGenHPVM"], ["genhpvm", "globaldce"])
-
-
-def opt_codegen_cudnn(src_file: PathLike, target_file: PathLike) -> List[str]:
-    passes = [
-        "LLVMBuildDFG", "LLVMInPlaceDFGAnalysis",
-        "LLVMDFG2LLVM_CUDNN", "LLVMDFG2LLVM_CPU",
-        "LLVMFuseHPVMTensorNodes", "LLVMClearDFG", "LLVMGenHPVM"
-    ]
-    flags = [
-        "buildDFG", "inplace", "hpvm-fuse",
-        "dfg2llvm-cudnn", "dfg2llvm-cpu", "clearDFG"
-    ]
-    return _run_opt(src_file, target_file, passes, flags)
-
-
-def opt_codegen_tensor(
-    src_file: PathLike, target_file: PathLike, conf_file: PathLike
-):
-    passes = [
-        "LLVMBuildDFG", "LLVMInPlaceDFGAnalysis",
-        "LLVMDFG2LLVM_WrapperAPI", "LLVMDFG2LLVM_CPU",
-        "LLVMFuseHPVMTensorNodes", "LLVMClearDFG", "LLVMGenHPVM"
-    ]
-    flags = [
-        "buildDFG", "inplace", "hpvm-fuse",
-        "dfg2llvm-wrapperapi",
-        f"configuration-inputs-filename={conf_file}",
-        "dfg2llvm-cpu", "clearDFG",
-    ]
-    return _run_opt(src_file, target_file, passes, flags)
-
-
-def link_hpvm_rt(src_file: PathLike, target_file: PathLike) -> List[str]:
-    return [str(LLVM_BUILD_DIR / "llvm-link"), str(src_file), HPVM_RT_PATH, "-o", str(target_file)]
-
-
-def link_binary(src_file: PathLike, target_file: PathLike) -> List[str]:
-    linker_dir_flags = []
-    for path in LINK_DIRS:
-        linker_dir_flags.extend([f"-L{path}", f"-Wl,-rpath={path}"])
-    linker_lib_flags = [f"-l{lib}" for lib in LINK_LIBS]
-    return [
-        str(LLVM_BUILD_DIR / "clang++"), str(src_file), str(TENSOR_RUNTIME_LIB), "-o", str(target_file),
-        *linker_dir_flags, *linker_lib_flags
-    ]
-
-
-def _run_opt(
-    src_file: PathLike,
-    target_file: PathLike,
-    pass_names: List[str],
-    pass_flags: List[str],
-) -> List[str]:
-    unavailable = set(pass_names) - set(AVAILABLE_PASSES)
-    if unavailable:
-        raise ValueError(f"Passes {unavailable} are unavailable from CMake")
-    load_passes_strs = [s for pass_ in pass_names for s in ["-load", f"{pass_}.so"]]
-    pass_flags_strs = [f"-{flag}" for flag in pass_flags]
-    return [
-        str(LLVM_BUILD_DIR / "opt"), *load_passes_strs, *pass_flags_strs,
-        "-S", str(src_file), "-o", str(target_file)
-    ]
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("approxhpvm")
-    parser.add_argument("input_file", type=Path, help="HPVM-C code to compile")
-    parser.add_argument("output_file", type=Path, help="Path to generate binary to")
-    parser.add_argument(
-        "-t",
-        "--codegen-target",
-        type=str,
-        choices=["tensor", "cudnn"],
-        help="Backend to use",
-    )
-    parser.add_argument(
-        "-d", "--working-dir", type=Path, help="Directory to generate temp files in"
-    )
-    parser.add_argument(
-        "--conf-file", type=Path,
-        help="File to approximation configurations; required for 'tensor' target"
-    )
-    parser.add_argument(
-        "-I", "--include", type=Path, nargs="+",
-        help="Additional include directories to use"
-    )
-
-    args = parser.parse_args()
-    if args.codegen_target == "tensor":
-        if args.conf_file is None:
-            parser.error('Codegen target "tensor" requires --conf-file argument')
-    return args
-
-
-if __name__ == "__main__":
-    compile_hpvm_c(**vars(parse_args()))