diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..f7d3d37cde7f947d3d7d7f4f9d8d7879b60e33e6
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "hpvm/projects/predtuner"]
+	path = hpvm/projects/predtuner
+	url = ../predtuner.git
+	branch = hpvm
diff --git a/README.md b/README.md
index 4280373aa5f5c7d239ee18989edf1f6219c360a1..6b185aab532fb00e94dcad3d735954e61b0a883f 100644
--- a/README.md
+++ b/README.md
@@ -15,8 +15,8 @@ HPVM is currently at **version 1.0**. For more about what HPVM is, see [our webs
 
 [PPoPP'21 paper](https://dl.acm.org/doi/10.1145/3437801.3446108)
 
+## Resources
 
-## Docs
 [HPVM IR Specification](/hpvm/docs/hpvm-specification.md)
 
 [HPVM-C Language Specification](/hpvm/docs/hpvm-c.md)
@@ -24,6 +24,7 @@ HPVM is currently at **version 1.0**. For more about what HPVM is, see [our webs
 [HPVM Compilation Process](/hpvm/docs/compilation.md)
 
 ## Dependencies
+
 The following components are required to be installed on your machine to build HPVM.
 
 * GCC (>=5.1)
@@ -36,29 +37,39 @@ The following components are required to be installed on your machine to build H
 * CUDA (>=9.1)
 
 ## Supported Targets
+
 Supported/tested CPU architectures:
+
 * Intel Xeon E5-2640
 * Intel Xeon W-2135
 * ARM Cortex A-57
 
 Supported/tested GPU architectures for OpenCL backend:
+
 * Nvidia Quadro P1000
 * Nvidia GeForce GTX 1080
 
 Supported/tested GPU architectures for Tensor Backend:
-* Nvidia Jetson TX2 
+
+* Nvidia Jetson TX2
 * Nvidia GeForce GTX 1080
 
 HPVM has not been tested but might work on other CPUs supported by LLVM Backend, and GPUs supported by OpenCL such as Intel, AMD, etc.
 
-**NOTE: Approximations are tuned for Jetson TX2 and same speedups may not exist for other architectures **
+**NOTE**: Approximations are tuned for Jetson TX2 and same speedups may not exist for other architectures.
+
+## Getting Started
+
+### Getting source code and setting up environment
 
-## Getting source code and building HPVM
+Checkout HPVM and go to directory `./hpvm` under project root:
 
-Checkout HPVM:
 ```shell
-git clone --recursive https://gitlab.engr.illinois.edu/llvm/hpvm-release.git/
-cd hpvm-release/hpvm
+git clone --recursive https://gitlab.engr.illinois.edu/llvm/hpvm.git
+cd hpvm/
+git checkout approx_hpvm_reorg_keras
+git submodule update --init --recursive
+cd hpvm/
 ```
 
 HPVM needs to be able to find CUDA.
@@ -68,58 +79,77 @@ Otherwise, some environment variables are required:
 
 * `CUDA_TOOLKIT_PATH` --- Path to the CUDA toolkit
 * `CUDA_INCLUDE_PATH` --- Path to the CUDA headers
-* `CUDA_LIB_PATH` --- Path to CUDA libraries 
+* `CUDA_LIB_PATH` --- Path to CUDA libraries
+
+`set_paths.sh` can be used for this.
+Modify the values of these variables in `set_paths.sh` according to your system, and source the script:
 
-`hpvm/set_paths.sh` can be used for this. Modify the values of these variables in set_paths.sh and source the script:
 ```shell
 source set_paths.sh
 ```
 
-HPVM installer script can be used to download, configure and build HPVM along with LLVM and Clang. 
+HPVM installer script can be used to download, configure and build HPVM along with LLVM and Clang.
+
 ```shell
 bash install.sh
 ```
-Specifically, the HPVM installer downloads LLVM, and Clang, copies HPVM source into 
-llvm/tools and builds the entire tree. It also builds a modified LLVM C-Backend, based on the one maintained by [Julia Computing](https://github.com/JuliaComputing/llvm-cbe), as a part of HPVM and is currently used 
-to generate OpenCL kernels for GPUs.
 
-In the beginning of the building process, the installer provides users the choice of automatically or manually building HPVM. 
-If HPVM is selected to be built automatically, the installer allows users to type in the number of threads they want to use. 
-The default number of threads used to build HPVM is two.
+On launch, the installer asks whether it should also build HPVM.
+If HPVM is to be built, the installer asks the number of threads to be used.
+The default number of threads used to build HPVM is two (2).
+
+If you use this automatic build, skip the next section.
+
+* Specifically, the HPVM installer downloads LLVM, and Clang, copies HPVM source into
+  llvm/tools and builds the entire tree. It also builds a modified LLVM C-Backend,
+  based on the one maintained by [Julia Computing](https://github.com/JuliaComputing/llvm-cbe),
+  as a part of HPVM and is currently used to generate OpenCL kernels for GPUs.
+
+### Manually Build HPVM
+
+Alternatively, you can manually build HPVM with CMake.
+Please note that in this case,
+the installer script still *must* be executed to obtain some required components,
+but without the build step.
+
+In current directory (`hpvm/`), do
 
-Alternatively, CMake can be run manually using the following steps in ./hpvm-release/hpvm directory.
 ```shell
 mkdir build
 cd build
 cmake ../llvm [options]
+export PATH=$(realpath ./bin):$PATH
 ```
-**Note** that if the installer script was not used,
-you must _manually add `build/bin` directory to your $PATH variable_ (as absolute path).
 
 Some common options that can be used with CMake are:
 
 * -DCMAKE_INSTALL_PREFIX=directory --- Specify for directory the full pathname of where you want the HPVM tools and libraries to be installed.
-
 * -DCMAKE_BUILD_TYPE=type --- Valid options for type are Debug, Release, RelWithDebInfo, and MinSizeRel. Default is Debug.
-
 * -DLLVM_ENABLE_ASSERTIONS=On --- Compile with assertion checks enabled (default is Yes for Debug builds, No for all other build types).
 
-In order to manually build and install HPVM, GNU Make can be run using the following in the build directory.
+**Note** that if the installer script was not used,
+you must _manually add `build/bin` directory to your $PATH variable_ as absolute path (as shown above).
+
+Now, compile the HPVM Compilation Tool `approxhpvm.py` using:
+
 ```shell
-make -j<number of threads>
-make install
+make -j<number of threads> approxhpvm.py
 ```
 
-In the end of the installation process, the installer automatically runs all the regression tests to ensure that the installation is
-successful. If HPVM is built and installed manually, the tests can be automatically run by executing the following step from the ./hpvm-release/hpvm directory.
+With all the aforementioned steps, HPVM should be built, installed, tested and ready to use.
+In particular, `approxhpvm.py` should be an executable command from your command line.
+
+When not using the installer, you may want to run the regression tests using this script (outside of build directory):
+
 ```shell
+cd ..
 bash scripts/automate_tests.sh
 ```
 
-With all the aforementioned steps, HPVM should be built, installed, tested and ready to use.
-
 ## Benchmarks and Tests
+
 We are providing the following [HPVM benchmarks](/hpvm/test/benchmarks):
+
 * Select benchmarks from the [Parboil](http://impact.crhc.illinois.edu/parboil/parboil.aspx) benchmark suite, located under [test/benchmarks/parboil](/hpvm/test/benchmarks/parboil).
 * An edge detection pipeline benchmark, located under [test/benchmarks/pipeline](/hpvm/test/benchmarks/pipeline).
 * A Camera ISP pipeline, located under [test/benchmarks/hpvm-cava](/hpvm/test/benchmarks/hpvm-cava), adapted from C code provided from our collaborators at [Harvard](http://vlsiarch.eecs.harvard.edu).
@@ -129,4 +159,5 @@ Benchmark descriptions and instructions on how to compile and run them are [here
 We are also providing [unit tests](/hpvm/test/unitTests) and [regression tests](/hpvm/test/regressionTests).
 
 ## Support
+
 All questions can be directed to [hpvm-dev@lists.cs.illinois.edu](mailto:hpvm-dev@lists.cs.illinois.edu).
diff --git a/hpvm/CMakeLists.txt b/hpvm/CMakeLists.txt
index d63675b34275c3f83c10ca83005bbfe563777554..b6985d0a100f38a7712580a30d3ba91e59dd248c 100644
--- a/hpvm/CMakeLists.txt
+++ b/hpvm/CMakeLists.txt
@@ -1,3 +1,13 @@
+cmake_minimum_required(VERSION 3.17)
+project(hpvm CUDA CXX)
+get_filename_component(
+  CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}/../.." ABSOLUTE
+)  # Set CUDA_TOOLKIT_ROOT_DIR by our own, to the parent folder of cuda nvcc
+
+# find_package will use the auxillary cmake/Find*.cmake we provide
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+find_package(CUDNN 7 EXACT REQUIRED)  # CUDNN_INCLUDE_PATH, CUDNN_LIBRARY_PATH
+
 include_directories(./include/)
 
 # Generate TENSOR_RT_PREFIX into config.h
diff --git a/hpvm/cmake/FindCUDNN.cmake b/hpvm/cmake/FindCUDNN.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e5a427f0317a6f3b8f7e7b2cc89fd176fd4362dc
--- /dev/null
+++ b/hpvm/cmake/FindCUDNN.cmake
@@ -0,0 +1,83 @@
+# Obtained from PyTorch repo: https://github.com/pytorch/pytorch/blob/master/cmake/Modules_CUDA_fix/FindCUDNN.cmake
+# Find the CUDNN libraries
+#
+# The following variables are optionally searched for defaults
+#  CUDNN_ROOT: Base directory where CUDNN is found
+#  CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for
+#  CUDNN_LIBRARY: Directory where CUDNN library is searched for
+#  CUDNN_STATIC: Are we looking for a static library? (default: no)
+#
+# The following are set after configuration is done:
+#  CUDNN_FOUND
+#  CUDNN_INCLUDE_PATH
+#  CUDNN_LIBRARY_PATH
+#
+
+include(FindPackageHandleStandardArgs)
+
+set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN")
+if (DEFINED $ENV{CUDNN_ROOT_DIR})
+  message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.")
+endif()
+list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+
+# Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT})
+
+set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files")
+
+find_path(CUDNN_INCLUDE_PATH cudnn.h
+  HINTS ${CUDNN_INCLUDE_DIR}
+  PATH_SUFFIXES cuda/include cuda include)
+
+option(CUDNN_STATIC "Look for static CUDNN" OFF)
+if (CUDNN_STATIC)
+  set(CUDNN_LIBNAME "libcudnn_static.a")
+else()
+  set(CUDNN_LIBNAME "cudnn")
+endif()
+
+set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)")
+if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC)
+  message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.")
+endif()
+
+find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME}
+  PATHS ${CUDNN_LIBRARY}
+  PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+# Get director from filename ${CUDNN_LIBRARY_PATH}
+get_filename_component(
+  CUDNN_LIBRARY_PATH
+  "${CUDNN_LIBRARY_PATH}/.." ABSOLUTE
+)
+
+# This version check is from OpenCV repo: https://github.com/opencv/opencv/blob/master/cmake/FindCUDNN.cmake
+# extract version from the include
+if(CUDNN_INCLUDE_PATH)
+  if(EXISTS "${CUDNN_INCLUDE_PATH}/cudnn_version.h")
+    file(READ "${CUDNN_INCLUDE_PATH}/cudnn_version.h" CUDNN_H_CONTENTS)
+  else()
+    file(READ "${CUDNN_INCLUDE_PATH}/cudnn.h" CUDNN_H_CONTENTS)
+  endif()
+
+  string(REGEX MATCH "define CUDNN_MAJOR ([0-9]+)" _ "${CUDNN_H_CONTENTS}")
+  set(CUDNN_VERSION_MAJOR ${CMAKE_MATCH_1})
+  string(REGEX MATCH "define CUDNN_MINOR ([0-9]+)" _ "${CUDNN_H_CONTENTS}")
+  set(CUDNN_VERSION_MINOR ${CMAKE_MATCH_1})
+  string(REGEX MATCH "define CUDNN_PATCHLEVEL ([0-9]+)" _ "${CUDNN_H_CONTENTS}")
+  set(CUDNN_VERSION_PATCH ${CMAKE_MATCH_1})
+
+  set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
+  unset(CUDNN_H_CONTENTS)
+endif()
+
+find_package_handle_standard_args(
+  CUDNN
+  FOUND_VAR CUDNN_FOUND
+  REQUIRED_VARS
+    CUDNN_LIBRARY_PATH
+    CUDNN_INCLUDE_PATH
+  VERSION_VAR CUDNN_VERSION
+)
+
+mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
diff --git a/hpvm/lib/Transforms/DFG2LLVM_CPU/CMakeLists.txt b/hpvm/lib/Transforms/DFG2LLVM_CPU/CMakeLists.txt
index b4e129ba01837cf328912f7787b861f843f4f581..83ec877b0675f0b2a841e24d15126932c812bbd9 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_CPU/CMakeLists.txt
+++ b/hpvm/lib/Transforms/DFG2LLVM_CPU/CMakeLists.txt
@@ -2,7 +2,7 @@ if(WIN32 OR CYGWIN)
   set(LLVM_LINK_COMPONENTS Core Support)
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${PROJECT_BINARY_DIR}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${CMAKE_BINARY_DIR}")
 
 add_llvm_library( LLVMDFG2LLVM_CPU
   MODULE
diff --git a/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt b/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt
index 00c651eaa250fc114f229f30e0cb7c121154ff96..4041df11ce8d79e39d6f72bdf0a1068eae449300 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt
+++ b/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt
@@ -2,7 +2,7 @@ if(WIN32 OR CYGWIN)
   set(LLVM_LINK_COMPONENTS Core Support)
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${PROJECT_BINARY_DIR}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${CMAKE_BINARY_DIR}")
 
 add_llvm_library( LLVMDFG2LLVM_OpenCL
   MODULE
diff --git a/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt
index fc4c9fc5a98007dd700973c598b6731edcd61e14..fbf5881480ce11745b0d4de00b90c0812a6db356 100644
--- a/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt
+++ b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt
@@ -2,7 +2,7 @@ if(WIN32 OR CYGWIN)
   set(LLVM_LINK_COMPONENTS Core Support)
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${PROJECT_BINARY_DIR}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${CMAKE_BINARY_DIR}")
 
 add_llvm_library( LLVMGenHPVM
   MODULE
diff --git a/hpvm/projects/CMakeLists.txt b/hpvm/projects/CMakeLists.txt
index b46164b8d07de77ba9feb570b976e19ae9fdf4b2..2a51c0b09e672e8508a8a13d189d05eb3ccc2e48 100644
--- a/hpvm/projects/CMakeLists.txt
+++ b/hpvm/projects/CMakeLists.txt
@@ -10,7 +10,6 @@ foreach(entry ${entries})
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/parallel-libs) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/openmp) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/debuginfo-tests))
-      set(LLVM_BUILD_DIR ${PROJECT_BINARY_DIR})
       get_filename_component(entry_name "${entry}" NAME)
       add_llvm_external_project(${entry_name})
     endif()
diff --git a/hpvm/projects/hpvm-rt/CMakeLists.txt b/hpvm/projects/hpvm-rt/CMakeLists.txt
index 02ab62fca57f66155ffafff0686634b3efe4f861..6efd8d3d0a9d86236adc87657fb68b782f3daaa0 100644
--- a/hpvm/projects/hpvm-rt/CMakeLists.txt
+++ b/hpvm/projects/hpvm-rt/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_definitions(-DNUM_CORES=8)
 
-SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang)
-SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++)
+SET(CMAKE_C_COMPILER ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/clang)
+SET(CMAKE_CXX_COMPILER ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/clang++)
 SET(CMAKE_CXX_STANDARD 11)
 # Defines ${OpenCL_INCLUDE_DIRS} and ${OpenCL_LIBRARY} if found
 find_package(OpenCL REQUIRED)
diff --git a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
index 2f8cfc27e5280e7d18a830cc6083841a2cc3590b..5c04604406eb81571c0a87539fb0568aad3c4e4d 100644
--- a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -1,50 +1,15 @@
-cmake_minimum_required(VERSION 3.17)
-project(hpvm-tensor-rt)
-find_package(CUDA 9.1 REQUIRED)
-set(CUDA_SEPARABLE_COMPILATION ON CACHE BOOL "")
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-  # gcc > 8 are not supported
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8)
-    message(FATAL_ERROR "GCC versions later than 8 are not supported")
-  endif()
-elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  # clang < 3.2 || clang >= 9 unsupported
-  set(clang_v ${CMAKE_CXX_COMPILER_VERSION})
-  if (clang_v VERSION_GREATER_EQUAL 9 OR clang_v VERSION_LESS_EQUAL 3.2)
-    message(FATAL_ERROR "Clang<3.2 or clang>=9 are not supported")
-  endif()
-endif()
-# Addresses a bug where code is not compiled as C++11 in non-CUDA code and older g++ versions
-# Edit: using c++14 now
+project(hpvm-tensor-rt CUDA CXX)
 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -I/")
-set(
-  CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
-  -gencode;arch=compute_60,code=sm_60;
-  -gencode;arch=compute_60,code=compute_60;
-  -std=c++14 --expt-relaxed-constexpr -maxrregcount 32 # These are for image ops
-)
-if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  message("Debug mode")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-g;-lineinfo;-Xcompiler;-ggdb)
-else()
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNDEBUG;-Xcompiler;-DNDEBUG)
-endif()
 
-# Default options
-if(USE_GFLAGS)
-  add_definitions(-DUSE_GFLAGS)
-endif()
-if(USE_AUTOTUNER)
-  remove_definitions(-DNO_INJECTION)
+if(CMAKE_CURRENT_BINARY_DIR STREQUAL CMAKE_SOURCE_DIR)  # This means we're NOT compiling in HPVM
+  set(INDEP_BUILD True)
+  message(STATUS "Compiling hpvm-tensor-rt independently")
+else()
+  set(INDEP_BUILD False)
+  message(STATUS "Compiling hpvm-tensor-rt inside HPVM")
 endif()
-add_definitions(-DNO_INJECTION)
-add_definitions(-DPROMISE_TUNER_ENABLED)
-add_definitions(-DSIMULATION_MODE=true)
 
-# Config path configuration file
+# -- Configure path configuration file
 if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/global_knobs.txt)
   message(FATAL_ERROR "global_knobs.txt not found")
 endif()
@@ -56,202 +21,176 @@ configure_file(
   ${CMAKE_CURRENT_BINARY_DIR}/tensor_runtime/include/config.h
 )
 
-# Default include/link directories
+# -- Default include directories
 set(
   INCLUDES
-  $ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/include
-  ${CUDA_INCLUDE_DIRS}
+  ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+  ${CUDNN_INCLUDE_PATH}
   ./tensor_runtime/include ${CMAKE_CURRENT_BINARY_DIR}/tensor_runtime/include
   ./dnn_sources/include
   ../gpu_profiler/include ../soc_simulator/include
 )
-set(
-  LINK_DIRS
-  ${CUDA_TOOLKIT_ROOT_DIR}/lib64 $ENV{CUDNN_PATH}
-  $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64
-)
-include_directories(${INCLUDES})
-link_directories(${LINK_DIRS})
 
-# Source files of runtime
+# -- Link libraries
+find_package(OpenMP REQUIRED)  # Provides ${OpenMP_CXX_FLAGS}
+# Configure gpu_profiler and soc_simulator, and setup all libs to link to
+# Conditionally add gpu_profiler project if we're building independently
+# (not building the whole hpvm)
+if(INDEP_BUILD)
+  message(STATUS "Also compiling gpu_profiler and soc_simulator")
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../gpu_profiler ${CMAKE_CURRENT_BINARY_DIR}/gpu_profiler)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../soc_simulator ${CMAKE_CURRENT_BINARY_DIR}/soc_simulator)
+endif()
+set(LINK_DIR CUDNN_LIBRARY_PATH)
+set(LINK_LIBS gpu_profiler promise_profiler stdc++fs cudnn curand cublas)
+if(USE_GFLAGS)
+  list(APPEND LINK_LIBS gflags)
+endif()
+
+# -- Definitions
+set(DEFS -DPROMISE_TUNER_ENABLED -DSIMULATION_MODE=true)
+if(USE_GFLAGS)
+  list(APPEND DEFS -DUSE_GFLAGS)
+endif()
+
+# -- Sources of runtime
 set(
   RUNTIME_SRCS_FILENAME
-  approx_simulation.cu
-  group_conv.cu
-  approx_techniques.cu
-  common.cpp
+  approx_knobs_utils.cc approx_simulation.cu approx_techniques.cu
   configuration.cpp
-  debug.cc
-  debug.cpp
-  device_math.cu
+  debug.cpp device_math.cu
   error.cu
-  tensor_cpu_runtime.cc
-  fp16_gemm.cu
-  global_data.cc
-  half_precision_api.cu
-  hpvm-rt-controller.cpp
-  img_tensor_runtime.cu
-  img_tensor_utils.cpp
+  fp16_gemm.cu freq_utils.cc
+  global_data.cc group_conv.cu
+  half_precision_api.cu hpvm-rt-controller.cpp
+  init_api.cc
   op_overheads.cc
   profiling.cc
-  tensor_runtime.cu
-  tensor_utils.cu
+  tensor_cpu_runtime.cc tensor_runtime.cu tensor_utils.cu
   wrapper_runtime.cu
-  approx_knobs_utils.cc
-  init_api.cc
 )
 foreach(FILE ${RUNTIME_SRCS_FILENAME})
   list(APPEND RUNTIME_SRCS "tensor_runtime/src/${FILE}")
+  # Some files doesn't end in .cu or .cuh, but we know they are still CUDA files
+  set_source_files_properties("tensor_runtime/src/${FILE}" PROPERTIES LANGUAGE CUDA)
 endforeach()
 
-# Compile gpu_profiler and soc_simulator
-# Conditionally add gpu_profiler project if we're building independently
-# (not building the whole hpvm)
-get_filename_component(root_dir ${CMAKE_SOURCE_DIR} REALPATH)
-get_filename_component(our_dir ${CMAKE_CURRENT_SOURCE_DIR} REALPATH)
-if(${root_dir} STREQUAL ${our_dir})
-  message(STATUS "Compiling hpvm-tensor-rt independently")
-  message(STATUS "Also compiling gpu_profiler and soc_simulator")
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../gpu_profiler ${CMAKE_CURRENT_BINARY_DIR}/gpu_profiler)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../soc_simulator ${CMAKE_CURRENT_BINARY_DIR}/soc_simulator)
-endif()
-set(LINK_LIBS gpu_profiler promise_profiler cudnn cufft stdc++fs curand)
-if(USE_GFLAGS)
-  list(APPEND LINK_LIBS gflags)
-endif()
+# -- Adding tensor_runtime targets
+function(add_tensor_runtime target_name)
+  add_library(${target_name} ${RUNTIME_SRCS})
+  set_property(TARGET ${target_name} PROPERTY CUDA_ARCHITECTURES 60)
+  target_compile_options(
+    ${target_name} PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr -maxrregcount 32>
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CONFIG:DEBUG>>:-lineinfo -Xcompiler -ggdb>
+    $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
+  )
+  target_include_directories(${target_name} PUBLIC ${INCLUDES})
+  target_link_directories(${target_name} PUBLIC ${LINK_DIR})
+  target_link_libraries(${target_name} PUBLIC ${LINK_LIBS})
+  target_compile_definitions(${target_name} PRIVATE ${DEFS} ${ARGN})
+endfunction(add_tensor_runtime)
 
 # Adding new rule for building a cuDNN runtime library
 # Offline version
-find_package(OpenMP REQUIRED)
-cuda_add_library(tensor_runtime ${RUNTIME_SRCS})
-cuda_add_cublas_to_target(tensor_runtime ${OpenMP_CXX_FLAGS})
-target_compile_options(tensor_runtime PRIVATE ${OpenMP_CXX_FLAGS})
-target_link_libraries(tensor_runtime ${LINK_LIBS} ${OpenMP_CXX_FLAGS})
-target_compile_definitions(tensor_runtime PRIVATE -DONLINE_PROFILING=false -DFP16_tuning=true)
+add_tensor_runtime(tensor_runtime -DONLINE_PROFILING=false -DFP16_tuning=true)
 
-if(LLVM_BUILD_DIR)  # Defined in ../CMakeLists.txt. This means we're compiling in LLVM
-  get_filename_component(LLVM_CLANG_XX ${LLVM_BUILD_DIR}/bin/clang++ REALPATH)
-  # It's important that tensor_runtime.ll goes here if we're compiling with LLVM
-  # Some HPVM passes look for tensor_runtime.ll in this folder (which is usually build/lib)
-  set(TENSOR_RT_LL_PREFIX ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
-  add_dependencies(tensor_runtime clang)
-else()
-  # Surely if we're compiling outside of hpvm, then we need the system-wide clang.
-  # Use it but check version 9 first
-  execute_process(COMMAND clang++ --version OUTPUT_VARIABLE clang_full_version_string ERROR_QUIET)
-  string(REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION_STRING ${clang_full_version_string})
-  if(CLANG_VERSION_STRING VERSION_EQUAL 9)
-    set(LLVM_CLANG_XX clang++)
+# Online version
+add_tensor_runtime(tensor_runtime_online -DONLINE_PROFILING=true -DFP16_tuning=false)
+# tensor_runtime_online is built AFTER tensor_runtime because of a nvcc bug (bug?)
+# that doesn't allow compiling the same file from multiple targets at once.
+# Same for tensor_runtime_online.
+add_dependencies(tensor_runtime_online tensor_runtime)
+
+# Adding rule for the debugging source
+add_executable(unit_tests dnn_sources/src/unit_tests.cc)
+target_link_libraries(unit_tests  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+# -- Compile tensor_runtime.ll if possible
+if(INDEP_BUILD)
+  # Surely if we're compiling outside of hpvm, then we need the system-wide clang -- a clang 9.
+  execute_process(COMMAND clang-9 --version OUTPUT_VARIABLE clang_stdout ERROR_QUIET)
+  if(clang_stdout)
     set(TENSOR_RT_LL_PREFIX ${CMAKE_CURRENT_SOURCE_DIR}/lib)
   else()
     message(WARNING "System clang++ of version 9 not found; skipping tensor_runtime.ll generation")
   endif()
+  set(CLANG_NAME clang-9)
+else()
+  # It's important that tensor_runtime.ll goes here if we're compiling with LLVM
+  # Some HPVM passes look for tensor_runtime.ll in this folder (which is usually build/lib)
+  set(TENSOR_RT_LL_PREFIX ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
+  # Per cmake documentation, if we're building in LLVM, then in add_custom_command
+  # the command "clang" will be auto resolved to the path to clang we're building
+  set(CLANG_NAME clang)
+  add_dependencies(tensor_runtime clang)
 endif()
 # If some clang-9 is found, create a tensor_runtime.ll from tensor_signatures.cc
-if(LLVM_CLANG_XX)
+if(CLANG_NAME)
   message(STATUS "Creating tensor_runtime.ll in ${TENSOR_RT_LL_PREFIX}")
   foreach(dir ${INCLUDES})
     list(APPEND INCLUDE_COMPILER_STRINGS "-I${dir}")
   endforeach()
   add_custom_command(
     TARGET tensor_runtime POST_BUILD
-    COMMAND ${LLVM_CLANG_XX} ${INCLUDE_COMPILER_STRINGS} -S -emit-llvm
+    COMMAND ${CLANG_NAME} -x c++ ${INCLUDE_COMPILER_STRINGS} -S -emit-llvm
     ${CMAKE_CURRENT_SOURCE_DIR}/tensor_runtime/include/tensor_signatures.cc
     -o ${TENSOR_RT_LL_PREFIX}/tensor_runtime.ll
   )
 endif()
 
-# Install version (also offline)
-cuda_add_library(tensor_runtime_install ${RUNTIME_SRCS})
-cuda_add_cublas_to_target(tensor_runtime_install)
-# tensor_runtime_install is built AFTER tensor_runtime because of a nvcc bug (bug?)
-# that doesn't allow compiling the same file from multiple targets at once.
-# Same for tensor_runtime_online.
-add_dependencies(tensor_runtime_install tensor_runtime)
-target_link_libraries(tensor_runtime_install ${LINK_LIBS})
-target_compile_definitions(tensor_runtime_install PRIVATE -DONLINE_PROFILING=false -DFP16_tuning=true)
-
-# Online version
-cuda_add_library(tensor_runtime_online ${RUNTIME_SRCS})
-cuda_add_cublas_to_target(tensor_runtime_online ${OpenMP_CXX_FLAGS})
-target_compile_options(tensor_runtime_online PRIVATE ${OpenMP_CXX_FLAGS})
-add_dependencies(tensor_runtime_online tensor_runtime)
-target_link_libraries(tensor_runtime_online ${LINK_LIBS} ${OpenMP_CXX_FLAGS})
-target_compile_definitions(tensor_runtime_online PRIVATE -DONLINE_PROFILING=true -DFP16_tuning=false)
-
-
-
-# --------------  Unit Test Source ----------------
-
-add_executable(unit_tests   dnn_sources/src/unit_tests.cc)
-target_link_libraries(unit_tests  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
 
 #**************** FP32 TensorRT Source Builds *********** 
 
-add_executable(lenet_mnist_fp32  dnn_sources/src/fp32/lenet_mnist.cc)
-target_link_libraries(lenet_mnist_fp32  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(lenet_mnist_fp32 dnn_sources/src/fp32/lenet_mnist.cc)
+target_link_libraries(lenet_mnist_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(alexnet_cifar10_fp32  dnn_sources/src/fp32/alexnet_cifar10.cc)
-target_link_libraries(alexnet_cifar10_fp32  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(alexnet_cifar10_fp32 dnn_sources/src/fp32/alexnet_cifar10.cc)
+target_link_libraries(alexnet_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(alexnet2_cifar10_fp32  dnn_sources/src/fp32/alexnet2_cifar10.cc)
-target_link_libraries(alexnet2_cifar10_fp32  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(alexnet2_cifar10_fp32 dnn_sources/src/fp32/alexnet2_cifar10.cc)
+target_link_libraries(alexnet2_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(vgg16_cifar10_fp32  dnn_sources/src/fp32/vgg16_cifar10.cc)
-target_link_libraries(vgg16_cifar10_fp32  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(vgg16_cifar10_fp32 dnn_sources/src/fp32/vgg16_cifar10.cc)
+target_link_libraries(vgg16_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(resnet18_cifar10_fp32  dnn_sources/src/fp32/resnet18_cifar10.cc)
-target_link_libraries(resnet18_cifar10_fp32  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(resnet18_cifar10_fp32 dnn_sources/src/fp32/resnet18_cifar10.cc)
+target_link_libraries(resnet18_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(vgg16_cifar100_fp32  dnn_sources/src/fp32/vgg16_cifar100.cc)
-target_link_libraries(vgg16_cifar100_fp32  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(vgg16_cifar100_fp32 dnn_sources/src/fp32/vgg16_cifar100.cc)
+target_link_libraries(vgg16_cifar100_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(mobilenet_cifar10_fp32  dnn_sources/src/fp32/mobilenet.cc)
-target_link_libraries(mobilenet_cifar10_fp32  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(alexnet_imagenet_fp32  dnn_sources/src/fp32/alexnet_imagenet.cc)
-target_link_libraries(alexnet_imagenet_fp32  tensor_runtime_online  ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(vgg16_imagenet_fp32  dnn_sources/src/fp32/vgg16_imagenet.cc)
-target_link_libraries(vgg16_imagenet_fp32  tensor_runtime_online  ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(resnet50_imagenet_fp32  dnn_sources/src/fp32/resnet50_imagenet.cc)
-target_link_libraries(resnet50_imagenet_fp32  tensor_runtime_online  ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(mobilenet_cifar10_fp32 dnn_sources/src/fp32/mobilenet.cc)
+target_link_libraries(mobilenet_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
+add_executable(alexnet_imagenet_fp32 dnn_sources/src/fp32/alexnet_imagenet.cc)
+target_link_libraries(alexnet_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
+add_executable(vgg16_imagenet_fp32 dnn_sources/src/fp32/vgg16_imagenet.cc)
+target_link_libraries(vgg16_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
+add_executable(resnet50_imagenet_fp32 dnn_sources/src/fp32/resnet50_imagenet.cc)
+target_link_libraries(resnet50_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 #********* FP16 TensorRT Source Builds ****** 
 
-add_executable(lenet_mnist_fp16   dnn_sources/src/fp16/lenet_mnist_half.cc)
-target_link_libraries(lenet_mnist_fp16  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(alexnet_cifar10_fp16   dnn_sources/src/fp16/alexnet_cifar10_half.cc)
-target_link_libraries(alexnet_cifar10_fp16  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-
-add_executable(alexnet2_cifar10_fp16  dnn_sources/src/fp16/alexnet2_cifar10_half.cc)
-target_link_libraries(alexnet2_cifar10_fp16  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(lenet_mnist_fp16 dnn_sources/src/fp16/lenet_mnist_half.cc)
+target_link_libraries(lenet_mnist_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(resnet18_cifar10_fp16  dnn_sources/src/fp16/resnet18_cifar10_half.cc)
-target_link_libraries(resnet18_cifar10_fp16  tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(alexnet_cifar10_fp16 dnn_sources/src/fp16/alexnet_cifar10_half.cc)
+target_link_libraries(alexnet_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(vgg16_cifar10_fp16  dnn_sources/src/fp16/vgg16_cifar10_half.cc)
-target_link_libraries(vgg16_cifar10_fp16  tensor_runtime_online  ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(alexnet2_cifar10_fp16 dnn_sources/src/fp16/alexnet2_cifar10_half.cc)
+target_link_libraries(alexnet2_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(vgg16_cifar100_fp16  dnn_sources/src/fp16/vgg16_cifar100_half.cc)
-target_link_libraries(vgg16_cifar100_fp16  tensor_runtime_online  ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(resnet18_cifar10_fp16 dnn_sources/src/fp16/resnet18_cifar10_half.cc)
+target_link_libraries(resnet18_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(mobilenet_cifar10_fp16  dnn_sources/src/fp16/mobilenet_half.cc)
-target_link_libraries(mobilenet_cifar10_fp16  tensor_runtime_online  ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+add_executable(vgg16_cifar10_fp16 dnn_sources/src/fp16/vgg16_cifar10_half.cc)
+target_link_libraries(vgg16_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
+add_executable(vgg16_cifar100_fp16 dnn_sources/src/fp16/vgg16_cifar100_half.cc)
+target_link_libraries(vgg16_cifar100_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-
-
-file(GLOB files "dnn_sources/src/dynamic/*.cpp")
-foreach(file ${files})
-  get_filename_component(stem ${file} NAME_WE) 
-  add_executable(${stem} ${file})
-  target_link_libraries(${stem} tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
-endforeach()
-
+add_executable(mobilenet_cifar10_fp16 dnn_sources/src/fp16/mobilenet_half.cc)
+target_link_libraries(mobilenet_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h
deleted file mode 100644
index 4eaf88e6d613c51a5a75ef8ce73b55a3410f1dbd..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h
+++ /dev/null
@@ -1,148 +0,0 @@
-
-
-#ifndef OP_OVERHEADS_HEADER
-#define OP_OVERHEADS_HEADER
-
-
-#include <sstream>
-#include "../../tensor_runtime/include/tensor.h"
-#include "types.h"
-
-
-float scale_down_factor = 10000.0;
-float error_factor = 0.1;
-std::string result_str = "";
-
-
-// TODO: Every routine needs testing
-
-
-// private function
-static float getScaledComps(double total_comps, int error_scale){
-
-  total_comps = total_comps / scale_down_factor;
-  float comp_scale = 1.0 + (error_factor * error_scale);
-  total_comps = total_comps / comp_scale;
-
-  return total_comps;
-}
-
-
-static void addNormToResult(float comps){
-
-  std::ostringstream ss;
-  ss << std::fixed << comps;
-  
-  result_str.append( std::string(ss.str()) );
-  result_str.append("\t");
-}
-
-
-
-static void addCompsToResult(float comps){
-
-  std::ostringstream ss;
-  ss << std::fixed << comps;
-  
-  result_str.append( std::string(ss.str()) );
-  result_str.append("\n");
-}
-
-
-void add_conv_overheads(void* input_ptr, void* filter_ptr,
-			int strideA, int strideB, int error_scale){
-
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
-  
-}
-
-
-void add_gemm_overheads(void* lhs_ptr, void* rhs_ptr, int error_scale){
-
-  Tensor* lhs = (Tensor*) lhs_ptr;
-  Tensor* rhs = (Tensor*) rhs_ptr;
-    
-  int m = lhs->dims.dim_sizes[0];
-  // The rhs last dimension must contain the neurons
-  int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
-  int k = 1;
-  
-  // Flattening the dimensions after the batch dimension
-  for (int j = 1 ; j < lhs->dims.num_dims; j++){
-    k = k * lhs->dims.dim_sizes[j]; // input neurons
-  }
-
-  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
-  // Dimension-note: Check if k is same across the two tensors
-  printf("m = %d, n = %d, k = %d \n", m, n, k);
-  
-  if(rhs_k != k){
-    printf("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k);
-    abort();
-  }
-  
-  double total_comps = m * n * rhs_k * 1.0;
-  float scaled_comps = getScaledComps(total_comps, error_scale);
-  
-  printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n",
-	 error_scale, total_comps, scaled_comps);
-
-  addCompsToResult(scaled_comps);
-  
-}
-
-
-void add_bias_overheads(void* input_ptr, int error_scale){
-
-  Tensor* input = (Tensor*) input_ptr;
-  
-  double total_comps = input->num_elems;
-  float scaled_comps = getScaledComps(total_comps, error_scale);
-
-  printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n",
-	 error_scale, total_comps, scaled_comps);
-
-  addCompsToResult(scaled_comps);
-
-}
-
-
-void add_relu_overheads(void* input_ptr, int error_scale){
-  
-  Tensor* input = (Tensor*) input_ptr;
-  
-  double total_comps = input->num_elems;
-  float scaled_comps = getScaledComps(total_comps, error_scale);
-
-  printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n",
-	 error_scale, total_comps, scaled_comps);				     
-
-  addCompsToResult(scaled_comps);
-
-}
-
-float add_pool_overheads(void* input_ptr, int kernel_size,
-			 int stride_size, int error_scale){
-
-}
-
-
-void add_norms(void* norms_ptr){
-
-  Norm_t* norms = (Norm_t*) norms_ptr;
-
-  addNormToResult(norms->l1_norm);
-  addNormToResult(norms->l2_norm);
-  addNormToResult(norms->inf_norm);
- 
-}
-
-void dump_result(char* file_name){
-
-  FILE* fp = fopen(file_name, "w+");
-  fwrite(result_str.c_str(), 1, result_str.length(), fp);
-  fclose(fp); 
-}
-
-#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h
deleted file mode 100644
index 3e4f64610da64fb04b6270035da8557e940eb7e2..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h
+++ /dev/null
@@ -1,39 +0,0 @@
-
-#ifndef TYPES_HEADER
-#define TYPES_HEADER
-
-/*
-struct Dimension_t{
-  int num_dims;
-  size_t* dim_sizes;
-};
-
-
-struct Tensor_t{
-  int tensor_id; // used for indexing (in the tensor runtime)
-  int data_type; // {float_type, double_type, half_type, int_type}
-  int data_format; // {nchw, nhwc}
-  void* host_data;
-  size_t num_elems; // Total elements
-  size_t size_in_bytes; // Total size in bytes
-  struct Dimension_t dims;
-};
-
-
-
-enum Tensor_type_t{
-  float_type,
-  double_type,
-  half_type,
-  int_type
-};
-
-
-// NOTE: Currently only NCHW is supported due to limited cuDNN support
-enum Tensor_format_t{
-  nchw,
-  nhwc 
-};
-*/
-
-#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
index 5d1e0e66ad1a3402981682ed97e664ddcc173787..7bcfda70080688387e9bb74e8d25a1174a3e7337 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
@@ -4,9 +4,9 @@
 #define UTILS_HEADER
 
 #include <stdio.h>
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
 #include <sstream>
 #include <vector>
 #include <bits/stdc++.h>
@@ -15,737 +15,341 @@
 #include <cmath>
 #include <string.h>
 
-
 std::vector<float> run_accuracies;
 std::string model_params_path = "../../../build/model_params/";
 
-
-void printTensorInfo(void* tensor_ptr){
-
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-
-  if(tensor->gpu_data != NULL){
-    printf("Successful cudaMalloc \n");
-  }
-
-  printf("tensor dims = %d \n", tensor->dims.num_dims);
-  printf("dim1_size = %lu \n", tensor->dims.dim_sizes[0]);
-  printf("dim2_size = %lu \n", tensor->dims.dim_sizes[1]);
-  printf("num_elems = %lu \n", tensor->num_elems);
-}
-
-
 // FIXIT: Move this to debug.h and include in all files
-void dumpWeightsToFile(const char* file_name, void* weights_ptr){
+void dumpWeightsToFile(const char *file_name, void *weights_ptr) {
 
-  struct Tensor* weights = (Tensor*) weights_ptr;
+  struct Tensor *weights = (Tensor *)weights_ptr;
   // Move data back to host
   hpvm_request_tensor(weights, 0);
-  
-  FILE* fp = fopen(file_name, "wb");
-  if(fp == NULL){
-    printf("File %s could not be created. Check if directory exists \n", file_name);
+
+  FILE *fp = fopen(file_name, "wb");
+  if (fp == NULL) {
+    printf("File %s could not be created. Check if directory exists \n",
+           file_name);
     abort();
   }
 
-  //printf("size_in_bytes = %lu \n", weights->size_in_bytes);
-  size_t bytes_written = fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
-  //printf("bytes_written = %lu \n", bytes_written);
   fclose(fp);
 }
 
+void fillTensorWithOnes(void *tensor_ptr) {
 
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-void fillTensorWithOnes(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = 1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = 1.0;
     }
   }
 }
 
+void fillWithOnesAndTwos(void *tensor_ptr) {
 
-void fillWithOnesAndTwos(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-  
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
 
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
       if (i % 2 == 0)
         data_arr[i] = 1.0;
       else
-	data_arr[i] = 2.0;
+        data_arr[i] = 2.0;
     }
 
     /*for(unsigned int i = 0; i < tensor->num_elems/2; i++){
-      data_arr[i] = 1.0;    
+      data_arr[i] = 1.0;
     }
 
     for(unsigned int i = tensor->num_elems/2; i < tensor->num_elems; i++){
-      data_arr[i] = 2.0;    
+      data_arr[i] = 2.0;
     }*/
- 
   }
 }
 
+void fillTensorWithVal(void *tensor_ptr, float target_value) {
 
-void fillTensorWithVal(void* tensor_ptr, float target_value){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = target_value;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = target_value;
     }
   }
 }
 
+void fillTensorWithNegOnes(void *tensor_ptr) {
 
-void fillTensorWithNegOnes(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
-  // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = -1.0;    
-    }
-  }
-}
-
 
-void fillTensorVals(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = i + 1;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = -1.0;
     }
   }
 }
 
+void printTensorValues(void *tensor_ptr) {
 
-void printTensorValues(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
   hpvm_request_tensor(tensor, 0);
-  
+
   // printing is specific to the floating point type
-  if(tensor->data_type != CUDNN_DATA_FLOAT){
-    //printf("\n WARNING: The tensor is non-float type tensor \n\n");
-  }  
+  if (tensor->data_type != CUDNN_DATA_FLOAT) {
+    // printf("\n WARNING: The tensor is non-float type tensor \n\n");
+  }
 
-  float* data_arr = (float*) tensor->host_data;
+  float *data_arr = (float *)tensor->host_data;
 
-  for(unsigned int i = 0; i < tensor->num_elems; i++){
-      printf("%f,", data_arr[i]);    
+  for (unsigned int i = 0; i < tensor->num_elems; i++) {
+    printf("%f,", data_arr[i]);
   }
-   
 
   printf("\n");
 }
 
+void printTensorDims(void *tensor_ptr) {
 
-void printTensorDims(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
   printf("Num_elems = %lu \n", tensor->num_elems);
-  for (int i = 0; i < tensor->dims.num_dims; i++){
+  for (int i = 0; i < tensor->dims.num_dims; i++) {
     printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]);
   }
 }
 
-
-
-void compareTensors(void* tensor1_ptr, void* tensor2_ptr){
-
-  struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr;
-  struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr;
-
-  hpvm_request_tensor(tensor1, 0);
-  hpvm_request_tensor(tensor2, 0);
-
-  float* tensor_data1 = (float*) tensor1->host_data;
-  float* tensor_data2 = (float*) tensor2->host_data;
-  
-  for(unsigned int i = 0; i < tensor1->num_elems; i++){
-    if(tensor_data1[i] != tensor_data2[i]){
-      printf("Tensor data mismatch at index %d \n", i);
-      abort();
-    }
-  }
-}
-
-
-
-void compareValues(void* tensor_ptr, float* data, size_t num_elems){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
-  hpvm_request_tensor(tensor, 0);
-  
-  float* tensor_data = (float*) tensor->host_data;
-  for(unsigned int i = 0; i < num_elems; i++){
-    if(tensor_data[i] != data[i]){
-      printf("Tensor data mismatch");
-      abort();
-    }
-  }
-}
-
-
-void* readInputTensor(const char* file_name, int data_type, int dim1_size, int dim2_size,
-		      int dim3_size, int dim4_size){
-
-  int type_size = 4; // NOTE: Assuming floating point tensors
-  int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  uint8_t* file_data = (uint8_t*) malloc(sizeof(char) * num_elems);
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  int file_header_size = 16;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
-    printf("Data file %s is not found. Aborting... \n", file_name);
-    abort();
-  }
-
- 
-  fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
-  size_t bytes_read = fread(file_data, 1, sizeof(uint8_t) * num_elems, file);
-
-  fclose(file);
-  
-  for (size_t i = 0; i < num_elems; ++i){
-    tensor_data[i] = (float) file_data[i] / 255.0f;
-  }
-
-  // NOTE: Using NCHW format
-  struct Tensor* input = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					dim3_size, dim4_size);
-  
-  initTensorData(input, tensor_data, size_in_bytes);
-  //  compareValues(input, tensor_data, num_elems);
-  
-  return input;  
-}
-
-
-//*** FIXIT: Move this to CPU-only
-struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type,
-				     int dim1_size, int dim2_size,
-				     int dim3_size, int dim4_size){
+struct Tensor *readTrainedWeights(const char *file_name, int data_type,
+                                  long int dim1_size, long int dim2_size,
+                                  long int dim3_size, long int dim4_size) {
 
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  int file_header_size = 0;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
-    printf("Data file %s is not found. Aborting... \n", file_name);
-    abort();
-  }
-    
-  fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
-  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
-
-  //printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
-
-  fclose(file);
-  
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
-  
-  initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
-  free(tensor_data);
-
-  return weights;
-}
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
+  // printf("size_in_bytes  = %lu \n", size_in_bytes);
 
-
-struct Tensor* readTrainedWeights(const char* file_name, int data_type,
-				  long int dim1_size, long int dim2_size,
-				  long int dim3_size, long int dim4_size){
-
-  // FIXIT: Don't assume floating point types
-  int type_size = 4; // NOTE: Assuming floating point tensors
-  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  //printf("size_in_bytes  = %lu \n", size_in_bytes);
-  
   int file_header_size = 0;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
-  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
+  fread(tensor_data, 1, size_in_bytes, file);
+  fclose(file);
 
-  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
 
-  fclose(file);
-  
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
-  
   initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
+  // compareValues(weights, tensor_data, num_elems);
   free(tensor_data);
 
   return weights;
 }
 
-
-
-
-struct Tensor* readInputBatch(const char* file_name, int data_type,
-			      long int start, long int end,
-			      long int dim2_size, long int dim3_size, long int dim4_size){
+struct Tensor *readInputBatch(const char *file_name, int data_type,
+                              long int start, long int end, long int dim2_size,
+                              long int dim3_size, long int dim4_size) {
 
   long int dim1_size = end - start;
   // FIXIT: Don't assume floating point types
   long int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  long int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
+  long int file_header_size =
+      type_size * start * dim2_size * dim3_size * dim4_size;
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
+  fread(tensor_data, 1, size_in_bytes, file);
+  fclose(file);
 
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
 
-  fclose(file);
-  
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
-  
   initTensorData(weights, tensor_data, size_in_bytes);
   free(tensor_data);
 
   return weights;
 }
 
-
-
-void* copyInputBatch(const char* file_name, 
-		    int start, int end,
-		    long int dim2_size, long int dim3_size, long int dim4_size,
-		    void* inputTensor_ptr){
-
-  struct Tensor* inputTensor = (struct Tensor*) inputTensor_ptr;
-  
-  long int dim1_size = end - start;
-  // FIXIT: Don't assume floating point types
-  int type_size = 4; // NOTE: Assuming floating point tensors
-  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
-    printf("Data file %s is not found. Aborting... \n", file_name);
-    abort();
-  }
-    
-  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
-
-  fclose(file);
-  
-    
-  initTensorData(inputTensor, tensor_data, size_in_bytes);
-  free(tensor_data);
-
-  printf("******NOTE: tensor Dims = %d \n", inputTensor->dims.num_dims);
-  if(inputTensor->host_data == NULL || inputTensor->gpu_data == NULL)
-    printf("ERROR: NULL data pointers \n");
-
-
-  // Chaning Tensor Placement to HOST 
-  changeTensorPlacement(inputTensor, HOST);
-
-
-  return inputTensor;
-}
-
-
-
-uint8_t* readLabels(const char* labels_file, int num_labels){
-
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
-    printf("Data file %s is not found. Aborting...\n", labels_file);
-    abort();
-  }
-
-  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
-
-  fclose(file);
-  
-  return labels;
-}
-
-
-
-uint32_t* readLabels3(const char* labels_file, int num_labels){
-
-  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
-    printf("Data file %s is not found. Aborting...\n", labels_file);
-    abort();
-  }
-
-  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
-
-  fclose(file);
-  
-  return labels;
-}
-
-
-uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
+uint8_t *readLabelsBatch(const char *labels_file, int start, int end) {
 
   int num_labels = end - start;
   int file_header_size = sizeof(uint8_t) * start;
-  
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+
+  uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
-  
-  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-    
-  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
-
 
+  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
+  fread(labels, 1, sizeof(uint8_t) * num_labels, file);
   fclose(file);
-  
+
   // printf("--labels bytes_read = %lu \n", bytes_read);
   return labels;
 }
 
-
-uint32_t* readLabelsBatch3(const char* labels_file, int start, int end){
+uint32_t *readLabelsBatch3(const char *labels_file, int start, int end) {
 
   int num_labels = end - start;
   int file_header_size = sizeof(uint32_t) * start;
-  
-  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+
+  uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
-  
-  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-    
-  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
-
 
+  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
+  fread(labels, 1, sizeof(uint32_t) * num_labels, file);
   fclose(file);
-  
-  return labels;
-}
-
-
-
-void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
-  uint8_t* labels = readLabels(labels_file, num_labels);
-  size_t batch_dim = result->dims.dim_sizes[0];
-  size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
-  int num_errors = 0;
-  
-  for(int i = 0; i < batch_dim; i++){
-    int chosen = 0;
-    for (int id = 1; id < 10; ++id){
-      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
-    }
-    
-    //printf("chosen = %d, label = %d \n", chosen, labels[i]);
-    if(chosen != labels[i])
-      num_errors++;
-  }
-
-  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
-  printf("****** Accuracy = %f \n\n", accuracy);
-
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
-
-    std::ostringstream ss;
-    ss << std::fixed << accuracy;
-    std::string print_str = ss.str();
-  
-    fwrite(print_str.c_str(), 1, print_str.length(), fp);
-    fclose(fp);
-  }
-  
+  return labels;
 }
 
+// NOTE: batch_size and num_classes are Unused arguments
+float computeAccuracy2(uint8_t *labels, int batch_size, void *result_ptr,
+                       size_t num_classes = 10) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-
-// NOTE: batch_size and num_classes are Unused arguments 
-float computeAccuracy2(uint8_t* labels, int batch_size,
-		       void* result_ptr, size_t num_classes = 10){
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
   num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, num_classes);
-  
-  for(unsigned int i = 0; i < batch_dim; i++){ 
-      
-    int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
-    }
-    
-    if(chosen != labels[i])
-      num_errors++;
-
-  }
-
-  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
-  printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  for (unsigned int i = 0; i < batch_dim; i++) {
 
-    std::ostringstream ss;
-    ss << std::fixed << accuracy;
-    std::string print_str = ss.str();
-  
-    fwrite(print_str.c_str(), 1, print_str.length(), fp);
-  }
-
-  fclose(fp);
-
-  return accuracy;    
-}
-
-
-
-float computeAccuracy3(uint32_t* labels, void* result_ptr){
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
-  size_t batch_dim = result->dims.dim_sizes[0];
-  size_t num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
-  int num_errors = 0;
-
-  printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
-  
-  for(int i = 0; i < batch_dim; i++){
-  
     int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
+    for (size_t id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
     }
-    
-    if(chosen != labels[i])
+
+    if (chosen != labels[i])
       num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
+float computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-struct ClassProb{
-  float prob;
-  int index;
-};
-
-
-bool descendFloatComp(ClassProb obj1, ClassProb obj2){
-  return obj1.prob > obj2.prob;
-}
-
-
-float computeTop5Accuracy(uint8_t* labels, int num_labels,
-			  void* result_ptr, unsigned num_classes = 10){
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
-  size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  size_t num_classes = result->dims.dim_sizes[1];
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
-  printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
-  
-  for(int i = 0; i < num_labels; i++){
+  printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
 
-    std::vector<ClassProb> elem_probs;
-    for (int id = 0; id < num_classes; ++id){
-      ClassProb cProb;
-      cProb.prob = data[i * channels + id];
-      cProb.index = id;
-      elem_probs.push_back(cProb);   
-    }
+  for (size_t i = 0; i < batch_dim; i++) {
 
-    std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
-    // Check if any of top-5 predictions matches
-    bool matched = false;
-    for(int j = 0; j < 5; j++){
-      ClassProb cProb = elem_probs[j];
-      if(cProb.index == labels[i])
-        matched = true;
+    uint32_t chosen = 0;
+    for (size_t id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
     }
 
-    if(!matched)
-      num_errors +=1; 
+    if (chosen != labels[i])
+      num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
-
-
-
-void dumpFinalAccuracy(float accuracy){
+void dumpFinalAccuracy(float accuracy) {
 
   printf("\n\n **** Final Accuracy = %f \n", accuracy);
-  
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
-    fwrite(print_str.c_str(), 1, print_str.length(), fp);
-  }
-
-  fclose(fp);
-
-  run_accuracies.push_back(accuracy);
-}
-
-
 
-void dumpAvgPSNR(float avg_psnr){
-
-  FILE* fp = fopen("avg_psnr", "w+");
-  if(fp != NULL){
-    std::ostringstream ss;
-    ss << std::fixed << avg_psnr;
-    std::string print_str = ss.str(); 
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
-}
-
-
-void dumpPSNRStd(float psnr_std){
 
-  FILE* fp = fopen("psnr_std.txt", "w+");
-  if(fp != NULL){
-    std::ostringstream ss;
-    ss << std::fixed << psnr_std;
-    std::string print_str = ss.str(); 
-    fwrite(print_str.c_str(), 1, print_str.length(), fp);
-  }
-
-  fclose(fp);
+  run_accuracies.push_back(accuracy);
 }
 
+void dumpExecutionAccuracies() {
 
-
-
-
-void dumpExecutionAccuracies(){
-
-  FILE* fp = fopen("run_accuracies.txt", "w+");
-  if(fp != NULL){  
-    for (int i = 0; i < run_accuracies.size(); i++){
+  FILE *fp = fopen("run_accuracies.txt", "w+");
+  if (fp != NULL) {
+    for (size_t i = 0; i < run_accuracies.size(); i++) {
       float accuracy = run_accuracies[i];
       std::ostringstream ss;
       ss << std::fixed << accuracy;
@@ -753,242 +357,8 @@ void dumpExecutionAccuracies(){
       fwrite(print_str.c_str(), 1, print_str.length(), fp);
       fwrite("\n", 1, 1, fp);
     }
-
-  }
-
-  fclose(fp);
-}
-
-
-float readPSNRFromFile(const char* file_name){
-
-  float psnr;
-  FILE* pFile = fopen(file_name, "r");
-  if(pFile == NULL){
-    printf("ERROR: psnr.txt not found! \n");
-    abort();
-  }
-  
-  fscanf(pFile, "%f", &psnr);
-  printf("**** PSNR read = %f \n\n", psnr);
-  return psnr; 
-}
-
-
-float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){
-
-  
-  PSNR_threshold = readPSNRFromFile("psnr.txt");
-  std::vector<float> psnr_list;
-  
-  struct Tensor* gold_tensor = (struct Tensor*) gold_ptr;
-  struct Tensor* approx_tensor = (struct Tensor*) approx_ptr;
-
-  size_t* dim_sizes = gold_tensor->dims.dim_sizes;
-  size_t batch_dim = dim_sizes[0];
-  size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
-  
-  printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size);
-	 
-  float* gold_data = (float*) gold_tensor->host_data;
-  float* approx_data = (float*) approx_tensor->host_data;
-
-  FILE* fp = fopen("img_psnr.txt", "w+");
-
-  float sum_psnr = 0.0;
-  int num_errors = 0;  
-  for(size_t i = 0; i < batch_dim; i++){
-    float mse_sum = 0.0;
-    float max_val = -999999;     
-    size_t offset = i * image_size;
-    
-    for(size_t j = 0; j < image_size; j++){
-      float diff = gold_data[offset + j] - approx_data[offset + j];
-      float diff_square = diff * diff;
-      mse_sum += diff_square;
-
-      if(max_val < gold_data[offset + j]){
-	max_val = gold_data[offset + j];
-      }   
-    }
-
-    mse_sum = mse_sum / image_size;
-    float psnr = 20 * log10(255 / sqrt(mse_sum));
-
-    sum_psnr += psnr;
-    if (psnr < PSNR_threshold)
-      num_errors += 1;    
-
-    printf("PSNR value = %f \n", psnr);
-    psnr_list.push_back(psnr);
-
-    std::ostringstream ss;
-    ss << std::fixed << psnr;
-    std::string print_str = ss.str();
-    fwrite(print_str.c_str(), 1, print_str.length(), fp);
-    fwrite("\n", 1, 1, fp);
   }
 
-  float violation_rate = (num_errors * 1.0) / batch_dim * 100.0;
-  printf("*** violation_rate= %f \n\n", violation_rate);
-
-  float avg_psnr = sum_psnr / batch_dim;
-  printf("*** avg_psnr =  %f \n\n", avg_psnr);
-  dumpAvgPSNR(avg_psnr);
- 
-  float success_rate = 100.0 - violation_rate;
-  dumpFinalAccuracy(success_rate);
-
   fclose(fp);
-
-
-  float var = 0.0;
-  for(size_t i = 0; i < batch_dim; i++){
-    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); 
-  }
-
-  var /= batch_dim;
-  float std = sqrt(var);
-
-  dumpPSNRStd(std);
-  
-  return violation_rate;  
-}
-
-
-void dumpOutput(void* output_ptr, const char* file_name){
-
-  struct Tensor* out_tensor = (struct Tensor*) output_ptr;  
-  size_t size_in_bytes = out_tensor->size_in_bytes;
-  printf ("** Output size = %lu \n", size_in_bytes);
-  
-  float* host_data = (float*) out_tensor->host_data; 
-  FILE* fd = fopen(file_name, "w+");
-  fwrite(host_data, 1, size_in_bytes, fd);
-  fclose(fd);
-}
-
-
-
-
-
-void copyClassConfsAndLabels(void* result_ptr,
-			     float* classConfs,
-			     int* predictedLabels,
-			     int start, int end){
-
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
- 
-  size_t num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
-
-  
-  int it_count = end - start;  
-  for(int i = 0; i < it_count; i++){
-  
-    int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
-    }
-
-    predictedLabels[start + i] = chosen;
-    classConfs[start + i] = data[i * num_classes + chosen];
-  }
-  
-
-}
-
-
-void dumpClassConfsAndLabels(float* classConfs,
-			     int* predictedLabels,
-			     uint32_t* goldLabels, 
-			     int test_input_size){
-
-  FILE* labels_fp = fopen("predicted_confs_labels.txt", "w+");
-  
-  for (int i = 0; i < test_input_size; i++){
-
-    int label = predictedLabels[i];
-    int gold_label = (int) goldLabels[i];
-    float conf = classConfs[i];
-    
-    std::ostringstream ss;
-    ss << std::fixed << conf;
-    std::string print_str = ss.str(); 
-    fwrite(print_str.c_str(), 1, print_str.length(), labels_fp);
-    fwrite(" ", 1, 1, labels_fp);
-
-
-    std::ostringstream label_ss;
-    label_ss << label;
-    std::string label_str = label_ss.str(); 
-    fwrite(label_str.c_str(), 1, label_str.length(), labels_fp);
-    fwrite(" ", 1, 1, labels_fp);
-
-
-    std::ostringstream gold_ss;
-    gold_ss << gold_label;
-    std::string gold_str = gold_ss.str(); 
-    fwrite(gold_str.c_str(), 1, gold_str.length(), labels_fp);
-    fwrite("\n", 1, 1, labels_fp);
- 
- 
-  }
-
-  fclose(labels_fp);
 }
-
-
-
-
-
-/**** Routines for Handling Piped Execution ***/
-void stallOnOpenTunerSignal(){
-
-  const char* myfifo = "/tmp/opentuner_fifo";
-  int fd = open(myfifo, O_RDONLY);
-  if (fd == -1){
-    printf("OpenTuner pipe could not be opened \n");
-    abort();
-  }
-    
-  int ret_val = fcntl(fd, F_GETFD);
-  if(ret_val == -1){
-    printf("Invalid descriptor \n");
-    abort();
-  }
-
-  char str[100];
-  read(fd, str, 100);
-  readOpenTunerFlags("promise_flags");
-
-   
-  if(strcmp(str, "stop_run") == 0){
-    abort();
-  }
-
-  close(fd);
-}
-
-
-
-void signalPipeToOpenTuner(){
-
-  const char* myfifo = "/tmp/opentuner_fifo";
-  int fd_out = open(myfifo, O_WRONLY);
-  int ret_val = fcntl(fd_out, F_GETFD);
-  if(ret_val == -1){
-    printf("Invalid descriptor \n");
-    abort();
-  }
-      
-  const char* str = "completed***!\n\0";
-  write(fd_out, str, 80);
-  close(fd_out);
-}
-
-
-
-
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h
deleted file mode 100644
index 45ef7211a4c04f15d1763fde729b4ca550851008..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h
+++ /dev/null
@@ -1,467 +0,0 @@
-
-// Header guards
-#ifndef UTILS_HEADER
-#define UTILS_HEADER
-
-
-#include <sstream>
-#include <vector>
-#include <bits/stdc++.h>
-#include "../../tensor_runtime/include/tensor_cpu.h"
-#include "../../tensor_runtime/include/tensor_cpu_runtime.h"
-//#include "types.h"
-#include <cmath>
-#include <stdint.h>
-
-
-std::vector<float> run_accuracies;
-
-
-void printTensorInfo(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-
-  if(tensor->gpu_data != NULL){
-    printf("Successful cudaMalloc \n");
-  }
-
-  printf("tensor dims = %d \n", tensor->dims.num_dims);
-  printf("dim1_size = %lu \n", tensor->dims.dim_sizes[0]);
-  printf("dim2_size = %lu \n", tensor->dims.dim_sizes[1]);
-  printf("num_elems = %lu \n", tensor->num_elems);
-}
-
-
-
-void printTensorDims(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-
-  printf("Num_elems = %lu \n", tensor->num_elems);
-  for (int i = 0; i < tensor->dims.num_dims; i++){
-    printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]);
-  }
-}
-
-
-
-void compareTensors(void* tensor1_ptr, void* tensor2_ptr){
-
-  struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr;
-  struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr;
-
-  //hpvm_request_tensor(tensor1, 0);
-  //hpvm_request_tensor(tensor2, 0);
-
-  float* tensor_data1 = (float*) tensor1->host_data;
-  float* tensor_data2 = (float*) tensor2->host_data;
-  
-  for(unsigned int i = 0; i < tensor1->num_elems; i++){
-    if(tensor_data1[i] != tensor_data2[i]){
-      printf("Tensor data mismatch at index %d \n", i);
-      abort();
-    }
-  }
-}
-
-
-
-//*** FIXIT: Move this to CPU-only
-struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type,
-				     int dim1_size, int dim2_size,
-				     int dim3_size, int dim4_size){
-
-  // FIXIT: Don't assume floating point types
-  int type_size = 4; // NOTE: Assuming floating point tensors
-  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  int file_header_size = 0;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
-    printf("Data file %s is not found. Aborting... \n", file_name);
-    abort();
-  }
-    
-  fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
-  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
-
-  printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
-
-  fclose(file);
-
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-							   dim3_size, dim4_size);
-  
-  initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
-  free(tensor_data);
-
-  return weights;
-}
-
-
-struct Tensor* readTrainedWeights(const char* file_name, int data_type,
-				     int dim1_size, int dim2_size,
-				     int dim3_size, int dim4_size){
-
-  return readTrainedWeightsCPU(file_name, data_type, dim1_size, dim2_size, dim3_size, dim4_size);
-}
-
-
-
-uint8_t* readLabels(const char* labels_file, int num_labels){
-
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
-    printf("Data file %s is not found. Aborting...\n", labels_file);
-    abort();
-  }
-
-  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
-
-  fclose(file);
-  
-  return labels;
-}
-
-
-uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
-
-  int num_labels = end - start;
-  int file_header_size = sizeof(uint8_t) * start;
-  
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
-    printf("Data file %s is not found. Aborting...\n", labels_file);
-    abort();
-  }
-  
-  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-    
-  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
-
-
-  fclose(file);
-  
-  return labels;
-}
-
-
-
-void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
-  uint8_t* labels = readLabels(labels_file, num_labels);
-  size_t batch_dim = result->dims.dim_sizes[0];
-  size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
-  int num_errors = 0;
-  
-  for(int i = 0; i < batch_dim; i++){
-    int chosen = 0;
-    for (int id = 1; id < 10; ++id){
-      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
-    }
-    
-    if(chosen != labels[i])
-      num_errors++;
-  }
-
-  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
-  printf("****** Accuracy = %f \n\n", accuracy);
-
-
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
-    fprintf(fp, "%f", accuracy);
-    fclose(fp);
-  }
-  
-}
-
-
-
-float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){
-
-  unsigned num_zeros = 0;
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
-  size_t batch_dim = result->dims.dim_sizes[0];
-  size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
-  int num_errors = 0;
-
-  printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
-  
-  for(int i = 0; i < num_labels; i++){  
-    int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
-    }
-    
-    if(labels[i] == 0)
-      num_zeros++;
-      
-    if(chosen != labels[i])
-      num_errors++;
-  }
-
-  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
-  printf("****** Accuracy = %f \n\n", accuracy);
-
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
-    fprintf(fp, "%f", accuracy);
-  }
-
-  fclose(fp);
-
-  return accuracy;    
-}
-
-
-struct ClassProb{
-  float prob;
-  int index;
-};
-
-
-bool descendFloatComp(ClassProb obj1, ClassProb obj2){
-  return obj1.prob > obj2.prob;
-}
-
-
-float computeTop5Accuracy(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
-  size_t batch_dim = result->dims.dim_sizes[0];
-  size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
-  int num_errors = 0;
-
-  printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
-  
-  for(int i = 0; i < num_labels; i++){
-
-    std::vector<ClassProb> elem_probs;
-    for (int id = 0; id < num_classes; ++id){
-      ClassProb cProb;
-      cProb.prob = data[i * channels + id];
-      cProb.index = id;
-      elem_probs.push_back(cProb);   
-    }
-
-    std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
-    // Check if any of top-5 predictions matches
-    bool matched = false;
-    for(int j = 0; j < 5; j++){
-      ClassProb cProb = elem_probs[j];
-      if(cProb.index == labels[i])
-        matched = true;
-    }
-
-    if(!matched)
-      num_errors +=1; 
-  }
-
-  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
-  printf("****** Accuracy = %f \n\n", accuracy);
-
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
-    fprintf(fp, "%f", accuracy);
-  }
-
-  fclose(fp);
-
-  return accuracy;    
-}
-
-
-
-
-void dumpFinalAccuracy(float accuracy){
-
-  printf("\n\n **** Final Accuracy = %f \n", accuracy);
-  
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
-    fprintf(fp, "%f", accuracy);
-  }
-
-  fclose(fp);
-
-  run_accuracies.push_back(accuracy);
-}
-
-
-
-/*void dumpAvgPSNR(float avg_psnr){
-
-  FILE* fp = fopen("avg_psnr", "w+");
-  if(fp != NULL){
-    std::ostringstream ss;
-    ss << std::fixed << avg_psnr;
-    std::string print_str = ss.str(); 
-    fwrite(print_str.c_str(), 1, print_str.length(), fp);
-  }
-
-  fclose(fp);
-}
-*/
-
-/*void dumpPSNRStd(float psnr_std){
-
-  FILE* fp = fopen("psnr_std.txt", "w+");
-  if(fp != NULL){
-    std::ostringstream ss;
-    ss << std::fixed << psnr_std;
-    std::string print_str = ss.str(); 
-    fwrite(print_str.c_str(), 1, print_str.length(), fp);
-  }
-
-  fclose(fp);
-}*/
-
-
-
-
-/*
-void dumpExecutionAccuracies(){
-
-  FILE* fp = fopen("run_accuracies.txt", "w+");
-  if(fp != NULL){  
-    for (int i = 0; i < run_accuracies.size(); i++){
-      float accuracy = run_accuracies[i];
-      std::ostringstream ss;
-      ss << std::fixed << accuracy;
-      std::string print_str = ss.str();
-      fwrite(print_str.c_str(), 1, print_str.length(), fp);
-      fwrite("\n", 1, 1, fp);
-    }
-
-  }
-
-  fclose(fp);
-}
-*/
-
-float readPSNRFromFile(const char* file_name){
-
-  float psnr;
-  FILE* pFile = fopen(file_name, "r");
-  if(pFile == NULL){
-    printf("ERROR: psnr.txt not found! \n");
-    abort();
-  }
-  
-  fscanf(pFile, "%f", &psnr);
-  printf("**** PSNR read = %f \n\n", psnr);
-  return psnr; 
-}
-
-
-/*float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){
-
-  
-  PSNR_threshold = readPSNRFromFile("psnr.txt");
-  std::vector<float> psnr_list;
-  
-  struct Tensor* gold_tensor = (struct Tensor*) gold_ptr;
-  struct Tensor* approx_tensor = (struct Tensor*) approx_ptr;
-
-  size_t* dim_sizes = gold_tensor->dims.dim_sizes;
-  size_t batch_dim = dim_sizes[0];
-  size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
-  
-  printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size);
-	 
-  float* gold_data = (float*) gold_tensor->host_data;
-  float* approx_data = (float*) approx_tensor->host_data;
-
-  FILE* fp = fopen("img_psnr.txt", "w+");
-
-  float sum_psnr = 0.0;
-  int num_errors = 0;  
-  for(size_t i = 0; i < batch_dim; i++){
-    float mse_sum = 0.0;
-    float max_val = -999999;     
-    size_t offset = i * image_size;
-    
-    for(size_t j = 0; j < image_size; j++){
-      float diff = gold_data[offset + j] - approx_data[offset + j];
-      float diff_square = diff * diff;
-      mse_sum += diff_square;
-
-      if(max_val < gold_data[offset + j]){
-	max_val = gold_data[offset + j];
-      }   
-    }
-
-    mse_sum = mse_sum / image_size;
-    float psnr = 20 * log10(255 / sqrt(mse_sum));
-
-    sum_psnr += psnr;
-    if (psnr < PSNR_threshold)
-      num_errors += 1;    
-
-    printf("PSNR value = %f \n", psnr);
-    psnr_list.push_back(psnr);
-
-    std::ostringstream ss;
-    ss << std::fixed << psnr;
-    std::string print_str = ss.str();
-    fwrite(print_str.c_str(), 1, print_str.length(), fp);
-    fwrite("\n", 1, 1, fp);
-  }
-
-  float violation_rate = (num_errors * 1.0) / batch_dim * 100.0;
-  printf("*** violation_rate= %f \n\n", violation_rate);
-
-  float avg_psnr = sum_psnr / batch_dim;
-  printf("*** avg_psnr =  %f \n\n", avg_psnr);
-  dumpAvgPSNR(avg_psnr);
- 
-  float success_rate = 100.0 - violation_rate;
-  dumpFinalAccuracy(success_rate);
-
-  fclose(fp);
-
-
-  float var = 0.0;
-  for(size_t i = 0; i < batch_dim; i++){
-    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); 
-  }
-
-  var /= batch_dim;
-  float std = sqrt(var);
-
-  //dumpPSNRStd(std);
-  
-  return violation_rate;  
-}*/
-
-
-void dumpOutput(void* output_ptr, const char* file_name){
-
-  struct Tensor* out_tensor = (struct Tensor*) output_ptr;  
-  size_t size_in_bytes = out_tensor->size_in_bytes;
-  printf ("** Output size = %lu \n", size_in_bytes);
-  
-  float* host_data = (float*) out_tensor->host_data; 
-  FILE* fd = fopen(file_name, "w+");
-  fwrite(host_data, 1, size_in_bytes, fd);
-  fclose(fd);
-}
-
-
-
-#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
index d93110945b1d1a70ec29c7788d9133dc16551ee5..8133e86ef9735932607b5548cec5910a907f7b3c 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
@@ -1,60 +1,64 @@
 
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <string.h>
-
 #include "../../../tensor_runtime/include/tensor_runtime.h"
 #include "../../include/utils.h"
 
-
-
 /* NOTE: Reference Architecture to use for profiling */
-void testCifarNet(){
+void testCifarNet() {
 
   printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
- 
-  std::string dir_prefix = model_params_path + std::string("/alexnet2_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,32,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,128,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-  
-  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
-  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
 
+  std::string dir_prefix =
+      model_params_path + std::string("/alexnet2_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
+
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 32, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
+
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision =
+      0; // NOTE: using Float as compute precision. FIXIT: use enum
 
   startMemTracking();
 
@@ -65,61 +69,61 @@ void testCifarNet(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
-    
-    void* conv1out = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv1out, conv2d_1_b); 
-    void* conv1_tanh = tensorHalfTanh(conv1out);
-    
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *conv1out = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv1out, conv2d_1_b);
+    void *conv1_tanh = tensorHalfTanh(conv1out);
+
     // 2nd Layer
-    void* conv2out = tensorHalfConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv2out, conv2d_2_b); 
-    void* conv2_tanh = tensorHalfTanh(conv2out);
-    void* pool2out = tensorHalfPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
-     
+    void *conv2out = tensorHalfConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv2out, conv2d_2_b);
+    void *conv2_tanh = tensorHalfTanh(conv2out);
+    void *pool2out = tensorHalfPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // 3rd Layer
-    void* conv3out = tensorHalfConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv3out, conv2d_3_b); 
-    void* conv3_tanh = tensorHalfTanh(conv3out);
+    void *conv3out = tensorHalfConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv3out, conv2d_3_b);
+    void *conv3_tanh = tensorHalfTanh(conv3out);
 
     // 4th Layer
-    void* conv4out = tensorHalfConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv4out, conv2d_4_b); 
-    void* conv4_tanh = tensorHalfTanh(conv4out);
-    void* pool4out = tensorHalfPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
-    
+    void *conv4out = tensorHalfConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv4out, conv2d_4_b);
+    void *conv4_tanh = tensorHalfTanh(conv4out);
+    void *pool4out = tensorHalfPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // 5th Layer
-    void* conv5out = tensorHalfConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv5out, conv2d_5_b); 
-    void* conv5_tanh = tensorHalfTanh(conv5out);
+    void *conv5out = tensorHalfConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv5out, conv2d_5_b);
+    void *conv5_tanh = tensorHalfTanh(conv5out);
 
     // 6th Layer
-    void* conv6out = tensorHalfConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
+    void *conv6out = tensorHalfConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
     tensorHalfAdd(conv6out, conv2d_6_b);
-    void* conv6_tanh = tensorHalfTanh(conv6out);
-    void* pool6out = tensorHalfPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
-    
+    void *conv6_tanh = tensorHalfTanh(conv6out);
+    void *pool6out = tensorHalfPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // final FC Layer
-    void* gemm1out = tensorHalfGemmGPU(pool6out, dense_1_w);  
-    void* gemm1biasout = tensorHalfAdd(gemm1out, dense_1_b);
-    void* result = tensorSoftmax(gemm1biasout);
+    void *gemm1out = tensorHalfGemmGPU(pool6out, dense_1_w);
+    void *gemm1biasout = tensorHalfAdd(gemm1out, dense_1_b);
+    void *result = tensorSoftmax(gemm1biasout);
 
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
 
-    float accuracy = computeAccuracy2(labels, batch_size, result); 
+    float accuracy = computeAccuracy2(labels, batch_size, result);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -127,11 +131,9 @@ void testCifarNet(){
 
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
-
 }
 
-
-int main(int argc, char* argv[]){
+int main(int argc, char *argv[]) {
 
   llvm_hpvm_initTensorRt(0);
 
@@ -141,4 +143,3 @@ int main(int argc, char* argv[]){
 
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc
index b7695bbd7a24712e335f0cf8bbd25290f3261dea..020ad6d578bea8acae8cce5373bdf37ec7df1fd9 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc
@@ -1,49 +1,52 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../../tensor_runtime/include/tensor_runtime.h" 
-#include "../../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-
-  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); 
-
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
 
+#include "../../../tensor_runtime/include/tensor_runtime.h"
+#include "../../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/");
+
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 4096, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -54,40 +57,40 @@ int main(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-   for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
-
-    void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
-    void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorHalfTanh(var_1); 
-    void* var_3 = tensorHalfPooling(var_2,0,2,2,0,0,2,2); 
-    void* var_5 = tensorHalfConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
-    void* var_6 = tensorHalfAdd(var_5, conv2d_2_b); 
-    void* var_7 = tensorHalfTanh(var_6); 
-    void* var_8 = tensorHalfPooling(var_7,0,2,2,0,0,2,2); 
-    void* var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorHalfAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorHalfTanh(var_11); 
-    void* var_13 = tensorHalfConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_14 = tensorHalfAdd(var_13, conv2d_4_b); 
-    void* var_15 = tensorHalfTanh(var_14); 
-    void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorHalfTanh(var_17); 
-    void* var_19 = tensorHalfPooling(var_18,0,2,2,0,0,2,2); 
-    void* var_22 = tensorHalfGemmGPU(var_19, dense_1_w); 
-    void* var_23 = tensorHalfAdd(var_22, dense_1_b); 
-    void* var_24 = tensorSoftmax(var_23); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_24); 
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0);
+    void *var_1 = tensorHalfAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorHalfTanh(var_1);
+    void *var_3 = tensorHalfPooling(var_2, 0, 2, 2, 0, 0, 2, 2);
+    void *var_5 = tensorHalfConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0);
+    void *var_6 = tensorHalfAdd(var_5, conv2d_2_b);
+    void *var_7 = tensorHalfTanh(var_6);
+    void *var_8 = tensorHalfPooling(var_7, 0, 2, 2, 0, 0, 2, 2);
+    void *var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_11 = tensorHalfAdd(var_10, conv2d_3_b);
+    void *var_12 = tensorHalfTanh(var_11);
+    void *var_13 = tensorHalfConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_14 = tensorHalfAdd(var_13, conv2d_4_b);
+    void *var_15 = tensorHalfTanh(var_14);
+    void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorHalfAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorHalfTanh(var_17);
+    void *var_19 = tensorHalfPooling(var_18, 0, 2, 2, 0, 0, 2, 2);
+    void *var_22 = tensorHalfGemmGPU(var_19, dense_1_w);
+    void *var_23 = tensorHalfAdd(var_22, dense_1_b);
+    void *var_24 = tensorSoftmax(var_23);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_24);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -96,9 +99,7 @@ int main(){
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
 
+  llvm_hpvm_cleanupTensorRt();
 
-  llvm_hpvm_cleanupTensorRt(); 
-
-  return 0; 
-
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
index 29f392c630a36a6044c5f804e5d3a7b252591831..0fb39cbe84af998ad42c9c14915e272aa3dab88d 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
@@ -1,115 +1,101 @@
 
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <string.h>
-
-
 #include "tensor_runtime.h"
 #include "utils.h"
 
-
 /* NOTE: Reference Architecture to use for profiling */
-void testLenetTanh(){
+void testLenetTanh() {
   int total_runs = 1;
   printf("********* Lenet-2 Architecture ********** \n");
   // FIXIT: Extend this to batch of images - currently 5 images
 
   int test_batch_size = 5000;
 
-  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");   
+  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");
+
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-  
   // Loading Input Batch
-  void* input = readInputBatch(input_path.c_str(),0, 0,test_batch_size,1,28,28); 
-  uint8_t* labels = readLabelsBatch(labels_path.c_str(), 0,test_batch_size); 
-    
-  void* conv1_filter = readTrainedWeights("../model_params/lenet_mnist/conv1.bin",
-					  float_type, 32, 1, 5, 5);    
-  void* conv1_bias = readTrainedWeights("../model_params/lenet_mnist/conv1_bias.bin",
-					float_type, 1, 32, 1, 1);  
-  void* conv2_filter = readTrainedWeights("../model_params/lenet_mnist/conv2.bin",
-					  float_type, 64, 32, 5, 5);  
-  void* conv2_bias = readTrainedWeights("../model_params/lenet_mnist/conv2_bias.bin",
-					float_type, 1, 64, 1, 1);  
-  void* fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin",
-					 float_type, 1, 1, 7*7*64, 1024);  
-  void* fc1_bias = readTrainedWeights("../model_params/lenet_mnist/fc1_bias.bin",
-				      float_type, 1, 1024, 1, 1);  
-  void* fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin",
-					 float_type, 1, 1, 1024, 10);  
-  void* fc2_bias = readTrainedWeights("../model_params/lenet_mnist/fc2_bias.bin",
-				      float_type, 1, 10, 1, 1);  
-
-
-  
+  void *input =
+      readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28);
+  uint8_t *labels = readLabelsBatch(labels_path.c_str(), 0, test_batch_size);
+
+  void *conv1_filter = readTrainedWeights(
+      "../model_params/lenet_mnist/conv1.bin", float_type, 32, 1, 5, 5);
+  void *conv1_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/conv1_bias.bin", float_type, 1, 32, 1, 1);
+  void *conv2_filter = readTrainedWeights(
+      "../model_params/lenet_mnist/conv2.bin", float_type, 64, 32, 5, 5);
+  void *conv2_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/conv2_bias.bin", float_type, 1, 64, 1, 1);
+  void *fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin",
+                                         float_type, 1, 1, 7 * 7 * 64, 1024);
+  void *fc1_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/fc1_bias.bin", float_type, 1, 1024, 1, 1);
+  void *fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin",
+                                         float_type, 1, 1, 1024, 10);
+  void *fc2_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/fc2_bias.bin", float_type, 1, 10, 1, 1);
+
   clearTensorMap();
-  
-  for(int i = 0; i < total_runs; i++){
+
+  for (int i = 0; i < total_runs; i++) {
     readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters
 
-    // Start power and performnce profiling 
+    // Start power and performnce profiling
     startProfiling();
-  
+
     int conv_mode = 1; // NOTE: using CROSS_CORRELATION
-    int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+    int conv_precision =
+        0; // NOTE: using Float as compute precision. FIXIT: use enum
 
     // NOTE: 'SAME' convolution
-    void* conv1out = tensorHalfConvolution(input, conv1_filter, 2, 2, 1, 1,
-				       conv_mode, conv_precision);
+    void *conv1out = tensorHalfConvolution(input, conv1_filter, 2, 2, 1, 1,
+                                           conv_mode, conv_precision);
 
-    // NOTE: For tensorAdd, the only dimension that MUST match is channels  
+    // NOTE: For tensorAdd, the only dimension that MUST match is channels
     tensorHalfAdd(conv1out, conv1_bias); // NOTE: In place operation
 
-    void* pool1out = tensorHalfPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
+    void *pool1out = tensorHalfPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
 
-    void* conv1_tanh = tensorHalfTanh(pool1out);
+    void *conv1_tanh = tensorHalfTanh(pool1out);
 
-    // NOTE: input channels have to match between tensor op inputs and outputs 
-    void* conv2out = tensorHalfConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1,
-				       conv_mode, conv_precision);
+    // NOTE: input channels have to match between tensor op inputs and outputs
+    void *conv2out = tensorHalfConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1,
+                                           conv_mode, conv_precision);
     tensorHalfAdd(conv2out, conv2_bias); // NOTE: In place operation
 
-    void* pool2out = tensorHalfPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+    void *pool2out = tensorHalfPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+
+    void *conv2_tanh = tensorHalfTanh(pool2out);
 
-    void* conv2_tanh = tensorHalfTanh(pool2out);
+    void *gemm1out = tensorHalfGemm(conv2_tanh, fc1_weights);
 
-    void* gemm1out = tensorHalfGemm(conv2_tanh, fc1_weights);  
+    void *gemm1biasout = tensorHalfAdd(gemm1out, fc1_bias);
 
-    void* gemm1biasout = tensorHalfAdd(gemm1out, fc1_bias);
+    void *tanh1out = tensorHalfTanh(gemm1biasout);
 
-    void* tanh1out = tensorHalfTanh(gemm1biasout);
-  
-    void* gemm2out = tensorHalfGemm(tanh1out, fc2_weights);  
-  
-    void* gemm2_biasout = tensorHalfAdd(gemm2out, fc2_bias);
+    void *gemm2out = tensorHalfGemm(tanh1out, fc2_weights);
 
-    void* tanh2out = tensorHalfTanh(gemm2_biasout);
-  
-    void* result = tensorSoftmax(tanh2out);
+    void *gemm2_biasout = tensorHalfAdd(gemm2out, fc2_bias);
+
+    void *tanh2out = tensorHalfTanh(gemm2_biasout);
+
+    void *result = tensorSoftmax(tanh2out);
 
     // End profiling and dump output to profile.txt
     stopProfiling();
-  
+
     computeAccuracy2(labels, test_batch_size, result);
-    
+
     dumpAccuracyNorms();
     freeOutputTensors();
   }
-
-
-  
 }
 
-
-int main(int argc, char* argv[]){
+int main(int argc, char *argv[]) {
   llvm_hpvm_initTensorRt(0);
 
   testLenetTanh();
@@ -118,4 +104,3 @@ int main(int argc, char* argv[]){
 
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
index d662dc1584c7810d8d3631d5ac16c427c3ff8b02..7722447047aaac6dc679fb02c16e6b2c20c2c049 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
@@ -1,411 +1,725 @@
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
+
 
 #include "../../../tensor_runtime/include/tensor_runtime.h"
 #include "../../include/utils.h"
 
-int main(){ 
-
-    llvm_hpvm_initTensorRt(0); 
-
-
-    std::string dir_prefix = model_params_path + std::string("/mobilenet/"); 
-    std::string input_path =  dir_prefix + std::string("input.bin"); 
-    std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-    std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-    void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
-    std::string batch_normalization_1_gamma_path =  dir_prefix + std::string("batch_normalization_1_gamma.bin"); 
-    void* batch_normalization_1_gamma =  readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_1_beta_path =  dir_prefix + std::string("batch_normalization_1_beta.bin"); 
-    void* batch_normalization_1_beta =  readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_1_mean_path =  dir_prefix + std::string("batch_normalization_1_mean.bin"); 
-    void* batch_normalization_1_mean =  readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_1_variance_path =  dir_prefix + std::string("batch_normalization_1_variance.bin"); 
-    void* batch_normalization_1_variance =  readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); 
-    std::string depthwise_conv2d_1_w_path =  dir_prefix + std::string("depthwise_conv2d_1_w.bin"); 
-    void* depthwise_conv2d_1_w =  readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); 
-    std::string batch_normalization_2_gamma_path =  dir_prefix + std::string("batch_normalization_2_gamma.bin"); 
-    void* batch_normalization_2_gamma =  readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_2_beta_path =  dir_prefix + std::string("batch_normalization_2_beta.bin"); 
-    void* batch_normalization_2_beta =  readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_2_mean_path =  dir_prefix + std::string("batch_normalization_2_mean.bin"); 
-    void* batch_normalization_2_mean =  readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_2_variance_path =  dir_prefix + std::string("batch_normalization_2_variance.bin"); 
-    void* batch_normalization_2_variance =  readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); 
-    std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-    void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); 
-    std::string batch_normalization_3_gamma_path =  dir_prefix + std::string("batch_normalization_3_gamma.bin"); 
-    void* batch_normalization_3_gamma =  readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_3_beta_path =  dir_prefix + std::string("batch_normalization_3_beta.bin"); 
-    void* batch_normalization_3_beta =  readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_3_mean_path =  dir_prefix + std::string("batch_normalization_3_mean.bin"); 
-    void* batch_normalization_3_mean =  readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_3_variance_path =  dir_prefix + std::string("batch_normalization_3_variance.bin"); 
-    void* batch_normalization_3_variance =  readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); 
-    std::string depthwise_conv2d_2_w_path =  dir_prefix + std::string("depthwise_conv2d_2_w.bin"); 
-    void* depthwise_conv2d_2_w =  readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); 
-    std::string batch_normalization_4_gamma_path =  dir_prefix + std::string("batch_normalization_4_gamma.bin"); 
-    void* batch_normalization_4_gamma =  readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_4_beta_path =  dir_prefix + std::string("batch_normalization_4_beta.bin"); 
-    void* batch_normalization_4_beta =  readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_4_mean_path =  dir_prefix + std::string("batch_normalization_4_mean.bin"); 
-    void* batch_normalization_4_mean =  readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_4_variance_path =  dir_prefix + std::string("batch_normalization_4_variance.bin"); 
-    void* batch_normalization_4_variance =  readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); 
-    std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-    void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); 
-    std::string batch_normalization_5_gamma_path =  dir_prefix + std::string("batch_normalization_5_gamma.bin"); 
-    void* batch_normalization_5_gamma =  readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_5_beta_path =  dir_prefix + std::string("batch_normalization_5_beta.bin"); 
-    void* batch_normalization_5_beta =  readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_5_mean_path =  dir_prefix + std::string("batch_normalization_5_mean.bin"); 
-    void* batch_normalization_5_mean =  readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_5_variance_path =  dir_prefix + std::string("batch_normalization_5_variance.bin"); 
-    void* batch_normalization_5_variance =  readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); 
-    std::string depthwise_conv2d_3_w_path =  dir_prefix + std::string("depthwise_conv2d_3_w.bin"); 
-    void* depthwise_conv2d_3_w =  readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); 
-    std::string batch_normalization_6_gamma_path =  dir_prefix + std::string("batch_normalization_6_gamma.bin"); 
-    void* batch_normalization_6_gamma =  readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_6_beta_path =  dir_prefix + std::string("batch_normalization_6_beta.bin"); 
-    void* batch_normalization_6_beta =  readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_6_mean_path =  dir_prefix + std::string("batch_normalization_6_mean.bin"); 
-    void* batch_normalization_6_mean =  readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_6_variance_path =  dir_prefix + std::string("batch_normalization_6_variance.bin"); 
-    void* batch_normalization_6_variance =  readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); 
-    std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-    void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); 
-    std::string batch_normalization_7_gamma_path =  dir_prefix + std::string("batch_normalization_7_gamma.bin"); 
-    void* batch_normalization_7_gamma =  readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_7_beta_path =  dir_prefix + std::string("batch_normalization_7_beta.bin"); 
-    void* batch_normalization_7_beta =  readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_7_mean_path =  dir_prefix + std::string("batch_normalization_7_mean.bin"); 
-    void* batch_normalization_7_mean =  readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_7_variance_path =  dir_prefix + std::string("batch_normalization_7_variance.bin"); 
-    void* batch_normalization_7_variance =  readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); 
-    std::string depthwise_conv2d_4_w_path =  dir_prefix + std::string("depthwise_conv2d_4_w.bin"); 
-    void* depthwise_conv2d_4_w =  readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); 
-    std::string batch_normalization_8_gamma_path =  dir_prefix + std::string("batch_normalization_8_gamma.bin"); 
-    void* batch_normalization_8_gamma =  readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_8_beta_path =  dir_prefix + std::string("batch_normalization_8_beta.bin"); 
-    void* batch_normalization_8_beta =  readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_8_mean_path =  dir_prefix + std::string("batch_normalization_8_mean.bin"); 
-    void* batch_normalization_8_mean =  readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_8_variance_path =  dir_prefix + std::string("batch_normalization_8_variance.bin"); 
-    void* batch_normalization_8_variance =  readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); 
-    std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-    void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); 
-    std::string batch_normalization_9_gamma_path =  dir_prefix + std::string("batch_normalization_9_gamma.bin"); 
-    void* batch_normalization_9_gamma =  readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_9_beta_path =  dir_prefix + std::string("batch_normalization_9_beta.bin"); 
-    void* batch_normalization_9_beta =  readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_9_mean_path =  dir_prefix + std::string("batch_normalization_9_mean.bin"); 
-    void* batch_normalization_9_mean =  readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_9_variance_path =  dir_prefix + std::string("batch_normalization_9_variance.bin"); 
-    void* batch_normalization_9_variance =  readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); 
-    std::string depthwise_conv2d_5_w_path =  dir_prefix + std::string("depthwise_conv2d_5_w.bin"); 
-    void* depthwise_conv2d_5_w =  readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); 
-    std::string batch_normalization_10_gamma_path =  dir_prefix + std::string("batch_normalization_10_gamma.bin"); 
-    void* batch_normalization_10_gamma =  readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_10_beta_path =  dir_prefix + std::string("batch_normalization_10_beta.bin"); 
-    void* batch_normalization_10_beta =  readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_10_mean_path =  dir_prefix + std::string("batch_normalization_10_mean.bin"); 
-    void* batch_normalization_10_mean =  readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_10_variance_path =  dir_prefix + std::string("batch_normalization_10_variance.bin"); 
-    void* batch_normalization_10_variance =  readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-    void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); 
-    std::string batch_normalization_11_gamma_path =  dir_prefix + std::string("batch_normalization_11_gamma.bin"); 
-    void* batch_normalization_11_gamma =  readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_11_beta_path =  dir_prefix + std::string("batch_normalization_11_beta.bin"); 
-    void* batch_normalization_11_beta =  readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_11_mean_path =  dir_prefix + std::string("batch_normalization_11_mean.bin"); 
-    void* batch_normalization_11_mean =  readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_11_variance_path =  dir_prefix + std::string("batch_normalization_11_variance.bin"); 
-    void* batch_normalization_11_variance =  readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); 
-    std::string depthwise_conv2d_6_w_path =  dir_prefix + std::string("depthwise_conv2d_6_w.bin"); 
-    void* depthwise_conv2d_6_w =  readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); 
-    std::string batch_normalization_12_gamma_path =  dir_prefix + std::string("batch_normalization_12_gamma.bin"); 
-    void* batch_normalization_12_gamma =  readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_12_beta_path =  dir_prefix + std::string("batch_normalization_12_beta.bin"); 
-    void* batch_normalization_12_beta =  readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_12_mean_path =  dir_prefix + std::string("batch_normalization_12_mean.bin"); 
-    void* batch_normalization_12_mean =  readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_12_variance_path =  dir_prefix + std::string("batch_normalization_12_variance.bin"); 
-    void* batch_normalization_12_variance =  readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-    void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); 
-    std::string batch_normalization_13_gamma_path =  dir_prefix + std::string("batch_normalization_13_gamma.bin"); 
-    void* batch_normalization_13_gamma =  readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_13_beta_path =  dir_prefix + std::string("batch_normalization_13_beta.bin"); 
-    void* batch_normalization_13_beta =  readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_13_mean_path =  dir_prefix + std::string("batch_normalization_13_mean.bin"); 
-    void* batch_normalization_13_mean =  readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_13_variance_path =  dir_prefix + std::string("batch_normalization_13_variance.bin"); 
-    void* batch_normalization_13_variance =  readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_7_w_path =  dir_prefix + std::string("depthwise_conv2d_7_w.bin"); 
-    void* depthwise_conv2d_7_w =  readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_14_gamma_path =  dir_prefix + std::string("batch_normalization_14_gamma.bin"); 
-    void* batch_normalization_14_gamma =  readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_14_beta_path =  dir_prefix + std::string("batch_normalization_14_beta.bin"); 
-    void* batch_normalization_14_beta =  readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_14_mean_path =  dir_prefix + std::string("batch_normalization_14_mean.bin"); 
-    void* batch_normalization_14_mean =  readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_14_variance_path =  dir_prefix + std::string("batch_normalization_14_variance.bin"); 
-    void* batch_normalization_14_variance =  readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-    void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_15_gamma_path =  dir_prefix + std::string("batch_normalization_15_gamma.bin"); 
-    void* batch_normalization_15_gamma =  readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_15_beta_path =  dir_prefix + std::string("batch_normalization_15_beta.bin"); 
-    void* batch_normalization_15_beta =  readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_15_mean_path =  dir_prefix + std::string("batch_normalization_15_mean.bin"); 
-    void* batch_normalization_15_mean =  readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_15_variance_path =  dir_prefix + std::string("batch_normalization_15_variance.bin"); 
-    void* batch_normalization_15_variance =  readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_8_w_path =  dir_prefix + std::string("depthwise_conv2d_8_w.bin"); 
-    void* depthwise_conv2d_8_w =  readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_16_gamma_path =  dir_prefix + std::string("batch_normalization_16_gamma.bin"); 
-    void* batch_normalization_16_gamma =  readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_16_beta_path =  dir_prefix + std::string("batch_normalization_16_beta.bin"); 
-    void* batch_normalization_16_beta =  readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_16_mean_path =  dir_prefix + std::string("batch_normalization_16_mean.bin"); 
-    void* batch_normalization_16_mean =  readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_16_variance_path =  dir_prefix + std::string("batch_normalization_16_variance.bin"); 
-    void* batch_normalization_16_variance =  readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-    void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_17_gamma_path =  dir_prefix + std::string("batch_normalization_17_gamma.bin"); 
-    void* batch_normalization_17_gamma =  readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_17_beta_path =  dir_prefix + std::string("batch_normalization_17_beta.bin"); 
-    void* batch_normalization_17_beta =  readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_17_mean_path =  dir_prefix + std::string("batch_normalization_17_mean.bin"); 
-    void* batch_normalization_17_mean =  readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_17_variance_path =  dir_prefix + std::string("batch_normalization_17_variance.bin"); 
-    void* batch_normalization_17_variance =  readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_9_w_path =  dir_prefix + std::string("depthwise_conv2d_9_w.bin"); 
-    void* depthwise_conv2d_9_w =  readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_18_gamma_path =  dir_prefix + std::string("batch_normalization_18_gamma.bin"); 
-    void* batch_normalization_18_gamma =  readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_18_beta_path =  dir_prefix + std::string("batch_normalization_18_beta.bin"); 
-    void* batch_normalization_18_beta =  readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_18_mean_path =  dir_prefix + std::string("batch_normalization_18_mean.bin"); 
-    void* batch_normalization_18_mean =  readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_18_variance_path =  dir_prefix + std::string("batch_normalization_18_variance.bin"); 
-    void* batch_normalization_18_variance =  readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-    void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_19_gamma_path =  dir_prefix + std::string("batch_normalization_19_gamma.bin"); 
-    void* batch_normalization_19_gamma =  readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_19_beta_path =  dir_prefix + std::string("batch_normalization_19_beta.bin"); 
-    void* batch_normalization_19_beta =  readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_19_mean_path =  dir_prefix + std::string("batch_normalization_19_mean.bin"); 
-    void* batch_normalization_19_mean =  readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_19_variance_path =  dir_prefix + std::string("batch_normalization_19_variance.bin"); 
-    void* batch_normalization_19_variance =  readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_10_w_path =  dir_prefix + std::string("depthwise_conv2d_10_w.bin"); 
-    void* depthwise_conv2d_10_w =  readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_20_gamma_path =  dir_prefix + std::string("batch_normalization_20_gamma.bin"); 
-    void* batch_normalization_20_gamma =  readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_20_beta_path =  dir_prefix + std::string("batch_normalization_20_beta.bin"); 
-    void* batch_normalization_20_beta =  readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_20_mean_path =  dir_prefix + std::string("batch_normalization_20_mean.bin"); 
-    void* batch_normalization_20_mean =  readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_20_variance_path =  dir_prefix + std::string("batch_normalization_20_variance.bin"); 
-    void* batch_normalization_20_variance =  readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-    void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_21_gamma_path =  dir_prefix + std::string("batch_normalization_21_gamma.bin"); 
-    void* batch_normalization_21_gamma =  readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_21_beta_path =  dir_prefix + std::string("batch_normalization_21_beta.bin"); 
-    void* batch_normalization_21_beta =  readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_21_mean_path =  dir_prefix + std::string("batch_normalization_21_mean.bin"); 
-    void* batch_normalization_21_mean =  readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_21_variance_path =  dir_prefix + std::string("batch_normalization_21_variance.bin"); 
-    void* batch_normalization_21_variance =  readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_11_w_path =  dir_prefix + std::string("depthwise_conv2d_11_w.bin"); 
-    void* depthwise_conv2d_11_w =  readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_22_gamma_path =  dir_prefix + std::string("batch_normalization_22_gamma.bin"); 
-    void* batch_normalization_22_gamma =  readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_22_beta_path =  dir_prefix + std::string("batch_normalization_22_beta.bin"); 
-    void* batch_normalization_22_beta =  readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_22_mean_path =  dir_prefix + std::string("batch_normalization_22_mean.bin"); 
-    void* batch_normalization_22_mean =  readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_22_variance_path =  dir_prefix + std::string("batch_normalization_22_variance.bin"); 
-    void* batch_normalization_22_variance =  readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-    void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_23_gamma_path =  dir_prefix + std::string("batch_normalization_23_gamma.bin"); 
-    void* batch_normalization_23_gamma =  readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_23_beta_path =  dir_prefix + std::string("batch_normalization_23_beta.bin"); 
-    void* batch_normalization_23_beta =  readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_23_mean_path =  dir_prefix + std::string("batch_normalization_23_mean.bin"); 
-    void* batch_normalization_23_mean =  readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_23_variance_path =  dir_prefix + std::string("batch_normalization_23_variance.bin"); 
-    void* batch_normalization_23_variance =  readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_12_w_path =  dir_prefix + std::string("depthwise_conv2d_12_w.bin"); 
-    void* depthwise_conv2d_12_w =  readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_24_gamma_path =  dir_prefix + std::string("batch_normalization_24_gamma.bin"); 
-    void* batch_normalization_24_gamma =  readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_24_beta_path =  dir_prefix + std::string("batch_normalization_24_beta.bin"); 
-    void* batch_normalization_24_beta =  readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_24_mean_path =  dir_prefix + std::string("batch_normalization_24_mean.bin"); 
-    void* batch_normalization_24_mean =  readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_24_variance_path =  dir_prefix + std::string("batch_normalization_24_variance.bin"); 
-    void* batch_normalization_24_variance =  readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-    void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); 
-    std::string batch_normalization_25_gamma_path =  dir_prefix + std::string("batch_normalization_25_gamma.bin"); 
-    void* batch_normalization_25_gamma =  readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_25_beta_path =  dir_prefix + std::string("batch_normalization_25_beta.bin"); 
-    void* batch_normalization_25_beta =  readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_25_mean_path =  dir_prefix + std::string("batch_normalization_25_mean.bin"); 
-    void* batch_normalization_25_mean =  readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_25_variance_path =  dir_prefix + std::string("batch_normalization_25_variance.bin"); 
-    void* batch_normalization_25_variance =  readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); 
-    std::string depthwise_conv2d_13_w_path =  dir_prefix + std::string("depthwise_conv2d_13_w.bin"); 
-    void* depthwise_conv2d_13_w =  readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); 
-    std::string batch_normalization_26_gamma_path =  dir_prefix + std::string("batch_normalization_26_gamma.bin"); 
-    void* batch_normalization_26_gamma =  readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_26_beta_path =  dir_prefix + std::string("batch_normalization_26_beta.bin"); 
-    void* batch_normalization_26_beta =  readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_26_mean_path =  dir_prefix + std::string("batch_normalization_26_mean.bin"); 
-    void* batch_normalization_26_mean =  readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_26_variance_path =  dir_prefix + std::string("batch_normalization_26_variance.bin"); 
-    void* batch_normalization_26_variance =  readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); 
-    std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-    void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); 
-    std::string batch_normalization_27_gamma_path =  dir_prefix + std::string("batch_normalization_27_gamma.bin"); 
-    void* batch_normalization_27_gamma =  readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_27_beta_path =  dir_prefix + std::string("batch_normalization_27_beta.bin"); 
-    void* batch_normalization_27_beta =  readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_27_mean_path =  dir_prefix + std::string("batch_normalization_27_mean.bin"); 
-    void* batch_normalization_27_mean =  readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_27_variance_path =  dir_prefix + std::string("batch_normalization_27_variance.bin"); 
-    void* batch_normalization_27_variance =  readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); 
-    std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-    void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); 
-    std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-    void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-
+int main() {
 
+  llvm_hpvm_initTensorRt(0);
 
-    startMemTracking(); 
+  std::string dir_prefix = model_params_path + std::string("/mobilenet/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
+  std::string batch_normalization_1_gamma_path =
+      dir_prefix + std::string("batch_normalization_1_gamma.bin");
+  void *batch_normalization_1_gamma = readTrainedWeights(
+      batch_normalization_1_gamma_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_beta_path =
+      dir_prefix + std::string("batch_normalization_1_beta.bin");
+  void *batch_normalization_1_beta = readTrainedWeights(
+      batch_normalization_1_beta_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_mean_path =
+      dir_prefix + std::string("batch_normalization_1_mean.bin");
+  void *batch_normalization_1_mean = readTrainedWeights(
+      batch_normalization_1_mean_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_variance_path =
+      dir_prefix + std::string("batch_normalization_1_variance.bin");
+  void *batch_normalization_1_variance = readTrainedWeights(
+      batch_normalization_1_variance_path.c_str(), 0, 1, 32, 1, 1);
+  std::string depthwise_conv2d_1_w_path =
+      dir_prefix + std::string("depthwise_conv2d_1_w.bin");
+  void *depthwise_conv2d_1_w =
+      readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0, 32, 1, 3, 3);
+  std::string batch_normalization_2_gamma_path =
+      dir_prefix + std::string("batch_normalization_2_gamma.bin");
+  void *batch_normalization_2_gamma = readTrainedWeights(
+      batch_normalization_2_gamma_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_beta_path =
+      dir_prefix + std::string("batch_normalization_2_beta.bin");
+  void *batch_normalization_2_beta = readTrainedWeights(
+      batch_normalization_2_beta_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_mean_path =
+      dir_prefix + std::string("batch_normalization_2_mean.bin");
+  void *batch_normalization_2_mean = readTrainedWeights(
+      batch_normalization_2_mean_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_variance_path =
+      dir_prefix + std::string("batch_normalization_2_variance.bin");
+  void *batch_normalization_2_variance = readTrainedWeights(
+      batch_normalization_2_variance_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 32, 1, 1);
+  std::string batch_normalization_3_gamma_path =
+      dir_prefix + std::string("batch_normalization_3_gamma.bin");
+  void *batch_normalization_3_gamma = readTrainedWeights(
+      batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_beta_path =
+      dir_prefix + std::string("batch_normalization_3_beta.bin");
+  void *batch_normalization_3_beta = readTrainedWeights(
+      batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_mean_path =
+      dir_prefix + std::string("batch_normalization_3_mean.bin");
+  void *batch_normalization_3_mean = readTrainedWeights(
+      batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_variance_path =
+      dir_prefix + std::string("batch_normalization_3_variance.bin");
+  void *batch_normalization_3_variance = readTrainedWeights(
+      batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string depthwise_conv2d_2_w_path =
+      dir_prefix + std::string("depthwise_conv2d_2_w.bin");
+  void *depthwise_conv2d_2_w =
+      readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0, 64, 1, 3, 3);
+  std::string batch_normalization_4_gamma_path =
+      dir_prefix + std::string("batch_normalization_4_gamma.bin");
+  void *batch_normalization_4_gamma = readTrainedWeights(
+      batch_normalization_4_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_beta_path =
+      dir_prefix + std::string("batch_normalization_4_beta.bin");
+  void *batch_normalization_4_beta = readTrainedWeights(
+      batch_normalization_4_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_mean_path =
+      dir_prefix + std::string("batch_normalization_4_mean.bin");
+  void *batch_normalization_4_mean = readTrainedWeights(
+      batch_normalization_4_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_variance_path =
+      dir_prefix + std::string("batch_normalization_4_variance.bin");
+  void *batch_normalization_4_variance = readTrainedWeights(
+      batch_normalization_4_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 1, 1);
+  std::string batch_normalization_5_gamma_path =
+      dir_prefix + std::string("batch_normalization_5_gamma.bin");
+  void *batch_normalization_5_gamma = readTrainedWeights(
+      batch_normalization_5_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_beta_path =
+      dir_prefix + std::string("batch_normalization_5_beta.bin");
+  void *batch_normalization_5_beta = readTrainedWeights(
+      batch_normalization_5_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_mean_path =
+      dir_prefix + std::string("batch_normalization_5_mean.bin");
+  void *batch_normalization_5_mean = readTrainedWeights(
+      batch_normalization_5_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_variance_path =
+      dir_prefix + std::string("batch_normalization_5_variance.bin");
+  void *batch_normalization_5_variance = readTrainedWeights(
+      batch_normalization_5_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string depthwise_conv2d_3_w_path =
+      dir_prefix + std::string("depthwise_conv2d_3_w.bin");
+  void *depthwise_conv2d_3_w =
+      readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0, 128, 1, 3, 3);
+  std::string batch_normalization_6_gamma_path =
+      dir_prefix + std::string("batch_normalization_6_gamma.bin");
+  void *batch_normalization_6_gamma = readTrainedWeights(
+      batch_normalization_6_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_beta_path =
+      dir_prefix + std::string("batch_normalization_6_beta.bin");
+  void *batch_normalization_6_beta = readTrainedWeights(
+      batch_normalization_6_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_mean_path =
+      dir_prefix + std::string("batch_normalization_6_mean.bin");
+  void *batch_normalization_6_mean = readTrainedWeights(
+      batch_normalization_6_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_variance_path =
+      dir_prefix + std::string("batch_normalization_6_variance.bin");
+  void *batch_normalization_6_variance = readTrainedWeights(
+      batch_normalization_6_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 1, 1);
+  std::string batch_normalization_7_gamma_path =
+      dir_prefix + std::string("batch_normalization_7_gamma.bin");
+  void *batch_normalization_7_gamma = readTrainedWeights(
+      batch_normalization_7_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_beta_path =
+      dir_prefix + std::string("batch_normalization_7_beta.bin");
+  void *batch_normalization_7_beta = readTrainedWeights(
+      batch_normalization_7_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_mean_path =
+      dir_prefix + std::string("batch_normalization_7_mean.bin");
+  void *batch_normalization_7_mean = readTrainedWeights(
+      batch_normalization_7_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_variance_path =
+      dir_prefix + std::string("batch_normalization_7_variance.bin");
+  void *batch_normalization_7_variance = readTrainedWeights(
+      batch_normalization_7_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string depthwise_conv2d_4_w_path =
+      dir_prefix + std::string("depthwise_conv2d_4_w.bin");
+  void *depthwise_conv2d_4_w =
+      readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0, 128, 1, 3, 3);
+  std::string batch_normalization_8_gamma_path =
+      dir_prefix + std::string("batch_normalization_8_gamma.bin");
+  void *batch_normalization_8_gamma = readTrainedWeights(
+      batch_normalization_8_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_beta_path =
+      dir_prefix + std::string("batch_normalization_8_beta.bin");
+  void *batch_normalization_8_beta = readTrainedWeights(
+      batch_normalization_8_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_mean_path =
+      dir_prefix + std::string("batch_normalization_8_mean.bin");
+  void *batch_normalization_8_mean = readTrainedWeights(
+      batch_normalization_8_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_variance_path =
+      dir_prefix + std::string("batch_normalization_8_variance.bin");
+  void *batch_normalization_8_variance = readTrainedWeights(
+      batch_normalization_8_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 1, 1);
+  std::string batch_normalization_9_gamma_path =
+      dir_prefix + std::string("batch_normalization_9_gamma.bin");
+  void *batch_normalization_9_gamma = readTrainedWeights(
+      batch_normalization_9_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_beta_path =
+      dir_prefix + std::string("batch_normalization_9_beta.bin");
+  void *batch_normalization_9_beta = readTrainedWeights(
+      batch_normalization_9_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_mean_path =
+      dir_prefix + std::string("batch_normalization_9_mean.bin");
+  void *batch_normalization_9_mean = readTrainedWeights(
+      batch_normalization_9_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_variance_path =
+      dir_prefix + std::string("batch_normalization_9_variance.bin");
+  void *batch_normalization_9_variance = readTrainedWeights(
+      batch_normalization_9_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string depthwise_conv2d_5_w_path =
+      dir_prefix + std::string("depthwise_conv2d_5_w.bin");
+  void *depthwise_conv2d_5_w =
+      readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0, 256, 1, 3, 3);
+  std::string batch_normalization_10_gamma_path =
+      dir_prefix + std::string("batch_normalization_10_gamma.bin");
+  void *batch_normalization_10_gamma = readTrainedWeights(
+      batch_normalization_10_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_beta_path =
+      dir_prefix + std::string("batch_normalization_10_beta.bin");
+  void *batch_normalization_10_beta = readTrainedWeights(
+      batch_normalization_10_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_mean_path =
+      dir_prefix + std::string("batch_normalization_10_mean.bin");
+  void *batch_normalization_10_mean = readTrainedWeights(
+      batch_normalization_10_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_variance_path =
+      dir_prefix + std::string("batch_normalization_10_variance.bin");
+  void *batch_normalization_10_variance = readTrainedWeights(
+      batch_normalization_10_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 1, 1);
+  std::string batch_normalization_11_gamma_path =
+      dir_prefix + std::string("batch_normalization_11_gamma.bin");
+  void *batch_normalization_11_gamma = readTrainedWeights(
+      batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_beta_path =
+      dir_prefix + std::string("batch_normalization_11_beta.bin");
+  void *batch_normalization_11_beta = readTrainedWeights(
+      batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_mean_path =
+      dir_prefix + std::string("batch_normalization_11_mean.bin");
+  void *batch_normalization_11_mean = readTrainedWeights(
+      batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_variance_path =
+      dir_prefix + std::string("batch_normalization_11_variance.bin");
+  void *batch_normalization_11_variance = readTrainedWeights(
+      batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string depthwise_conv2d_6_w_path =
+      dir_prefix + std::string("depthwise_conv2d_6_w.bin");
+  void *depthwise_conv2d_6_w =
+      readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0, 256, 1, 3, 3);
+  std::string batch_normalization_12_gamma_path =
+      dir_prefix + std::string("batch_normalization_12_gamma.bin");
+  void *batch_normalization_12_gamma = readTrainedWeights(
+      batch_normalization_12_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_beta_path =
+      dir_prefix + std::string("batch_normalization_12_beta.bin");
+  void *batch_normalization_12_beta = readTrainedWeights(
+      batch_normalization_12_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_mean_path =
+      dir_prefix + std::string("batch_normalization_12_mean.bin");
+  void *batch_normalization_12_mean = readTrainedWeights(
+      batch_normalization_12_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_variance_path =
+      dir_prefix + std::string("batch_normalization_12_variance.bin");
+  void *batch_normalization_12_variance = readTrainedWeights(
+      batch_normalization_12_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 512, 256, 1, 1);
+  std::string batch_normalization_13_gamma_path =
+      dir_prefix + std::string("batch_normalization_13_gamma.bin");
+  void *batch_normalization_13_gamma = readTrainedWeights(
+      batch_normalization_13_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_beta_path =
+      dir_prefix + std::string("batch_normalization_13_beta.bin");
+  void *batch_normalization_13_beta = readTrainedWeights(
+      batch_normalization_13_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_mean_path =
+      dir_prefix + std::string("batch_normalization_13_mean.bin");
+  void *batch_normalization_13_mean = readTrainedWeights(
+      batch_normalization_13_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_variance_path =
+      dir_prefix + std::string("batch_normalization_13_variance.bin");
+  void *batch_normalization_13_variance = readTrainedWeights(
+      batch_normalization_13_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_7_w_path =
+      dir_prefix + std::string("depthwise_conv2d_7_w.bin");
+  void *depthwise_conv2d_7_w =
+      readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_14_gamma_path =
+      dir_prefix + std::string("batch_normalization_14_gamma.bin");
+  void *batch_normalization_14_gamma = readTrainedWeights(
+      batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_beta_path =
+      dir_prefix + std::string("batch_normalization_14_beta.bin");
+  void *batch_normalization_14_beta = readTrainedWeights(
+      batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_mean_path =
+      dir_prefix + std::string("batch_normalization_14_mean.bin");
+  void *batch_normalization_14_mean = readTrainedWeights(
+      batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_variance_path =
+      dir_prefix + std::string("batch_normalization_14_variance.bin");
+  void *batch_normalization_14_variance = readTrainedWeights(
+      batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_15_gamma_path =
+      dir_prefix + std::string("batch_normalization_15_gamma.bin");
+  void *batch_normalization_15_gamma = readTrainedWeights(
+      batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_beta_path =
+      dir_prefix + std::string("batch_normalization_15_beta.bin");
+  void *batch_normalization_15_beta = readTrainedWeights(
+      batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_mean_path =
+      dir_prefix + std::string("batch_normalization_15_mean.bin");
+  void *batch_normalization_15_mean = readTrainedWeights(
+      batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_variance_path =
+      dir_prefix + std::string("batch_normalization_15_variance.bin");
+  void *batch_normalization_15_variance = readTrainedWeights(
+      batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_8_w_path =
+      dir_prefix + std::string("depthwise_conv2d_8_w.bin");
+  void *depthwise_conv2d_8_w =
+      readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_16_gamma_path =
+      dir_prefix + std::string("batch_normalization_16_gamma.bin");
+  void *batch_normalization_16_gamma = readTrainedWeights(
+      batch_normalization_16_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_beta_path =
+      dir_prefix + std::string("batch_normalization_16_beta.bin");
+  void *batch_normalization_16_beta = readTrainedWeights(
+      batch_normalization_16_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_mean_path =
+      dir_prefix + std::string("batch_normalization_16_mean.bin");
+  void *batch_normalization_16_mean = readTrainedWeights(
+      batch_normalization_16_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_variance_path =
+      dir_prefix + std::string("batch_normalization_16_variance.bin");
+  void *batch_normalization_16_variance = readTrainedWeights(
+      batch_normalization_16_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_17_gamma_path =
+      dir_prefix + std::string("batch_normalization_17_gamma.bin");
+  void *batch_normalization_17_gamma = readTrainedWeights(
+      batch_normalization_17_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_beta_path =
+      dir_prefix + std::string("batch_normalization_17_beta.bin");
+  void *batch_normalization_17_beta = readTrainedWeights(
+      batch_normalization_17_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_mean_path =
+      dir_prefix + std::string("batch_normalization_17_mean.bin");
+  void *batch_normalization_17_mean = readTrainedWeights(
+      batch_normalization_17_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_variance_path =
+      dir_prefix + std::string("batch_normalization_17_variance.bin");
+  void *batch_normalization_17_variance = readTrainedWeights(
+      batch_normalization_17_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_9_w_path =
+      dir_prefix + std::string("depthwise_conv2d_9_w.bin");
+  void *depthwise_conv2d_9_w =
+      readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_18_gamma_path =
+      dir_prefix + std::string("batch_normalization_18_gamma.bin");
+  void *batch_normalization_18_gamma = readTrainedWeights(
+      batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_beta_path =
+      dir_prefix + std::string("batch_normalization_18_beta.bin");
+  void *batch_normalization_18_beta = readTrainedWeights(
+      batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_mean_path =
+      dir_prefix + std::string("batch_normalization_18_mean.bin");
+  void *batch_normalization_18_mean = readTrainedWeights(
+      batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_variance_path =
+      dir_prefix + std::string("batch_normalization_18_variance.bin");
+  void *batch_normalization_18_variance = readTrainedWeights(
+      batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_19_gamma_path =
+      dir_prefix + std::string("batch_normalization_19_gamma.bin");
+  void *batch_normalization_19_gamma = readTrainedWeights(
+      batch_normalization_19_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_beta_path =
+      dir_prefix + std::string("batch_normalization_19_beta.bin");
+  void *batch_normalization_19_beta = readTrainedWeights(
+      batch_normalization_19_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_mean_path =
+      dir_prefix + std::string("batch_normalization_19_mean.bin");
+  void *batch_normalization_19_mean = readTrainedWeights(
+      batch_normalization_19_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_variance_path =
+      dir_prefix + std::string("batch_normalization_19_variance.bin");
+  void *batch_normalization_19_variance = readTrainedWeights(
+      batch_normalization_19_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_10_w_path =
+      dir_prefix + std::string("depthwise_conv2d_10_w.bin");
+  void *depthwise_conv2d_10_w =
+      readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_20_gamma_path =
+      dir_prefix + std::string("batch_normalization_20_gamma.bin");
+  void *batch_normalization_20_gamma = readTrainedWeights(
+      batch_normalization_20_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_beta_path =
+      dir_prefix + std::string("batch_normalization_20_beta.bin");
+  void *batch_normalization_20_beta = readTrainedWeights(
+      batch_normalization_20_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_mean_path =
+      dir_prefix + std::string("batch_normalization_20_mean.bin");
+  void *batch_normalization_20_mean = readTrainedWeights(
+      batch_normalization_20_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_variance_path =
+      dir_prefix + std::string("batch_normalization_20_variance.bin");
+  void *batch_normalization_20_variance = readTrainedWeights(
+      batch_normalization_20_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_21_gamma_path =
+      dir_prefix + std::string("batch_normalization_21_gamma.bin");
+  void *batch_normalization_21_gamma = readTrainedWeights(
+      batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_beta_path =
+      dir_prefix + std::string("batch_normalization_21_beta.bin");
+  void *batch_normalization_21_beta = readTrainedWeights(
+      batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_mean_path =
+      dir_prefix + std::string("batch_normalization_21_mean.bin");
+  void *batch_normalization_21_mean = readTrainedWeights(
+      batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_variance_path =
+      dir_prefix + std::string("batch_normalization_21_variance.bin");
+  void *batch_normalization_21_variance = readTrainedWeights(
+      batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_11_w_path =
+      dir_prefix + std::string("depthwise_conv2d_11_w.bin");
+  void *depthwise_conv2d_11_w =
+      readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_22_gamma_path =
+      dir_prefix + std::string("batch_normalization_22_gamma.bin");
+  void *batch_normalization_22_gamma = readTrainedWeights(
+      batch_normalization_22_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_beta_path =
+      dir_prefix + std::string("batch_normalization_22_beta.bin");
+  void *batch_normalization_22_beta = readTrainedWeights(
+      batch_normalization_22_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_mean_path =
+      dir_prefix + std::string("batch_normalization_22_mean.bin");
+  void *batch_normalization_22_mean = readTrainedWeights(
+      batch_normalization_22_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_variance_path =
+      dir_prefix + std::string("batch_normalization_22_variance.bin");
+  void *batch_normalization_22_variance = readTrainedWeights(
+      batch_normalization_22_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_23_gamma_path =
+      dir_prefix + std::string("batch_normalization_23_gamma.bin");
+  void *batch_normalization_23_gamma = readTrainedWeights(
+      batch_normalization_23_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_beta_path =
+      dir_prefix + std::string("batch_normalization_23_beta.bin");
+  void *batch_normalization_23_beta = readTrainedWeights(
+      batch_normalization_23_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_mean_path =
+      dir_prefix + std::string("batch_normalization_23_mean.bin");
+  void *batch_normalization_23_mean = readTrainedWeights(
+      batch_normalization_23_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_variance_path =
+      dir_prefix + std::string("batch_normalization_23_variance.bin");
+  void *batch_normalization_23_variance = readTrainedWeights(
+      batch_normalization_23_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_12_w_path =
+      dir_prefix + std::string("depthwise_conv2d_12_w.bin");
+  void *depthwise_conv2d_12_w =
+      readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_24_gamma_path =
+      dir_prefix + std::string("batch_normalization_24_gamma.bin");
+  void *batch_normalization_24_gamma = readTrainedWeights(
+      batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_beta_path =
+      dir_prefix + std::string("batch_normalization_24_beta.bin");
+  void *batch_normalization_24_beta = readTrainedWeights(
+      batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_mean_path =
+      dir_prefix + std::string("batch_normalization_24_mean.bin");
+  void *batch_normalization_24_mean = readTrainedWeights(
+      batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_variance_path =
+      dir_prefix + std::string("batch_normalization_24_variance.bin");
+  void *batch_normalization_24_variance = readTrainedWeights(
+      batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 1024, 512, 1, 1);
+  std::string batch_normalization_25_gamma_path =
+      dir_prefix + std::string("batch_normalization_25_gamma.bin");
+  void *batch_normalization_25_gamma = readTrainedWeights(
+      batch_normalization_25_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_beta_path =
+      dir_prefix + std::string("batch_normalization_25_beta.bin");
+  void *batch_normalization_25_beta = readTrainedWeights(
+      batch_normalization_25_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_mean_path =
+      dir_prefix + std::string("batch_normalization_25_mean.bin");
+  void *batch_normalization_25_mean = readTrainedWeights(
+      batch_normalization_25_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_variance_path =
+      dir_prefix + std::string("batch_normalization_25_variance.bin");
+  void *batch_normalization_25_variance = readTrainedWeights(
+      batch_normalization_25_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string depthwise_conv2d_13_w_path =
+      dir_prefix + std::string("depthwise_conv2d_13_w.bin");
+  void *depthwise_conv2d_13_w =
+      readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0, 1024, 1, 3, 3);
+  std::string batch_normalization_26_gamma_path =
+      dir_prefix + std::string("batch_normalization_26_gamma.bin");
+  void *batch_normalization_26_gamma = readTrainedWeights(
+      batch_normalization_26_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_beta_path =
+      dir_prefix + std::string("batch_normalization_26_beta.bin");
+  void *batch_normalization_26_beta = readTrainedWeights(
+      batch_normalization_26_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_mean_path =
+      dir_prefix + std::string("batch_normalization_26_mean.bin");
+  void *batch_normalization_26_mean = readTrainedWeights(
+      batch_normalization_26_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_variance_path =
+      dir_prefix + std::string("batch_normalization_26_variance.bin");
+  void *batch_normalization_26_variance = readTrainedWeights(
+      batch_normalization_26_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 1024, 1024, 1, 1);
+  std::string batch_normalization_27_gamma_path =
+      dir_prefix + std::string("batch_normalization_27_gamma.bin");
+  void *batch_normalization_27_gamma = readTrainedWeights(
+      batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_beta_path =
+      dir_prefix + std::string("batch_normalization_27_beta.bin");
+  void *batch_normalization_27_beta = readTrainedWeights(
+      batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_mean_path =
+      dir_prefix + std::string("batch_normalization_27_mean.bin");
+  void *batch_normalization_27_mean = readTrainedWeights(
+      batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_variance_path =
+      dir_prefix + std::string("batch_normalization_27_variance.bin");
+  void *batch_normalization_27_variance = readTrainedWeights(
+      batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-    int test_input_size = 2000; 
-    int batch_size = 1000;  
-    int batch_count = test_input_size / batch_size; 
+  startMemTracking();
 
-    float final_accuracy = 0.0; 
+  int test_input_size = 2000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
 
-    for(int i = 0; i < batch_count; i++){ 
+  float final_accuracy = 0.0;
 
-        int start = i * batch_size; 
-        int end = (i + 1) * batch_size; 
+  for (int i = 0; i < batch_count; i++) {
 
-        void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
 
-        void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
-        void* var_1 = tensorHalfBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
-        void* var_2 = tensorHalfRelu(var_1); 
-        void* var_4 = tensorHalfConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
-        void* var_5 = tensorHalfBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
-        void* var_6 = tensorHalfRelu(var_5); 
-        void* var_7 = tensorHalfConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
-        void* var_8 = tensorHalfBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
-        void* var_9 = tensorHalfRelu(var_8); 
-        void* var_11 = tensorHalfConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
-        void* var_12 = tensorHalfBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
-        void* var_13 = tensorHalfRelu(var_12); 
-        void* var_14 = tensorHalfConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
-        void* var_15 = tensorHalfBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
-        void* var_16 = tensorHalfRelu(var_15); 
-        void* var_18 = tensorHalfConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
-        void* var_19 = tensorHalfBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
-        void* var_20 = tensorHalfRelu(var_19); 
-        void* var_21 = tensorHalfConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
-        void* var_22 = tensorHalfBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
-        void* var_23 = tensorHalfRelu(var_22); 
-        void* var_26 = tensorHalfConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
-        void* var_27 = tensorHalfBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
-        void* var_28 = tensorHalfRelu(var_27); 
-        void* var_29 = tensorHalfConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
-        void* var_30 = tensorHalfBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
-        void* var_31 = tensorHalfRelu(var_30); 
-        void* var_33 = tensorHalfConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
-        void* var_34 = tensorHalfBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
-        void* var_35 = tensorHalfRelu(var_34); 
-        void* var_36 = tensorHalfConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
-        void* var_37 = tensorHalfBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
-        void* var_38 = tensorHalfRelu(var_37); 
-        void* var_41 = tensorHalfConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
-        void* var_42 = tensorHalfBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
-        void* var_43 = tensorHalfRelu(var_42); 
-        void* var_44 = tensorHalfConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); 
-        void* var_45 = tensorHalfBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
-        void* var_46 = tensorHalfRelu(var_45); 
-        void* var_48 = tensorHalfConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
-        void* var_49 = tensorHalfBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
-        void* var_50 = tensorHalfRelu(var_49); 
-        void* var_51 = tensorHalfConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
-        void* var_52 = tensorHalfBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
-        void* var_53 = tensorHalfRelu(var_52); 
-        void* var_55 = tensorHalfConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
-        void* var_56 = tensorHalfBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
-        void* var_57 = tensorHalfRelu(var_56); 
-        void* var_58 = tensorHalfConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
-        void* var_59 = tensorHalfBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
-        void* var_60 = tensorHalfRelu(var_59); 
-        void* var_63 = tensorHalfConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
-        void* var_64 = tensorHalfBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
-        void* var_65 = tensorHalfRelu(var_64); 
-        void* var_66 = tensorHalfConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); 
-        void* var_67 = tensorHalfBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
-        void* var_68 = tensorHalfRelu(var_67); 
-        void* var_70 = tensorHalfConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
-        void* var_71 = tensorHalfBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
-        void* var_72 = tensorHalfRelu(var_71); 
-        void* var_73 = tensorHalfConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
-        void* var_74 = tensorHalfBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
-        void* var_75 = tensorHalfRelu(var_74); 
-        void* var_77 = tensorHalfConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
-        void* var_78 = tensorHalfBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
-        void* var_79 = tensorHalfRelu(var_78); 
-        void* var_80 = tensorHalfConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); 
-        void* var_81 = tensorHalfBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
-        void* var_82 = tensorHalfRelu(var_81); 
-        void* var_85 = tensorHalfConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
-        void* var_86 = tensorHalfBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
-        void* var_87 = tensorHalfRelu(var_86); 
-        void* var_88 = tensorHalfConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); 
-        void* var_89 = tensorHalfBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-        void* var_90 = tensorHalfRelu(var_89); 
-        void* var_92 = tensorHalfConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
-        void* var_93 = tensorHalfBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
-        void* var_94 = tensorHalfRelu(var_93); 
-        void* var_95 = tensorHalfConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
-        void* var_96 = tensorHalfBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-        void* var_97 = tensorHalfRelu(var_96); 
-        void* var_99 = tensorHalfPooling(var_97,1,2,2,0,0,2,2); 
-        void* var_101 = tensorHalfGemmGPU(var_99, dense_1_w); 
-        void* var_102 = tensorHalfAdd(var_101, dense_1_b); 
-        void* var_103 = tensorSoftmax(var_102); 
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
 
-        uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
+    void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1);
+    void *var_1 = tensorHalfBatchNorm(
+        var_0, batch_normalization_1_gamma, batch_normalization_1_beta,
+        batch_normalization_1_mean, batch_normalization_1_variance, 0.001);
+    void *var_2 = tensorHalfRelu(var_1);
+    void *var_4 =
+        tensorHalfConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32);
+    void *var_5 = tensorHalfBatchNorm(
+        var_4, batch_normalization_2_gamma, batch_normalization_2_beta,
+        batch_normalization_2_mean, batch_normalization_2_variance, 0.001);
+    void *var_6 = tensorHalfRelu(var_5);
+    void *var_7 = tensorHalfConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1);
+    void *var_8 = tensorHalfBatchNorm(
+        var_7, batch_normalization_3_gamma, batch_normalization_3_beta,
+        batch_normalization_3_mean, batch_normalization_3_variance, 0.001);
+    void *var_9 = tensorHalfRelu(var_8);
+    void *var_11 =
+        tensorHalfConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64);
+    void *var_12 = tensorHalfBatchNorm(
+        var_11, batch_normalization_4_gamma, batch_normalization_4_beta,
+        batch_normalization_4_mean, batch_normalization_4_variance, 0.001);
+    void *var_13 = tensorHalfRelu(var_12);
+    void *var_14 = tensorHalfConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1);
+    void *var_15 = tensorHalfBatchNorm(
+        var_14, batch_normalization_5_gamma, batch_normalization_5_beta,
+        batch_normalization_5_mean, batch_normalization_5_variance, 0.001);
+    void *var_16 = tensorHalfRelu(var_15);
+    void *var_18 =
+        tensorHalfConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128);
+    void *var_19 = tensorHalfBatchNorm(
+        var_18, batch_normalization_6_gamma, batch_normalization_6_beta,
+        batch_normalization_6_mean, batch_normalization_6_variance, 0.001);
+    void *var_20 = tensorHalfRelu(var_19);
+    void *var_21 = tensorHalfConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1);
+    void *var_22 = tensorHalfBatchNorm(
+        var_21, batch_normalization_7_gamma, batch_normalization_7_beta,
+        batch_normalization_7_mean, batch_normalization_7_variance, 0.001);
+    void *var_23 = tensorHalfRelu(var_22);
+    void *var_26 =
+        tensorHalfConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128);
+    void *var_27 = tensorHalfBatchNorm(
+        var_26, batch_normalization_8_gamma, batch_normalization_8_beta,
+        batch_normalization_8_mean, batch_normalization_8_variance, 0.001);
+    void *var_28 = tensorHalfRelu(var_27);
+    void *var_29 = tensorHalfConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1);
+    void *var_30 = tensorHalfBatchNorm(
+        var_29, batch_normalization_9_gamma, batch_normalization_9_beta,
+        batch_normalization_9_mean, batch_normalization_9_variance, 0.001);
+    void *var_31 = tensorHalfRelu(var_30);
+    void *var_33 =
+        tensorHalfConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256);
+    void *var_34 = tensorHalfBatchNorm(
+        var_33, batch_normalization_10_gamma, batch_normalization_10_beta,
+        batch_normalization_10_mean, batch_normalization_10_variance, 0.001);
+    void *var_35 = tensorHalfRelu(var_34);
+    void *var_36 = tensorHalfConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1);
+    void *var_37 = tensorHalfBatchNorm(
+        var_36, batch_normalization_11_gamma, batch_normalization_11_beta,
+        batch_normalization_11_mean, batch_normalization_11_variance, 0.001);
+    void *var_38 = tensorHalfRelu(var_37);
+    void *var_41 =
+        tensorHalfConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256);
+    void *var_42 = tensorHalfBatchNorm(
+        var_41, batch_normalization_12_gamma, batch_normalization_12_beta,
+        batch_normalization_12_mean, batch_normalization_12_variance, 0.001);
+    void *var_43 = tensorHalfRelu(var_42);
+    void *var_44 = tensorHalfConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1);
+    void *var_45 = tensorHalfBatchNorm(
+        var_44, batch_normalization_13_gamma, batch_normalization_13_beta,
+        batch_normalization_13_mean, batch_normalization_13_variance, 0.001);
+    void *var_46 = tensorHalfRelu(var_45);
+    void *var_48 =
+        tensorHalfConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512);
+    void *var_49 = tensorHalfBatchNorm(
+        var_48, batch_normalization_14_gamma, batch_normalization_14_beta,
+        batch_normalization_14_mean, batch_normalization_14_variance, 0.001);
+    void *var_50 = tensorHalfRelu(var_49);
+    void *var_51 = tensorHalfConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1);
+    void *var_52 = tensorHalfBatchNorm(
+        var_51, batch_normalization_15_gamma, batch_normalization_15_beta,
+        batch_normalization_15_mean, batch_normalization_15_variance, 0.001);
+    void *var_53 = tensorHalfRelu(var_52);
+    void *var_55 =
+        tensorHalfConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512);
+    void *var_56 = tensorHalfBatchNorm(
+        var_55, batch_normalization_16_gamma, batch_normalization_16_beta,
+        batch_normalization_16_mean, batch_normalization_16_variance, 0.001);
+    void *var_57 = tensorHalfRelu(var_56);
+    void *var_58 = tensorHalfConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1);
+    void *var_59 = tensorHalfBatchNorm(
+        var_58, batch_normalization_17_gamma, batch_normalization_17_beta,
+        batch_normalization_17_mean, batch_normalization_17_variance, 0.001);
+    void *var_60 = tensorHalfRelu(var_59);
+    void *var_63 =
+        tensorHalfConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512);
+    void *var_64 = tensorHalfBatchNorm(
+        var_63, batch_normalization_18_gamma, batch_normalization_18_beta,
+        batch_normalization_18_mean, batch_normalization_18_variance, 0.001);
+    void *var_65 = tensorHalfRelu(var_64);
+    void *var_66 = tensorHalfConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1);
+    void *var_67 = tensorHalfBatchNorm(
+        var_66, batch_normalization_19_gamma, batch_normalization_19_beta,
+        batch_normalization_19_mean, batch_normalization_19_variance, 0.001);
+    void *var_68 = tensorHalfRelu(var_67);
+    void *var_70 = tensorHalfConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1,
+                                         1, 1, 512);
+    void *var_71 = tensorHalfBatchNorm(
+        var_70, batch_normalization_20_gamma, batch_normalization_20_beta,
+        batch_normalization_20_mean, batch_normalization_20_variance, 0.001);
+    void *var_72 = tensorHalfRelu(var_71);
+    void *var_73 = tensorHalfConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1);
+    void *var_74 = tensorHalfBatchNorm(
+        var_73, batch_normalization_21_gamma, batch_normalization_21_beta,
+        batch_normalization_21_mean, batch_normalization_21_variance, 0.001);
+    void *var_75 = tensorHalfRelu(var_74);
+    void *var_77 = tensorHalfConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1,
+                                         1, 1, 512);
+    void *var_78 = tensorHalfBatchNorm(
+        var_77, batch_normalization_22_gamma, batch_normalization_22_beta,
+        batch_normalization_22_mean, batch_normalization_22_variance, 0.001);
+    void *var_79 = tensorHalfRelu(var_78);
+    void *var_80 = tensorHalfConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1);
+    void *var_81 = tensorHalfBatchNorm(
+        var_80, batch_normalization_23_gamma, batch_normalization_23_beta,
+        batch_normalization_23_mean, batch_normalization_23_variance, 0.001);
+    void *var_82 = tensorHalfRelu(var_81);
+    void *var_85 = tensorHalfConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2,
+                                         2, 1, 512);
+    void *var_86 = tensorHalfBatchNorm(
+        var_85, batch_normalization_24_gamma, batch_normalization_24_beta,
+        batch_normalization_24_mean, batch_normalization_24_variance, 0.001);
+    void *var_87 = tensorHalfRelu(var_86);
+    void *var_88 = tensorHalfConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1);
+    void *var_89 = tensorHalfBatchNorm(
+        var_88, batch_normalization_25_gamma, batch_normalization_25_beta,
+        batch_normalization_25_mean, batch_normalization_25_variance, 0.001);
+    void *var_90 = tensorHalfRelu(var_89);
+    void *var_92 = tensorHalfConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1,
+                                         1, 1, 1024);
+    void *var_93 = tensorHalfBatchNorm(
+        var_92, batch_normalization_26_gamma, batch_normalization_26_beta,
+        batch_normalization_26_mean, batch_normalization_26_variance, 0.001);
+    void *var_94 = tensorHalfRelu(var_93);
+    void *var_95 = tensorHalfConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1);
+    void *var_96 = tensorHalfBatchNorm(
+        var_95, batch_normalization_27_gamma, batch_normalization_27_beta,
+        batch_normalization_27_mean, batch_normalization_27_variance, 0.001);
+    void *var_97 = tensorHalfRelu(var_96);
+    void *var_99 = tensorHalfPooling(var_97, 1, 2, 2, 0, 0, 2, 2);
+    void *var_101 = tensorHalfGemmGPU(var_99, dense_1_w);
+    void *var_102 = tensorHalfAdd(var_101, dense_1_b);
+    void *var_103 = tensorSoftmax(var_102);
 
-        float accuracy = computeAccuracy2(labels, batch_size, var_103); 
-        final_accuracy += accuracy; 
-        freeBatchMemory(); 
-    }
-    final_accuracy = final_accuracy / batch_count; 
-    dumpFinalAccuracy(final_accuracy); 
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
 
-    llvm_hpvm_cleanupTensorRt(); 
+    float accuracy = computeAccuracy2(labels, batch_size, var_103);
+    final_accuracy += accuracy;
+    freeBatchMemory();
+  }
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-    return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
index 741c4a443cc9a56c443ec5858aaed5a7d5705268..db8081c6b06e3529d76b13d64f3d25691184024c 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
@@ -1,112 +1,149 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../../tensor_runtime/include/tensor_runtime.h" 
-#include "../../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-  
-  std::string dir_prefix = model_params_path + std::string("/resnet18_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_14_b_path =  dir_prefix + std::string("conv2d_14_b.bin"); 
-  void* conv2d_14_b =  readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_15_w_path =  dir_prefix + std::string("conv2d_15_w.bin"); 
-  void* conv2d_15_w =  readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); 
-  std::string conv2d_15_b_path =  dir_prefix + std::string("conv2d_15_b.bin"); 
-  void* conv2d_15_b =  readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_17_w_path =  dir_prefix + std::string("conv2d_17_w.bin"); 
-  void* conv2d_17_w =  readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); 
-  std::string conv2d_17_b_path =  dir_prefix + std::string("conv2d_17_b.bin"); 
-  void* conv2d_17_b =  readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_16_w_path =  dir_prefix + std::string("conv2d_16_w.bin"); 
-  void* conv2d_16_w =  readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_16_b_path =  dir_prefix + std::string("conv2d_16_b.bin"); 
-  void* conv2d_16_b =  readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_18_w_path =  dir_prefix + std::string("conv2d_18_w.bin"); 
-  void* conv2d_18_w =  readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_18_b_path =  dir_prefix + std::string("conv2d_18_b.bin"); 
-  void* conv2d_18_b =  readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_19_w_path =  dir_prefix + std::string("conv2d_19_w.bin"); 
-  void* conv2d_19_w =  readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_19_b_path =  dir_prefix + std::string("conv2d_19_b.bin"); 
-  void* conv2d_19_b =  readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_20_w_path =  dir_prefix + std::string("conv2d_20_w.bin"); 
-  void* conv2d_20_w =  readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_20_b_path =  dir_prefix + std::string("conv2d_20_b.bin"); 
-  void* conv2d_20_b =  readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_21_w_path =  dir_prefix + std::string("conv2d_21_w.bin"); 
-  void* conv2d_21_w =  readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_21_b_path =  dir_prefix + std::string("conv2d_21_b.bin"); 
-  void* conv2d_21_b =  readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
 
+#include "../../../tensor_runtime/include/tensor_runtime.h"
+#include "../../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix =
+      model_params_path + std::string("/resnet18_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  // void* input = readTrainedWeights(input_path.c_str(), 0,
+  // batch_size,3,32,32);
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  // uint8_t* labels = readLabels(labels_path.c_str(), batch_size);
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 32, 16, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 32, 16, 1, 1);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin");
+  void *conv2d_14_b =
+      readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin");
+  void *conv2d_15_w =
+      readTrainedWeights(conv2d_15_w_path.c_str(), 0, 64, 32, 3, 3);
+  std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin");
+  void *conv2d_15_b =
+      readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin");
+  void *conv2d_17_w =
+      readTrainedWeights(conv2d_17_w_path.c_str(), 0, 64, 32, 1, 1);
+  std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin");
+  void *conv2d_17_b =
+      readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin");
+  void *conv2d_16_w =
+      readTrainedWeights(conv2d_16_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin");
+  void *conv2d_16_b =
+      readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin");
+  void *conv2d_18_w =
+      readTrainedWeights(conv2d_18_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin");
+  void *conv2d_18_b =
+      readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin");
+  void *conv2d_19_w =
+      readTrainedWeights(conv2d_19_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin");
+  void *conv2d_19_b =
+      readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin");
+  void *conv2d_20_w =
+      readTrainedWeights(conv2d_20_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin");
+  void *conv2d_20_b =
+      readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin");
+  void *conv2d_21_w =
+      readTrainedWeights(conv2d_21_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin");
+  void *conv2d_21_b =
+      readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 64, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -117,94 +154,94 @@ int main(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
-    
-    void* var_2 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_3 = tensorHalfAdd(var_2, conv2d_1_b); 
-    void* var_4 = tensorHalfRelu(var_3); 
-    void* var_6 = tensorHalfConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_7 = tensorHalfAdd(var_6, conv2d_2_b); 
-    void* var_8 = tensorHalfRelu(var_7); 
-    void* var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorHalfAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorHalfAdd(var_4, var_11); 
-    void* var_13 = tensorHalfRelu(var_12); 
-    void* var_15 = tensorHalfConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_16 = tensorHalfAdd(var_15, conv2d_4_b); 
-    void* var_17 = tensorHalfRelu(var_16); 
-    void* var_19 = tensorHalfConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_20 = tensorHalfAdd(var_19, conv2d_5_b); 
-    void* var_21 = tensorHalfAdd(var_13, var_20); 
-    void* var_22 = tensorHalfRelu(var_21); 
-    void* var_24 = tensorHalfConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorHalfAdd(var_24, conv2d_6_b); 
-    void* var_26 = tensorHalfRelu(var_25); 
-    void* var_28 = tensorHalfConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorHalfAdd(var_28, conv2d_7_b); 
-    void* var_30 = tensorHalfAdd(var_22, var_29); 
-    void* var_31 = tensorHalfRelu(var_30); 
-    void* var_33 = tensorHalfConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); 
-    void* var_34 = tensorHalfAdd(var_33, conv2d_8_b); 
-    void* var_35 = tensorHalfRelu(var_34); 
-    void* var_37 = tensorHalfConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_38 = tensorHalfAdd(var_37, conv2d_9_b); 
-    void* var_40 = tensorHalfConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); 
-    void* var_41 = tensorHalfAdd(var_40, conv2d_10_b); 
-    void* var_42 = tensorHalfAdd(var_41, var_38); 
-    void* var_43 = tensorHalfRelu(var_42); 
-    void* var_45 = tensorHalfConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_46 = tensorHalfAdd(var_45, conv2d_11_b); 
-    void* var_47 = tensorHalfRelu(var_46); 
-    void* var_49 = tensorHalfConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_50 = tensorHalfAdd(var_49, conv2d_12_b); 
-    void* var_51 = tensorHalfAdd(var_43, var_50); 
-    void* var_52 = tensorHalfRelu(var_51); 
-    void* var_54 = tensorHalfConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_55 = tensorHalfAdd(var_54, conv2d_13_b); 
-    void* var_56 = tensorHalfRelu(var_55); 
-    void* var_58 = tensorHalfConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); 
-    void* var_59 = tensorHalfAdd(var_58, conv2d_14_b); 
-    void* var_60 = tensorHalfAdd(var_52, var_59); 
-    void* var_61 = tensorHalfRelu(var_60); 
-    void* var_63 = tensorHalfConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); 
-    void* var_64 = tensorHalfAdd(var_63, conv2d_15_b); 
-    void* var_65 = tensorHalfRelu(var_64); 
-    void* var_67 = tensorHalfConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); 
-    void* var_68 = tensorHalfAdd(var_67, conv2d_16_b); 
-    void* var_70 = tensorHalfConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); 
-    void* var_71 = tensorHalfAdd(var_70, conv2d_17_b); 
-    void* var_72 = tensorHalfAdd(var_71, var_68); 
-    void* var_73 = tensorHalfRelu(var_72); 
-    void* var_75 = tensorHalfConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); 
-    void* var_76 = tensorHalfAdd(var_75, conv2d_18_b); 
-    void* var_77 = tensorHalfRelu(var_76); 
-    void* var_79 = tensorHalfConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); 
-    void* var_80 = tensorHalfAdd(var_79, conv2d_19_b); 
-    void* var_81 = tensorHalfAdd(var_73, var_80); 
-    void* var_82 = tensorHalfRelu(var_81); 
-    void* var_84 = tensorHalfConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); 
-    void* var_85 = tensorHalfAdd(var_84, conv2d_20_b); 
-    void* var_86 = tensorHalfRelu(var_85); 
-    void* var_88 = tensorHalfConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); 
-    void* var_89 = tensorHalfAdd(var_88, conv2d_21_b); 
-    void* var_90 = tensorHalfAdd(var_82, var_89); 
-    void* var_91 = tensorHalfRelu(var_90); 
-    void* var_92 = tensorHalfPooling(var_91,1,8,8,0,0,8,8); 
-    void* var_94 = tensorHalfGemmGPU(var_92, dense_1_w); 
-    void* var_95 = tensorHalfAdd(var_94, dense_1_b); 
-    void* var_96 = tensorSoftmax(var_95); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_96); 
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_2 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_3 = tensorHalfAdd(var_2, conv2d_1_b);
+    void *var_4 = tensorHalfRelu(var_3);
+    void *var_6 = tensorHalfConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_7 = tensorHalfAdd(var_6, conv2d_2_b);
+    void *var_8 = tensorHalfRelu(var_7);
+    void *var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_11 = tensorHalfAdd(var_10, conv2d_3_b);
+    void *var_12 = tensorHalfAdd(var_4, var_11);
+    void *var_13 = tensorHalfRelu(var_12);
+    void *var_15 = tensorHalfConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_16 = tensorHalfAdd(var_15, conv2d_4_b);
+    void *var_17 = tensorHalfRelu(var_16);
+    void *var_19 = tensorHalfConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_20 = tensorHalfAdd(var_19, conv2d_5_b);
+    void *var_21 = tensorHalfAdd(var_13, var_20);
+    void *var_22 = tensorHalfRelu(var_21);
+    void *var_24 = tensorHalfConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorHalfAdd(var_24, conv2d_6_b);
+    void *var_26 = tensorHalfRelu(var_25);
+    void *var_28 = tensorHalfConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorHalfAdd(var_28, conv2d_7_b);
+    void *var_30 = tensorHalfAdd(var_22, var_29);
+    void *var_31 = tensorHalfRelu(var_30);
+    void *var_33 = tensorHalfConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0);
+    void *var_34 = tensorHalfAdd(var_33, conv2d_8_b);
+    void *var_35 = tensorHalfRelu(var_34);
+    void *var_37 = tensorHalfConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_38 = tensorHalfAdd(var_37, conv2d_9_b);
+    void *var_40 = tensorHalfConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0);
+    void *var_41 = tensorHalfAdd(var_40, conv2d_10_b);
+    void *var_42 = tensorHalfAdd(var_41, var_38);
+    void *var_43 = tensorHalfRelu(var_42);
+    void *var_45 = tensorHalfConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_46 = tensorHalfAdd(var_45, conv2d_11_b);
+    void *var_47 = tensorHalfRelu(var_46);
+    void *var_49 = tensorHalfConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_50 = tensorHalfAdd(var_49, conv2d_12_b);
+    void *var_51 = tensorHalfAdd(var_43, var_50);
+    void *var_52 = tensorHalfRelu(var_51);
+    void *var_54 = tensorHalfConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_55 = tensorHalfAdd(var_54, conv2d_13_b);
+    void *var_56 = tensorHalfRelu(var_55);
+    void *var_58 = tensorHalfConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0);
+    void *var_59 = tensorHalfAdd(var_58, conv2d_14_b);
+    void *var_60 = tensorHalfAdd(var_52, var_59);
+    void *var_61 = tensorHalfRelu(var_60);
+    void *var_63 = tensorHalfConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0);
+    void *var_64 = tensorHalfAdd(var_63, conv2d_15_b);
+    void *var_65 = tensorHalfRelu(var_64);
+    void *var_67 = tensorHalfConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0);
+    void *var_68 = tensorHalfAdd(var_67, conv2d_16_b);
+    void *var_70 = tensorHalfConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0);
+    void *var_71 = tensorHalfAdd(var_70, conv2d_17_b);
+    void *var_72 = tensorHalfAdd(var_71, var_68);
+    void *var_73 = tensorHalfRelu(var_72);
+    void *var_75 = tensorHalfConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0);
+    void *var_76 = tensorHalfAdd(var_75, conv2d_18_b);
+    void *var_77 = tensorHalfRelu(var_76);
+    void *var_79 = tensorHalfConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0);
+    void *var_80 = tensorHalfAdd(var_79, conv2d_19_b);
+    void *var_81 = tensorHalfAdd(var_73, var_80);
+    void *var_82 = tensorHalfRelu(var_81);
+    void *var_84 = tensorHalfConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0);
+    void *var_85 = tensorHalfAdd(var_84, conv2d_20_b);
+    void *var_86 = tensorHalfRelu(var_85);
+    void *var_88 = tensorHalfConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0);
+    void *var_89 = tensorHalfAdd(var_88, conv2d_21_b);
+    void *var_90 = tensorHalfAdd(var_82, var_89);
+    void *var_91 = tensorHalfRelu(var_90);
+    void *var_92 = tensorHalfPooling(var_91, 1, 8, 8, 0, 0, 8, 8);
+    void *var_94 = tensorHalfGemmGPU(var_92, dense_1_w);
+    void *var_95 = tensorHalfAdd(var_94, dense_1_b);
+    void *var_96 = tensorSoftmax(var_95);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_96);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -213,9 +250,7 @@ int main(){
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
 
-  
-  llvm_hpvm_cleanupTensorRt(); 
-
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
index 9ac1deea68c693f8baf2df2d9f2b626b3597ad7f..1bd79c7fb71400edd900bceb42413cf4320005fe 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
@@ -1,160 +1,180 @@
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
+
 
 #include "../../../tensor_runtime/include/tensor_runtime.h"
 #include "../../include/utils.h"
 
-int main(){ 
-
-    llvm_hpvm_initTensorRt(0); 
-
-    std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); 
-    std::string input_path =  dir_prefix + std::string("input.bin"); 
-    std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-    std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-    void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-    std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-    void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-    std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-    void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-    std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-    void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-    std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-    void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-    std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-    void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-    std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-    void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-    std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-    void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-    std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-    void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-    std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-    void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-    void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-    std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-    void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-    void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-    std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-    void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-    void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-    std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-    void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-    void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-    void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-    void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-    void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-    void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-    void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-    void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-    void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-    void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-    void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-    std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-    void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
-    std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-    void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
-    std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-    void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); 
-    std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-    void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); 
-
-
-    startMemTracking(); 
-
-    int test_input_size = 2000; 
-    int batch_size = 1000; 
-    int batch_count = test_input_size / batch_size; 
-    float final_accuracy = 0.0; 
-
-    for(int i = 0; i < batch_count; i++){ 
-
-        int start = i * batch_size; 
-        int end = (i + 1) * batch_size; 
-
-        void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
-
-        void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-        void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); 
-        void* var_2 = tensorHalfRelu(var_1); 
-        void* var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-        void* var_5 = tensorHalfAdd(var_4, conv2d_2_b); 
-        void* var_6 = tensorHalfRelu(var_5); 
-        void* var_7 = tensorHalfPooling(var_6,0,2,2,0,0,2,2); 
-        void* var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-        void* var_9 = tensorHalfAdd(var_8, conv2d_3_b); 
-        void* var_10 = tensorHalfRelu(var_9); 
-        void* var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-        void* var_13 = tensorHalfAdd(var_12, conv2d_4_b); 
-        void* var_14 = tensorHalfRelu(var_13); 
-        void* var_15 = tensorHalfPooling(var_14,0,2,2,0,0,2,2); 
-        void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-        void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); 
-        void* var_18 = tensorHalfRelu(var_17); 
-        void* var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-        void* var_21 = tensorHalfAdd(var_20, conv2d_6_b); 
-        void* var_22 = tensorHalfRelu(var_21); 
-        void* var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-        void* var_25 = tensorHalfAdd(var_24, conv2d_7_b); 
-        void* var_26 = tensorHalfRelu(var_25); 
-        void* var_27 = tensorHalfPooling(var_26,0,2,2,0,0,2,2); 
-        void* var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
-        void* var_29 = tensorHalfAdd(var_28, conv2d_8_b); 
-        void* var_30 = tensorHalfRelu(var_29); 
-        void* var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-        void* var_33 = tensorHalfAdd(var_32, conv2d_9_b); 
-        void* var_34 = tensorHalfRelu(var_33); 
-        void* var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
-        void* var_37 = tensorHalfAdd(var_36, conv2d_10_b); 
-        void* var_38 = tensorHalfRelu(var_37); 
-        void* var_39 = tensorHalfPooling(var_38,0,2,2,0,0,2,2); 
-        void* var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-        void* var_41 = tensorHalfAdd(var_40, conv2d_11_b); 
-        void* var_42 = tensorHalfRelu(var_41); 
-        void* var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-        void* var_45 = tensorHalfAdd(var_44, conv2d_12_b); 
-        void* var_46 = tensorHalfRelu(var_45); 
-        void* var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-        void* var_49 = tensorHalfAdd(var_48, conv2d_13_b); 
-        void* var_50 = tensorHalfRelu(var_49); 
-        void* var_51 = tensorHalfPooling(var_50,0,2,2,0,0,2,2); 
-        void* var_54 = tensorHalfGemmGPU(var_51, dense_1_w); 
-        void* var_55 = tensorHalfAdd(var_54, dense_1_b); 
-        void* var_56 = tensorHalfRelu(var_55); 
-        void* var_58 = tensorHalfGemmGPU(var_56, dense_2_w); 
-        void* var_59 = tensorHalfAdd(var_58, dense_2_b); 
-        void* var_60 = tensorSoftmax(var_59); 
-
-        uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
-
-        float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); 
-        final_accuracy += accuracy; 
-        freeBatchMemory(); 
-
-    }
-
-    final_accuracy = final_accuracy / batch_count;
-    dumpFinalAccuracy(final_accuracy); 
-
-    llvm_hpvm_cleanupTensorRt(); 
-
-    return 0; 
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1);
+
+  startMemTracking();
+
+  int test_input_size = 2000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  for (int i = 0; i < batch_count; i++) {
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_1 = tensorHalfAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorHalfRelu(var_1);
+    void *var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_5 = tensorHalfAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorHalfRelu(var_5);
+    void *var_7 = tensorHalfPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_9 = tensorHalfAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorHalfRelu(var_9);
+    void *var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_13 = tensorHalfAdd(var_12, conv2d_4_b);
+    void *var_14 = tensorHalfRelu(var_13);
+    void *var_15 = tensorHalfPooling(var_14, 0, 2, 2, 0, 0, 2, 2);
+    void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorHalfAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorHalfRelu(var_17);
+    void *var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_21 = tensorHalfAdd(var_20, conv2d_6_b);
+    void *var_22 = tensorHalfRelu(var_21);
+    void *var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorHalfAdd(var_24, conv2d_7_b);
+    void *var_26 = tensorHalfRelu(var_25);
+    void *var_27 = tensorHalfPooling(var_26, 0, 2, 2, 0, 0, 2, 2);
+    void *var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorHalfAdd(var_28, conv2d_8_b);
+    void *var_30 = tensorHalfRelu(var_29);
+    void *var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_33 = tensorHalfAdd(var_32, conv2d_9_b);
+    void *var_34 = tensorHalfRelu(var_33);
+    void *var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0);
+    void *var_37 = tensorHalfAdd(var_36, conv2d_10_b);
+    void *var_38 = tensorHalfRelu(var_37);
+    void *var_39 = tensorHalfPooling(var_38, 0, 2, 2, 0, 0, 2, 2);
+    void *var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_41 = tensorHalfAdd(var_40, conv2d_11_b);
+    void *var_42 = tensorHalfRelu(var_41);
+    void *var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_45 = tensorHalfAdd(var_44, conv2d_12_b);
+    void *var_46 = tensorHalfRelu(var_45);
+    void *var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_49 = tensorHalfAdd(var_48, conv2d_13_b);
+    void *var_50 = tensorHalfRelu(var_49);
+    void *var_51 = tensorHalfPooling(var_50, 0, 2, 2, 0, 0, 2, 2);
+    void *var_54 = tensorHalfGemmGPU(var_51, dense_1_w);
+    void *var_55 = tensorHalfAdd(var_54, dense_1_b);
+    void *var_56 = tensorHalfRelu(var_55);
+    void *var_58 = tensorHalfGemmGPU(var_56, dense_2_w);
+    void *var_59 = tensorHalfAdd(var_58, dense_2_b);
+    void *var_60 = tensorSoftmax(var_59);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_60, 100);
+    final_accuracy += accuracy;
+    freeBatchMemory();
+  }
+
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc
index f92bac10e27162fe0bc59c07aa4f9ede542ccd6e..22d2a3614cb668a668f60c7a3941e06d92ebf4de 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc
@@ -1,82 +1,103 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h>
-#include "../../../tensor_runtime/include/tensor_runtime.h"
-#include "../../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); 
 
+#include "../../../tensor_runtime/include/tensor_runtime.h"
+#include "../../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -85,83 +106,82 @@ int main(){
   int batch_count = test_input_size / batch_size;
   float final_accuracy = 0.0;
 
-  // Start power and performance profiling 
+  // Start power and performance profiling
   startProfiling();
 
-  for(int i = 0; i < batch_count; i++){
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); 
- 
-    void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorHalfRelu(var_1); 
-    void* var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_5 = tensorHalfAdd(var_4, conv2d_2_b); 
-    void* var_6 = tensorHalfRelu(var_5); 
-    void* var_7 = tensorHalfPooling(var_6,0,2,2,0,0,2,2); 
-    void* var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_9 = tensorHalfAdd(var_8, conv2d_3_b); 
-    void* var_10 = tensorHalfRelu(var_9); 
-    void* var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_13 = tensorHalfAdd(var_12, conv2d_4_b); 
-    void* var_14 = tensorHalfRelu(var_13); 
-    void* var_15 = tensorHalfPooling(var_14,0,2,2,0,0,2,2); 
-    void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorHalfRelu(var_17); 
-    void* var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_21 = tensorHalfAdd(var_20, conv2d_6_b); 
-    void* var_22 = tensorHalfRelu(var_21); 
-    void* var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorHalfAdd(var_24, conv2d_7_b); 
-    void* var_26 = tensorHalfRelu(var_25); 
-    void* var_27 = tensorHalfPooling(var_26,0,2,2,0,0,2,2); 
-    void* var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorHalfAdd(var_28, conv2d_8_b); 
-    void* var_30 = tensorHalfRelu(var_29); 
-    void* var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_33 = tensorHalfAdd(var_32, conv2d_9_b); 
-    void* var_34 = tensorHalfRelu(var_33); 
-    void* var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
-    void* var_37 = tensorHalfAdd(var_36, conv2d_10_b); 
-    void* var_38 = tensorHalfRelu(var_37); 
-    void* var_39 = tensorHalfPooling(var_38,0,2,2,0,0,2,2); 
-    void* var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_41 = tensorHalfAdd(var_40, conv2d_11_b); 
-    void* var_42 = tensorHalfRelu(var_41); 
-    void* var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_45 = tensorHalfAdd(var_44, conv2d_12_b); 
-    void* var_46 = tensorHalfRelu(var_45); 
-    void* var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_49 = tensorHalfAdd(var_48, conv2d_13_b); 
-    void* var_50 = tensorHalfRelu(var_49); 
-    void* var_51 = tensorHalfPooling(var_50,0,2,2,0,0,2,2); 
-    void* var_54 = tensorHalfGemmGPU(var_51, dense_1_w); 
-    void* var_55 = tensorHalfAdd(var_54, dense_1_b); 
-    void* var_56 = tensorHalfRelu(var_55); 
-    void* var_58 = tensorHalfGemmGPU(var_56, dense_2_w); 
-    void* var_59 = tensorHalfAdd(var_58, dense_2_b); 
-    void* var_60 = tensorSoftmax(var_59); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_60); 
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_1 = tensorHalfAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorHalfRelu(var_1);
+    void *var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_5 = tensorHalfAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorHalfRelu(var_5);
+    void *var_7 = tensorHalfPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_9 = tensorHalfAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorHalfRelu(var_9);
+    void *var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_13 = tensorHalfAdd(var_12, conv2d_4_b);
+    void *var_14 = tensorHalfRelu(var_13);
+    void *var_15 = tensorHalfPooling(var_14, 0, 2, 2, 0, 0, 2, 2);
+    void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorHalfAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorHalfRelu(var_17);
+    void *var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_21 = tensorHalfAdd(var_20, conv2d_6_b);
+    void *var_22 = tensorHalfRelu(var_21);
+    void *var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorHalfAdd(var_24, conv2d_7_b);
+    void *var_26 = tensorHalfRelu(var_25);
+    void *var_27 = tensorHalfPooling(var_26, 0, 2, 2, 0, 0, 2, 2);
+    void *var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorHalfAdd(var_28, conv2d_8_b);
+    void *var_30 = tensorHalfRelu(var_29);
+    void *var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_33 = tensorHalfAdd(var_32, conv2d_9_b);
+    void *var_34 = tensorHalfRelu(var_33);
+    void *var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0);
+    void *var_37 = tensorHalfAdd(var_36, conv2d_10_b);
+    void *var_38 = tensorHalfRelu(var_37);
+    void *var_39 = tensorHalfPooling(var_38, 0, 2, 2, 0, 0, 2, 2);
+    void *var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_41 = tensorHalfAdd(var_40, conv2d_11_b);
+    void *var_42 = tensorHalfRelu(var_41);
+    void *var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_45 = tensorHalfAdd(var_44, conv2d_12_b);
+    void *var_46 = tensorHalfRelu(var_45);
+    void *var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_49 = tensorHalfAdd(var_48, conv2d_13_b);
+    void *var_50 = tensorHalfRelu(var_49);
+    void *var_51 = tensorHalfPooling(var_50, 0, 2, 2, 0, 0, 2, 2);
+    void *var_54 = tensorHalfGemmGPU(var_51, dense_1_w);
+    void *var_55 = tensorHalfAdd(var_54, dense_1_b);
+    void *var_56 = tensorHalfRelu(var_55);
+    void *var_58 = tensorHalfGemmGPU(var_56, dense_2_w);
+    void *var_59 = tensorHalfAdd(var_58, dense_2_b);
+    void *var_60 = tensorSoftmax(var_59);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_60);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
-  // Start power and performance profiling 
+  // Start power and performance profiling
   stopProfiling();
 
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
-  
-  llvm_hpvm_cleanupTensorRt(); 
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
index 50d9747f990d486c4543607d16d4a4ccb88b0517..7e2c4be6335e3de82b0719923554e17b74732b93 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
@@ -1,62 +1,64 @@
 
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <string.h>
-
 #include "../../tensor_runtime/include/tensor_runtime.h"
 #include "../include/utils.h"
 
-
-
 /* NOTE: Reference Architecture to use for profiling */
-void testCifarNet(){
+void testCifarNet() {
 
   printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
- 
-
-  std::string dir_prefix = model_params_path +  std::string("/alexnet2_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,32,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,128,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-
-  
-  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
-  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
 
+  std::string dir_prefix =
+      model_params_path + std::string("/alexnet2_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
+
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 32, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
+
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision =
+      0; // NOTE: using Float as compute precision. FIXIT: use enum
 
   startMemTracking();
 
@@ -67,62 +69,61 @@ void testCifarNet(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
-    
-    void* conv1out = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv1out, conv2d_1_b); 
-    void* conv1_tanh = tensorTanh(conv1out);
-    
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *conv1out = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, conv_mode,
+                                       conv_precision);
+    tensorAdd(conv1out, conv2d_1_b);
+    void *conv1_tanh = tensorTanh(conv1out);
+
     // 2nd Layer
-    void* conv2out = tensorConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv2out, conv2d_2_b); 
-    void* conv2_tanh = tensorTanh(conv2out);
-    void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
-     
+    void *conv2out = tensorConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv2out, conv2d_2_b);
+    void *conv2_tanh = tensorTanh(conv2out);
+    void *pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // 3rd Layer
-    void* conv3out = tensorConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv3out, conv2d_3_b); 
-    void* conv3_tanh = tensorTanh(conv3out);
+    void *conv3out = tensorConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv3out, conv2d_3_b);
+    void *conv3_tanh = tensorTanh(conv3out);
 
     // 4th Layer
-    void* conv4out = tensorConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv4out, conv2d_4_b); 
-    void* conv4_tanh = tensorTanh(conv4out);
-    void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
-    
+    void *conv4out = tensorConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv4out, conv2d_4_b);
+    void *conv4_tanh = tensorTanh(conv4out);
+    void *pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // 5th Layer
-    void* conv5out = tensorConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv5out, conv2d_5_b); 
-    void* conv5_tanh = tensorTanh(conv5out);
+    void *conv5out = tensorConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv5out, conv2d_5_b);
+    void *conv5_tanh = tensorTanh(conv5out);
 
     // 6th Layer
-    void* conv6out = tensorConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv6out, conv2d_6_b); 
-    void* conv6_tanh = tensorTanh(conv6out);
-    void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
-    
+    void *conv6out = tensorConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv6out, conv2d_6_b);
+    void *conv6_tanh = tensorTanh(conv6out);
+    void *pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // final FC Layer
-    void* gemm1out = tensorGemmGPU(pool6out, dense_1_w);  
-    void* gemm1biasout = tensorAdd(gemm1out, dense_1_b);
-    void* result = tensorSoftmax(gemm1biasout);
+    void *gemm1out = tensorGemmGPU(pool6out, dense_1_w);
+    void *gemm1biasout = tensorAdd(gemm1out, dense_1_b);
+    void *result = tensorSoftmax(gemm1biasout);
 
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
 
-    float accuracy = computeAccuracy2(labels, batch_size, result); 
+    float accuracy = computeAccuracy2(labels, batch_size, result);
     final_accuracy += accuracy;
 
-    
     freeBatchMemory();
   }
 
@@ -130,11 +131,9 @@ void testCifarNet(){
 
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
-
 }
 
-
-int main(int argc, char* argv[]){
+int main(int argc, char *argv[]) {
 
   llvm_hpvm_initTensorRt(0);
 
@@ -144,4 +143,3 @@ int main(int argc, char* argv[]){
 
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
index 1a76f1ae8ba6059124117b82cd72e8ccd6cdeba6..1cee9b4fa5dd96bf74c4662d0d8edef34f8f2282 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
@@ -1,50 +1,53 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); 
-
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-
-
-  
+
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/");
+
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 4096, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
+
   startMemTracking();
 
   int test_input_size = 5000;
@@ -54,40 +57,40 @@ int main(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
-
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
-    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorTanh(var_1); 
-    void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); 
-    void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
-    void* var_6 = tensorAdd(var_5, conv2d_2_b); 
-    void* var_7 = tensorTanh(var_6); 
-    void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); 
-    void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorTanh(var_11); 
-    void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_14 = tensorAdd(var_13, conv2d_4_b); 
-    void* var_15 = tensorTanh(var_14); 
-    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorTanh(var_17); 
-    void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); 
-    void* var_22 = tensorGemmGPU(var_19, dense_1_w); 
-    void* var_23 = tensorAdd(var_22, dense_1_b); 
-    void* var_24 = tensorSoftmax(var_23); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_24); 
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0);
+    void *var_1 = tensorAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorTanh(var_1);
+    void *var_3 = tensorPooling(var_2, 0, 2, 2, 0, 0, 2, 2);
+    void *var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0);
+    void *var_6 = tensorAdd(var_5, conv2d_2_b);
+    void *var_7 = tensorTanh(var_6);
+    void *var_8 = tensorPooling(var_7, 0, 2, 2, 0, 0, 2, 2);
+    void *var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_11 = tensorAdd(var_10, conv2d_3_b);
+    void *var_12 = tensorTanh(var_11);
+    void *var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_14 = tensorAdd(var_13, conv2d_4_b);
+    void *var_15 = tensorTanh(var_14);
+    void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorTanh(var_17);
+    void *var_19 = tensorPooling(var_18, 0, 2, 2, 0, 0, 2, 2);
+    void *var_22 = tensorGemmGPU(var_19, dense_1_w);
+    void *var_23 = tensorAdd(var_22, dense_1_b);
+    void *var_24 = tensorSoftmax(var_23);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_24);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -96,9 +99,7 @@ int main(){
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
 
+  llvm_hpvm_cleanupTensorRt();
 
-  llvm_hpvm_cleanupTensorRt(); 
-
-  return 0; 
-
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
index aa518d77a1993ce5f0f47b4a29276aae6e6de0e5..0f8df1a4207502b345aa02835a4d77368a35aa92 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
@@ -1,116 +1,119 @@
 
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "tensor_runtime.h" 
-#include "utils.h" 
-
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-
-  std::string dir_prefix = std::string("/home/nvidia/sd_card/alexnet_imagenet_tune/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,9216,4096); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); 
-  std::string dense_3_w_path =  dir_prefix + std::string("dense_3_w.bin"); 
-  void* dense_3_w =  readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); 
-  std::string dense_3_b_path =  dir_prefix + std::string("dense_3_b.bin"); 
-  void* dense_3_b =  readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); 
-
-
-
-  startMemTracking(); 
-
-  int test_input_size = 1000; 
-  int batch_size = 100; 
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
-
-  for(int i = 0; i < batch_count; i++){ 
-
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
-
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); 
-
-    void* var_2 = tensorConvolution(input, conv2d_1_w, 2, 2, 4, 4, 1, 1); 
-    void* var_3 = tensorAdd(var_2, conv2d_1_b); 
-    void* var_4 = tensorRelu(var_3); 
-    void* var_5 = tensorPooling(var_4,0,3,3,0,0,2,2); 
-    void* var_7 = tensorConvolution(var_5, conv2d_2_w, 2, 2, 1, 1, 1, 1); 
-    void* var_8 = tensorAdd(var_7, conv2d_2_b); 
-    void* var_9 = tensorRelu(var_8); 
-    void* var_10 = tensorPooling(var_9,0,3,3,0,0,2,2); 
-    void* var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); 
-    void* var_12 = tensorAdd(var_11, conv2d_3_b); 
-    void* var_13 = tensorRelu(var_12); 
-    void* var_14 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 1); 
-    void* var_15 = tensorAdd(var_14, conv2d_4_b); 
-    void* var_16 = tensorRelu(var_15); 
-    void* var_17 = tensorConvolution(var_16, conv2d_5_w, 1, 1, 1, 1, 1, 1); 
-    void* var_18 = tensorAdd(var_17, conv2d_5_b); 
-    void* var_19 = tensorRelu(var_18); 
-    void* var_20 = tensorPooling(var_19,0,3,3,0,0,2,2); 
-    void* var_23 = tensorGemmGPU(var_20, dense_1_w); 
-    void* var_24 = tensorAdd(var_23, dense_1_b); 
-    void* var_25 = tensorRelu(var_24); 
-    void* var_27 = tensorGemmGPU(var_25, dense_2_w); 
-    void* var_28 = tensorAdd(var_27, dense_2_b); 
-    void* var_29 = tensorRelu(var_28); 
-    void* var_30 = tensorGemmGPU(var_29, dense_3_w); 
-    void* var_31 = tensorAdd(var_30, dense_3_b); 
-    void* var_32 = tensorSoftmax(var_31); 
-
-    uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy3(labels, var_32); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
- 
+#include "tensor_runtime.h"
+#include "utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix =
+      std::string("/home/nvidia/sd_card/alexnet_imagenet_tune/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 9216, 4096);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b =
+      readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 4096, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 4096, 4096);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b =
+      readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 4096, 1, 1);
+  std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin");
+  void *dense_3_w =
+      readTrainedWeights(dense_3_w_path.c_str(), 0, 1, 1, 4096, 1000);
+  std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
+  void *dense_3_b =
+      readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
+
+  startMemTracking();
+
+  int test_input_size = 1000;
+  int batch_size = 100;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  for (int i = 0; i < batch_count; i++) {
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+
+    void *input =
+        readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
+
+    void *var_2 = tensorConvolution(input, conv2d_1_w, 2, 2, 4, 4, 1, 1);
+    void *var_3 = tensorAdd(var_2, conv2d_1_b);
+    void *var_4 = tensorRelu(var_3);
+    void *var_5 = tensorPooling(var_4, 0, 3, 3, 0, 0, 2, 2);
+    void *var_7 = tensorConvolution(var_5, conv2d_2_w, 2, 2, 1, 1, 1, 1);
+    void *var_8 = tensorAdd(var_7, conv2d_2_b);
+    void *var_9 = tensorRelu(var_8);
+    void *var_10 = tensorPooling(var_9, 0, 3, 3, 0, 0, 2, 2);
+    void *var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1);
+    void *var_12 = tensorAdd(var_11, conv2d_3_b);
+    void *var_13 = tensorRelu(var_12);
+    void *var_14 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 1);
+    void *var_15 = tensorAdd(var_14, conv2d_4_b);
+    void *var_16 = tensorRelu(var_15);
+    void *var_17 = tensorConvolution(var_16, conv2d_5_w, 1, 1, 1, 1, 1, 1);
+    void *var_18 = tensorAdd(var_17, conv2d_5_b);
+    void *var_19 = tensorRelu(var_18);
+    void *var_20 = tensorPooling(var_19, 0, 3, 3, 0, 0, 2, 2);
+    void *var_23 = tensorGemmGPU(var_20, dense_1_w);
+    void *var_24 = tensorAdd(var_23, dense_1_b);
+    void *var_25 = tensorRelu(var_24);
+    void *var_27 = tensorGemmGPU(var_25, dense_2_w);
+    void *var_28 = tensorAdd(var_27, dense_2_b);
+    void *var_29 = tensorRelu(var_28);
+    void *var_30 = tensorGemmGPU(var_29, dense_3_w);
+    void *var_31 = tensorAdd(var_30, dense_3_b);
+    void *var_32 = tensorSoftmax(var_31);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy3(labels, var_32);
+    final_accuracy += accuracy;
+    freeBatchMemory();
   }
 
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
+  llvm_hpvm_cleanupTensorRt();
 
-  llvm_hpvm_cleanupTensorRt(); 
-
-
-  return 0; 
-
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
index 7508f3119eeb469a164fad9741000308e3e8c031..cb6593f7d5cac872159c909c99fbde478729df29 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
@@ -1,124 +1,108 @@
 
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <string.h>
-
-
 #include "tensor_runtime.h"
 #include "utils.h"
 
 int total_runs = 1;
 
-
 /* NOTE: Reference Architecture to use for profiling */
-void testLenetTanh(){
+void testLenetTanh() {
   printf("********* Lenet-2 Architecture ********** \n");
   // FIXIT: Extend this to batch of images - currently 5 images
 
   int test_batch_size = 5000;
 
-  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");   
+  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");
+
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-  
   // Loading Input Batch
-  void* input = readInputBatch(input_path.c_str(),0, 0,test_batch_size,1,28,28); 
-  uint8_t* labels = readLabelsBatch(labels_path.c_str(), 0,test_batch_size); 
-    
-
-  void* conv1_filter = readTrainedWeights("../model_params/lenet_mnist/conv1.bin",
-					  float_type, 32, 1, 5, 5);    
-  void* conv1_bias = readTrainedWeights("../model_params/lenet_mnist/conv1_bias.bin",
-					float_type, 1, 32, 1, 1);  
-  void* conv2_filter = readTrainedWeights("../model_params/lenet_mnist/conv2.bin",
-					  float_type, 64, 32, 5, 5);  
-  void* conv2_bias = readTrainedWeights("../model_params/lenet_mnist/conv2_bias.bin",
-					float_type, 1, 64, 1, 1);  
-  void* fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin",
-					 float_type, 1, 1, 7*7*64, 1024);  
-  void* fc1_bias = readTrainedWeights("../model_params/lenet_mnist/fc1_bias.bin",
-				      float_type, 1, 1024, 1, 1);  
-  void* fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin",
-					 float_type, 1, 1, 1024, 10);  
-  void* fc2_bias = readTrainedWeights("../model_params/lenet_mnist/fc2_bias.bin",
-				      float_type, 1, 10, 1, 1);  
-
-
-  
+  void *input =
+      readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28);
+  uint8_t *labels = readLabelsBatch(labels_path.c_str(), 0, test_batch_size);
+
+  void *conv1_filter = readTrainedWeights(
+      "../model_params/lenet_mnist/conv1.bin", float_type, 32, 1, 5, 5);
+  void *conv1_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/conv1_bias.bin", float_type, 1, 32, 1, 1);
+  void *conv2_filter = readTrainedWeights(
+      "../model_params/lenet_mnist/conv2.bin", float_type, 64, 32, 5, 5);
+  void *conv2_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/conv2_bias.bin", float_type, 1, 64, 1, 1);
+  void *fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin",
+                                         float_type, 1, 1, 7 * 7 * 64, 1024);
+  void *fc1_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/fc1_bias.bin", float_type, 1, 1024, 1, 1);
+  void *fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin",
+                                         float_type, 1, 1, 1024, 10);
+  void *fc2_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/fc2_bias.bin", float_type, 1, 10, 1, 1);
+
   clearTensorMap();
-  
-  for(int i = 0; i < total_runs; i++){
+
+  for (int i = 0; i < total_runs; i++) {
     readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters
 
-    // Start power and performnce profiling 
+    // Start power and performnce profiling
     startProfiling();
-  
+
     int conv_mode = 1; // NOTE: using CROSS_CORRELATION
-    int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+    int conv_precision =
+        0; // NOTE: using Float as compute precision. FIXIT: use enum
 
     // NOTE: 'SAME' convolution
-    void* conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1,
-				       conv_mode, conv_precision);
+    void *conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1,
+                                       conv_mode, conv_precision);
 
-    // NOTE: For tensorAdd, the only dimension that MUST match is channels  
+    // NOTE: For tensorAdd, the only dimension that MUST match is channels
     tensorAdd(conv1out, conv1_bias); // NOTE: In place operation
 
-    void* pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
+    void *pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
 
-    void* conv1_tanh = tensorTanh(pool1out);
+    void *conv1_tanh = tensorTanh(pool1out);
 
-    // NOTE: input channels have to match between tensor op inputs and outputs 
-    void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1,
-				       conv_mode, conv_precision);
+    // NOTE: input channels have to match between tensor op inputs and outputs
+    void *conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1,
+                                       conv_mode, conv_precision);
     tensorAdd(conv2out, conv2_bias); // NOTE: In place operation
 
-    void* pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+    void *pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+
+    void *conv2_tanh = tensorTanh(pool2out);
 
-    void* conv2_tanh = tensorTanh(pool2out);
+    void *gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights);
 
-    void* gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights);  
+    void *gemm1biasout = tensorAdd(gemm1out, fc1_bias);
 
-    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+    void *tanh1out = tensorTanh(gemm1biasout);
 
-    void* tanh1out = tensorTanh(gemm1biasout);
-  
-    void* gemm2out = tensorGemmGPU(tanh1out, fc2_weights);  
-  
-    void* gemm2_biasout = tensorAdd(gemm2out, fc2_bias);
+    void *gemm2out = tensorGemmGPU(tanh1out, fc2_weights);
 
-    void* tanh2out = tensorTanh(gemm2_biasout);
-  
-    void* result = tensorSoftmax(tanh2out);
+    void *gemm2_biasout = tensorAdd(gemm2out, fc2_bias);
+
+    void *tanh2out = tensorTanh(gemm2_biasout);
+
+    void *result = tensorSoftmax(tanh2out);
 
     // End profiling and dump output to profile.txt
     stopProfiling();
-  
+
     float accuracy = computeAccuracy2(labels, test_batch_size, result);
-    dumpFinalAccuracy(accuracy); 
+    dumpFinalAccuracy(accuracy);
 
-    
-    //FIXME: remove the comment below to use piped autotuner
-    //dumpAccuracyNorms();
-    freeOutputTensors();  
+    // FIXME: remove the comment below to use piped autotuner
+    // dumpAccuracyNorms();
+    freeOutputTensors();
   }
 
   dumpExecutionAccuracies();
-
-  
 }
 
+int main(int argc, char *argv[]) {
 
-
-int main(int argc, char* argv[]){
-
-  if (argc > 1){
+  if (argc > 1) {
     total_runs = atoi(argv[1]);
   }
 
@@ -130,4 +114,3 @@ int main(int argc, char* argv[]){
 
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
index 7c311a568647caa107112bed4982fb57254dc7b3..44336b02e0297f0ecbc37d3dccea8b97e766a357 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
@@ -1,414 +1,725 @@
 
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
 #include "../../tensor_runtime/include/tensor_runtime.h"
-#include "../include/utils.h" 
+#include "../include/utils.h"
 
-int main(){ 
+int main() {
 
-  llvm_hpvm_initTensorRt(0); 
+  llvm_hpvm_initTensorRt(0);
 
+  std::string dir_prefix = model_params_path + std::string("/mobilenet/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
+  std::string batch_normalization_1_gamma_path =
+      dir_prefix + std::string("batch_normalization_1_gamma.bin");
+  void *batch_normalization_1_gamma = readTrainedWeights(
+      batch_normalization_1_gamma_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_beta_path =
+      dir_prefix + std::string("batch_normalization_1_beta.bin");
+  void *batch_normalization_1_beta = readTrainedWeights(
+      batch_normalization_1_beta_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_mean_path =
+      dir_prefix + std::string("batch_normalization_1_mean.bin");
+  void *batch_normalization_1_mean = readTrainedWeights(
+      batch_normalization_1_mean_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_variance_path =
+      dir_prefix + std::string("batch_normalization_1_variance.bin");
+  void *batch_normalization_1_variance = readTrainedWeights(
+      batch_normalization_1_variance_path.c_str(), 0, 1, 32, 1, 1);
+  std::string depthwise_conv2d_1_w_path =
+      dir_prefix + std::string("depthwise_conv2d_1_w.bin");
+  void *depthwise_conv2d_1_w =
+      readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0, 32, 1, 3, 3);
+  std::string batch_normalization_2_gamma_path =
+      dir_prefix + std::string("batch_normalization_2_gamma.bin");
+  void *batch_normalization_2_gamma = readTrainedWeights(
+      batch_normalization_2_gamma_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_beta_path =
+      dir_prefix + std::string("batch_normalization_2_beta.bin");
+  void *batch_normalization_2_beta = readTrainedWeights(
+      batch_normalization_2_beta_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_mean_path =
+      dir_prefix + std::string("batch_normalization_2_mean.bin");
+  void *batch_normalization_2_mean = readTrainedWeights(
+      batch_normalization_2_mean_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_variance_path =
+      dir_prefix + std::string("batch_normalization_2_variance.bin");
+  void *batch_normalization_2_variance = readTrainedWeights(
+      batch_normalization_2_variance_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 32, 1, 1);
+  std::string batch_normalization_3_gamma_path =
+      dir_prefix + std::string("batch_normalization_3_gamma.bin");
+  void *batch_normalization_3_gamma = readTrainedWeights(
+      batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_beta_path =
+      dir_prefix + std::string("batch_normalization_3_beta.bin");
+  void *batch_normalization_3_beta = readTrainedWeights(
+      batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_mean_path =
+      dir_prefix + std::string("batch_normalization_3_mean.bin");
+  void *batch_normalization_3_mean = readTrainedWeights(
+      batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_variance_path =
+      dir_prefix + std::string("batch_normalization_3_variance.bin");
+  void *batch_normalization_3_variance = readTrainedWeights(
+      batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string depthwise_conv2d_2_w_path =
+      dir_prefix + std::string("depthwise_conv2d_2_w.bin");
+  void *depthwise_conv2d_2_w =
+      readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0, 64, 1, 3, 3);
+  std::string batch_normalization_4_gamma_path =
+      dir_prefix + std::string("batch_normalization_4_gamma.bin");
+  void *batch_normalization_4_gamma = readTrainedWeights(
+      batch_normalization_4_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_beta_path =
+      dir_prefix + std::string("batch_normalization_4_beta.bin");
+  void *batch_normalization_4_beta = readTrainedWeights(
+      batch_normalization_4_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_mean_path =
+      dir_prefix + std::string("batch_normalization_4_mean.bin");
+  void *batch_normalization_4_mean = readTrainedWeights(
+      batch_normalization_4_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_variance_path =
+      dir_prefix + std::string("batch_normalization_4_variance.bin");
+  void *batch_normalization_4_variance = readTrainedWeights(
+      batch_normalization_4_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 1, 1);
+  std::string batch_normalization_5_gamma_path =
+      dir_prefix + std::string("batch_normalization_5_gamma.bin");
+  void *batch_normalization_5_gamma = readTrainedWeights(
+      batch_normalization_5_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_beta_path =
+      dir_prefix + std::string("batch_normalization_5_beta.bin");
+  void *batch_normalization_5_beta = readTrainedWeights(
+      batch_normalization_5_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_mean_path =
+      dir_prefix + std::string("batch_normalization_5_mean.bin");
+  void *batch_normalization_5_mean = readTrainedWeights(
+      batch_normalization_5_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_variance_path =
+      dir_prefix + std::string("batch_normalization_5_variance.bin");
+  void *batch_normalization_5_variance = readTrainedWeights(
+      batch_normalization_5_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string depthwise_conv2d_3_w_path =
+      dir_prefix + std::string("depthwise_conv2d_3_w.bin");
+  void *depthwise_conv2d_3_w =
+      readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0, 128, 1, 3, 3);
+  std::string batch_normalization_6_gamma_path =
+      dir_prefix + std::string("batch_normalization_6_gamma.bin");
+  void *batch_normalization_6_gamma = readTrainedWeights(
+      batch_normalization_6_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_beta_path =
+      dir_prefix + std::string("batch_normalization_6_beta.bin");
+  void *batch_normalization_6_beta = readTrainedWeights(
+      batch_normalization_6_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_mean_path =
+      dir_prefix + std::string("batch_normalization_6_mean.bin");
+  void *batch_normalization_6_mean = readTrainedWeights(
+      batch_normalization_6_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_variance_path =
+      dir_prefix + std::string("batch_normalization_6_variance.bin");
+  void *batch_normalization_6_variance = readTrainedWeights(
+      batch_normalization_6_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 1, 1);
+  std::string batch_normalization_7_gamma_path =
+      dir_prefix + std::string("batch_normalization_7_gamma.bin");
+  void *batch_normalization_7_gamma = readTrainedWeights(
+      batch_normalization_7_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_beta_path =
+      dir_prefix + std::string("batch_normalization_7_beta.bin");
+  void *batch_normalization_7_beta = readTrainedWeights(
+      batch_normalization_7_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_mean_path =
+      dir_prefix + std::string("batch_normalization_7_mean.bin");
+  void *batch_normalization_7_mean = readTrainedWeights(
+      batch_normalization_7_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_variance_path =
+      dir_prefix + std::string("batch_normalization_7_variance.bin");
+  void *batch_normalization_7_variance = readTrainedWeights(
+      batch_normalization_7_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string depthwise_conv2d_4_w_path =
+      dir_prefix + std::string("depthwise_conv2d_4_w.bin");
+  void *depthwise_conv2d_4_w =
+      readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0, 128, 1, 3, 3);
+  std::string batch_normalization_8_gamma_path =
+      dir_prefix + std::string("batch_normalization_8_gamma.bin");
+  void *batch_normalization_8_gamma = readTrainedWeights(
+      batch_normalization_8_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_beta_path =
+      dir_prefix + std::string("batch_normalization_8_beta.bin");
+  void *batch_normalization_8_beta = readTrainedWeights(
+      batch_normalization_8_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_mean_path =
+      dir_prefix + std::string("batch_normalization_8_mean.bin");
+  void *batch_normalization_8_mean = readTrainedWeights(
+      batch_normalization_8_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_variance_path =
+      dir_prefix + std::string("batch_normalization_8_variance.bin");
+  void *batch_normalization_8_variance = readTrainedWeights(
+      batch_normalization_8_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 1, 1);
+  std::string batch_normalization_9_gamma_path =
+      dir_prefix + std::string("batch_normalization_9_gamma.bin");
+  void *batch_normalization_9_gamma = readTrainedWeights(
+      batch_normalization_9_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_beta_path =
+      dir_prefix + std::string("batch_normalization_9_beta.bin");
+  void *batch_normalization_9_beta = readTrainedWeights(
+      batch_normalization_9_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_mean_path =
+      dir_prefix + std::string("batch_normalization_9_mean.bin");
+  void *batch_normalization_9_mean = readTrainedWeights(
+      batch_normalization_9_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_variance_path =
+      dir_prefix + std::string("batch_normalization_9_variance.bin");
+  void *batch_normalization_9_variance = readTrainedWeights(
+      batch_normalization_9_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string depthwise_conv2d_5_w_path =
+      dir_prefix + std::string("depthwise_conv2d_5_w.bin");
+  void *depthwise_conv2d_5_w =
+      readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0, 256, 1, 3, 3);
+  std::string batch_normalization_10_gamma_path =
+      dir_prefix + std::string("batch_normalization_10_gamma.bin");
+  void *batch_normalization_10_gamma = readTrainedWeights(
+      batch_normalization_10_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_beta_path =
+      dir_prefix + std::string("batch_normalization_10_beta.bin");
+  void *batch_normalization_10_beta = readTrainedWeights(
+      batch_normalization_10_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_mean_path =
+      dir_prefix + std::string("batch_normalization_10_mean.bin");
+  void *batch_normalization_10_mean = readTrainedWeights(
+      batch_normalization_10_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_variance_path =
+      dir_prefix + std::string("batch_normalization_10_variance.bin");
+  void *batch_normalization_10_variance = readTrainedWeights(
+      batch_normalization_10_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 1, 1);
+  std::string batch_normalization_11_gamma_path =
+      dir_prefix + std::string("batch_normalization_11_gamma.bin");
+  void *batch_normalization_11_gamma = readTrainedWeights(
+      batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_beta_path =
+      dir_prefix + std::string("batch_normalization_11_beta.bin");
+  void *batch_normalization_11_beta = readTrainedWeights(
+      batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_mean_path =
+      dir_prefix + std::string("batch_normalization_11_mean.bin");
+  void *batch_normalization_11_mean = readTrainedWeights(
+      batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_variance_path =
+      dir_prefix + std::string("batch_normalization_11_variance.bin");
+  void *batch_normalization_11_variance = readTrainedWeights(
+      batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string depthwise_conv2d_6_w_path =
+      dir_prefix + std::string("depthwise_conv2d_6_w.bin");
+  void *depthwise_conv2d_6_w =
+      readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0, 256, 1, 3, 3);
+  std::string batch_normalization_12_gamma_path =
+      dir_prefix + std::string("batch_normalization_12_gamma.bin");
+  void *batch_normalization_12_gamma = readTrainedWeights(
+      batch_normalization_12_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_beta_path =
+      dir_prefix + std::string("batch_normalization_12_beta.bin");
+  void *batch_normalization_12_beta = readTrainedWeights(
+      batch_normalization_12_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_mean_path =
+      dir_prefix + std::string("batch_normalization_12_mean.bin");
+  void *batch_normalization_12_mean = readTrainedWeights(
+      batch_normalization_12_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_variance_path =
+      dir_prefix + std::string("batch_normalization_12_variance.bin");
+  void *batch_normalization_12_variance = readTrainedWeights(
+      batch_normalization_12_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 512, 256, 1, 1);
+  std::string batch_normalization_13_gamma_path =
+      dir_prefix + std::string("batch_normalization_13_gamma.bin");
+  void *batch_normalization_13_gamma = readTrainedWeights(
+      batch_normalization_13_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_beta_path =
+      dir_prefix + std::string("batch_normalization_13_beta.bin");
+  void *batch_normalization_13_beta = readTrainedWeights(
+      batch_normalization_13_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_mean_path =
+      dir_prefix + std::string("batch_normalization_13_mean.bin");
+  void *batch_normalization_13_mean = readTrainedWeights(
+      batch_normalization_13_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_variance_path =
+      dir_prefix + std::string("batch_normalization_13_variance.bin");
+  void *batch_normalization_13_variance = readTrainedWeights(
+      batch_normalization_13_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_7_w_path =
+      dir_prefix + std::string("depthwise_conv2d_7_w.bin");
+  void *depthwise_conv2d_7_w =
+      readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_14_gamma_path =
+      dir_prefix + std::string("batch_normalization_14_gamma.bin");
+  void *batch_normalization_14_gamma = readTrainedWeights(
+      batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_beta_path =
+      dir_prefix + std::string("batch_normalization_14_beta.bin");
+  void *batch_normalization_14_beta = readTrainedWeights(
+      batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_mean_path =
+      dir_prefix + std::string("batch_normalization_14_mean.bin");
+  void *batch_normalization_14_mean = readTrainedWeights(
+      batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_variance_path =
+      dir_prefix + std::string("batch_normalization_14_variance.bin");
+  void *batch_normalization_14_variance = readTrainedWeights(
+      batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_15_gamma_path =
+      dir_prefix + std::string("batch_normalization_15_gamma.bin");
+  void *batch_normalization_15_gamma = readTrainedWeights(
+      batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_beta_path =
+      dir_prefix + std::string("batch_normalization_15_beta.bin");
+  void *batch_normalization_15_beta = readTrainedWeights(
+      batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_mean_path =
+      dir_prefix + std::string("batch_normalization_15_mean.bin");
+  void *batch_normalization_15_mean = readTrainedWeights(
+      batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_variance_path =
+      dir_prefix + std::string("batch_normalization_15_variance.bin");
+  void *batch_normalization_15_variance = readTrainedWeights(
+      batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_8_w_path =
+      dir_prefix + std::string("depthwise_conv2d_8_w.bin");
+  void *depthwise_conv2d_8_w =
+      readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_16_gamma_path =
+      dir_prefix + std::string("batch_normalization_16_gamma.bin");
+  void *batch_normalization_16_gamma = readTrainedWeights(
+      batch_normalization_16_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_beta_path =
+      dir_prefix + std::string("batch_normalization_16_beta.bin");
+  void *batch_normalization_16_beta = readTrainedWeights(
+      batch_normalization_16_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_mean_path =
+      dir_prefix + std::string("batch_normalization_16_mean.bin");
+  void *batch_normalization_16_mean = readTrainedWeights(
+      batch_normalization_16_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_variance_path =
+      dir_prefix + std::string("batch_normalization_16_variance.bin");
+  void *batch_normalization_16_variance = readTrainedWeights(
+      batch_normalization_16_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_17_gamma_path =
+      dir_prefix + std::string("batch_normalization_17_gamma.bin");
+  void *batch_normalization_17_gamma = readTrainedWeights(
+      batch_normalization_17_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_beta_path =
+      dir_prefix + std::string("batch_normalization_17_beta.bin");
+  void *batch_normalization_17_beta = readTrainedWeights(
+      batch_normalization_17_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_mean_path =
+      dir_prefix + std::string("batch_normalization_17_mean.bin");
+  void *batch_normalization_17_mean = readTrainedWeights(
+      batch_normalization_17_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_variance_path =
+      dir_prefix + std::string("batch_normalization_17_variance.bin");
+  void *batch_normalization_17_variance = readTrainedWeights(
+      batch_normalization_17_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_9_w_path =
+      dir_prefix + std::string("depthwise_conv2d_9_w.bin");
+  void *depthwise_conv2d_9_w =
+      readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_18_gamma_path =
+      dir_prefix + std::string("batch_normalization_18_gamma.bin");
+  void *batch_normalization_18_gamma = readTrainedWeights(
+      batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_beta_path =
+      dir_prefix + std::string("batch_normalization_18_beta.bin");
+  void *batch_normalization_18_beta = readTrainedWeights(
+      batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_mean_path =
+      dir_prefix + std::string("batch_normalization_18_mean.bin");
+  void *batch_normalization_18_mean = readTrainedWeights(
+      batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_variance_path =
+      dir_prefix + std::string("batch_normalization_18_variance.bin");
+  void *batch_normalization_18_variance = readTrainedWeights(
+      batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_19_gamma_path =
+      dir_prefix + std::string("batch_normalization_19_gamma.bin");
+  void *batch_normalization_19_gamma = readTrainedWeights(
+      batch_normalization_19_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_beta_path =
+      dir_prefix + std::string("batch_normalization_19_beta.bin");
+  void *batch_normalization_19_beta = readTrainedWeights(
+      batch_normalization_19_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_mean_path =
+      dir_prefix + std::string("batch_normalization_19_mean.bin");
+  void *batch_normalization_19_mean = readTrainedWeights(
+      batch_normalization_19_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_variance_path =
+      dir_prefix + std::string("batch_normalization_19_variance.bin");
+  void *batch_normalization_19_variance = readTrainedWeights(
+      batch_normalization_19_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_10_w_path =
+      dir_prefix + std::string("depthwise_conv2d_10_w.bin");
+  void *depthwise_conv2d_10_w =
+      readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_20_gamma_path =
+      dir_prefix + std::string("batch_normalization_20_gamma.bin");
+  void *batch_normalization_20_gamma = readTrainedWeights(
+      batch_normalization_20_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_beta_path =
+      dir_prefix + std::string("batch_normalization_20_beta.bin");
+  void *batch_normalization_20_beta = readTrainedWeights(
+      batch_normalization_20_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_mean_path =
+      dir_prefix + std::string("batch_normalization_20_mean.bin");
+  void *batch_normalization_20_mean = readTrainedWeights(
+      batch_normalization_20_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_variance_path =
+      dir_prefix + std::string("batch_normalization_20_variance.bin");
+  void *batch_normalization_20_variance = readTrainedWeights(
+      batch_normalization_20_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_21_gamma_path =
+      dir_prefix + std::string("batch_normalization_21_gamma.bin");
+  void *batch_normalization_21_gamma = readTrainedWeights(
+      batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_beta_path =
+      dir_prefix + std::string("batch_normalization_21_beta.bin");
+  void *batch_normalization_21_beta = readTrainedWeights(
+      batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_mean_path =
+      dir_prefix + std::string("batch_normalization_21_mean.bin");
+  void *batch_normalization_21_mean = readTrainedWeights(
+      batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_variance_path =
+      dir_prefix + std::string("batch_normalization_21_variance.bin");
+  void *batch_normalization_21_variance = readTrainedWeights(
+      batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_11_w_path =
+      dir_prefix + std::string("depthwise_conv2d_11_w.bin");
+  void *depthwise_conv2d_11_w =
+      readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_22_gamma_path =
+      dir_prefix + std::string("batch_normalization_22_gamma.bin");
+  void *batch_normalization_22_gamma = readTrainedWeights(
+      batch_normalization_22_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_beta_path =
+      dir_prefix + std::string("batch_normalization_22_beta.bin");
+  void *batch_normalization_22_beta = readTrainedWeights(
+      batch_normalization_22_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_mean_path =
+      dir_prefix + std::string("batch_normalization_22_mean.bin");
+  void *batch_normalization_22_mean = readTrainedWeights(
+      batch_normalization_22_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_variance_path =
+      dir_prefix + std::string("batch_normalization_22_variance.bin");
+  void *batch_normalization_22_variance = readTrainedWeights(
+      batch_normalization_22_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_23_gamma_path =
+      dir_prefix + std::string("batch_normalization_23_gamma.bin");
+  void *batch_normalization_23_gamma = readTrainedWeights(
+      batch_normalization_23_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_beta_path =
+      dir_prefix + std::string("batch_normalization_23_beta.bin");
+  void *batch_normalization_23_beta = readTrainedWeights(
+      batch_normalization_23_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_mean_path =
+      dir_prefix + std::string("batch_normalization_23_mean.bin");
+  void *batch_normalization_23_mean = readTrainedWeights(
+      batch_normalization_23_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_variance_path =
+      dir_prefix + std::string("batch_normalization_23_variance.bin");
+  void *batch_normalization_23_variance = readTrainedWeights(
+      batch_normalization_23_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_12_w_path =
+      dir_prefix + std::string("depthwise_conv2d_12_w.bin");
+  void *depthwise_conv2d_12_w =
+      readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_24_gamma_path =
+      dir_prefix + std::string("batch_normalization_24_gamma.bin");
+  void *batch_normalization_24_gamma = readTrainedWeights(
+      batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_beta_path =
+      dir_prefix + std::string("batch_normalization_24_beta.bin");
+  void *batch_normalization_24_beta = readTrainedWeights(
+      batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_mean_path =
+      dir_prefix + std::string("batch_normalization_24_mean.bin");
+  void *batch_normalization_24_mean = readTrainedWeights(
+      batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_variance_path =
+      dir_prefix + std::string("batch_normalization_24_variance.bin");
+  void *batch_normalization_24_variance = readTrainedWeights(
+      batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 1024, 512, 1, 1);
+  std::string batch_normalization_25_gamma_path =
+      dir_prefix + std::string("batch_normalization_25_gamma.bin");
+  void *batch_normalization_25_gamma = readTrainedWeights(
+      batch_normalization_25_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_beta_path =
+      dir_prefix + std::string("batch_normalization_25_beta.bin");
+  void *batch_normalization_25_beta = readTrainedWeights(
+      batch_normalization_25_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_mean_path =
+      dir_prefix + std::string("batch_normalization_25_mean.bin");
+  void *batch_normalization_25_mean = readTrainedWeights(
+      batch_normalization_25_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_variance_path =
+      dir_prefix + std::string("batch_normalization_25_variance.bin");
+  void *batch_normalization_25_variance = readTrainedWeights(
+      batch_normalization_25_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string depthwise_conv2d_13_w_path =
+      dir_prefix + std::string("depthwise_conv2d_13_w.bin");
+  void *depthwise_conv2d_13_w =
+      readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0, 1024, 1, 3, 3);
+  std::string batch_normalization_26_gamma_path =
+      dir_prefix + std::string("batch_normalization_26_gamma.bin");
+  void *batch_normalization_26_gamma = readTrainedWeights(
+      batch_normalization_26_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_beta_path =
+      dir_prefix + std::string("batch_normalization_26_beta.bin");
+  void *batch_normalization_26_beta = readTrainedWeights(
+      batch_normalization_26_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_mean_path =
+      dir_prefix + std::string("batch_normalization_26_mean.bin");
+  void *batch_normalization_26_mean = readTrainedWeights(
+      batch_normalization_26_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_variance_path =
+      dir_prefix + std::string("batch_normalization_26_variance.bin");
+  void *batch_normalization_26_variance = readTrainedWeights(
+      batch_normalization_26_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 1024, 1024, 1, 1);
+  std::string batch_normalization_27_gamma_path =
+      dir_prefix + std::string("batch_normalization_27_gamma.bin");
+  void *batch_normalization_27_gamma = readTrainedWeights(
+      batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_beta_path =
+      dir_prefix + std::string("batch_normalization_27_beta.bin");
+  void *batch_normalization_27_beta = readTrainedWeights(
+      batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_mean_path =
+      dir_prefix + std::string("batch_normalization_27_mean.bin");
+  void *batch_normalization_27_mean = readTrainedWeights(
+      batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_variance_path =
+      dir_prefix + std::string("batch_normalization_27_variance.bin");
+  void *batch_normalization_27_variance = readTrainedWeights(
+      batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-  std::string dir_prefix = model_params_path + std::string("/mobilenet/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
-  std::string batch_normalization_1_gamma_path =  dir_prefix + std::string("batch_normalization_1_gamma.bin"); 
-  void* batch_normalization_1_gamma =  readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_1_beta_path =  dir_prefix + std::string("batch_normalization_1_beta.bin"); 
-  void* batch_normalization_1_beta =  readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_1_mean_path =  dir_prefix + std::string("batch_normalization_1_mean.bin"); 
-  void* batch_normalization_1_mean =  readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_1_variance_path =  dir_prefix + std::string("batch_normalization_1_variance.bin"); 
-  void* batch_normalization_1_variance =  readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); 
-  std::string depthwise_conv2d_1_w_path =  dir_prefix + std::string("depthwise_conv2d_1_w.bin"); 
-  void* depthwise_conv2d_1_w =  readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); 
-  std::string batch_normalization_2_gamma_path =  dir_prefix + std::string("batch_normalization_2_gamma.bin"); 
-  void* batch_normalization_2_gamma =  readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_2_beta_path =  dir_prefix + std::string("batch_normalization_2_beta.bin"); 
-  void* batch_normalization_2_beta =  readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_2_mean_path =  dir_prefix + std::string("batch_normalization_2_mean.bin"); 
-  void* batch_normalization_2_mean =  readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_2_variance_path =  dir_prefix + std::string("batch_normalization_2_variance.bin"); 
-  void* batch_normalization_2_variance =  readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); 
-  std::string batch_normalization_3_gamma_path =  dir_prefix + std::string("batch_normalization_3_gamma.bin"); 
-  void* batch_normalization_3_gamma =  readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_beta_path =  dir_prefix + std::string("batch_normalization_3_beta.bin"); 
-  void* batch_normalization_3_beta =  readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_mean_path =  dir_prefix + std::string("batch_normalization_3_mean.bin"); 
-  void* batch_normalization_3_mean =  readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_variance_path =  dir_prefix + std::string("batch_normalization_3_variance.bin"); 
-  void* batch_normalization_3_variance =  readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); 
-  std::string depthwise_conv2d_2_w_path =  dir_prefix + std::string("depthwise_conv2d_2_w.bin"); 
-  void* depthwise_conv2d_2_w =  readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); 
-  std::string batch_normalization_4_gamma_path =  dir_prefix + std::string("batch_normalization_4_gamma.bin"); 
-  void* batch_normalization_4_gamma =  readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_4_beta_path =  dir_prefix + std::string("batch_normalization_4_beta.bin"); 
-  void* batch_normalization_4_beta =  readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_4_mean_path =  dir_prefix + std::string("batch_normalization_4_mean.bin"); 
-  void* batch_normalization_4_mean =  readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_4_variance_path =  dir_prefix + std::string("batch_normalization_4_variance.bin"); 
-  void* batch_normalization_4_variance =  readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); 
-  std::string batch_normalization_5_gamma_path =  dir_prefix + std::string("batch_normalization_5_gamma.bin"); 
-  void* batch_normalization_5_gamma =  readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_5_beta_path =  dir_prefix + std::string("batch_normalization_5_beta.bin"); 
-  void* batch_normalization_5_beta =  readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_5_mean_path =  dir_prefix + std::string("batch_normalization_5_mean.bin"); 
-  void* batch_normalization_5_mean =  readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_5_variance_path =  dir_prefix + std::string("batch_normalization_5_variance.bin"); 
-  void* batch_normalization_5_variance =  readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); 
-  std::string depthwise_conv2d_3_w_path =  dir_prefix + std::string("depthwise_conv2d_3_w.bin"); 
-  void* depthwise_conv2d_3_w =  readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); 
-  std::string batch_normalization_6_gamma_path =  dir_prefix + std::string("batch_normalization_6_gamma.bin"); 
-  void* batch_normalization_6_gamma =  readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_6_beta_path =  dir_prefix + std::string("batch_normalization_6_beta.bin"); 
-  void* batch_normalization_6_beta =  readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_6_mean_path =  dir_prefix + std::string("batch_normalization_6_mean.bin"); 
-  void* batch_normalization_6_mean =  readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_6_variance_path =  dir_prefix + std::string("batch_normalization_6_variance.bin"); 
-  void* batch_normalization_6_variance =  readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); 
-  std::string batch_normalization_7_gamma_path =  dir_prefix + std::string("batch_normalization_7_gamma.bin"); 
-  void* batch_normalization_7_gamma =  readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_7_beta_path =  dir_prefix + std::string("batch_normalization_7_beta.bin"); 
-  void* batch_normalization_7_beta =  readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_7_mean_path =  dir_prefix + std::string("batch_normalization_7_mean.bin"); 
-  void* batch_normalization_7_mean =  readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_7_variance_path =  dir_prefix + std::string("batch_normalization_7_variance.bin"); 
-  void* batch_normalization_7_variance =  readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); 
-  std::string depthwise_conv2d_4_w_path =  dir_prefix + std::string("depthwise_conv2d_4_w.bin"); 
-  void* depthwise_conv2d_4_w =  readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); 
-  std::string batch_normalization_8_gamma_path =  dir_prefix + std::string("batch_normalization_8_gamma.bin"); 
-  void* batch_normalization_8_gamma =  readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_8_beta_path =  dir_prefix + std::string("batch_normalization_8_beta.bin"); 
-  void* batch_normalization_8_beta =  readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_8_mean_path =  dir_prefix + std::string("batch_normalization_8_mean.bin"); 
-  void* batch_normalization_8_mean =  readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_8_variance_path =  dir_prefix + std::string("batch_normalization_8_variance.bin"); 
-  void* batch_normalization_8_variance =  readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); 
-  std::string batch_normalization_9_gamma_path =  dir_prefix + std::string("batch_normalization_9_gamma.bin"); 
-  void* batch_normalization_9_gamma =  readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_9_beta_path =  dir_prefix + std::string("batch_normalization_9_beta.bin"); 
-  void* batch_normalization_9_beta =  readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_9_mean_path =  dir_prefix + std::string("batch_normalization_9_mean.bin"); 
-  void* batch_normalization_9_mean =  readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_9_variance_path =  dir_prefix + std::string("batch_normalization_9_variance.bin"); 
-  void* batch_normalization_9_variance =  readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); 
-  std::string depthwise_conv2d_5_w_path =  dir_prefix + std::string("depthwise_conv2d_5_w.bin"); 
-  void* depthwise_conv2d_5_w =  readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); 
-  std::string batch_normalization_10_gamma_path =  dir_prefix + std::string("batch_normalization_10_gamma.bin"); 
-  void* batch_normalization_10_gamma =  readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_10_beta_path =  dir_prefix + std::string("batch_normalization_10_beta.bin"); 
-  void* batch_normalization_10_beta =  readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_10_mean_path =  dir_prefix + std::string("batch_normalization_10_mean.bin"); 
-  void* batch_normalization_10_mean =  readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_10_variance_path =  dir_prefix + std::string("batch_normalization_10_variance.bin"); 
-  void* batch_normalization_10_variance =  readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); 
-  std::string batch_normalization_11_gamma_path =  dir_prefix + std::string("batch_normalization_11_gamma.bin"); 
-  void* batch_normalization_11_gamma =  readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_beta_path =  dir_prefix + std::string("batch_normalization_11_beta.bin"); 
-  void* batch_normalization_11_beta =  readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_mean_path =  dir_prefix + std::string("batch_normalization_11_mean.bin"); 
-  void* batch_normalization_11_mean =  readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_variance_path =  dir_prefix + std::string("batch_normalization_11_variance.bin"); 
-  void* batch_normalization_11_variance =  readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); 
-  std::string depthwise_conv2d_6_w_path =  dir_prefix + std::string("depthwise_conv2d_6_w.bin"); 
-  void* depthwise_conv2d_6_w =  readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); 
-  std::string batch_normalization_12_gamma_path =  dir_prefix + std::string("batch_normalization_12_gamma.bin"); 
-  void* batch_normalization_12_gamma =  readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_12_beta_path =  dir_prefix + std::string("batch_normalization_12_beta.bin"); 
-  void* batch_normalization_12_beta =  readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_12_mean_path =  dir_prefix + std::string("batch_normalization_12_mean.bin"); 
-  void* batch_normalization_12_mean =  readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_12_variance_path =  dir_prefix + std::string("batch_normalization_12_variance.bin"); 
-  void* batch_normalization_12_variance =  readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); 
-  std::string batch_normalization_13_gamma_path =  dir_prefix + std::string("batch_normalization_13_gamma.bin"); 
-  void* batch_normalization_13_gamma =  readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_13_beta_path =  dir_prefix + std::string("batch_normalization_13_beta.bin"); 
-  void* batch_normalization_13_beta =  readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_13_mean_path =  dir_prefix + std::string("batch_normalization_13_mean.bin"); 
-  void* batch_normalization_13_mean =  readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_13_variance_path =  dir_prefix + std::string("batch_normalization_13_variance.bin"); 
-  void* batch_normalization_13_variance =  readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_7_w_path =  dir_prefix + std::string("depthwise_conv2d_7_w.bin"); 
-  void* depthwise_conv2d_7_w =  readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_14_gamma_path =  dir_prefix + std::string("batch_normalization_14_gamma.bin"); 
-  void* batch_normalization_14_gamma =  readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_beta_path =  dir_prefix + std::string("batch_normalization_14_beta.bin"); 
-  void* batch_normalization_14_beta =  readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_mean_path =  dir_prefix + std::string("batch_normalization_14_mean.bin"); 
-  void* batch_normalization_14_mean =  readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_variance_path =  dir_prefix + std::string("batch_normalization_14_variance.bin"); 
-  void* batch_normalization_14_variance =  readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_15_gamma_path =  dir_prefix + std::string("batch_normalization_15_gamma.bin"); 
-  void* batch_normalization_15_gamma =  readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_beta_path =  dir_prefix + std::string("batch_normalization_15_beta.bin"); 
-  void* batch_normalization_15_beta =  readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_mean_path =  dir_prefix + std::string("batch_normalization_15_mean.bin"); 
-  void* batch_normalization_15_mean =  readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_variance_path =  dir_prefix + std::string("batch_normalization_15_variance.bin"); 
-  void* batch_normalization_15_variance =  readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_8_w_path =  dir_prefix + std::string("depthwise_conv2d_8_w.bin"); 
-  void* depthwise_conv2d_8_w =  readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_16_gamma_path =  dir_prefix + std::string("batch_normalization_16_gamma.bin"); 
-  void* batch_normalization_16_gamma =  readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_16_beta_path =  dir_prefix + std::string("batch_normalization_16_beta.bin"); 
-  void* batch_normalization_16_beta =  readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_16_mean_path =  dir_prefix + std::string("batch_normalization_16_mean.bin"); 
-  void* batch_normalization_16_mean =  readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_16_variance_path =  dir_prefix + std::string("batch_normalization_16_variance.bin"); 
-  void* batch_normalization_16_variance =  readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_17_gamma_path =  dir_prefix + std::string("batch_normalization_17_gamma.bin"); 
-  void* batch_normalization_17_gamma =  readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_17_beta_path =  dir_prefix + std::string("batch_normalization_17_beta.bin"); 
-  void* batch_normalization_17_beta =  readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_17_mean_path =  dir_prefix + std::string("batch_normalization_17_mean.bin"); 
-  void* batch_normalization_17_mean =  readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_17_variance_path =  dir_prefix + std::string("batch_normalization_17_variance.bin"); 
-  void* batch_normalization_17_variance =  readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_9_w_path =  dir_prefix + std::string("depthwise_conv2d_9_w.bin"); 
-  void* depthwise_conv2d_9_w =  readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_18_gamma_path =  dir_prefix + std::string("batch_normalization_18_gamma.bin"); 
-  void* batch_normalization_18_gamma =  readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_beta_path =  dir_prefix + std::string("batch_normalization_18_beta.bin"); 
-  void* batch_normalization_18_beta =  readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_mean_path =  dir_prefix + std::string("batch_normalization_18_mean.bin"); 
-  void* batch_normalization_18_mean =  readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_variance_path =  dir_prefix + std::string("batch_normalization_18_variance.bin"); 
-  void* batch_normalization_18_variance =  readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_19_gamma_path =  dir_prefix + std::string("batch_normalization_19_gamma.bin"); 
-  void* batch_normalization_19_gamma =  readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_19_beta_path =  dir_prefix + std::string("batch_normalization_19_beta.bin"); 
-  void* batch_normalization_19_beta =  readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_19_mean_path =  dir_prefix + std::string("batch_normalization_19_mean.bin"); 
-  void* batch_normalization_19_mean =  readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_19_variance_path =  dir_prefix + std::string("batch_normalization_19_variance.bin"); 
-  void* batch_normalization_19_variance =  readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_10_w_path =  dir_prefix + std::string("depthwise_conv2d_10_w.bin"); 
-  void* depthwise_conv2d_10_w =  readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_20_gamma_path =  dir_prefix + std::string("batch_normalization_20_gamma.bin"); 
-  void* batch_normalization_20_gamma =  readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_20_beta_path =  dir_prefix + std::string("batch_normalization_20_beta.bin"); 
-  void* batch_normalization_20_beta =  readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_20_mean_path =  dir_prefix + std::string("batch_normalization_20_mean.bin"); 
-  void* batch_normalization_20_mean =  readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_20_variance_path =  dir_prefix + std::string("batch_normalization_20_variance.bin"); 
-  void* batch_normalization_20_variance =  readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_21_gamma_path =  dir_prefix + std::string("batch_normalization_21_gamma.bin"); 
-  void* batch_normalization_21_gamma =  readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_beta_path =  dir_prefix + std::string("batch_normalization_21_beta.bin"); 
-  void* batch_normalization_21_beta =  readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_mean_path =  dir_prefix + std::string("batch_normalization_21_mean.bin"); 
-  void* batch_normalization_21_mean =  readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_variance_path =  dir_prefix + std::string("batch_normalization_21_variance.bin"); 
-  void* batch_normalization_21_variance =  readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_11_w_path =  dir_prefix + std::string("depthwise_conv2d_11_w.bin"); 
-  void* depthwise_conv2d_11_w =  readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_22_gamma_path =  dir_prefix + std::string("batch_normalization_22_gamma.bin"); 
-  void* batch_normalization_22_gamma =  readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_22_beta_path =  dir_prefix + std::string("batch_normalization_22_beta.bin"); 
-  void* batch_normalization_22_beta =  readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_22_mean_path =  dir_prefix + std::string("batch_normalization_22_mean.bin"); 
-  void* batch_normalization_22_mean =  readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_22_variance_path =  dir_prefix + std::string("batch_normalization_22_variance.bin"); 
-  void* batch_normalization_22_variance =  readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_23_gamma_path =  dir_prefix + std::string("batch_normalization_23_gamma.bin"); 
-  void* batch_normalization_23_gamma =  readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_23_beta_path =  dir_prefix + std::string("batch_normalization_23_beta.bin"); 
-  void* batch_normalization_23_beta =  readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_23_mean_path =  dir_prefix + std::string("batch_normalization_23_mean.bin"); 
-  void* batch_normalization_23_mean =  readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_23_variance_path =  dir_prefix + std::string("batch_normalization_23_variance.bin"); 
-  void* batch_normalization_23_variance =  readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_12_w_path =  dir_prefix + std::string("depthwise_conv2d_12_w.bin"); 
-  void* depthwise_conv2d_12_w =  readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_24_gamma_path =  dir_prefix + std::string("batch_normalization_24_gamma.bin"); 
-  void* batch_normalization_24_gamma =  readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_beta_path =  dir_prefix + std::string("batch_normalization_24_beta.bin"); 
-  void* batch_normalization_24_beta =  readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_mean_path =  dir_prefix + std::string("batch_normalization_24_mean.bin"); 
-  void* batch_normalization_24_mean =  readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_variance_path =  dir_prefix + std::string("batch_normalization_24_variance.bin"); 
-  void* batch_normalization_24_variance =  readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); 
-  std::string batch_normalization_25_gamma_path =  dir_prefix + std::string("batch_normalization_25_gamma.bin"); 
-  void* batch_normalization_25_gamma =  readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_25_beta_path =  dir_prefix + std::string("batch_normalization_25_beta.bin"); 
-  void* batch_normalization_25_beta =  readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_25_mean_path =  dir_prefix + std::string("batch_normalization_25_mean.bin"); 
-  void* batch_normalization_25_mean =  readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_25_variance_path =  dir_prefix + std::string("batch_normalization_25_variance.bin"); 
-  void* batch_normalization_25_variance =  readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string depthwise_conv2d_13_w_path =  dir_prefix + std::string("depthwise_conv2d_13_w.bin"); 
-  void* depthwise_conv2d_13_w =  readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); 
-  std::string batch_normalization_26_gamma_path =  dir_prefix + std::string("batch_normalization_26_gamma.bin"); 
-  void* batch_normalization_26_gamma =  readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_26_beta_path =  dir_prefix + std::string("batch_normalization_26_beta.bin"); 
-  void* batch_normalization_26_beta =  readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_26_mean_path =  dir_prefix + std::string("batch_normalization_26_mean.bin"); 
-  void* batch_normalization_26_mean =  readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_26_variance_path =  dir_prefix + std::string("batch_normalization_26_variance.bin"); 
-  void* batch_normalization_26_variance =  readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); 
-  std::string batch_normalization_27_gamma_path =  dir_prefix + std::string("batch_normalization_27_gamma.bin"); 
-  void* batch_normalization_27_gamma =  readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_beta_path =  dir_prefix + std::string("batch_normalization_27_beta.bin"); 
-  void* batch_normalization_27_beta =  readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_mean_path =  dir_prefix + std::string("batch_normalization_27_mean.bin"); 
-  void* batch_normalization_27_mean =  readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_variance_path =  dir_prefix + std::string("batch_normalization_27_variance.bin"); 
-  void* batch_normalization_27_variance =  readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+  startMemTracking();
 
+  int test_input_size = 2000;
+  int batch_size = 2000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
 
+  for (int i = 0; i < batch_count; i++) {
 
-  startMemTracking(); 
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
 
-  int test_input_size = 2000; 
-  int batch_size = 2000;  
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
 
-  for(int i = 0; i < batch_count; i++){ 
+    void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1);
+    void *var_1 = tensorBatchNorm(
+        var_0, batch_normalization_1_gamma, batch_normalization_1_beta,
+        batch_normalization_1_mean, batch_normalization_1_variance, 0.001);
+    void *var_2 = tensorRelu(var_1);
+    void *var_4 =
+        tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32);
+    void *var_5 = tensorBatchNorm(
+        var_4, batch_normalization_2_gamma, batch_normalization_2_beta,
+        batch_normalization_2_mean, batch_normalization_2_variance, 0.001);
+    void *var_6 = tensorRelu(var_5);
+    void *var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1);
+    void *var_8 = tensorBatchNorm(
+        var_7, batch_normalization_3_gamma, batch_normalization_3_beta,
+        batch_normalization_3_mean, batch_normalization_3_variance, 0.001);
+    void *var_9 = tensorRelu(var_8);
+    void *var_11 =
+        tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64);
+    void *var_12 = tensorBatchNorm(
+        var_11, batch_normalization_4_gamma, batch_normalization_4_beta,
+        batch_normalization_4_mean, batch_normalization_4_variance, 0.001);
+    void *var_13 = tensorRelu(var_12);
+    void *var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1);
+    void *var_15 = tensorBatchNorm(
+        var_14, batch_normalization_5_gamma, batch_normalization_5_beta,
+        batch_normalization_5_mean, batch_normalization_5_variance, 0.001);
+    void *var_16 = tensorRelu(var_15);
+    void *var_18 =
+        tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128);
+    void *var_19 = tensorBatchNorm(
+        var_18, batch_normalization_6_gamma, batch_normalization_6_beta,
+        batch_normalization_6_mean, batch_normalization_6_variance, 0.001);
+    void *var_20 = tensorRelu(var_19);
+    void *var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1);
+    void *var_22 = tensorBatchNorm(
+        var_21, batch_normalization_7_gamma, batch_normalization_7_beta,
+        batch_normalization_7_mean, batch_normalization_7_variance, 0.001);
+    void *var_23 = tensorRelu(var_22);
+    void *var_26 =
+        tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128);
+    void *var_27 = tensorBatchNorm(
+        var_26, batch_normalization_8_gamma, batch_normalization_8_beta,
+        batch_normalization_8_mean, batch_normalization_8_variance, 0.001);
+    void *var_28 = tensorRelu(var_27);
+    void *var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1);
+    void *var_30 = tensorBatchNorm(
+        var_29, batch_normalization_9_gamma, batch_normalization_9_beta,
+        batch_normalization_9_mean, batch_normalization_9_variance, 0.001);
+    void *var_31 = tensorRelu(var_30);
+    void *var_33 =
+        tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256);
+    void *var_34 = tensorBatchNorm(
+        var_33, batch_normalization_10_gamma, batch_normalization_10_beta,
+        batch_normalization_10_mean, batch_normalization_10_variance, 0.001);
+    void *var_35 = tensorRelu(var_34);
+    void *var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1);
+    void *var_37 = tensorBatchNorm(
+        var_36, batch_normalization_11_gamma, batch_normalization_11_beta,
+        batch_normalization_11_mean, batch_normalization_11_variance, 0.001);
+    void *var_38 = tensorRelu(var_37);
+    void *var_41 =
+        tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256);
+    void *var_42 = tensorBatchNorm(
+        var_41, batch_normalization_12_gamma, batch_normalization_12_beta,
+        batch_normalization_12_mean, batch_normalization_12_variance, 0.001);
+    void *var_43 = tensorRelu(var_42);
+    void *var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1);
+    void *var_45 = tensorBatchNorm(
+        var_44, batch_normalization_13_gamma, batch_normalization_13_beta,
+        batch_normalization_13_mean, batch_normalization_13_variance, 0.001);
+    void *var_46 = tensorRelu(var_45);
+    void *var_48 =
+        tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512);
+    void *var_49 = tensorBatchNorm(
+        var_48, batch_normalization_14_gamma, batch_normalization_14_beta,
+        batch_normalization_14_mean, batch_normalization_14_variance, 0.001);
+    void *var_50 = tensorRelu(var_49);
+    void *var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1);
+    void *var_52 = tensorBatchNorm(
+        var_51, batch_normalization_15_gamma, batch_normalization_15_beta,
+        batch_normalization_15_mean, batch_normalization_15_variance, 0.001);
+    void *var_53 = tensorRelu(var_52);
+    void *var_55 =
+        tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512);
+    void *var_56 = tensorBatchNorm(
+        var_55, batch_normalization_16_gamma, batch_normalization_16_beta,
+        batch_normalization_16_mean, batch_normalization_16_variance, 0.001);
+    void *var_57 = tensorRelu(var_56);
+    void *var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1);
+    void *var_59 = tensorBatchNorm(
+        var_58, batch_normalization_17_gamma, batch_normalization_17_beta,
+        batch_normalization_17_mean, batch_normalization_17_variance, 0.001);
+    void *var_60 = tensorRelu(var_59);
+    void *var_63 =
+        tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512);
+    void *var_64 = tensorBatchNorm(
+        var_63, batch_normalization_18_gamma, batch_normalization_18_beta,
+        batch_normalization_18_mean, batch_normalization_18_variance, 0.001);
+    void *var_65 = tensorRelu(var_64);
+    void *var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1);
+    void *var_67 = tensorBatchNorm(
+        var_66, batch_normalization_19_gamma, batch_normalization_19_beta,
+        batch_normalization_19_mean, batch_normalization_19_variance, 0.001);
+    void *var_68 = tensorRelu(var_67);
+    void *var_70 =
+        tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512);
+    void *var_71 = tensorBatchNorm(
+        var_70, batch_normalization_20_gamma, batch_normalization_20_beta,
+        batch_normalization_20_mean, batch_normalization_20_variance, 0.001);
+    void *var_72 = tensorRelu(var_71);
+    void *var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1);
+    void *var_74 = tensorBatchNorm(
+        var_73, batch_normalization_21_gamma, batch_normalization_21_beta,
+        batch_normalization_21_mean, batch_normalization_21_variance, 0.001);
+    void *var_75 = tensorRelu(var_74);
+    void *var_77 =
+        tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512);
+    void *var_78 = tensorBatchNorm(
+        var_77, batch_normalization_22_gamma, batch_normalization_22_beta,
+        batch_normalization_22_mean, batch_normalization_22_variance, 0.001);
+    void *var_79 = tensorRelu(var_78);
+    void *var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1);
+    void *var_81 = tensorBatchNorm(
+        var_80, batch_normalization_23_gamma, batch_normalization_23_beta,
+        batch_normalization_23_mean, batch_normalization_23_variance, 0.001);
+    void *var_82 = tensorRelu(var_81);
+    void *var_85 =
+        tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512);
+    void *var_86 = tensorBatchNorm(
+        var_85, batch_normalization_24_gamma, batch_normalization_24_beta,
+        batch_normalization_24_mean, batch_normalization_24_variance, 0.001);
+    void *var_87 = tensorRelu(var_86);
+    void *var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1);
+    void *var_89 = tensorBatchNorm(
+        var_88, batch_normalization_25_gamma, batch_normalization_25_beta,
+        batch_normalization_25_mean, batch_normalization_25_variance, 0.001);
+    void *var_90 = tensorRelu(var_89);
+    void *var_92 =
+        tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024);
+    void *var_93 = tensorBatchNorm(
+        var_92, batch_normalization_26_gamma, batch_normalization_26_beta,
+        batch_normalization_26_mean, batch_normalization_26_variance, 0.001);
+    void *var_94 = tensorRelu(var_93);
+    void *var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1);
+    void *var_96 = tensorBatchNorm(
+        var_95, batch_normalization_27_gamma, batch_normalization_27_beta,
+        batch_normalization_27_mean, batch_normalization_27_variance, 0.001);
+    void *var_97 = tensorRelu(var_96);
+    void *var_99 = tensorPooling(var_97, 1, 2, 2, 0, 0, 2, 2);
+    void *var_101 = tensorGemmGPU(var_99, dense_1_w);
+    void *var_102 = tensorAdd(var_101, dense_1_b);
+    void *var_103 = tensorSoftmax(var_102);
 
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
-
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
-
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
-    void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
-    void* var_2 = tensorRelu(var_1); 
-    void* var_4 = tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
-    void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
-    void* var_6 = tensorRelu(var_5); 
-    void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
-    void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
-    void* var_9 = tensorRelu(var_8); 
-    void* var_11 = tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
-    void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
-    void* var_13 = tensorRelu(var_12); 
-    void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
-    void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
-    void* var_16 = tensorRelu(var_15); 
-    void* var_18 = tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
-    void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
-    void* var_20 = tensorRelu(var_19); 
-    void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
-    void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
-    void* var_23 = tensorRelu(var_22); 
-    void* var_26 = tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
-    void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
-    void* var_28 = tensorRelu(var_27); 
-    void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
-    void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
-    void* var_31 = tensorRelu(var_30); 
-    void* var_33 = tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
-    void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
-    void* var_35 = tensorRelu(var_34); 
-    void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
-    void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
-    void* var_38 = tensorRelu(var_37); 
-    void* var_41 = tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
-    void* var_42 = tensorBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
-    void* var_43 = tensorRelu(var_42); 
-    void* var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); 
-    void* var_45 = tensorBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
-    void* var_46 = tensorRelu(var_45); 
-    void* var_48 = tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
-    void* var_49 = tensorBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
-    void* var_50 = tensorRelu(var_49); 
-    void* var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
-    void* var_52 = tensorBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
-    void* var_53 = tensorRelu(var_52); 
-    void* var_55 = tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
-    void* var_56 = tensorBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
-    void* var_57 = tensorRelu(var_56); 
-    void* var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
-    void* var_59 = tensorBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
-    void* var_60 = tensorRelu(var_59); 
-    void* var_63 = tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
-    void* var_64 = tensorBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
-    void* var_65 = tensorRelu(var_64); 
-    void* var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); 
-    void* var_67 = tensorBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
-    void* var_68 = tensorRelu(var_67); 
-    void* var_70 = tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
-    void* var_71 = tensorBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
-    void* var_72 = tensorRelu(var_71); 
-    void* var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
-    void* var_74 = tensorBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
-    void* var_75 = tensorRelu(var_74); 
-    void* var_77 = tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
-    void* var_78 = tensorBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
-    void* var_79 = tensorRelu(var_78); 
-    void* var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); 
-    void* var_81 = tensorBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
-    void* var_82 = tensorRelu(var_81); 
-    void* var_85 = tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
-    void* var_86 = tensorBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
-    void* var_87 = tensorRelu(var_86); 
-    void* var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); 
-    void* var_89 = tensorBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-    void* var_90 = tensorRelu(var_89); 
-    void* var_92 = tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
-    void* var_93 = tensorBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
-    void* var_94 = tensorRelu(var_93); 
-    void* var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
-    void* var_96 = tensorBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-    void* var_97 = tensorRelu(var_96); 
-    void* var_99 = tensorPooling(var_97,1,2,2,0,0,2,2); 
-    void* var_101 = tensorGemmGPU(var_99, dense_1_w); 
-    void* var_102 = tensorAdd(var_101, dense_1_b); 
-    void* var_103 = tensorSoftmax(var_102); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy2(labels, batch_size, var_103); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
 
+    float accuracy = computeAccuracy2(labels, batch_size, var_103);
+    final_accuracy += accuracy;
+    freeBatchMemory();
   }
 
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
-
-
-  llvm_hpvm_cleanupTensorRt(); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
index 87b8cd4156ed8d7f882ff7642420c995cd7c3a0f..a7355fb063b37a90ab04d077d1c1b32f26613857 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
@@ -1,112 +1,149 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(1); 
-  
-  std::string dir_prefix = model_params_path + std::string("/resnet18_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_14_b_path =  dir_prefix + std::string("conv2d_14_b.bin"); 
-  void* conv2d_14_b =  readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_15_w_path =  dir_prefix + std::string("conv2d_15_w.bin"); 
-  void* conv2d_15_w =  readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); 
-  std::string conv2d_15_b_path =  dir_prefix + std::string("conv2d_15_b.bin"); 
-  void* conv2d_15_b =  readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_17_w_path =  dir_prefix + std::string("conv2d_17_w.bin"); 
-  void* conv2d_17_w =  readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); 
-  std::string conv2d_17_b_path =  dir_prefix + std::string("conv2d_17_b.bin"); 
-  void* conv2d_17_b =  readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_16_w_path =  dir_prefix + std::string("conv2d_16_w.bin"); 
-  void* conv2d_16_w =  readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_16_b_path =  dir_prefix + std::string("conv2d_16_b.bin"); 
-  void* conv2d_16_b =  readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_18_w_path =  dir_prefix + std::string("conv2d_18_w.bin"); 
-  void* conv2d_18_w =  readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_18_b_path =  dir_prefix + std::string("conv2d_18_b.bin"); 
-  void* conv2d_18_b =  readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_19_w_path =  dir_prefix + std::string("conv2d_19_w.bin"); 
-  void* conv2d_19_w =  readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_19_b_path =  dir_prefix + std::string("conv2d_19_b.bin"); 
-  void* conv2d_19_b =  readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_20_w_path =  dir_prefix + std::string("conv2d_20_w.bin"); 
-  void* conv2d_20_w =  readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_20_b_path =  dir_prefix + std::string("conv2d_20_b.bin"); 
-  void* conv2d_20_b =  readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_21_w_path =  dir_prefix + std::string("conv2d_21_w.bin"); 
-  void* conv2d_21_w =  readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_21_b_path =  dir_prefix + std::string("conv2d_21_b.bin"); 
-  void* conv2d_21_b =  readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
 
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(1);
+
+  std::string dir_prefix =
+      model_params_path + std::string("/resnet18_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  // void* input = readTrainedWeights(input_path.c_str(), 0,
+  // batch_size,3,32,32);
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  // uint8_t* labels = readLabels(labels_path.c_str(), batch_size);
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 32, 16, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 32, 16, 1, 1);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin");
+  void *conv2d_14_b =
+      readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin");
+  void *conv2d_15_w =
+      readTrainedWeights(conv2d_15_w_path.c_str(), 0, 64, 32, 3, 3);
+  std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin");
+  void *conv2d_15_b =
+      readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin");
+  void *conv2d_17_w =
+      readTrainedWeights(conv2d_17_w_path.c_str(), 0, 64, 32, 1, 1);
+  std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin");
+  void *conv2d_17_b =
+      readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin");
+  void *conv2d_16_w =
+      readTrainedWeights(conv2d_16_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin");
+  void *conv2d_16_b =
+      readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin");
+  void *conv2d_18_w =
+      readTrainedWeights(conv2d_18_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin");
+  void *conv2d_18_b =
+      readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin");
+  void *conv2d_19_w =
+      readTrainedWeights(conv2d_19_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin");
+  void *conv2d_19_b =
+      readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin");
+  void *conv2d_20_w =
+      readTrainedWeights(conv2d_20_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin");
+  void *conv2d_20_b =
+      readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin");
+  void *conv2d_21_w =
+      readTrainedWeights(conv2d_21_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin");
+  void *conv2d_21_b =
+      readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 64, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -117,94 +154,94 @@ int main(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
-    
-    void* var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_3 = tensorAdd(var_2, conv2d_1_b); 
-    void* var_4 = tensorRelu(var_3); 
-    void* var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_7 = tensorAdd(var_6, conv2d_2_b); 
-    void* var_8 = tensorRelu(var_7); 
-    void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorAdd(var_4, var_11); 
-    void* var_13 = tensorRelu(var_12); 
-    void* var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_16 = tensorAdd(var_15, conv2d_4_b); 
-    void* var_17 = tensorRelu(var_16); 
-    void* var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_20 = tensorAdd(var_19, conv2d_5_b); 
-    void* var_21 = tensorAdd(var_13, var_20); 
-    void* var_22 = tensorRelu(var_21); 
-    void* var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorAdd(var_24, conv2d_6_b); 
-    void* var_26 = tensorRelu(var_25); 
-    void* var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorAdd(var_28, conv2d_7_b); 
-    void* var_30 = tensorAdd(var_22, var_29); 
-    void* var_31 = tensorRelu(var_30); 
-    void* var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); 
-    void* var_34 = tensorAdd(var_33, conv2d_8_b); 
-    void* var_35 = tensorRelu(var_34); 
-    void* var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_38 = tensorAdd(var_37, conv2d_9_b); 
-    void* var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); 
-    void* var_41 = tensorAdd(var_40, conv2d_10_b); 
-    void* var_42 = tensorAdd(var_41, var_38); 
-    void* var_43 = tensorRelu(var_42); 
-    void* var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_46 = tensorAdd(var_45, conv2d_11_b); 
-    void* var_47 = tensorRelu(var_46); 
-    void* var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_50 = tensorAdd(var_49, conv2d_12_b); 
-    void* var_51 = tensorAdd(var_43, var_50); 
-    void* var_52 = tensorRelu(var_51); 
-    void* var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_55 = tensorAdd(var_54, conv2d_13_b); 
-    void* var_56 = tensorRelu(var_55); 
-    void* var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); 
-    void* var_59 = tensorAdd(var_58, conv2d_14_b); 
-    void* var_60 = tensorAdd(var_52, var_59); 
-    void* var_61 = tensorRelu(var_60); 
-    void* var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); 
-    void* var_64 = tensorAdd(var_63, conv2d_15_b); 
-    void* var_65 = tensorRelu(var_64); 
-    void* var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); 
-    void* var_68 = tensorAdd(var_67, conv2d_16_b); 
-    void* var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); 
-    void* var_71 = tensorAdd(var_70, conv2d_17_b); 
-    void* var_72 = tensorAdd(var_71, var_68); 
-    void* var_73 = tensorRelu(var_72); 
-    void* var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); 
-    void* var_76 = tensorAdd(var_75, conv2d_18_b); 
-    void* var_77 = tensorRelu(var_76); 
-    void* var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); 
-    void* var_80 = tensorAdd(var_79, conv2d_19_b); 
-    void* var_81 = tensorAdd(var_73, var_80); 
-    void* var_82 = tensorRelu(var_81); 
-    void* var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); 
-    void* var_85 = tensorAdd(var_84, conv2d_20_b); 
-    void* var_86 = tensorRelu(var_85); 
-    void* var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); 
-    void* var_89 = tensorAdd(var_88, conv2d_21_b); 
-    void* var_90 = tensorAdd(var_82, var_89); 
-    void* var_91 = tensorRelu(var_90); 
-    void* var_92 = tensorPooling(var_91,1,8,8,0,0,8,8); 
-    void* var_94 = tensorGemmGPU(var_92, dense_1_w); 
-    void* var_95 = tensorAdd(var_94, dense_1_b); 
-    void* var_96 = tensorSoftmax(var_95); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_96); 
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_3 = tensorAdd(var_2, conv2d_1_b);
+    void *var_4 = tensorRelu(var_3);
+    void *var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_7 = tensorAdd(var_6, conv2d_2_b);
+    void *var_8 = tensorRelu(var_7);
+    void *var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_11 = tensorAdd(var_10, conv2d_3_b);
+    void *var_12 = tensorAdd(var_4, var_11);
+    void *var_13 = tensorRelu(var_12);
+    void *var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_16 = tensorAdd(var_15, conv2d_4_b);
+    void *var_17 = tensorRelu(var_16);
+    void *var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_20 = tensorAdd(var_19, conv2d_5_b);
+    void *var_21 = tensorAdd(var_13, var_20);
+    void *var_22 = tensorRelu(var_21);
+    void *var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorAdd(var_24, conv2d_6_b);
+    void *var_26 = tensorRelu(var_25);
+    void *var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorAdd(var_28, conv2d_7_b);
+    void *var_30 = tensorAdd(var_22, var_29);
+    void *var_31 = tensorRelu(var_30);
+    void *var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0);
+    void *var_34 = tensorAdd(var_33, conv2d_8_b);
+    void *var_35 = tensorRelu(var_34);
+    void *var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_38 = tensorAdd(var_37, conv2d_9_b);
+    void *var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0);
+    void *var_41 = tensorAdd(var_40, conv2d_10_b);
+    void *var_42 = tensorAdd(var_41, var_38);
+    void *var_43 = tensorRelu(var_42);
+    void *var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_46 = tensorAdd(var_45, conv2d_11_b);
+    void *var_47 = tensorRelu(var_46);
+    void *var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_50 = tensorAdd(var_49, conv2d_12_b);
+    void *var_51 = tensorAdd(var_43, var_50);
+    void *var_52 = tensorRelu(var_51);
+    void *var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_55 = tensorAdd(var_54, conv2d_13_b);
+    void *var_56 = tensorRelu(var_55);
+    void *var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0);
+    void *var_59 = tensorAdd(var_58, conv2d_14_b);
+    void *var_60 = tensorAdd(var_52, var_59);
+    void *var_61 = tensorRelu(var_60);
+    void *var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0);
+    void *var_64 = tensorAdd(var_63, conv2d_15_b);
+    void *var_65 = tensorRelu(var_64);
+    void *var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0);
+    void *var_68 = tensorAdd(var_67, conv2d_16_b);
+    void *var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0);
+    void *var_71 = tensorAdd(var_70, conv2d_17_b);
+    void *var_72 = tensorAdd(var_71, var_68);
+    void *var_73 = tensorRelu(var_72);
+    void *var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0);
+    void *var_76 = tensorAdd(var_75, conv2d_18_b);
+    void *var_77 = tensorRelu(var_76);
+    void *var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0);
+    void *var_80 = tensorAdd(var_79, conv2d_19_b);
+    void *var_81 = tensorAdd(var_73, var_80);
+    void *var_82 = tensorRelu(var_81);
+    void *var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0);
+    void *var_85 = tensorAdd(var_84, conv2d_20_b);
+    void *var_86 = tensorRelu(var_85);
+    void *var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0);
+    void *var_89 = tensorAdd(var_88, conv2d_21_b);
+    void *var_90 = tensorAdd(var_82, var_89);
+    void *var_91 = tensorRelu(var_90);
+    void *var_92 = tensorPooling(var_91, 1, 8, 8, 0, 0, 8, 8);
+    void *var_94 = tensorGemmGPU(var_92, dense_1_w);
+    void *var_95 = tensorAdd(var_94, dense_1_b);
+    void *var_96 = tensorSoftmax(var_95);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_96);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -213,9 +250,7 @@ int main(){
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
 
-  
-  llvm_hpvm_cleanupTensorRt(); 
-
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
index 0914b3f70c353ee7e56c39ccf52f21914618301e..afa3f0bcc1b08fc4a89c694e8e07e813b352ccbf 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
@@ -1,924 +1,1551 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
 
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
 
-int main(){ 
+int main() {
 
-  llvm_hpvm_initTensorRt(0); 
+  llvm_hpvm_initTensorRt(0);
 
+  std::string dir_prefix =
+      model_params_path + std::string("/shared/hsharif3/resnet50_imagenet/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 7, 7);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_1_gamma_path =
+      dir_prefix + std::string("batch_normalization_1_gamma.bin");
+  void *batch_normalization_1_gamma = readTrainedWeights(
+      batch_normalization_1_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_1_beta_path =
+      dir_prefix + std::string("batch_normalization_1_beta.bin");
+  void *batch_normalization_1_beta = readTrainedWeights(
+      batch_normalization_1_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_1_mean_path =
+      dir_prefix + std::string("batch_normalization_1_mean.bin");
+  void *batch_normalization_1_mean = readTrainedWeights(
+      batch_normalization_1_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_1_variance_path =
+      dir_prefix + std::string("batch_normalization_1_variance.bin");
+  void *batch_normalization_1_variance = readTrainedWeights(
+      batch_normalization_1_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 1, 1);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_2_gamma_path =
+      dir_prefix + std::string("batch_normalization_2_gamma.bin");
+  void *batch_normalization_2_gamma = readTrainedWeights(
+      batch_normalization_2_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_2_beta_path =
+      dir_prefix + std::string("batch_normalization_2_beta.bin");
+  void *batch_normalization_2_beta = readTrainedWeights(
+      batch_normalization_2_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_2_mean_path =
+      dir_prefix + std::string("batch_normalization_2_mean.bin");
+  void *batch_normalization_2_mean = readTrainedWeights(
+      batch_normalization_2_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_2_variance_path =
+      dir_prefix + std::string("batch_normalization_2_variance.bin");
+  void *batch_normalization_2_variance = readTrainedWeights(
+      batch_normalization_2_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_gamma_path =
+      dir_prefix + std::string("batch_normalization_3_gamma.bin");
+  void *batch_normalization_3_gamma = readTrainedWeights(
+      batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_beta_path =
+      dir_prefix + std::string("batch_normalization_3_beta.bin");
+  void *batch_normalization_3_beta = readTrainedWeights(
+      batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_mean_path =
+      dir_prefix + std::string("batch_normalization_3_mean.bin");
+  void *batch_normalization_3_mean = readTrainedWeights(
+      batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_variance_path =
+      dir_prefix + std::string("batch_normalization_3_variance.bin");
+  void *batch_normalization_3_variance = readTrainedWeights(
+      batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 64, 1, 1);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 64, 1, 1);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_4_gamma_path =
+      dir_prefix + std::string("batch_normalization_4_gamma.bin");
+  void *batch_normalization_4_gamma = readTrainedWeights(
+      batch_normalization_4_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_4_beta_path =
+      dir_prefix + std::string("batch_normalization_4_beta.bin");
+  void *batch_normalization_4_beta = readTrainedWeights(
+      batch_normalization_4_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_4_mean_path =
+      dir_prefix + std::string("batch_normalization_4_mean.bin");
+  void *batch_normalization_4_mean = readTrainedWeights(
+      batch_normalization_4_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_4_variance_path =
+      dir_prefix + std::string("batch_normalization_4_variance.bin");
+  void *batch_normalization_4_variance = readTrainedWeights(
+      batch_normalization_4_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_5_gamma_path =
+      dir_prefix + std::string("batch_normalization_5_gamma.bin");
+  void *batch_normalization_5_gamma = readTrainedWeights(
+      batch_normalization_5_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_5_beta_path =
+      dir_prefix + std::string("batch_normalization_5_beta.bin");
+  void *batch_normalization_5_beta = readTrainedWeights(
+      batch_normalization_5_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_5_mean_path =
+      dir_prefix + std::string("batch_normalization_5_mean.bin");
+  void *batch_normalization_5_mean = readTrainedWeights(
+      batch_normalization_5_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_5_variance_path =
+      dir_prefix + std::string("batch_normalization_5_variance.bin");
+  void *batch_normalization_5_variance = readTrainedWeights(
+      batch_normalization_5_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 64, 256, 1, 1);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_6_gamma_path =
+      dir_prefix + std::string("batch_normalization_6_gamma.bin");
+  void *batch_normalization_6_gamma = readTrainedWeights(
+      batch_normalization_6_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_6_beta_path =
+      dir_prefix + std::string("batch_normalization_6_beta.bin");
+  void *batch_normalization_6_beta = readTrainedWeights(
+      batch_normalization_6_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_6_mean_path =
+      dir_prefix + std::string("batch_normalization_6_mean.bin");
+  void *batch_normalization_6_mean = readTrainedWeights(
+      batch_normalization_6_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_6_variance_path =
+      dir_prefix + std::string("batch_normalization_6_variance.bin");
+  void *batch_normalization_6_variance = readTrainedWeights(
+      batch_normalization_6_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_7_gamma_path =
+      dir_prefix + std::string("batch_normalization_7_gamma.bin");
+  void *batch_normalization_7_gamma = readTrainedWeights(
+      batch_normalization_7_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_7_beta_path =
+      dir_prefix + std::string("batch_normalization_7_beta.bin");
+  void *batch_normalization_7_beta = readTrainedWeights(
+      batch_normalization_7_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_7_mean_path =
+      dir_prefix + std::string("batch_normalization_7_mean.bin");
+  void *batch_normalization_7_mean = readTrainedWeights(
+      batch_normalization_7_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_7_variance_path =
+      dir_prefix + std::string("batch_normalization_7_variance.bin");
+  void *batch_normalization_7_variance = readTrainedWeights(
+      batch_normalization_7_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 256, 64, 1, 1);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_8_gamma_path =
+      dir_prefix + std::string("batch_normalization_8_gamma.bin");
+  void *batch_normalization_8_gamma = readTrainedWeights(
+      batch_normalization_8_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_8_beta_path =
+      dir_prefix + std::string("batch_normalization_8_beta.bin");
+  void *batch_normalization_8_beta = readTrainedWeights(
+      batch_normalization_8_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_8_mean_path =
+      dir_prefix + std::string("batch_normalization_8_mean.bin");
+  void *batch_normalization_8_mean = readTrainedWeights(
+      batch_normalization_8_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_8_variance_path =
+      dir_prefix + std::string("batch_normalization_8_variance.bin");
+  void *batch_normalization_8_variance = readTrainedWeights(
+      batch_normalization_8_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 64, 256, 1, 1);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_9_gamma_path =
+      dir_prefix + std::string("batch_normalization_9_gamma.bin");
+  void *batch_normalization_9_gamma = readTrainedWeights(
+      batch_normalization_9_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_9_beta_path =
+      dir_prefix + std::string("batch_normalization_9_beta.bin");
+  void *batch_normalization_9_beta = readTrainedWeights(
+      batch_normalization_9_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_9_mean_path =
+      dir_prefix + std::string("batch_normalization_9_mean.bin");
+  void *batch_normalization_9_mean = readTrainedWeights(
+      batch_normalization_9_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_9_variance_path =
+      dir_prefix + std::string("batch_normalization_9_variance.bin");
+  void *batch_normalization_9_variance = readTrainedWeights(
+      batch_normalization_9_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_10_gamma_path =
+      dir_prefix + std::string("batch_normalization_10_gamma.bin");
+  void *batch_normalization_10_gamma = readTrainedWeights(
+      batch_normalization_10_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_10_beta_path =
+      dir_prefix + std::string("batch_normalization_10_beta.bin");
+  void *batch_normalization_10_beta = readTrainedWeights(
+      batch_normalization_10_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_10_mean_path =
+      dir_prefix + std::string("batch_normalization_10_mean.bin");
+  void *batch_normalization_10_mean = readTrainedWeights(
+      batch_normalization_10_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_10_variance_path =
+      dir_prefix + std::string("batch_normalization_10_variance.bin");
+  void *batch_normalization_10_variance = readTrainedWeights(
+      batch_normalization_10_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 256, 64, 1, 1);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_gamma_path =
+      dir_prefix + std::string("batch_normalization_11_gamma.bin");
+  void *batch_normalization_11_gamma = readTrainedWeights(
+      batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_beta_path =
+      dir_prefix + std::string("batch_normalization_11_beta.bin");
+  void *batch_normalization_11_beta = readTrainedWeights(
+      batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_mean_path =
+      dir_prefix + std::string("batch_normalization_11_mean.bin");
+  void *batch_normalization_11_mean = readTrainedWeights(
+      batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_variance_path =
+      dir_prefix + std::string("batch_normalization_11_variance.bin");
+  void *batch_normalization_11_variance = readTrainedWeights(
+      batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 128, 256, 1, 1);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_12_gamma_path =
+      dir_prefix + std::string("batch_normalization_12_gamma.bin");
+  void *batch_normalization_12_gamma = readTrainedWeights(
+      batch_normalization_12_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_12_beta_path =
+      dir_prefix + std::string("batch_normalization_12_beta.bin");
+  void *batch_normalization_12_beta = readTrainedWeights(
+      batch_normalization_12_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_12_mean_path =
+      dir_prefix + std::string("batch_normalization_12_mean.bin");
+  void *batch_normalization_12_mean = readTrainedWeights(
+      batch_normalization_12_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_12_variance_path =
+      dir_prefix + std::string("batch_normalization_12_variance.bin");
+  void *batch_normalization_12_variance = readTrainedWeights(
+      batch_normalization_12_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_13_gamma_path =
+      dir_prefix + std::string("batch_normalization_13_gamma.bin");
+  void *batch_normalization_13_gamma = readTrainedWeights(
+      batch_normalization_13_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_13_beta_path =
+      dir_prefix + std::string("batch_normalization_13_beta.bin");
+  void *batch_normalization_13_beta = readTrainedWeights(
+      batch_normalization_13_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_13_mean_path =
+      dir_prefix + std::string("batch_normalization_13_mean.bin");
+  void *batch_normalization_13_mean = readTrainedWeights(
+      batch_normalization_13_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_13_variance_path =
+      dir_prefix + std::string("batch_normalization_13_variance.bin");
+  void *batch_normalization_13_variance = readTrainedWeights(
+      batch_normalization_13_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 512, 128, 1, 1);
+  std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin");
+  void *conv2d_14_b =
+      readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin");
+  void *conv2d_15_w =
+      readTrainedWeights(conv2d_15_w_path.c_str(), 0, 512, 256, 1, 1);
+  std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin");
+  void *conv2d_15_b =
+      readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_gamma_path =
+      dir_prefix + std::string("batch_normalization_14_gamma.bin");
+  void *batch_normalization_14_gamma = readTrainedWeights(
+      batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_beta_path =
+      dir_prefix + std::string("batch_normalization_14_beta.bin");
+  void *batch_normalization_14_beta = readTrainedWeights(
+      batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_mean_path =
+      dir_prefix + std::string("batch_normalization_14_mean.bin");
+  void *batch_normalization_14_mean = readTrainedWeights(
+      batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_variance_path =
+      dir_prefix + std::string("batch_normalization_14_variance.bin");
+  void *batch_normalization_14_variance = readTrainedWeights(
+      batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_gamma_path =
+      dir_prefix + std::string("batch_normalization_15_gamma.bin");
+  void *batch_normalization_15_gamma = readTrainedWeights(
+      batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_beta_path =
+      dir_prefix + std::string("batch_normalization_15_beta.bin");
+  void *batch_normalization_15_beta = readTrainedWeights(
+      batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_mean_path =
+      dir_prefix + std::string("batch_normalization_15_mean.bin");
+  void *batch_normalization_15_mean = readTrainedWeights(
+      batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_variance_path =
+      dir_prefix + std::string("batch_normalization_15_variance.bin");
+  void *batch_normalization_15_variance = readTrainedWeights(
+      batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin");
+  void *conv2d_16_w =
+      readTrainedWeights(conv2d_16_w_path.c_str(), 0, 128, 512, 1, 1);
+  std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin");
+  void *conv2d_16_b =
+      readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_16_gamma_path =
+      dir_prefix + std::string("batch_normalization_16_gamma.bin");
+  void *batch_normalization_16_gamma = readTrainedWeights(
+      batch_normalization_16_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_16_beta_path =
+      dir_prefix + std::string("batch_normalization_16_beta.bin");
+  void *batch_normalization_16_beta = readTrainedWeights(
+      batch_normalization_16_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_16_mean_path =
+      dir_prefix + std::string("batch_normalization_16_mean.bin");
+  void *batch_normalization_16_mean = readTrainedWeights(
+      batch_normalization_16_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_16_variance_path =
+      dir_prefix + std::string("batch_normalization_16_variance.bin");
+  void *batch_normalization_16_variance = readTrainedWeights(
+      batch_normalization_16_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin");
+  void *conv2d_17_w =
+      readTrainedWeights(conv2d_17_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin");
+  void *conv2d_17_b =
+      readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_17_gamma_path =
+      dir_prefix + std::string("batch_normalization_17_gamma.bin");
+  void *batch_normalization_17_gamma = readTrainedWeights(
+      batch_normalization_17_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_17_beta_path =
+      dir_prefix + std::string("batch_normalization_17_beta.bin");
+  void *batch_normalization_17_beta = readTrainedWeights(
+      batch_normalization_17_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_17_mean_path =
+      dir_prefix + std::string("batch_normalization_17_mean.bin");
+  void *batch_normalization_17_mean = readTrainedWeights(
+      batch_normalization_17_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_17_variance_path =
+      dir_prefix + std::string("batch_normalization_17_variance.bin");
+  void *batch_normalization_17_variance = readTrainedWeights(
+      batch_normalization_17_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin");
+  void *conv2d_18_w =
+      readTrainedWeights(conv2d_18_w_path.c_str(), 0, 512, 128, 1, 1);
+  std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin");
+  void *conv2d_18_b =
+      readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_gamma_path =
+      dir_prefix + std::string("batch_normalization_18_gamma.bin");
+  void *batch_normalization_18_gamma = readTrainedWeights(
+      batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_beta_path =
+      dir_prefix + std::string("batch_normalization_18_beta.bin");
+  void *batch_normalization_18_beta = readTrainedWeights(
+      batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_mean_path =
+      dir_prefix + std::string("batch_normalization_18_mean.bin");
+  void *batch_normalization_18_mean = readTrainedWeights(
+      batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_variance_path =
+      dir_prefix + std::string("batch_normalization_18_variance.bin");
+  void *batch_normalization_18_variance = readTrainedWeights(
+      batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin");
+  void *conv2d_19_w =
+      readTrainedWeights(conv2d_19_w_path.c_str(), 0, 128, 512, 1, 1);
+  std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin");
+  void *conv2d_19_b =
+      readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_19_gamma_path =
+      dir_prefix + std::string("batch_normalization_19_gamma.bin");
+  void *batch_normalization_19_gamma = readTrainedWeights(
+      batch_normalization_19_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_19_beta_path =
+      dir_prefix + std::string("batch_normalization_19_beta.bin");
+  void *batch_normalization_19_beta = readTrainedWeights(
+      batch_normalization_19_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_19_mean_path =
+      dir_prefix + std::string("batch_normalization_19_mean.bin");
+  void *batch_normalization_19_mean = readTrainedWeights(
+      batch_normalization_19_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_19_variance_path =
+      dir_prefix + std::string("batch_normalization_19_variance.bin");
+  void *batch_normalization_19_variance = readTrainedWeights(
+      batch_normalization_19_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin");
+  void *conv2d_20_w =
+      readTrainedWeights(conv2d_20_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin");
+  void *conv2d_20_b =
+      readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_20_gamma_path =
+      dir_prefix + std::string("batch_normalization_20_gamma.bin");
+  void *batch_normalization_20_gamma = readTrainedWeights(
+      batch_normalization_20_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_20_beta_path =
+      dir_prefix + std::string("batch_normalization_20_beta.bin");
+  void *batch_normalization_20_beta = readTrainedWeights(
+      batch_normalization_20_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_20_mean_path =
+      dir_prefix + std::string("batch_normalization_20_mean.bin");
+  void *batch_normalization_20_mean = readTrainedWeights(
+      batch_normalization_20_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_20_variance_path =
+      dir_prefix + std::string("batch_normalization_20_variance.bin");
+  void *batch_normalization_20_variance = readTrainedWeights(
+      batch_normalization_20_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin");
+  void *conv2d_21_w =
+      readTrainedWeights(conv2d_21_w_path.c_str(), 0, 512, 128, 1, 1);
+  std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin");
+  void *conv2d_21_b =
+      readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_gamma_path =
+      dir_prefix + std::string("batch_normalization_21_gamma.bin");
+  void *batch_normalization_21_gamma = readTrainedWeights(
+      batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_beta_path =
+      dir_prefix + std::string("batch_normalization_21_beta.bin");
+  void *batch_normalization_21_beta = readTrainedWeights(
+      batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_mean_path =
+      dir_prefix + std::string("batch_normalization_21_mean.bin");
+  void *batch_normalization_21_mean = readTrainedWeights(
+      batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_variance_path =
+      dir_prefix + std::string("batch_normalization_21_variance.bin");
+  void *batch_normalization_21_variance = readTrainedWeights(
+      batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_22_w_path = dir_prefix + std::string("conv2d_22_w.bin");
+  void *conv2d_22_w =
+      readTrainedWeights(conv2d_22_w_path.c_str(), 0, 128, 512, 1, 1);
+  std::string conv2d_22_b_path = dir_prefix + std::string("conv2d_22_b.bin");
+  void *conv2d_22_b =
+      readTrainedWeights(conv2d_22_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_22_gamma_path =
+      dir_prefix + std::string("batch_normalization_22_gamma.bin");
+  void *batch_normalization_22_gamma = readTrainedWeights(
+      batch_normalization_22_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_22_beta_path =
+      dir_prefix + std::string("batch_normalization_22_beta.bin");
+  void *batch_normalization_22_beta = readTrainedWeights(
+      batch_normalization_22_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_22_mean_path =
+      dir_prefix + std::string("batch_normalization_22_mean.bin");
+  void *batch_normalization_22_mean = readTrainedWeights(
+      batch_normalization_22_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_22_variance_path =
+      dir_prefix + std::string("batch_normalization_22_variance.bin");
+  void *batch_normalization_22_variance = readTrainedWeights(
+      batch_normalization_22_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_23_w_path = dir_prefix + std::string("conv2d_23_w.bin");
+  void *conv2d_23_w =
+      readTrainedWeights(conv2d_23_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_23_b_path = dir_prefix + std::string("conv2d_23_b.bin");
+  void *conv2d_23_b =
+      readTrainedWeights(conv2d_23_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_23_gamma_path =
+      dir_prefix + std::string("batch_normalization_23_gamma.bin");
+  void *batch_normalization_23_gamma = readTrainedWeights(
+      batch_normalization_23_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_23_beta_path =
+      dir_prefix + std::string("batch_normalization_23_beta.bin");
+  void *batch_normalization_23_beta = readTrainedWeights(
+      batch_normalization_23_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_23_mean_path =
+      dir_prefix + std::string("batch_normalization_23_mean.bin");
+  void *batch_normalization_23_mean = readTrainedWeights(
+      batch_normalization_23_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_23_variance_path =
+      dir_prefix + std::string("batch_normalization_23_variance.bin");
+  void *batch_normalization_23_variance = readTrainedWeights(
+      batch_normalization_23_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_24_w_path = dir_prefix + std::string("conv2d_24_w.bin");
+  void *conv2d_24_w =
+      readTrainedWeights(conv2d_24_w_path.c_str(), 0, 512, 128, 1, 1);
+  std::string conv2d_24_b_path = dir_prefix + std::string("conv2d_24_b.bin");
+  void *conv2d_24_b =
+      readTrainedWeights(conv2d_24_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_gamma_path =
+      dir_prefix + std::string("batch_normalization_24_gamma.bin");
+  void *batch_normalization_24_gamma = readTrainedWeights(
+      batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_beta_path =
+      dir_prefix + std::string("batch_normalization_24_beta.bin");
+  void *batch_normalization_24_beta = readTrainedWeights(
+      batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_mean_path =
+      dir_prefix + std::string("batch_normalization_24_mean.bin");
+  void *batch_normalization_24_mean = readTrainedWeights(
+      batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_variance_path =
+      dir_prefix + std::string("batch_normalization_24_variance.bin");
+  void *batch_normalization_24_variance = readTrainedWeights(
+      batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_25_w_path = dir_prefix + std::string("conv2d_25_w.bin");
+  void *conv2d_25_w =
+      readTrainedWeights(conv2d_25_w_path.c_str(), 0, 256, 512, 1, 1);
+  std::string conv2d_25_b_path = dir_prefix + std::string("conv2d_25_b.bin");
+  void *conv2d_25_b =
+      readTrainedWeights(conv2d_25_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_25_gamma_path =
+      dir_prefix + std::string("batch_normalization_25_gamma.bin");
+  void *batch_normalization_25_gamma = readTrainedWeights(
+      batch_normalization_25_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_25_beta_path =
+      dir_prefix + std::string("batch_normalization_25_beta.bin");
+  void *batch_normalization_25_beta = readTrainedWeights(
+      batch_normalization_25_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_25_mean_path =
+      dir_prefix + std::string("batch_normalization_25_mean.bin");
+  void *batch_normalization_25_mean = readTrainedWeights(
+      batch_normalization_25_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_25_variance_path =
+      dir_prefix + std::string("batch_normalization_25_variance.bin");
+  void *batch_normalization_25_variance = readTrainedWeights(
+      batch_normalization_25_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_26_w_path = dir_prefix + std::string("conv2d_26_w.bin");
+  void *conv2d_26_w =
+      readTrainedWeights(conv2d_26_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_26_b_path = dir_prefix + std::string("conv2d_26_b.bin");
+  void *conv2d_26_b =
+      readTrainedWeights(conv2d_26_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_26_gamma_path =
+      dir_prefix + std::string("batch_normalization_26_gamma.bin");
+  void *batch_normalization_26_gamma = readTrainedWeights(
+      batch_normalization_26_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_26_beta_path =
+      dir_prefix + std::string("batch_normalization_26_beta.bin");
+  void *batch_normalization_26_beta = readTrainedWeights(
+      batch_normalization_26_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_26_mean_path =
+      dir_prefix + std::string("batch_normalization_26_mean.bin");
+  void *batch_normalization_26_mean = readTrainedWeights(
+      batch_normalization_26_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_26_variance_path =
+      dir_prefix + std::string("batch_normalization_26_variance.bin");
+  void *batch_normalization_26_variance = readTrainedWeights(
+      batch_normalization_26_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_27_w_path = dir_prefix + std::string("conv2d_27_w.bin");
+  void *conv2d_27_w =
+      readTrainedWeights(conv2d_27_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_27_b_path = dir_prefix + std::string("conv2d_27_b.bin");
+  void *conv2d_27_b =
+      readTrainedWeights(conv2d_27_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_28_w_path = dir_prefix + std::string("conv2d_28_w.bin");
+  void *conv2d_28_w =
+      readTrainedWeights(conv2d_28_w_path.c_str(), 0, 1024, 512, 1, 1);
+  std::string conv2d_28_b_path = dir_prefix + std::string("conv2d_28_b.bin");
+  void *conv2d_28_b =
+      readTrainedWeights(conv2d_28_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_gamma_path =
+      dir_prefix + std::string("batch_normalization_27_gamma.bin");
+  void *batch_normalization_27_gamma = readTrainedWeights(
+      batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_beta_path =
+      dir_prefix + std::string("batch_normalization_27_beta.bin");
+  void *batch_normalization_27_beta = readTrainedWeights(
+      batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_mean_path =
+      dir_prefix + std::string("batch_normalization_27_mean.bin");
+  void *batch_normalization_27_mean = readTrainedWeights(
+      batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_variance_path =
+      dir_prefix + std::string("batch_normalization_27_variance.bin");
+  void *batch_normalization_27_variance = readTrainedWeights(
+      batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_28_gamma_path =
+      dir_prefix + std::string("batch_normalization_28_gamma.bin");
+  void *batch_normalization_28_gamma = readTrainedWeights(
+      batch_normalization_28_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_28_beta_path =
+      dir_prefix + std::string("batch_normalization_28_beta.bin");
+  void *batch_normalization_28_beta = readTrainedWeights(
+      batch_normalization_28_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_28_mean_path =
+      dir_prefix + std::string("batch_normalization_28_mean.bin");
+  void *batch_normalization_28_mean = readTrainedWeights(
+      batch_normalization_28_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_28_variance_path =
+      dir_prefix + std::string("batch_normalization_28_variance.bin");
+  void *batch_normalization_28_variance = readTrainedWeights(
+      batch_normalization_28_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_29_w_path = dir_prefix + std::string("conv2d_29_w.bin");
+  void *conv2d_29_w =
+      readTrainedWeights(conv2d_29_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_29_b_path = dir_prefix + std::string("conv2d_29_b.bin");
+  void *conv2d_29_b =
+      readTrainedWeights(conv2d_29_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_29_gamma_path =
+      dir_prefix + std::string("batch_normalization_29_gamma.bin");
+  void *batch_normalization_29_gamma = readTrainedWeights(
+      batch_normalization_29_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_29_beta_path =
+      dir_prefix + std::string("batch_normalization_29_beta.bin");
+  void *batch_normalization_29_beta = readTrainedWeights(
+      batch_normalization_29_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_29_mean_path =
+      dir_prefix + std::string("batch_normalization_29_mean.bin");
+  void *batch_normalization_29_mean = readTrainedWeights(
+      batch_normalization_29_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_29_variance_path =
+      dir_prefix + std::string("batch_normalization_29_variance.bin");
+  void *batch_normalization_29_variance = readTrainedWeights(
+      batch_normalization_29_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_30_w_path = dir_prefix + std::string("conv2d_30_w.bin");
+  void *conv2d_30_w =
+      readTrainedWeights(conv2d_30_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_30_b_path = dir_prefix + std::string("conv2d_30_b.bin");
+  void *conv2d_30_b =
+      readTrainedWeights(conv2d_30_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_30_gamma_path =
+      dir_prefix + std::string("batch_normalization_30_gamma.bin");
+  void *batch_normalization_30_gamma = readTrainedWeights(
+      batch_normalization_30_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_30_beta_path =
+      dir_prefix + std::string("batch_normalization_30_beta.bin");
+  void *batch_normalization_30_beta = readTrainedWeights(
+      batch_normalization_30_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_30_mean_path =
+      dir_prefix + std::string("batch_normalization_30_mean.bin");
+  void *batch_normalization_30_mean = readTrainedWeights(
+      batch_normalization_30_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_30_variance_path =
+      dir_prefix + std::string("batch_normalization_30_variance.bin");
+  void *batch_normalization_30_variance = readTrainedWeights(
+      batch_normalization_30_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_31_w_path = dir_prefix + std::string("conv2d_31_w.bin");
+  void *conv2d_31_w =
+      readTrainedWeights(conv2d_31_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_31_b_path = dir_prefix + std::string("conv2d_31_b.bin");
+  void *conv2d_31_b =
+      readTrainedWeights(conv2d_31_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_31_gamma_path =
+      dir_prefix + std::string("batch_normalization_31_gamma.bin");
+  void *batch_normalization_31_gamma = readTrainedWeights(
+      batch_normalization_31_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_31_beta_path =
+      dir_prefix + std::string("batch_normalization_31_beta.bin");
+  void *batch_normalization_31_beta = readTrainedWeights(
+      batch_normalization_31_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_31_mean_path =
+      dir_prefix + std::string("batch_normalization_31_mean.bin");
+  void *batch_normalization_31_mean = readTrainedWeights(
+      batch_normalization_31_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_31_variance_path =
+      dir_prefix + std::string("batch_normalization_31_variance.bin");
+  void *batch_normalization_31_variance = readTrainedWeights(
+      batch_normalization_31_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_32_w_path = dir_prefix + std::string("conv2d_32_w.bin");
+  void *conv2d_32_w =
+      readTrainedWeights(conv2d_32_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_32_b_path = dir_prefix + std::string("conv2d_32_b.bin");
+  void *conv2d_32_b =
+      readTrainedWeights(conv2d_32_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_32_gamma_path =
+      dir_prefix + std::string("batch_normalization_32_gamma.bin");
+  void *batch_normalization_32_gamma = readTrainedWeights(
+      batch_normalization_32_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_32_beta_path =
+      dir_prefix + std::string("batch_normalization_32_beta.bin");
+  void *batch_normalization_32_beta = readTrainedWeights(
+      batch_normalization_32_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_32_mean_path =
+      dir_prefix + std::string("batch_normalization_32_mean.bin");
+  void *batch_normalization_32_mean = readTrainedWeights(
+      batch_normalization_32_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_32_variance_path =
+      dir_prefix + std::string("batch_normalization_32_variance.bin");
+  void *batch_normalization_32_variance = readTrainedWeights(
+      batch_normalization_32_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_33_w_path = dir_prefix + std::string("conv2d_33_w.bin");
+  void *conv2d_33_w =
+      readTrainedWeights(conv2d_33_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_33_b_path = dir_prefix + std::string("conv2d_33_b.bin");
+  void *conv2d_33_b =
+      readTrainedWeights(conv2d_33_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_33_gamma_path =
+      dir_prefix + std::string("batch_normalization_33_gamma.bin");
+  void *batch_normalization_33_gamma = readTrainedWeights(
+      batch_normalization_33_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_33_beta_path =
+      dir_prefix + std::string("batch_normalization_33_beta.bin");
+  void *batch_normalization_33_beta = readTrainedWeights(
+      batch_normalization_33_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_33_mean_path =
+      dir_prefix + std::string("batch_normalization_33_mean.bin");
+  void *batch_normalization_33_mean = readTrainedWeights(
+      batch_normalization_33_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_33_variance_path =
+      dir_prefix + std::string("batch_normalization_33_variance.bin");
+  void *batch_normalization_33_variance = readTrainedWeights(
+      batch_normalization_33_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_34_w_path = dir_prefix + std::string("conv2d_34_w.bin");
+  void *conv2d_34_w =
+      readTrainedWeights(conv2d_34_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_34_b_path = dir_prefix + std::string("conv2d_34_b.bin");
+  void *conv2d_34_b =
+      readTrainedWeights(conv2d_34_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_34_gamma_path =
+      dir_prefix + std::string("batch_normalization_34_gamma.bin");
+  void *batch_normalization_34_gamma = readTrainedWeights(
+      batch_normalization_34_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_34_beta_path =
+      dir_prefix + std::string("batch_normalization_34_beta.bin");
+  void *batch_normalization_34_beta = readTrainedWeights(
+      batch_normalization_34_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_34_mean_path =
+      dir_prefix + std::string("batch_normalization_34_mean.bin");
+  void *batch_normalization_34_mean = readTrainedWeights(
+      batch_normalization_34_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_34_variance_path =
+      dir_prefix + std::string("batch_normalization_34_variance.bin");
+  void *batch_normalization_34_variance = readTrainedWeights(
+      batch_normalization_34_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_35_w_path = dir_prefix + std::string("conv2d_35_w.bin");
+  void *conv2d_35_w =
+      readTrainedWeights(conv2d_35_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_35_b_path = dir_prefix + std::string("conv2d_35_b.bin");
+  void *conv2d_35_b =
+      readTrainedWeights(conv2d_35_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_35_gamma_path =
+      dir_prefix + std::string("batch_normalization_35_gamma.bin");
+  void *batch_normalization_35_gamma = readTrainedWeights(
+      batch_normalization_35_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_35_beta_path =
+      dir_prefix + std::string("batch_normalization_35_beta.bin");
+  void *batch_normalization_35_beta = readTrainedWeights(
+      batch_normalization_35_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_35_mean_path =
+      dir_prefix + std::string("batch_normalization_35_mean.bin");
+  void *batch_normalization_35_mean = readTrainedWeights(
+      batch_normalization_35_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_35_variance_path =
+      dir_prefix + std::string("batch_normalization_35_variance.bin");
+  void *batch_normalization_35_variance = readTrainedWeights(
+      batch_normalization_35_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_36_w_path = dir_prefix + std::string("conv2d_36_w.bin");
+  void *conv2d_36_w =
+      readTrainedWeights(conv2d_36_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_36_b_path = dir_prefix + std::string("conv2d_36_b.bin");
+  void *conv2d_36_b =
+      readTrainedWeights(conv2d_36_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_36_gamma_path =
+      dir_prefix + std::string("batch_normalization_36_gamma.bin");
+  void *batch_normalization_36_gamma = readTrainedWeights(
+      batch_normalization_36_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_36_beta_path =
+      dir_prefix + std::string("batch_normalization_36_beta.bin");
+  void *batch_normalization_36_beta = readTrainedWeights(
+      batch_normalization_36_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_36_mean_path =
+      dir_prefix + std::string("batch_normalization_36_mean.bin");
+  void *batch_normalization_36_mean = readTrainedWeights(
+      batch_normalization_36_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_36_variance_path =
+      dir_prefix + std::string("batch_normalization_36_variance.bin");
+  void *batch_normalization_36_variance = readTrainedWeights(
+      batch_normalization_36_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_37_w_path = dir_prefix + std::string("conv2d_37_w.bin");
+  void *conv2d_37_w =
+      readTrainedWeights(conv2d_37_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_37_b_path = dir_prefix + std::string("conv2d_37_b.bin");
+  void *conv2d_37_b =
+      readTrainedWeights(conv2d_37_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_37_gamma_path =
+      dir_prefix + std::string("batch_normalization_37_gamma.bin");
+  void *batch_normalization_37_gamma = readTrainedWeights(
+      batch_normalization_37_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_37_beta_path =
+      dir_prefix + std::string("batch_normalization_37_beta.bin");
+  void *batch_normalization_37_beta = readTrainedWeights(
+      batch_normalization_37_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_37_mean_path =
+      dir_prefix + std::string("batch_normalization_37_mean.bin");
+  void *batch_normalization_37_mean = readTrainedWeights(
+      batch_normalization_37_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_37_variance_path =
+      dir_prefix + std::string("batch_normalization_37_variance.bin");
+  void *batch_normalization_37_variance = readTrainedWeights(
+      batch_normalization_37_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_38_w_path = dir_prefix + std::string("conv2d_38_w.bin");
+  void *conv2d_38_w =
+      readTrainedWeights(conv2d_38_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_38_b_path = dir_prefix + std::string("conv2d_38_b.bin");
+  void *conv2d_38_b =
+      readTrainedWeights(conv2d_38_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_38_gamma_path =
+      dir_prefix + std::string("batch_normalization_38_gamma.bin");
+  void *batch_normalization_38_gamma = readTrainedWeights(
+      batch_normalization_38_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_38_beta_path =
+      dir_prefix + std::string("batch_normalization_38_beta.bin");
+  void *batch_normalization_38_beta = readTrainedWeights(
+      batch_normalization_38_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_38_mean_path =
+      dir_prefix + std::string("batch_normalization_38_mean.bin");
+  void *batch_normalization_38_mean = readTrainedWeights(
+      batch_normalization_38_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_38_variance_path =
+      dir_prefix + std::string("batch_normalization_38_variance.bin");
+  void *batch_normalization_38_variance = readTrainedWeights(
+      batch_normalization_38_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_39_w_path = dir_prefix + std::string("conv2d_39_w.bin");
+  void *conv2d_39_w =
+      readTrainedWeights(conv2d_39_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_39_b_path = dir_prefix + std::string("conv2d_39_b.bin");
+  void *conv2d_39_b =
+      readTrainedWeights(conv2d_39_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_39_gamma_path =
+      dir_prefix + std::string("batch_normalization_39_gamma.bin");
+  void *batch_normalization_39_gamma = readTrainedWeights(
+      batch_normalization_39_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_39_beta_path =
+      dir_prefix + std::string("batch_normalization_39_beta.bin");
+  void *batch_normalization_39_beta = readTrainedWeights(
+      batch_normalization_39_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_39_mean_path =
+      dir_prefix + std::string("batch_normalization_39_mean.bin");
+  void *batch_normalization_39_mean = readTrainedWeights(
+      batch_normalization_39_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_39_variance_path =
+      dir_prefix + std::string("batch_normalization_39_variance.bin");
+  void *batch_normalization_39_variance = readTrainedWeights(
+      batch_normalization_39_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_40_w_path = dir_prefix + std::string("conv2d_40_w.bin");
+  void *conv2d_40_w =
+      readTrainedWeights(conv2d_40_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_40_b_path = dir_prefix + std::string("conv2d_40_b.bin");
+  void *conv2d_40_b =
+      readTrainedWeights(conv2d_40_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_40_gamma_path =
+      dir_prefix + std::string("batch_normalization_40_gamma.bin");
+  void *batch_normalization_40_gamma = readTrainedWeights(
+      batch_normalization_40_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_40_beta_path =
+      dir_prefix + std::string("batch_normalization_40_beta.bin");
+  void *batch_normalization_40_beta = readTrainedWeights(
+      batch_normalization_40_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_40_mean_path =
+      dir_prefix + std::string("batch_normalization_40_mean.bin");
+  void *batch_normalization_40_mean = readTrainedWeights(
+      batch_normalization_40_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_40_variance_path =
+      dir_prefix + std::string("batch_normalization_40_variance.bin");
+  void *batch_normalization_40_variance = readTrainedWeights(
+      batch_normalization_40_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_41_w_path = dir_prefix + std::string("conv2d_41_w.bin");
+  void *conv2d_41_w =
+      readTrainedWeights(conv2d_41_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_41_b_path = dir_prefix + std::string("conv2d_41_b.bin");
+  void *conv2d_41_b =
+      readTrainedWeights(conv2d_41_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_41_gamma_path =
+      dir_prefix + std::string("batch_normalization_41_gamma.bin");
+  void *batch_normalization_41_gamma = readTrainedWeights(
+      batch_normalization_41_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_41_beta_path =
+      dir_prefix + std::string("batch_normalization_41_beta.bin");
+  void *batch_normalization_41_beta = readTrainedWeights(
+      batch_normalization_41_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_41_mean_path =
+      dir_prefix + std::string("batch_normalization_41_mean.bin");
+  void *batch_normalization_41_mean = readTrainedWeights(
+      batch_normalization_41_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_41_variance_path =
+      dir_prefix + std::string("batch_normalization_41_variance.bin");
+  void *batch_normalization_41_variance = readTrainedWeights(
+      batch_normalization_41_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_42_w_path = dir_prefix + std::string("conv2d_42_w.bin");
+  void *conv2d_42_w =
+      readTrainedWeights(conv2d_42_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_42_b_path = dir_prefix + std::string("conv2d_42_b.bin");
+  void *conv2d_42_b =
+      readTrainedWeights(conv2d_42_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_42_gamma_path =
+      dir_prefix + std::string("batch_normalization_42_gamma.bin");
+  void *batch_normalization_42_gamma = readTrainedWeights(
+      batch_normalization_42_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_42_beta_path =
+      dir_prefix + std::string("batch_normalization_42_beta.bin");
+  void *batch_normalization_42_beta = readTrainedWeights(
+      batch_normalization_42_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_42_mean_path =
+      dir_prefix + std::string("batch_normalization_42_mean.bin");
+  void *batch_normalization_42_mean = readTrainedWeights(
+      batch_normalization_42_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_42_variance_path =
+      dir_prefix + std::string("batch_normalization_42_variance.bin");
+  void *batch_normalization_42_variance = readTrainedWeights(
+      batch_normalization_42_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_43_w_path = dir_prefix + std::string("conv2d_43_w.bin");
+  void *conv2d_43_w =
+      readTrainedWeights(conv2d_43_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_43_b_path = dir_prefix + std::string("conv2d_43_b.bin");
+  void *conv2d_43_b =
+      readTrainedWeights(conv2d_43_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_43_gamma_path =
+      dir_prefix + std::string("batch_normalization_43_gamma.bin");
+  void *batch_normalization_43_gamma = readTrainedWeights(
+      batch_normalization_43_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_43_beta_path =
+      dir_prefix + std::string("batch_normalization_43_beta.bin");
+  void *batch_normalization_43_beta = readTrainedWeights(
+      batch_normalization_43_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_43_mean_path =
+      dir_prefix + std::string("batch_normalization_43_mean.bin");
+  void *batch_normalization_43_mean = readTrainedWeights(
+      batch_normalization_43_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_43_variance_path =
+      dir_prefix + std::string("batch_normalization_43_variance.bin");
+  void *batch_normalization_43_variance = readTrainedWeights(
+      batch_normalization_43_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_44_w_path = dir_prefix + std::string("conv2d_44_w.bin");
+  void *conv2d_44_w =
+      readTrainedWeights(conv2d_44_w_path.c_str(), 0, 512, 1024, 1, 1);
+  std::string conv2d_44_b_path = dir_prefix + std::string("conv2d_44_b.bin");
+  void *conv2d_44_b =
+      readTrainedWeights(conv2d_44_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_44_gamma_path =
+      dir_prefix + std::string("batch_normalization_44_gamma.bin");
+  void *batch_normalization_44_gamma = readTrainedWeights(
+      batch_normalization_44_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_44_beta_path =
+      dir_prefix + std::string("batch_normalization_44_beta.bin");
+  void *batch_normalization_44_beta = readTrainedWeights(
+      batch_normalization_44_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_44_mean_path =
+      dir_prefix + std::string("batch_normalization_44_mean.bin");
+  void *batch_normalization_44_mean = readTrainedWeights(
+      batch_normalization_44_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_44_variance_path =
+      dir_prefix + std::string("batch_normalization_44_variance.bin");
+  void *batch_normalization_44_variance = readTrainedWeights(
+      batch_normalization_44_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_45_w_path = dir_prefix + std::string("conv2d_45_w.bin");
+  void *conv2d_45_w =
+      readTrainedWeights(conv2d_45_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_45_b_path = dir_prefix + std::string("conv2d_45_b.bin");
+  void *conv2d_45_b =
+      readTrainedWeights(conv2d_45_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_45_gamma_path =
+      dir_prefix + std::string("batch_normalization_45_gamma.bin");
+  void *batch_normalization_45_gamma = readTrainedWeights(
+      batch_normalization_45_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_45_beta_path =
+      dir_prefix + std::string("batch_normalization_45_beta.bin");
+  void *batch_normalization_45_beta = readTrainedWeights(
+      batch_normalization_45_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_45_mean_path =
+      dir_prefix + std::string("batch_normalization_45_mean.bin");
+  void *batch_normalization_45_mean = readTrainedWeights(
+      batch_normalization_45_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_45_variance_path =
+      dir_prefix + std::string("batch_normalization_45_variance.bin");
+  void *batch_normalization_45_variance = readTrainedWeights(
+      batch_normalization_45_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_46_w_path = dir_prefix + std::string("conv2d_46_w.bin");
+  void *conv2d_46_w =
+      readTrainedWeights(conv2d_46_w_path.c_str(), 0, 2048, 512, 1, 1);
+  std::string conv2d_46_b_path = dir_prefix + std::string("conv2d_46_b.bin");
+  void *conv2d_46_b =
+      readTrainedWeights(conv2d_46_b_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string conv2d_47_w_path = dir_prefix + std::string("conv2d_47_w.bin");
+  void *conv2d_47_w =
+      readTrainedWeights(conv2d_47_w_path.c_str(), 0, 2048, 1024, 1, 1);
+  std::string conv2d_47_b_path = dir_prefix + std::string("conv2d_47_b.bin");
+  void *conv2d_47_b =
+      readTrainedWeights(conv2d_47_b_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_46_gamma_path =
+      dir_prefix + std::string("batch_normalization_46_gamma.bin");
+  void *batch_normalization_46_gamma = readTrainedWeights(
+      batch_normalization_46_gamma_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_46_beta_path =
+      dir_prefix + std::string("batch_normalization_46_beta.bin");
+  void *batch_normalization_46_beta = readTrainedWeights(
+      batch_normalization_46_beta_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_46_mean_path =
+      dir_prefix + std::string("batch_normalization_46_mean.bin");
+  void *batch_normalization_46_mean = readTrainedWeights(
+      batch_normalization_46_mean_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_46_variance_path =
+      dir_prefix + std::string("batch_normalization_46_variance.bin");
+  void *batch_normalization_46_variance = readTrainedWeights(
+      batch_normalization_46_variance_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_47_gamma_path =
+      dir_prefix + std::string("batch_normalization_47_gamma.bin");
+  void *batch_normalization_47_gamma = readTrainedWeights(
+      batch_normalization_47_gamma_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_47_beta_path =
+      dir_prefix + std::string("batch_normalization_47_beta.bin");
+  void *batch_normalization_47_beta = readTrainedWeights(
+      batch_normalization_47_beta_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_47_mean_path =
+      dir_prefix + std::string("batch_normalization_47_mean.bin");
+  void *batch_normalization_47_mean = readTrainedWeights(
+      batch_normalization_47_mean_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_47_variance_path =
+      dir_prefix + std::string("batch_normalization_47_variance.bin");
+  void *batch_normalization_47_variance = readTrainedWeights(
+      batch_normalization_47_variance_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string conv2d_48_w_path = dir_prefix + std::string("conv2d_48_w.bin");
+  void *conv2d_48_w =
+      readTrainedWeights(conv2d_48_w_path.c_str(), 0, 512, 2048, 1, 1);
+  std::string conv2d_48_b_path = dir_prefix + std::string("conv2d_48_b.bin");
+  void *conv2d_48_b =
+      readTrainedWeights(conv2d_48_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_48_gamma_path =
+      dir_prefix + std::string("batch_normalization_48_gamma.bin");
+  void *batch_normalization_48_gamma = readTrainedWeights(
+      batch_normalization_48_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_48_beta_path =
+      dir_prefix + std::string("batch_normalization_48_beta.bin");
+  void *batch_normalization_48_beta = readTrainedWeights(
+      batch_normalization_48_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_48_mean_path =
+      dir_prefix + std::string("batch_normalization_48_mean.bin");
+  void *batch_normalization_48_mean = readTrainedWeights(
+      batch_normalization_48_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_48_variance_path =
+      dir_prefix + std::string("batch_normalization_48_variance.bin");
+  void *batch_normalization_48_variance = readTrainedWeights(
+      batch_normalization_48_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_49_w_path = dir_prefix + std::string("conv2d_49_w.bin");
+  void *conv2d_49_w =
+      readTrainedWeights(conv2d_49_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_49_b_path = dir_prefix + std::string("conv2d_49_b.bin");
+  void *conv2d_49_b =
+      readTrainedWeights(conv2d_49_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_49_gamma_path =
+      dir_prefix + std::string("batch_normalization_49_gamma.bin");
+  void *batch_normalization_49_gamma = readTrainedWeights(
+      batch_normalization_49_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_49_beta_path =
+      dir_prefix + std::string("batch_normalization_49_beta.bin");
+  void *batch_normalization_49_beta = readTrainedWeights(
+      batch_normalization_49_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_49_mean_path =
+      dir_prefix + std::string("batch_normalization_49_mean.bin");
+  void *batch_normalization_49_mean = readTrainedWeights(
+      batch_normalization_49_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_49_variance_path =
+      dir_prefix + std::string("batch_normalization_49_variance.bin");
+  void *batch_normalization_49_variance = readTrainedWeights(
+      batch_normalization_49_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_50_w_path = dir_prefix + std::string("conv2d_50_w.bin");
+  void *conv2d_50_w =
+      readTrainedWeights(conv2d_50_w_path.c_str(), 0, 2048, 512, 1, 1);
+  std::string conv2d_50_b_path = dir_prefix + std::string("conv2d_50_b.bin");
+  void *conv2d_50_b =
+      readTrainedWeights(conv2d_50_b_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_50_gamma_path =
+      dir_prefix + std::string("batch_normalization_50_gamma.bin");
+  void *batch_normalization_50_gamma = readTrainedWeights(
+      batch_normalization_50_gamma_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_50_beta_path =
+      dir_prefix + std::string("batch_normalization_50_beta.bin");
+  void *batch_normalization_50_beta = readTrainedWeights(
+      batch_normalization_50_beta_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_50_mean_path =
+      dir_prefix + std::string("batch_normalization_50_mean.bin");
+  void *batch_normalization_50_mean = readTrainedWeights(
+      batch_normalization_50_mean_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_50_variance_path =
+      dir_prefix + std::string("batch_normalization_50_variance.bin");
+  void *batch_normalization_50_variance = readTrainedWeights(
+      batch_normalization_50_variance_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string conv2d_51_w_path = dir_prefix + std::string("conv2d_51_w.bin");
+  void *conv2d_51_w =
+      readTrainedWeights(conv2d_51_w_path.c_str(), 0, 512, 2048, 1, 1);
+  std::string conv2d_51_b_path = dir_prefix + std::string("conv2d_51_b.bin");
+  void *conv2d_51_b =
+      readTrainedWeights(conv2d_51_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_51_gamma_path =
+      dir_prefix + std::string("batch_normalization_51_gamma.bin");
+  void *batch_normalization_51_gamma = readTrainedWeights(
+      batch_normalization_51_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_51_beta_path =
+      dir_prefix + std::string("batch_normalization_51_beta.bin");
+  void *batch_normalization_51_beta = readTrainedWeights(
+      batch_normalization_51_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_51_mean_path =
+      dir_prefix + std::string("batch_normalization_51_mean.bin");
+  void *batch_normalization_51_mean = readTrainedWeights(
+      batch_normalization_51_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_51_variance_path =
+      dir_prefix + std::string("batch_normalization_51_variance.bin");
+  void *batch_normalization_51_variance = readTrainedWeights(
+      batch_normalization_51_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_52_w_path = dir_prefix + std::string("conv2d_52_w.bin");
+  void *conv2d_52_w =
+      readTrainedWeights(conv2d_52_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_52_b_path = dir_prefix + std::string("conv2d_52_b.bin");
+  void *conv2d_52_b =
+      readTrainedWeights(conv2d_52_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_52_gamma_path =
+      dir_prefix + std::string("batch_normalization_52_gamma.bin");
+  void *batch_normalization_52_gamma = readTrainedWeights(
+      batch_normalization_52_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_52_beta_path =
+      dir_prefix + std::string("batch_normalization_52_beta.bin");
+  void *batch_normalization_52_beta = readTrainedWeights(
+      batch_normalization_52_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_52_mean_path =
+      dir_prefix + std::string("batch_normalization_52_mean.bin");
+  void *batch_normalization_52_mean = readTrainedWeights(
+      batch_normalization_52_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_52_variance_path =
+      dir_prefix + std::string("batch_normalization_52_variance.bin");
+  void *batch_normalization_52_variance = readTrainedWeights(
+      batch_normalization_52_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_53_w_path = dir_prefix + std::string("conv2d_53_w.bin");
+  void *conv2d_53_w =
+      readTrainedWeights(conv2d_53_w_path.c_str(), 0, 2048, 512, 1, 1);
+  std::string conv2d_53_b_path = dir_prefix + std::string("conv2d_53_b.bin");
+  void *conv2d_53_b =
+      readTrainedWeights(conv2d_53_b_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_53_gamma_path =
+      dir_prefix + std::string("batch_normalization_53_gamma.bin");
+  void *batch_normalization_53_gamma = readTrainedWeights(
+      batch_normalization_53_gamma_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_53_beta_path =
+      dir_prefix + std::string("batch_normalization_53_beta.bin");
+  void *batch_normalization_53_beta = readTrainedWeights(
+      batch_normalization_53_beta_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_53_mean_path =
+      dir_prefix + std::string("batch_normalization_53_mean.bin");
+  void *batch_normalization_53_mean = readTrainedWeights(
+      batch_normalization_53_mean_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_53_variance_path =
+      dir_prefix + std::string("batch_normalization_53_variance.bin");
+  void *batch_normalization_53_variance = readTrainedWeights(
+      batch_normalization_53_variance_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 1000);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b =
+      readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1);
 
-  std::string dir_prefix = model_params_path + std::string("/shared/hsharif3/resnet50_imagenet/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,7,7); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_1_gamma_path =  dir_prefix + std::string("batch_normalization_1_gamma.bin"); 
-  void* batch_normalization_1_gamma =  readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_1_beta_path =  dir_prefix + std::string("batch_normalization_1_beta.bin"); 
-  void* batch_normalization_1_beta =  readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_1_mean_path =  dir_prefix + std::string("batch_normalization_1_mean.bin"); 
-  void* batch_normalization_1_mean =  readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_1_variance_path =  dir_prefix + std::string("batch_normalization_1_variance.bin"); 
-  void* batch_normalization_1_variance =  readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,1,1); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_2_gamma_path =  dir_prefix + std::string("batch_normalization_2_gamma.bin"); 
-  void* batch_normalization_2_gamma =  readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_2_beta_path =  dir_prefix + std::string("batch_normalization_2_beta.bin"); 
-  void* batch_normalization_2_beta =  readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_2_mean_path =  dir_prefix + std::string("batch_normalization_2_mean.bin"); 
-  void* batch_normalization_2_mean =  readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_2_variance_path =  dir_prefix + std::string("batch_normalization_2_variance.bin"); 
-  void* batch_normalization_2_variance =  readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_gamma_path =  dir_prefix + std::string("batch_normalization_3_gamma.bin"); 
-  void* batch_normalization_3_gamma =  readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_beta_path =  dir_prefix + std::string("batch_normalization_3_beta.bin"); 
-  void* batch_normalization_3_beta =  readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_mean_path =  dir_prefix + std::string("batch_normalization_3_mean.bin"); 
-  void* batch_normalization_3_mean =  readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_variance_path =  dir_prefix + std::string("batch_normalization_3_variance.bin"); 
-  void* batch_normalization_3_variance =  readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,64,1,1); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,64,1,1); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_4_gamma_path =  dir_prefix + std::string("batch_normalization_4_gamma.bin"); 
-  void* batch_normalization_4_gamma =  readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_4_beta_path =  dir_prefix + std::string("batch_normalization_4_beta.bin"); 
-  void* batch_normalization_4_beta =  readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_4_mean_path =  dir_prefix + std::string("batch_normalization_4_mean.bin"); 
-  void* batch_normalization_4_mean =  readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_4_variance_path =  dir_prefix + std::string("batch_normalization_4_variance.bin"); 
-  void* batch_normalization_4_variance =  readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_5_gamma_path =  dir_prefix + std::string("batch_normalization_5_gamma.bin"); 
-  void* batch_normalization_5_gamma =  readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_5_beta_path =  dir_prefix + std::string("batch_normalization_5_beta.bin"); 
-  void* batch_normalization_5_beta =  readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_5_mean_path =  dir_prefix + std::string("batch_normalization_5_mean.bin"); 
-  void* batch_normalization_5_mean =  readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_5_variance_path =  dir_prefix + std::string("batch_normalization_5_variance.bin"); 
-  void* batch_normalization_5_variance =  readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,64,256,1,1); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_6_gamma_path =  dir_prefix + std::string("batch_normalization_6_gamma.bin"); 
-  void* batch_normalization_6_gamma =  readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_6_beta_path =  dir_prefix + std::string("batch_normalization_6_beta.bin"); 
-  void* batch_normalization_6_beta =  readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_6_mean_path =  dir_prefix + std::string("batch_normalization_6_mean.bin"); 
-  void* batch_normalization_6_mean =  readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_6_variance_path =  dir_prefix + std::string("batch_normalization_6_variance.bin"); 
-  void* batch_normalization_6_variance =  readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_7_gamma_path =  dir_prefix + std::string("batch_normalization_7_gamma.bin"); 
-  void* batch_normalization_7_gamma =  readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_7_beta_path =  dir_prefix + std::string("batch_normalization_7_beta.bin"); 
-  void* batch_normalization_7_beta =  readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_7_mean_path =  dir_prefix + std::string("batch_normalization_7_mean.bin"); 
-  void* batch_normalization_7_mean =  readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_7_variance_path =  dir_prefix + std::string("batch_normalization_7_variance.bin"); 
-  void* batch_normalization_7_variance =  readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,256,64,1,1); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_8_gamma_path =  dir_prefix + std::string("batch_normalization_8_gamma.bin"); 
-  void* batch_normalization_8_gamma =  readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_8_beta_path =  dir_prefix + std::string("batch_normalization_8_beta.bin"); 
-  void* batch_normalization_8_beta =  readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_8_mean_path =  dir_prefix + std::string("batch_normalization_8_mean.bin"); 
-  void* batch_normalization_8_mean =  readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_8_variance_path =  dir_prefix + std::string("batch_normalization_8_variance.bin"); 
-  void* batch_normalization_8_variance =  readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,64,256,1,1); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_9_gamma_path =  dir_prefix + std::string("batch_normalization_9_gamma.bin"); 
-  void* batch_normalization_9_gamma =  readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_9_beta_path =  dir_prefix + std::string("batch_normalization_9_beta.bin"); 
-  void* batch_normalization_9_beta =  readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_9_mean_path =  dir_prefix + std::string("batch_normalization_9_mean.bin"); 
-  void* batch_normalization_9_mean =  readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_9_variance_path =  dir_prefix + std::string("batch_normalization_9_variance.bin"); 
-  void* batch_normalization_9_variance =  readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_10_gamma_path =  dir_prefix + std::string("batch_normalization_10_gamma.bin"); 
-  void* batch_normalization_10_gamma =  readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_10_beta_path =  dir_prefix + std::string("batch_normalization_10_beta.bin"); 
-  void* batch_normalization_10_beta =  readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_10_mean_path =  dir_prefix + std::string("batch_normalization_10_mean.bin"); 
-  void* batch_normalization_10_mean =  readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_10_variance_path =  dir_prefix + std::string("batch_normalization_10_variance.bin"); 
-  void* batch_normalization_10_variance =  readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,256,64,1,1); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_gamma_path =  dir_prefix + std::string("batch_normalization_11_gamma.bin"); 
-  void* batch_normalization_11_gamma =  readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_beta_path =  dir_prefix + std::string("batch_normalization_11_beta.bin"); 
-  void* batch_normalization_11_beta =  readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_mean_path =  dir_prefix + std::string("batch_normalization_11_mean.bin"); 
-  void* batch_normalization_11_mean =  readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_variance_path =  dir_prefix + std::string("batch_normalization_11_variance.bin"); 
-  void* batch_normalization_11_variance =  readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,128,256,1,1); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_12_gamma_path =  dir_prefix + std::string("batch_normalization_12_gamma.bin"); 
-  void* batch_normalization_12_gamma =  readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_12_beta_path =  dir_prefix + std::string("batch_normalization_12_beta.bin"); 
-  void* batch_normalization_12_beta =  readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_12_mean_path =  dir_prefix + std::string("batch_normalization_12_mean.bin"); 
-  void* batch_normalization_12_mean =  readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_12_variance_path =  dir_prefix + std::string("batch_normalization_12_variance.bin"); 
-  void* batch_normalization_12_variance =  readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_13_gamma_path =  dir_prefix + std::string("batch_normalization_13_gamma.bin"); 
-  void* batch_normalization_13_gamma =  readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_13_beta_path =  dir_prefix + std::string("batch_normalization_13_beta.bin"); 
-  void* batch_normalization_13_beta =  readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_13_mean_path =  dir_prefix + std::string("batch_normalization_13_mean.bin"); 
-  void* batch_normalization_13_mean =  readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_13_variance_path =  dir_prefix + std::string("batch_normalization_13_variance.bin"); 
-  void* batch_normalization_13_variance =  readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,512,128,1,1); 
-  std::string conv2d_14_b_path =  dir_prefix + std::string("conv2d_14_b.bin"); 
-  void* conv2d_14_b =  readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_15_w_path =  dir_prefix + std::string("conv2d_15_w.bin"); 
-  void* conv2d_15_w =  readTrainedWeights(conv2d_15_w_path.c_str(), 0,512,256,1,1); 
-  std::string conv2d_15_b_path =  dir_prefix + std::string("conv2d_15_b.bin"); 
-  void* conv2d_15_b =  readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_gamma_path =  dir_prefix + std::string("batch_normalization_14_gamma.bin"); 
-  void* batch_normalization_14_gamma =  readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_beta_path =  dir_prefix + std::string("batch_normalization_14_beta.bin"); 
-  void* batch_normalization_14_beta =  readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_mean_path =  dir_prefix + std::string("batch_normalization_14_mean.bin"); 
-  void* batch_normalization_14_mean =  readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_variance_path =  dir_prefix + std::string("batch_normalization_14_variance.bin"); 
-  void* batch_normalization_14_variance =  readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_gamma_path =  dir_prefix + std::string("batch_normalization_15_gamma.bin"); 
-  void* batch_normalization_15_gamma =  readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_beta_path =  dir_prefix + std::string("batch_normalization_15_beta.bin"); 
-  void* batch_normalization_15_beta =  readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_mean_path =  dir_prefix + std::string("batch_normalization_15_mean.bin"); 
-  void* batch_normalization_15_mean =  readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_variance_path =  dir_prefix + std::string("batch_normalization_15_variance.bin"); 
-  void* batch_normalization_15_variance =  readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_16_w_path =  dir_prefix + std::string("conv2d_16_w.bin"); 
-  void* conv2d_16_w =  readTrainedWeights(conv2d_16_w_path.c_str(), 0,128,512,1,1); 
-  std::string conv2d_16_b_path =  dir_prefix + std::string("conv2d_16_b.bin"); 
-  void* conv2d_16_b =  readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_16_gamma_path =  dir_prefix + std::string("batch_normalization_16_gamma.bin"); 
-  void* batch_normalization_16_gamma =  readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_16_beta_path =  dir_prefix + std::string("batch_normalization_16_beta.bin"); 
-  void* batch_normalization_16_beta =  readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_16_mean_path =  dir_prefix + std::string("batch_normalization_16_mean.bin"); 
-  void* batch_normalization_16_mean =  readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_16_variance_path =  dir_prefix + std::string("batch_normalization_16_variance.bin"); 
-  void* batch_normalization_16_variance =  readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_17_w_path =  dir_prefix + std::string("conv2d_17_w.bin"); 
-  void* conv2d_17_w =  readTrainedWeights(conv2d_17_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_17_b_path =  dir_prefix + std::string("conv2d_17_b.bin"); 
-  void* conv2d_17_b =  readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_17_gamma_path =  dir_prefix + std::string("batch_normalization_17_gamma.bin"); 
-  void* batch_normalization_17_gamma =  readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_17_beta_path =  dir_prefix + std::string("batch_normalization_17_beta.bin"); 
-  void* batch_normalization_17_beta =  readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_17_mean_path =  dir_prefix + std::string("batch_normalization_17_mean.bin"); 
-  void* batch_normalization_17_mean =  readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_17_variance_path =  dir_prefix + std::string("batch_normalization_17_variance.bin"); 
-  void* batch_normalization_17_variance =  readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_18_w_path =  dir_prefix + std::string("conv2d_18_w.bin"); 
-  void* conv2d_18_w =  readTrainedWeights(conv2d_18_w_path.c_str(), 0,512,128,1,1); 
-  std::string conv2d_18_b_path =  dir_prefix + std::string("conv2d_18_b.bin"); 
-  void* conv2d_18_b =  readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_gamma_path =  dir_prefix + std::string("batch_normalization_18_gamma.bin"); 
-  void* batch_normalization_18_gamma =  readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_beta_path =  dir_prefix + std::string("batch_normalization_18_beta.bin"); 
-  void* batch_normalization_18_beta =  readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_mean_path =  dir_prefix + std::string("batch_normalization_18_mean.bin"); 
-  void* batch_normalization_18_mean =  readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_variance_path =  dir_prefix + std::string("batch_normalization_18_variance.bin"); 
-  void* batch_normalization_18_variance =  readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_19_w_path =  dir_prefix + std::string("conv2d_19_w.bin"); 
-  void* conv2d_19_w =  readTrainedWeights(conv2d_19_w_path.c_str(), 0,128,512,1,1); 
-  std::string conv2d_19_b_path =  dir_prefix + std::string("conv2d_19_b.bin"); 
-  void* conv2d_19_b =  readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_19_gamma_path =  dir_prefix + std::string("batch_normalization_19_gamma.bin"); 
-  void* batch_normalization_19_gamma =  readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_19_beta_path =  dir_prefix + std::string("batch_normalization_19_beta.bin"); 
-  void* batch_normalization_19_beta =  readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_19_mean_path =  dir_prefix + std::string("batch_normalization_19_mean.bin"); 
-  void* batch_normalization_19_mean =  readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_19_variance_path =  dir_prefix + std::string("batch_normalization_19_variance.bin"); 
-  void* batch_normalization_19_variance =  readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_20_w_path =  dir_prefix + std::string("conv2d_20_w.bin"); 
-  void* conv2d_20_w =  readTrainedWeights(conv2d_20_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_20_b_path =  dir_prefix + std::string("conv2d_20_b.bin"); 
-  void* conv2d_20_b =  readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_20_gamma_path =  dir_prefix + std::string("batch_normalization_20_gamma.bin"); 
-  void* batch_normalization_20_gamma =  readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_20_beta_path =  dir_prefix + std::string("batch_normalization_20_beta.bin"); 
-  void* batch_normalization_20_beta =  readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_20_mean_path =  dir_prefix + std::string("batch_normalization_20_mean.bin"); 
-  void* batch_normalization_20_mean =  readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_20_variance_path =  dir_prefix + std::string("batch_normalization_20_variance.bin"); 
-  void* batch_normalization_20_variance =  readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_21_w_path =  dir_prefix + std::string("conv2d_21_w.bin"); 
-  void* conv2d_21_w =  readTrainedWeights(conv2d_21_w_path.c_str(), 0,512,128,1,1); 
-  std::string conv2d_21_b_path =  dir_prefix + std::string("conv2d_21_b.bin"); 
-  void* conv2d_21_b =  readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_gamma_path =  dir_prefix + std::string("batch_normalization_21_gamma.bin"); 
-  void* batch_normalization_21_gamma =  readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_beta_path =  dir_prefix + std::string("batch_normalization_21_beta.bin"); 
-  void* batch_normalization_21_beta =  readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_mean_path =  dir_prefix + std::string("batch_normalization_21_mean.bin"); 
-  void* batch_normalization_21_mean =  readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_variance_path =  dir_prefix + std::string("batch_normalization_21_variance.bin"); 
-  void* batch_normalization_21_variance =  readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_22_w_path =  dir_prefix + std::string("conv2d_22_w.bin"); 
-  void* conv2d_22_w =  readTrainedWeights(conv2d_22_w_path.c_str(), 0,128,512,1,1); 
-  std::string conv2d_22_b_path =  dir_prefix + std::string("conv2d_22_b.bin"); 
-  void* conv2d_22_b =  readTrainedWeights(conv2d_22_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_22_gamma_path =  dir_prefix + std::string("batch_normalization_22_gamma.bin"); 
-  void* batch_normalization_22_gamma =  readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_22_beta_path =  dir_prefix + std::string("batch_normalization_22_beta.bin"); 
-  void* batch_normalization_22_beta =  readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_22_mean_path =  dir_prefix + std::string("batch_normalization_22_mean.bin"); 
-  void* batch_normalization_22_mean =  readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_22_variance_path =  dir_prefix + std::string("batch_normalization_22_variance.bin"); 
-  void* batch_normalization_22_variance =  readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_23_w_path =  dir_prefix + std::string("conv2d_23_w.bin"); 
-  void* conv2d_23_w =  readTrainedWeights(conv2d_23_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_23_b_path =  dir_prefix + std::string("conv2d_23_b.bin"); 
-  void* conv2d_23_b =  readTrainedWeights(conv2d_23_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_23_gamma_path =  dir_prefix + std::string("batch_normalization_23_gamma.bin"); 
-  void* batch_normalization_23_gamma =  readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_23_beta_path =  dir_prefix + std::string("batch_normalization_23_beta.bin"); 
-  void* batch_normalization_23_beta =  readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_23_mean_path =  dir_prefix + std::string("batch_normalization_23_mean.bin"); 
-  void* batch_normalization_23_mean =  readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_23_variance_path =  dir_prefix + std::string("batch_normalization_23_variance.bin"); 
-  void* batch_normalization_23_variance =  readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_24_w_path =  dir_prefix + std::string("conv2d_24_w.bin"); 
-  void* conv2d_24_w =  readTrainedWeights(conv2d_24_w_path.c_str(), 0,512,128,1,1); 
-  std::string conv2d_24_b_path =  dir_prefix + std::string("conv2d_24_b.bin"); 
-  void* conv2d_24_b =  readTrainedWeights(conv2d_24_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_gamma_path =  dir_prefix + std::string("batch_normalization_24_gamma.bin"); 
-  void* batch_normalization_24_gamma =  readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_beta_path =  dir_prefix + std::string("batch_normalization_24_beta.bin"); 
-  void* batch_normalization_24_beta =  readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_mean_path =  dir_prefix + std::string("batch_normalization_24_mean.bin"); 
-  void* batch_normalization_24_mean =  readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_variance_path =  dir_prefix + std::string("batch_normalization_24_variance.bin"); 
-  void* batch_normalization_24_variance =  readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_25_w_path =  dir_prefix + std::string("conv2d_25_w.bin"); 
-  void* conv2d_25_w =  readTrainedWeights(conv2d_25_w_path.c_str(), 0,256,512,1,1); 
-  std::string conv2d_25_b_path =  dir_prefix + std::string("conv2d_25_b.bin"); 
-  void* conv2d_25_b =  readTrainedWeights(conv2d_25_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_25_gamma_path =  dir_prefix + std::string("batch_normalization_25_gamma.bin"); 
-  void* batch_normalization_25_gamma =  readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_25_beta_path =  dir_prefix + std::string("batch_normalization_25_beta.bin"); 
-  void* batch_normalization_25_beta =  readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_25_mean_path =  dir_prefix + std::string("batch_normalization_25_mean.bin"); 
-  void* batch_normalization_25_mean =  readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_25_variance_path =  dir_prefix + std::string("batch_normalization_25_variance.bin"); 
-  void* batch_normalization_25_variance =  readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_26_w_path =  dir_prefix + std::string("conv2d_26_w.bin"); 
-  void* conv2d_26_w =  readTrainedWeights(conv2d_26_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_26_b_path =  dir_prefix + std::string("conv2d_26_b.bin"); 
-  void* conv2d_26_b =  readTrainedWeights(conv2d_26_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_26_gamma_path =  dir_prefix + std::string("batch_normalization_26_gamma.bin"); 
-  void* batch_normalization_26_gamma =  readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_26_beta_path =  dir_prefix + std::string("batch_normalization_26_beta.bin"); 
-  void* batch_normalization_26_beta =  readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_26_mean_path =  dir_prefix + std::string("batch_normalization_26_mean.bin"); 
-  void* batch_normalization_26_mean =  readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_26_variance_path =  dir_prefix + std::string("batch_normalization_26_variance.bin"); 
-  void* batch_normalization_26_variance =  readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_27_w_path =  dir_prefix + std::string("conv2d_27_w.bin"); 
-  void* conv2d_27_w =  readTrainedWeights(conv2d_27_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_27_b_path =  dir_prefix + std::string("conv2d_27_b.bin"); 
-  void* conv2d_27_b =  readTrainedWeights(conv2d_27_b_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_28_w_path =  dir_prefix + std::string("conv2d_28_w.bin"); 
-  void* conv2d_28_w =  readTrainedWeights(conv2d_28_w_path.c_str(), 0,1024,512,1,1); 
-  std::string conv2d_28_b_path =  dir_prefix + std::string("conv2d_28_b.bin"); 
-  void* conv2d_28_b =  readTrainedWeights(conv2d_28_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_gamma_path =  dir_prefix + std::string("batch_normalization_27_gamma.bin"); 
-  void* batch_normalization_27_gamma =  readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_beta_path =  dir_prefix + std::string("batch_normalization_27_beta.bin"); 
-  void* batch_normalization_27_beta =  readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_mean_path =  dir_prefix + std::string("batch_normalization_27_mean.bin"); 
-  void* batch_normalization_27_mean =  readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_variance_path =  dir_prefix + std::string("batch_normalization_27_variance.bin"); 
-  void* batch_normalization_27_variance =  readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_28_gamma_path =  dir_prefix + std::string("batch_normalization_28_gamma.bin"); 
-  void* batch_normalization_28_gamma =  readTrainedWeights(batch_normalization_28_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_28_beta_path =  dir_prefix + std::string("batch_normalization_28_beta.bin"); 
-  void* batch_normalization_28_beta =  readTrainedWeights(batch_normalization_28_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_28_mean_path =  dir_prefix + std::string("batch_normalization_28_mean.bin"); 
-  void* batch_normalization_28_mean =  readTrainedWeights(batch_normalization_28_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_28_variance_path =  dir_prefix + std::string("batch_normalization_28_variance.bin"); 
-  void* batch_normalization_28_variance =  readTrainedWeights(batch_normalization_28_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_29_w_path =  dir_prefix + std::string("conv2d_29_w.bin"); 
-  void* conv2d_29_w =  readTrainedWeights(conv2d_29_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_29_b_path =  dir_prefix + std::string("conv2d_29_b.bin"); 
-  void* conv2d_29_b =  readTrainedWeights(conv2d_29_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_29_gamma_path =  dir_prefix + std::string("batch_normalization_29_gamma.bin"); 
-  void* batch_normalization_29_gamma =  readTrainedWeights(batch_normalization_29_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_29_beta_path =  dir_prefix + std::string("batch_normalization_29_beta.bin"); 
-  void* batch_normalization_29_beta =  readTrainedWeights(batch_normalization_29_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_29_mean_path =  dir_prefix + std::string("batch_normalization_29_mean.bin"); 
-  void* batch_normalization_29_mean =  readTrainedWeights(batch_normalization_29_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_29_variance_path =  dir_prefix + std::string("batch_normalization_29_variance.bin"); 
-  void* batch_normalization_29_variance =  readTrainedWeights(batch_normalization_29_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_30_w_path =  dir_prefix + std::string("conv2d_30_w.bin"); 
-  void* conv2d_30_w =  readTrainedWeights(conv2d_30_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_30_b_path =  dir_prefix + std::string("conv2d_30_b.bin"); 
-  void* conv2d_30_b =  readTrainedWeights(conv2d_30_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_30_gamma_path =  dir_prefix + std::string("batch_normalization_30_gamma.bin"); 
-  void* batch_normalization_30_gamma =  readTrainedWeights(batch_normalization_30_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_30_beta_path =  dir_prefix + std::string("batch_normalization_30_beta.bin"); 
-  void* batch_normalization_30_beta =  readTrainedWeights(batch_normalization_30_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_30_mean_path =  dir_prefix + std::string("batch_normalization_30_mean.bin"); 
-  void* batch_normalization_30_mean =  readTrainedWeights(batch_normalization_30_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_30_variance_path =  dir_prefix + std::string("batch_normalization_30_variance.bin"); 
-  void* batch_normalization_30_variance =  readTrainedWeights(batch_normalization_30_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_31_w_path =  dir_prefix + std::string("conv2d_31_w.bin"); 
-  void* conv2d_31_w =  readTrainedWeights(conv2d_31_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_31_b_path =  dir_prefix + std::string("conv2d_31_b.bin"); 
-  void* conv2d_31_b =  readTrainedWeights(conv2d_31_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_31_gamma_path =  dir_prefix + std::string("batch_normalization_31_gamma.bin"); 
-  void* batch_normalization_31_gamma =  readTrainedWeights(batch_normalization_31_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_31_beta_path =  dir_prefix + std::string("batch_normalization_31_beta.bin"); 
-  void* batch_normalization_31_beta =  readTrainedWeights(batch_normalization_31_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_31_mean_path =  dir_prefix + std::string("batch_normalization_31_mean.bin"); 
-  void* batch_normalization_31_mean =  readTrainedWeights(batch_normalization_31_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_31_variance_path =  dir_prefix + std::string("batch_normalization_31_variance.bin"); 
-  void* batch_normalization_31_variance =  readTrainedWeights(batch_normalization_31_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_32_w_path =  dir_prefix + std::string("conv2d_32_w.bin"); 
-  void* conv2d_32_w =  readTrainedWeights(conv2d_32_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_32_b_path =  dir_prefix + std::string("conv2d_32_b.bin"); 
-  void* conv2d_32_b =  readTrainedWeights(conv2d_32_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_32_gamma_path =  dir_prefix + std::string("batch_normalization_32_gamma.bin"); 
-  void* batch_normalization_32_gamma =  readTrainedWeights(batch_normalization_32_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_32_beta_path =  dir_prefix + std::string("batch_normalization_32_beta.bin"); 
-  void* batch_normalization_32_beta =  readTrainedWeights(batch_normalization_32_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_32_mean_path =  dir_prefix + std::string("batch_normalization_32_mean.bin"); 
-  void* batch_normalization_32_mean =  readTrainedWeights(batch_normalization_32_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_32_variance_path =  dir_prefix + std::string("batch_normalization_32_variance.bin"); 
-  void* batch_normalization_32_variance =  readTrainedWeights(batch_normalization_32_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_33_w_path =  dir_prefix + std::string("conv2d_33_w.bin"); 
-  void* conv2d_33_w =  readTrainedWeights(conv2d_33_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_33_b_path =  dir_prefix + std::string("conv2d_33_b.bin"); 
-  void* conv2d_33_b =  readTrainedWeights(conv2d_33_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_33_gamma_path =  dir_prefix + std::string("batch_normalization_33_gamma.bin"); 
-  void* batch_normalization_33_gamma =  readTrainedWeights(batch_normalization_33_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_33_beta_path =  dir_prefix + std::string("batch_normalization_33_beta.bin"); 
-  void* batch_normalization_33_beta =  readTrainedWeights(batch_normalization_33_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_33_mean_path =  dir_prefix + std::string("batch_normalization_33_mean.bin"); 
-  void* batch_normalization_33_mean =  readTrainedWeights(batch_normalization_33_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_33_variance_path =  dir_prefix + std::string("batch_normalization_33_variance.bin"); 
-  void* batch_normalization_33_variance =  readTrainedWeights(batch_normalization_33_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_34_w_path =  dir_prefix + std::string("conv2d_34_w.bin"); 
-  void* conv2d_34_w =  readTrainedWeights(conv2d_34_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_34_b_path =  dir_prefix + std::string("conv2d_34_b.bin"); 
-  void* conv2d_34_b =  readTrainedWeights(conv2d_34_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_34_gamma_path =  dir_prefix + std::string("batch_normalization_34_gamma.bin"); 
-  void* batch_normalization_34_gamma =  readTrainedWeights(batch_normalization_34_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_34_beta_path =  dir_prefix + std::string("batch_normalization_34_beta.bin"); 
-  void* batch_normalization_34_beta =  readTrainedWeights(batch_normalization_34_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_34_mean_path =  dir_prefix + std::string("batch_normalization_34_mean.bin"); 
-  void* batch_normalization_34_mean =  readTrainedWeights(batch_normalization_34_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_34_variance_path =  dir_prefix + std::string("batch_normalization_34_variance.bin"); 
-  void* batch_normalization_34_variance =  readTrainedWeights(batch_normalization_34_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_35_w_path =  dir_prefix + std::string("conv2d_35_w.bin"); 
-  void* conv2d_35_w =  readTrainedWeights(conv2d_35_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_35_b_path =  dir_prefix + std::string("conv2d_35_b.bin"); 
-  void* conv2d_35_b =  readTrainedWeights(conv2d_35_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_35_gamma_path =  dir_prefix + std::string("batch_normalization_35_gamma.bin"); 
-  void* batch_normalization_35_gamma =  readTrainedWeights(batch_normalization_35_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_35_beta_path =  dir_prefix + std::string("batch_normalization_35_beta.bin"); 
-  void* batch_normalization_35_beta =  readTrainedWeights(batch_normalization_35_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_35_mean_path =  dir_prefix + std::string("batch_normalization_35_mean.bin"); 
-  void* batch_normalization_35_mean =  readTrainedWeights(batch_normalization_35_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_35_variance_path =  dir_prefix + std::string("batch_normalization_35_variance.bin"); 
-  void* batch_normalization_35_variance =  readTrainedWeights(batch_normalization_35_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_36_w_path =  dir_prefix + std::string("conv2d_36_w.bin"); 
-  void* conv2d_36_w =  readTrainedWeights(conv2d_36_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_36_b_path =  dir_prefix + std::string("conv2d_36_b.bin"); 
-  void* conv2d_36_b =  readTrainedWeights(conv2d_36_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_36_gamma_path =  dir_prefix + std::string("batch_normalization_36_gamma.bin"); 
-  void* batch_normalization_36_gamma =  readTrainedWeights(batch_normalization_36_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_36_beta_path =  dir_prefix + std::string("batch_normalization_36_beta.bin"); 
-  void* batch_normalization_36_beta =  readTrainedWeights(batch_normalization_36_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_36_mean_path =  dir_prefix + std::string("batch_normalization_36_mean.bin"); 
-  void* batch_normalization_36_mean =  readTrainedWeights(batch_normalization_36_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_36_variance_path =  dir_prefix + std::string("batch_normalization_36_variance.bin"); 
-  void* batch_normalization_36_variance =  readTrainedWeights(batch_normalization_36_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_37_w_path =  dir_prefix + std::string("conv2d_37_w.bin"); 
-  void* conv2d_37_w =  readTrainedWeights(conv2d_37_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_37_b_path =  dir_prefix + std::string("conv2d_37_b.bin"); 
-  void* conv2d_37_b =  readTrainedWeights(conv2d_37_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_37_gamma_path =  dir_prefix + std::string("batch_normalization_37_gamma.bin"); 
-  void* batch_normalization_37_gamma =  readTrainedWeights(batch_normalization_37_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_37_beta_path =  dir_prefix + std::string("batch_normalization_37_beta.bin"); 
-  void* batch_normalization_37_beta =  readTrainedWeights(batch_normalization_37_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_37_mean_path =  dir_prefix + std::string("batch_normalization_37_mean.bin"); 
-  void* batch_normalization_37_mean =  readTrainedWeights(batch_normalization_37_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_37_variance_path =  dir_prefix + std::string("batch_normalization_37_variance.bin"); 
-  void* batch_normalization_37_variance =  readTrainedWeights(batch_normalization_37_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_38_w_path =  dir_prefix + std::string("conv2d_38_w.bin"); 
-  void* conv2d_38_w =  readTrainedWeights(conv2d_38_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_38_b_path =  dir_prefix + std::string("conv2d_38_b.bin"); 
-  void* conv2d_38_b =  readTrainedWeights(conv2d_38_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_38_gamma_path =  dir_prefix + std::string("batch_normalization_38_gamma.bin"); 
-  void* batch_normalization_38_gamma =  readTrainedWeights(batch_normalization_38_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_38_beta_path =  dir_prefix + std::string("batch_normalization_38_beta.bin"); 
-  void* batch_normalization_38_beta =  readTrainedWeights(batch_normalization_38_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_38_mean_path =  dir_prefix + std::string("batch_normalization_38_mean.bin"); 
-  void* batch_normalization_38_mean =  readTrainedWeights(batch_normalization_38_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_38_variance_path =  dir_prefix + std::string("batch_normalization_38_variance.bin"); 
-  void* batch_normalization_38_variance =  readTrainedWeights(batch_normalization_38_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_39_w_path =  dir_prefix + std::string("conv2d_39_w.bin"); 
-  void* conv2d_39_w =  readTrainedWeights(conv2d_39_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_39_b_path =  dir_prefix + std::string("conv2d_39_b.bin"); 
-  void* conv2d_39_b =  readTrainedWeights(conv2d_39_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_39_gamma_path =  dir_prefix + std::string("batch_normalization_39_gamma.bin"); 
-  void* batch_normalization_39_gamma =  readTrainedWeights(batch_normalization_39_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_39_beta_path =  dir_prefix + std::string("batch_normalization_39_beta.bin"); 
-  void* batch_normalization_39_beta =  readTrainedWeights(batch_normalization_39_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_39_mean_path =  dir_prefix + std::string("batch_normalization_39_mean.bin"); 
-  void* batch_normalization_39_mean =  readTrainedWeights(batch_normalization_39_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_39_variance_path =  dir_prefix + std::string("batch_normalization_39_variance.bin"); 
-  void* batch_normalization_39_variance =  readTrainedWeights(batch_normalization_39_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_40_w_path =  dir_prefix + std::string("conv2d_40_w.bin"); 
-  void* conv2d_40_w =  readTrainedWeights(conv2d_40_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_40_b_path =  dir_prefix + std::string("conv2d_40_b.bin"); 
-  void* conv2d_40_b =  readTrainedWeights(conv2d_40_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_40_gamma_path =  dir_prefix + std::string("batch_normalization_40_gamma.bin"); 
-  void* batch_normalization_40_gamma =  readTrainedWeights(batch_normalization_40_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_40_beta_path =  dir_prefix + std::string("batch_normalization_40_beta.bin"); 
-  void* batch_normalization_40_beta =  readTrainedWeights(batch_normalization_40_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_40_mean_path =  dir_prefix + std::string("batch_normalization_40_mean.bin"); 
-  void* batch_normalization_40_mean =  readTrainedWeights(batch_normalization_40_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_40_variance_path =  dir_prefix + std::string("batch_normalization_40_variance.bin"); 
-  void* batch_normalization_40_variance =  readTrainedWeights(batch_normalization_40_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_41_w_path =  dir_prefix + std::string("conv2d_41_w.bin"); 
-  void* conv2d_41_w =  readTrainedWeights(conv2d_41_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_41_b_path =  dir_prefix + std::string("conv2d_41_b.bin"); 
-  void* conv2d_41_b =  readTrainedWeights(conv2d_41_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_41_gamma_path =  dir_prefix + std::string("batch_normalization_41_gamma.bin"); 
-  void* batch_normalization_41_gamma =  readTrainedWeights(batch_normalization_41_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_41_beta_path =  dir_prefix + std::string("batch_normalization_41_beta.bin"); 
-  void* batch_normalization_41_beta =  readTrainedWeights(batch_normalization_41_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_41_mean_path =  dir_prefix + std::string("batch_normalization_41_mean.bin"); 
-  void* batch_normalization_41_mean =  readTrainedWeights(batch_normalization_41_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_41_variance_path =  dir_prefix + std::string("batch_normalization_41_variance.bin"); 
-  void* batch_normalization_41_variance =  readTrainedWeights(batch_normalization_41_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_42_w_path =  dir_prefix + std::string("conv2d_42_w.bin"); 
-  void* conv2d_42_w =  readTrainedWeights(conv2d_42_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_42_b_path =  dir_prefix + std::string("conv2d_42_b.bin"); 
-  void* conv2d_42_b =  readTrainedWeights(conv2d_42_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_42_gamma_path =  dir_prefix + std::string("batch_normalization_42_gamma.bin"); 
-  void* batch_normalization_42_gamma =  readTrainedWeights(batch_normalization_42_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_42_beta_path =  dir_prefix + std::string("batch_normalization_42_beta.bin"); 
-  void* batch_normalization_42_beta =  readTrainedWeights(batch_normalization_42_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_42_mean_path =  dir_prefix + std::string("batch_normalization_42_mean.bin"); 
-  void* batch_normalization_42_mean =  readTrainedWeights(batch_normalization_42_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_42_variance_path =  dir_prefix + std::string("batch_normalization_42_variance.bin"); 
-  void* batch_normalization_42_variance =  readTrainedWeights(batch_normalization_42_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_43_w_path =  dir_prefix + std::string("conv2d_43_w.bin"); 
-  void* conv2d_43_w =  readTrainedWeights(conv2d_43_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_43_b_path =  dir_prefix + std::string("conv2d_43_b.bin"); 
-  void* conv2d_43_b =  readTrainedWeights(conv2d_43_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_43_gamma_path =  dir_prefix + std::string("batch_normalization_43_gamma.bin"); 
-  void* batch_normalization_43_gamma =  readTrainedWeights(batch_normalization_43_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_43_beta_path =  dir_prefix + std::string("batch_normalization_43_beta.bin"); 
-  void* batch_normalization_43_beta =  readTrainedWeights(batch_normalization_43_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_43_mean_path =  dir_prefix + std::string("batch_normalization_43_mean.bin"); 
-  void* batch_normalization_43_mean =  readTrainedWeights(batch_normalization_43_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_43_variance_path =  dir_prefix + std::string("batch_normalization_43_variance.bin"); 
-  void* batch_normalization_43_variance =  readTrainedWeights(batch_normalization_43_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_44_w_path =  dir_prefix + std::string("conv2d_44_w.bin"); 
-  void* conv2d_44_w =  readTrainedWeights(conv2d_44_w_path.c_str(), 0,512,1024,1,1); 
-  std::string conv2d_44_b_path =  dir_prefix + std::string("conv2d_44_b.bin"); 
-  void* conv2d_44_b =  readTrainedWeights(conv2d_44_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_44_gamma_path =  dir_prefix + std::string("batch_normalization_44_gamma.bin"); 
-  void* batch_normalization_44_gamma =  readTrainedWeights(batch_normalization_44_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_44_beta_path =  dir_prefix + std::string("batch_normalization_44_beta.bin"); 
-  void* batch_normalization_44_beta =  readTrainedWeights(batch_normalization_44_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_44_mean_path =  dir_prefix + std::string("batch_normalization_44_mean.bin"); 
-  void* batch_normalization_44_mean =  readTrainedWeights(batch_normalization_44_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_44_variance_path =  dir_prefix + std::string("batch_normalization_44_variance.bin"); 
-  void* batch_normalization_44_variance =  readTrainedWeights(batch_normalization_44_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_45_w_path =  dir_prefix + std::string("conv2d_45_w.bin"); 
-  void* conv2d_45_w =  readTrainedWeights(conv2d_45_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_45_b_path =  dir_prefix + std::string("conv2d_45_b.bin"); 
-  void* conv2d_45_b =  readTrainedWeights(conv2d_45_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_45_gamma_path =  dir_prefix + std::string("batch_normalization_45_gamma.bin"); 
-  void* batch_normalization_45_gamma =  readTrainedWeights(batch_normalization_45_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_45_beta_path =  dir_prefix + std::string("batch_normalization_45_beta.bin"); 
-  void* batch_normalization_45_beta =  readTrainedWeights(batch_normalization_45_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_45_mean_path =  dir_prefix + std::string("batch_normalization_45_mean.bin"); 
-  void* batch_normalization_45_mean =  readTrainedWeights(batch_normalization_45_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_45_variance_path =  dir_prefix + std::string("batch_normalization_45_variance.bin"); 
-  void* batch_normalization_45_variance =  readTrainedWeights(batch_normalization_45_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_46_w_path =  dir_prefix + std::string("conv2d_46_w.bin"); 
-  void* conv2d_46_w =  readTrainedWeights(conv2d_46_w_path.c_str(), 0,2048,512,1,1); 
-  std::string conv2d_46_b_path =  dir_prefix + std::string("conv2d_46_b.bin"); 
-  void* conv2d_46_b =  readTrainedWeights(conv2d_46_b_path.c_str(), 0,1,2048,1,1); 
-  std::string conv2d_47_w_path =  dir_prefix + std::string("conv2d_47_w.bin"); 
-  void* conv2d_47_w =  readTrainedWeights(conv2d_47_w_path.c_str(), 0,2048,1024,1,1); 
-  std::string conv2d_47_b_path =  dir_prefix + std::string("conv2d_47_b.bin"); 
-  void* conv2d_47_b =  readTrainedWeights(conv2d_47_b_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_46_gamma_path =  dir_prefix + std::string("batch_normalization_46_gamma.bin"); 
-  void* batch_normalization_46_gamma =  readTrainedWeights(batch_normalization_46_gamma_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_46_beta_path =  dir_prefix + std::string("batch_normalization_46_beta.bin"); 
-  void* batch_normalization_46_beta =  readTrainedWeights(batch_normalization_46_beta_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_46_mean_path =  dir_prefix + std::string("batch_normalization_46_mean.bin"); 
-  void* batch_normalization_46_mean =  readTrainedWeights(batch_normalization_46_mean_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_46_variance_path =  dir_prefix + std::string("batch_normalization_46_variance.bin"); 
-  void* batch_normalization_46_variance =  readTrainedWeights(batch_normalization_46_variance_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_47_gamma_path =  dir_prefix + std::string("batch_normalization_47_gamma.bin"); 
-  void* batch_normalization_47_gamma =  readTrainedWeights(batch_normalization_47_gamma_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_47_beta_path =  dir_prefix + std::string("batch_normalization_47_beta.bin"); 
-  void* batch_normalization_47_beta =  readTrainedWeights(batch_normalization_47_beta_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_47_mean_path =  dir_prefix + std::string("batch_normalization_47_mean.bin"); 
-  void* batch_normalization_47_mean =  readTrainedWeights(batch_normalization_47_mean_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_47_variance_path =  dir_prefix + std::string("batch_normalization_47_variance.bin"); 
-  void* batch_normalization_47_variance =  readTrainedWeights(batch_normalization_47_variance_path.c_str(), 0,1,2048,1,1); 
-  std::string conv2d_48_w_path =  dir_prefix + std::string("conv2d_48_w.bin"); 
-  void* conv2d_48_w =  readTrainedWeights(conv2d_48_w_path.c_str(), 0,512,2048,1,1); 
-  std::string conv2d_48_b_path =  dir_prefix + std::string("conv2d_48_b.bin"); 
-  void* conv2d_48_b =  readTrainedWeights(conv2d_48_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_48_gamma_path =  dir_prefix + std::string("batch_normalization_48_gamma.bin"); 
-  void* batch_normalization_48_gamma =  readTrainedWeights(batch_normalization_48_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_48_beta_path =  dir_prefix + std::string("batch_normalization_48_beta.bin"); 
-  void* batch_normalization_48_beta =  readTrainedWeights(batch_normalization_48_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_48_mean_path =  dir_prefix + std::string("batch_normalization_48_mean.bin"); 
-  void* batch_normalization_48_mean =  readTrainedWeights(batch_normalization_48_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_48_variance_path =  dir_prefix + std::string("batch_normalization_48_variance.bin"); 
-  void* batch_normalization_48_variance =  readTrainedWeights(batch_normalization_48_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_49_w_path =  dir_prefix + std::string("conv2d_49_w.bin"); 
-  void* conv2d_49_w =  readTrainedWeights(conv2d_49_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_49_b_path =  dir_prefix + std::string("conv2d_49_b.bin"); 
-  void* conv2d_49_b =  readTrainedWeights(conv2d_49_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_49_gamma_path =  dir_prefix + std::string("batch_normalization_49_gamma.bin"); 
-  void* batch_normalization_49_gamma =  readTrainedWeights(batch_normalization_49_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_49_beta_path =  dir_prefix + std::string("batch_normalization_49_beta.bin"); 
-  void* batch_normalization_49_beta =  readTrainedWeights(batch_normalization_49_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_49_mean_path =  dir_prefix + std::string("batch_normalization_49_mean.bin"); 
-  void* batch_normalization_49_mean =  readTrainedWeights(batch_normalization_49_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_49_variance_path =  dir_prefix + std::string("batch_normalization_49_variance.bin"); 
-  void* batch_normalization_49_variance =  readTrainedWeights(batch_normalization_49_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_50_w_path =  dir_prefix + std::string("conv2d_50_w.bin"); 
-  void* conv2d_50_w =  readTrainedWeights(conv2d_50_w_path.c_str(), 0,2048,512,1,1); 
-  std::string conv2d_50_b_path =  dir_prefix + std::string("conv2d_50_b.bin"); 
-  void* conv2d_50_b =  readTrainedWeights(conv2d_50_b_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_50_gamma_path =  dir_prefix + std::string("batch_normalization_50_gamma.bin"); 
-  void* batch_normalization_50_gamma =  readTrainedWeights(batch_normalization_50_gamma_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_50_beta_path =  dir_prefix + std::string("batch_normalization_50_beta.bin"); 
-  void* batch_normalization_50_beta =  readTrainedWeights(batch_normalization_50_beta_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_50_mean_path =  dir_prefix + std::string("batch_normalization_50_mean.bin"); 
-  void* batch_normalization_50_mean =  readTrainedWeights(batch_normalization_50_mean_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_50_variance_path =  dir_prefix + std::string("batch_normalization_50_variance.bin"); 
-  void* batch_normalization_50_variance =  readTrainedWeights(batch_normalization_50_variance_path.c_str(), 0,1,2048,1,1); 
-  std::string conv2d_51_w_path =  dir_prefix + std::string("conv2d_51_w.bin"); 
-  void* conv2d_51_w =  readTrainedWeights(conv2d_51_w_path.c_str(), 0,512,2048,1,1); 
-  std::string conv2d_51_b_path =  dir_prefix + std::string("conv2d_51_b.bin"); 
-  void* conv2d_51_b =  readTrainedWeights(conv2d_51_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_51_gamma_path =  dir_prefix + std::string("batch_normalization_51_gamma.bin"); 
-  void* batch_normalization_51_gamma =  readTrainedWeights(batch_normalization_51_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_51_beta_path =  dir_prefix + std::string("batch_normalization_51_beta.bin"); 
-  void* batch_normalization_51_beta =  readTrainedWeights(batch_normalization_51_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_51_mean_path =  dir_prefix + std::string("batch_normalization_51_mean.bin"); 
-  void* batch_normalization_51_mean =  readTrainedWeights(batch_normalization_51_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_51_variance_path =  dir_prefix + std::string("batch_normalization_51_variance.bin"); 
-  void* batch_normalization_51_variance =  readTrainedWeights(batch_normalization_51_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_52_w_path =  dir_prefix + std::string("conv2d_52_w.bin"); 
-  void* conv2d_52_w =  readTrainedWeights(conv2d_52_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_52_b_path =  dir_prefix + std::string("conv2d_52_b.bin"); 
-  void* conv2d_52_b =  readTrainedWeights(conv2d_52_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_52_gamma_path =  dir_prefix + std::string("batch_normalization_52_gamma.bin"); 
-  void* batch_normalization_52_gamma =  readTrainedWeights(batch_normalization_52_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_52_beta_path =  dir_prefix + std::string("batch_normalization_52_beta.bin"); 
-  void* batch_normalization_52_beta =  readTrainedWeights(batch_normalization_52_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_52_mean_path =  dir_prefix + std::string("batch_normalization_52_mean.bin"); 
-  void* batch_normalization_52_mean =  readTrainedWeights(batch_normalization_52_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_52_variance_path =  dir_prefix + std::string("batch_normalization_52_variance.bin"); 
-  void* batch_normalization_52_variance =  readTrainedWeights(batch_normalization_52_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_53_w_path =  dir_prefix + std::string("conv2d_53_w.bin"); 
-  void* conv2d_53_w =  readTrainedWeights(conv2d_53_w_path.c_str(), 0,2048,512,1,1); 
-  std::string conv2d_53_b_path =  dir_prefix + std::string("conv2d_53_b.bin"); 
-  void* conv2d_53_b =  readTrainedWeights(conv2d_53_b_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_53_gamma_path =  dir_prefix + std::string("batch_normalization_53_gamma.bin"); 
-  void* batch_normalization_53_gamma =  readTrainedWeights(batch_normalization_53_gamma_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_53_beta_path =  dir_prefix + std::string("batch_normalization_53_beta.bin"); 
-  void* batch_normalization_53_beta =  readTrainedWeights(batch_normalization_53_beta_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_53_mean_path =  dir_prefix + std::string("batch_normalization_53_mean.bin"); 
-  void* batch_normalization_53_mean =  readTrainedWeights(batch_normalization_53_mean_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_53_variance_path =  dir_prefix + std::string("batch_normalization_53_variance.bin"); 
-  void* batch_normalization_53_variance =  readTrainedWeights(batch_normalization_53_variance_path.c_str(), 0,1,2048,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,1000); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,1000,1,1); 
+  startMemTracking();
 
+  int test_input_size = 500;
+  int batch_size = 100;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
 
+  for (int i = 0; i < batch_count; i++) {
 
-  startMemTracking(); 
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
 
-  int test_input_size = 500; 
-  int batch_size = 100; 
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
+    void *input =
+        readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
 
-  for(int i = 0; i < batch_count; i++){ 
+    void *var_2 = tensorConvolution(input, conv2d_1_w, 3, 3, 2, 2, 1, 1);
+    void *var_3 = tensorAdd(var_2, conv2d_1_b);
+    void *var_4 = tensorRelu(var_3);
+    void *var_5 = tensorPooling(var_4, 0, 3, 3, 0, 0, 2, 2);
+    void *var_6 = tensorBatchNorm(
+        var_5, batch_normalization_1_gamma, batch_normalization_1_beta,
+        batch_normalization_1_mean, batch_normalization_1_variance, 0.001);
+    void *var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1);
+    void *var_8 = tensorAdd(var_7, conv2d_2_b);
+    void *var_9 = tensorBatchNorm(
+        var_8, batch_normalization_2_gamma, batch_normalization_2_beta,
+        batch_normalization_2_mean, batch_normalization_2_variance, 0.001);
+    void *var_10 = tensorRelu(var_9);
+    void *var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1);
+    void *var_12 = tensorAdd(var_11, conv2d_3_b);
+    void *var_13 = tensorBatchNorm(
+        var_12, batch_normalization_3_gamma, batch_normalization_3_beta,
+        batch_normalization_3_mean, batch_normalization_3_variance, 0.001);
+    void *var_14 = tensorRelu(var_13);
+    void *var_15 = tensorConvolution(var_14, conv2d_4_w, 0, 0, 1, 1, 1, 1);
+    void *var_16 = tensorAdd(var_15, conv2d_4_b);
+    void *var_17 = tensorBatchNorm(
+        var_16, batch_normalization_4_gamma, batch_normalization_4_beta,
+        batch_normalization_4_mean, batch_normalization_4_variance, 0.001);
+    void *var_18 = tensorConvolution(var_6, conv2d_5_w, 0, 0, 1, 1, 1, 1);
+    void *var_19 = tensorAdd(var_18, conv2d_5_b);
+    void *var_20 = tensorBatchNorm(
+        var_19, batch_normalization_5_gamma, batch_normalization_5_beta,
+        batch_normalization_5_mean, batch_normalization_5_variance, 0.001);
+    void *var_21 = tensorAdd(var_17, var_20);
+    void *var_22 = tensorRelu(var_21);
+    void *var_23 = tensorConvolution(var_22, conv2d_6_w, 0, 0, 1, 1, 1, 1);
+    void *var_24 = tensorAdd(var_23, conv2d_6_b);
+    void *var_25 = tensorBatchNorm(
+        var_24, batch_normalization_6_gamma, batch_normalization_6_beta,
+        batch_normalization_6_mean, batch_normalization_6_variance, 0.001);
+    void *var_26 = tensorRelu(var_25);
+    void *var_27 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 1);
+    void *var_28 = tensorAdd(var_27, conv2d_7_b);
+    void *var_29 = tensorBatchNorm(
+        var_28, batch_normalization_7_gamma, batch_normalization_7_beta,
+        batch_normalization_7_mean, batch_normalization_7_variance, 0.001);
+    void *var_30 = tensorRelu(var_29);
+    void *var_31 = tensorConvolution(var_30, conv2d_8_w, 0, 0, 1, 1, 1, 1);
+    void *var_32 = tensorAdd(var_31, conv2d_8_b);
+    void *var_33 = tensorBatchNorm(
+        var_32, batch_normalization_8_gamma, batch_normalization_8_beta,
+        batch_normalization_8_mean, batch_normalization_8_variance, 0.001);
+    void *var_34 = tensorAdd(var_33, var_22);
+    void *var_35 = tensorRelu(var_34);
+    void *var_36 = tensorConvolution(var_35, conv2d_9_w, 0, 0, 1, 1, 1, 1);
+    void *var_37 = tensorAdd(var_36, conv2d_9_b);
+    void *var_38 = tensorBatchNorm(
+        var_37, batch_normalization_9_gamma, batch_normalization_9_beta,
+        batch_normalization_9_mean, batch_normalization_9_variance, 0.001);
+    void *var_39 = tensorRelu(var_38);
+    void *var_40 = tensorConvolution(var_39, conv2d_10_w, 1, 1, 1, 1, 1, 1);
+    void *var_41 = tensorAdd(var_40, conv2d_10_b);
+    void *var_42 = tensorBatchNorm(
+        var_41, batch_normalization_10_gamma, batch_normalization_10_beta,
+        batch_normalization_10_mean, batch_normalization_10_variance, 0.001);
+    void *var_43 = tensorRelu(var_42);
+    void *var_44 = tensorConvolution(var_43, conv2d_11_w, 0, 0, 1, 1, 1, 1);
+    void *var_45 = tensorAdd(var_44, conv2d_11_b);
+    void *var_46 = tensorBatchNorm(
+        var_45, batch_normalization_11_gamma, batch_normalization_11_beta,
+        batch_normalization_11_mean, batch_normalization_11_variance, 0.001);
+    void *var_47 = tensorAdd(var_46, var_35);
+    void *var_48 = tensorRelu(var_47);
+    void *var_49 = tensorConvolution(var_48, conv2d_12_w, 0, 0, 2, 2, 1, 1);
+    void *var_50 = tensorAdd(var_49, conv2d_12_b);
+    void *var_51 = tensorBatchNorm(
+        var_50, batch_normalization_12_gamma, batch_normalization_12_beta,
+        batch_normalization_12_mean, batch_normalization_12_variance, 0.001);
+    void *var_52 = tensorRelu(var_51);
+    void *var_53 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 1);
+    void *var_54 = tensorAdd(var_53, conv2d_13_b);
+    void *var_55 = tensorBatchNorm(
+        var_54, batch_normalization_13_gamma, batch_normalization_13_beta,
+        batch_normalization_13_mean, batch_normalization_13_variance, 0.001);
+    void *var_56 = tensorRelu(var_55);
+    void *var_57 = tensorConvolution(var_56, conv2d_14_w, 0, 0, 1, 1, 1, 1);
+    void *var_58 = tensorAdd(var_57, conv2d_14_b);
+    void *var_59 = tensorBatchNorm(
+        var_58, batch_normalization_14_gamma, batch_normalization_14_beta,
+        batch_normalization_14_mean, batch_normalization_14_variance, 0.001);
+    void *var_60 = tensorConvolution(var_48, conv2d_15_w, 0, 0, 2, 2, 1, 1);
+    void *var_61 = tensorAdd(var_60, conv2d_15_b);
+    void *var_62 = tensorBatchNorm(
+        var_61, batch_normalization_15_gamma, batch_normalization_15_beta,
+        batch_normalization_15_mean, batch_normalization_15_variance, 0.001);
+    void *var_63 = tensorAdd(var_59, var_62);
+    void *var_64 = tensorRelu(var_63);
+    void *var_65 = tensorConvolution(var_64, conv2d_16_w, 0, 0, 1, 1, 1, 1);
+    void *var_66 = tensorAdd(var_65, conv2d_16_b);
+    void *var_67 = tensorBatchNorm(
+        var_66, batch_normalization_16_gamma, batch_normalization_16_beta,
+        batch_normalization_16_mean, batch_normalization_16_variance, 0.001);
+    void *var_68 = tensorRelu(var_67);
+    void *var_69 = tensorConvolution(var_68, conv2d_17_w, 1, 1, 1, 1, 1, 1);
+    void *var_70 = tensorAdd(var_69, conv2d_17_b);
+    void *var_71 = tensorBatchNorm(
+        var_70, batch_normalization_17_gamma, batch_normalization_17_beta,
+        batch_normalization_17_mean, batch_normalization_17_variance, 0.001);
+    void *var_72 = tensorRelu(var_71);
+    void *var_73 = tensorConvolution(var_72, conv2d_18_w, 0, 0, 1, 1, 1, 1);
+    void *var_74 = tensorAdd(var_73, conv2d_18_b);
+    void *var_75 = tensorBatchNorm(
+        var_74, batch_normalization_18_gamma, batch_normalization_18_beta,
+        batch_normalization_18_mean, batch_normalization_18_variance, 0.001);
+    void *var_76 = tensorAdd(var_75, var_64);
+    void *var_77 = tensorRelu(var_76);
+    void *var_78 = tensorConvolution(var_77, conv2d_19_w, 0, 0, 1, 1, 1, 1);
+    void *var_79 = tensorAdd(var_78, conv2d_19_b);
+    void *var_80 = tensorBatchNorm(
+        var_79, batch_normalization_19_gamma, batch_normalization_19_beta,
+        batch_normalization_19_mean, batch_normalization_19_variance, 0.001);
+    void *var_81 = tensorRelu(var_80);
+    void *var_82 = tensorConvolution(var_81, conv2d_20_w, 1, 1, 1, 1, 1, 1);
+    void *var_83 = tensorAdd(var_82, conv2d_20_b);
+    void *var_84 = tensorBatchNorm(
+        var_83, batch_normalization_20_gamma, batch_normalization_20_beta,
+        batch_normalization_20_mean, batch_normalization_20_variance, 0.001);
+    void *var_85 = tensorRelu(var_84);
+    void *var_86 = tensorConvolution(var_85, conv2d_21_w, 0, 0, 1, 1, 1, 1);
+    void *var_87 = tensorAdd(var_86, conv2d_21_b);
+    void *var_88 = tensorBatchNorm(
+        var_87, batch_normalization_21_gamma, batch_normalization_21_beta,
+        batch_normalization_21_mean, batch_normalization_21_variance, 0.001);
+    void *var_89 = tensorAdd(var_88, var_77);
+    void *var_90 = tensorRelu(var_89);
+    void *var_91 = tensorConvolution(var_90, conv2d_22_w, 0, 0, 1, 1, 1, 1);
+    void *var_92 = tensorAdd(var_91, conv2d_22_b);
+    void *var_93 = tensorBatchNorm(
+        var_92, batch_normalization_22_gamma, batch_normalization_22_beta,
+        batch_normalization_22_mean, batch_normalization_22_variance, 0.001);
+    void *var_94 = tensorRelu(var_93);
+    void *var_95 = tensorConvolution(var_94, conv2d_23_w, 1, 1, 1, 1, 1, 1);
+    void *var_96 = tensorAdd(var_95, conv2d_23_b);
+    void *var_97 = tensorBatchNorm(
+        var_96, batch_normalization_23_gamma, batch_normalization_23_beta,
+        batch_normalization_23_mean, batch_normalization_23_variance, 0.001);
+    void *var_98 = tensorRelu(var_97);
+    void *var_99 = tensorConvolution(var_98, conv2d_24_w, 0, 0, 1, 1, 1, 1);
+    void *var_100 = tensorAdd(var_99, conv2d_24_b);
+    void *var_101 = tensorBatchNorm(
+        var_100, batch_normalization_24_gamma, batch_normalization_24_beta,
+        batch_normalization_24_mean, batch_normalization_24_variance, 0.001);
+    void *var_102 = tensorAdd(var_101, var_90);
+    void *var_103 = tensorRelu(var_102);
+    void *var_104 = tensorConvolution(var_103, conv2d_25_w, 0, 0, 2, 2, 1, 1);
+    void *var_105 = tensorAdd(var_104, conv2d_25_b);
+    void *var_106 = tensorBatchNorm(
+        var_105, batch_normalization_25_gamma, batch_normalization_25_beta,
+        batch_normalization_25_mean, batch_normalization_25_variance, 0.001);
+    void *var_107 = tensorRelu(var_106);
+    void *var_108 = tensorConvolution(var_107, conv2d_26_w, 1, 1, 1, 1, 1, 1);
+    void *var_109 = tensorAdd(var_108, conv2d_26_b);
+    void *var_110 = tensorBatchNorm(
+        var_109, batch_normalization_26_gamma, batch_normalization_26_beta,
+        batch_normalization_26_mean, batch_normalization_26_variance, 0.001);
+    void *var_111 = tensorRelu(var_110);
+    void *var_112 = tensorConvolution(var_111, conv2d_27_w, 0, 0, 1, 1, 1, 1);
+    void *var_113 = tensorAdd(var_112, conv2d_27_b);
+    void *var_114 = tensorBatchNorm(
+        var_113, batch_normalization_27_gamma, batch_normalization_27_beta,
+        batch_normalization_27_mean, batch_normalization_27_variance, 0.001);
+    void *var_115 = tensorConvolution(var_103, conv2d_28_w, 0, 0, 2, 2, 1, 1);
+    void *var_116 = tensorAdd(var_115, conv2d_28_b);
+    void *var_117 = tensorBatchNorm(
+        var_116, batch_normalization_28_gamma, batch_normalization_28_beta,
+        batch_normalization_28_mean, batch_normalization_28_variance, 0.001);
+    void *var_118 = tensorAdd(var_114, var_117);
+    void *var_119 = tensorRelu(var_118);
+    void *var_120 = tensorConvolution(var_119, conv2d_29_w, 0, 0, 1, 1, 1, 1);
+    void *var_121 = tensorAdd(var_120, conv2d_29_b);
+    void *var_122 = tensorBatchNorm(
+        var_121, batch_normalization_29_gamma, batch_normalization_29_beta,
+        batch_normalization_29_mean, batch_normalization_29_variance, 0.001);
+    void *var_123 = tensorRelu(var_122);
+    void *var_124 = tensorConvolution(var_123, conv2d_30_w, 1, 1, 1, 1, 1, 1);
+    void *var_125 = tensorAdd(var_124, conv2d_30_b);
+    void *var_126 = tensorBatchNorm(
+        var_125, batch_normalization_30_gamma, batch_normalization_30_beta,
+        batch_normalization_30_mean, batch_normalization_30_variance, 0.001);
+    void *var_127 = tensorRelu(var_126);
+    void *var_128 = tensorConvolution(var_127, conv2d_31_w, 0, 0, 1, 1, 1, 1);
+    void *var_129 = tensorAdd(var_128, conv2d_31_b);
+    void *var_130 = tensorBatchNorm(
+        var_129, batch_normalization_31_gamma, batch_normalization_31_beta,
+        batch_normalization_31_mean, batch_normalization_31_variance, 0.001);
+    void *var_131 = tensorAdd(var_130, var_119);
+    void *var_132 = tensorRelu(var_131);
+    void *var_133 = tensorConvolution(var_132, conv2d_32_w, 0, 0, 1, 1, 1, 1);
+    void *var_134 = tensorAdd(var_133, conv2d_32_b);
+    void *var_135 = tensorBatchNorm(
+        var_134, batch_normalization_32_gamma, batch_normalization_32_beta,
+        batch_normalization_32_mean, batch_normalization_32_variance, 0.001);
+    void *var_136 = tensorRelu(var_135);
+    void *var_137 = tensorConvolution(var_136, conv2d_33_w, 1, 1, 1, 1, 1, 1);
+    void *var_138 = tensorAdd(var_137, conv2d_33_b);
+    void *var_139 = tensorBatchNorm(
+        var_138, batch_normalization_33_gamma, batch_normalization_33_beta,
+        batch_normalization_33_mean, batch_normalization_33_variance, 0.001);
+    void *var_140 = tensorRelu(var_139);
+    void *var_141 = tensorConvolution(var_140, conv2d_34_w, 0, 0, 1, 1, 1, 1);
+    void *var_142 = tensorAdd(var_141, conv2d_34_b);
+    void *var_143 = tensorBatchNorm(
+        var_142, batch_normalization_34_gamma, batch_normalization_34_beta,
+        batch_normalization_34_mean, batch_normalization_34_variance, 0.001);
+    void *var_144 = tensorAdd(var_143, var_132);
+    void *var_145 = tensorRelu(var_144);
+    void *var_146 = tensorConvolution(var_145, conv2d_35_w, 0, 0, 1, 1, 1, 1);
+    void *var_147 = tensorAdd(var_146, conv2d_35_b);
+    void *var_148 = tensorBatchNorm(
+        var_147, batch_normalization_35_gamma, batch_normalization_35_beta,
+        batch_normalization_35_mean, batch_normalization_35_variance, 0.001);
+    void *var_149 = tensorRelu(var_148);
+    void *var_150 = tensorConvolution(var_149, conv2d_36_w, 1, 1, 1, 1, 1, 1);
+    void *var_151 = tensorAdd(var_150, conv2d_36_b);
+    void *var_152 = tensorBatchNorm(
+        var_151, batch_normalization_36_gamma, batch_normalization_36_beta,
+        batch_normalization_36_mean, batch_normalization_36_variance, 0.001);
+    void *var_153 = tensorRelu(var_152);
+    void *var_154 = tensorConvolution(var_153, conv2d_37_w, 0, 0, 1, 1, 1, 1);
+    void *var_155 = tensorAdd(var_154, conv2d_37_b);
+    void *var_156 = tensorBatchNorm(
+        var_155, batch_normalization_37_gamma, batch_normalization_37_beta,
+        batch_normalization_37_mean, batch_normalization_37_variance, 0.001);
+    void *var_157 = tensorAdd(var_156, var_145);
+    void *var_158 = tensorRelu(var_157);
+    void *var_159 = tensorConvolution(var_158, conv2d_38_w, 0, 0, 1, 1, 1, 1);
+    void *var_160 = tensorAdd(var_159, conv2d_38_b);
+    void *var_161 = tensorBatchNorm(
+        var_160, batch_normalization_38_gamma, batch_normalization_38_beta,
+        batch_normalization_38_mean, batch_normalization_38_variance, 0.001);
+    void *var_162 = tensorRelu(var_161);
+    void *var_163 = tensorConvolution(var_162, conv2d_39_w, 1, 1, 1, 1, 1, 1);
+    void *var_164 = tensorAdd(var_163, conv2d_39_b);
+    void *var_165 = tensorBatchNorm(
+        var_164, batch_normalization_39_gamma, batch_normalization_39_beta,
+        batch_normalization_39_mean, batch_normalization_39_variance, 0.001);
+    void *var_166 = tensorRelu(var_165);
+    void *var_167 = tensorConvolution(var_166, conv2d_40_w, 0, 0, 1, 1, 1, 1);
+    void *var_168 = tensorAdd(var_167, conv2d_40_b);
+    void *var_169 = tensorBatchNorm(
+        var_168, batch_normalization_40_gamma, batch_normalization_40_beta,
+        batch_normalization_40_mean, batch_normalization_40_variance, 0.001);
+    void *var_170 = tensorAdd(var_169, var_158);
+    void *var_171 = tensorRelu(var_170);
+    void *var_172 = tensorConvolution(var_171, conv2d_41_w, 0, 0, 1, 1, 1, 1);
+    void *var_173 = tensorAdd(var_172, conv2d_41_b);
+    void *var_174 = tensorBatchNorm(
+        var_173, batch_normalization_41_gamma, batch_normalization_41_beta,
+        batch_normalization_41_mean, batch_normalization_41_variance, 0.001);
+    void *var_175 = tensorRelu(var_174);
+    void *var_176 = tensorConvolution(var_175, conv2d_42_w, 1, 1, 1, 1, 1, 1);
+    void *var_177 = tensorAdd(var_176, conv2d_42_b);
+    void *var_178 = tensorBatchNorm(
+        var_177, batch_normalization_42_gamma, batch_normalization_42_beta,
+        batch_normalization_42_mean, batch_normalization_42_variance, 0.001);
+    void *var_179 = tensorRelu(var_178);
+    void *var_180 = tensorConvolution(var_179, conv2d_43_w, 0, 0, 1, 1, 1, 1);
+    void *var_181 = tensorAdd(var_180, conv2d_43_b);
+    void *var_182 = tensorBatchNorm(
+        var_181, batch_normalization_43_gamma, batch_normalization_43_beta,
+        batch_normalization_43_mean, batch_normalization_43_variance, 0.001);
+    void *var_183 = tensorAdd(var_182, var_171);
+    void *var_184 = tensorRelu(var_183);
+    void *var_185 = tensorConvolution(var_184, conv2d_44_w, 0, 0, 2, 2, 1, 1);
+    void *var_186 = tensorAdd(var_185, conv2d_44_b);
+    void *var_187 = tensorBatchNorm(
+        var_186, batch_normalization_44_gamma, batch_normalization_44_beta,
+        batch_normalization_44_mean, batch_normalization_44_variance, 0.001);
+    void *var_188 = tensorRelu(var_187);
+    void *var_189 = tensorConvolution(var_188, conv2d_45_w, 1, 1, 1, 1, 1, 1);
+    void *var_190 = tensorAdd(var_189, conv2d_45_b);
+    void *var_191 = tensorBatchNorm(
+        var_190, batch_normalization_45_gamma, batch_normalization_45_beta,
+        batch_normalization_45_mean, batch_normalization_45_variance, 0.001);
+    void *var_192 = tensorRelu(var_191);
+    void *var_193 = tensorConvolution(var_192, conv2d_46_w, 0, 0, 1, 1, 1, 1);
+    void *var_194 = tensorAdd(var_193, conv2d_46_b);
+    void *var_195 = tensorBatchNorm(
+        var_194, batch_normalization_46_gamma, batch_normalization_46_beta,
+        batch_normalization_46_mean, batch_normalization_46_variance, 0.001);
+    void *var_196 = tensorConvolution(var_184, conv2d_47_w, 0, 0, 2, 2, 1, 1);
+    void *var_197 = tensorAdd(var_196, conv2d_47_b);
+    void *var_198 = tensorBatchNorm(
+        var_197, batch_normalization_47_gamma, batch_normalization_47_beta,
+        batch_normalization_47_mean, batch_normalization_47_variance, 0.001);
+    void *var_199 = tensorAdd(var_195, var_198);
+    void *var_200 = tensorRelu(var_199);
+    void *var_201 = tensorConvolution(var_200, conv2d_48_w, 0, 0, 1, 1, 1, 1);
+    void *var_202 = tensorAdd(var_201, conv2d_48_b);
+    void *var_203 = tensorBatchNorm(
+        var_202, batch_normalization_48_gamma, batch_normalization_48_beta,
+        batch_normalization_48_mean, batch_normalization_48_variance, 0.001);
+    void *var_204 = tensorRelu(var_203);
+    void *var_205 = tensorConvolution(var_204, conv2d_49_w, 1, 1, 1, 1, 1, 1);
+    void *var_206 = tensorAdd(var_205, conv2d_49_b);
+    void *var_207 = tensorBatchNorm(
+        var_206, batch_normalization_49_gamma, batch_normalization_49_beta,
+        batch_normalization_49_mean, batch_normalization_49_variance, 0.001);
+    void *var_208 = tensorRelu(var_207);
+    void *var_209 = tensorConvolution(var_208, conv2d_50_w, 0, 0, 1, 1, 1, 1);
+    void *var_210 = tensorAdd(var_209, conv2d_50_b);
+    void *var_211 = tensorBatchNorm(
+        var_210, batch_normalization_50_gamma, batch_normalization_50_beta,
+        batch_normalization_50_mean, batch_normalization_50_variance, 0.001);
+    void *var_212 = tensorAdd(var_211, var_200);
+    void *var_213 = tensorRelu(var_212);
+    void *var_214 = tensorConvolution(var_213, conv2d_51_w, 0, 0, 1, 1, 1, 1);
+    void *var_215 = tensorAdd(var_214, conv2d_51_b);
+    void *var_216 = tensorBatchNorm(
+        var_215, batch_normalization_51_gamma, batch_normalization_51_beta,
+        batch_normalization_51_mean, batch_normalization_51_variance, 0.001);
+    void *var_217 = tensorRelu(var_216);
+    void *var_218 = tensorConvolution(var_217, conv2d_52_w, 1, 1, 1, 1, 1, 1);
+    void *var_219 = tensorAdd(var_218, conv2d_52_b);
+    void *var_220 = tensorBatchNorm(
+        var_219, batch_normalization_52_gamma, batch_normalization_52_beta,
+        batch_normalization_52_mean, batch_normalization_52_variance, 0.001);
+    void *var_221 = tensorRelu(var_220);
+    void *var_222 = tensorConvolution(var_221, conv2d_53_w, 0, 0, 1, 1, 1, 1);
+    void *var_223 = tensorAdd(var_222, conv2d_53_b);
+    void *var_224 = tensorBatchNorm(
+        var_223, batch_normalization_53_gamma, batch_normalization_53_beta,
+        batch_normalization_53_mean, batch_normalization_53_variance, 0.001);
+    void *var_225 = tensorAdd(var_224, var_213);
+    void *var_226 = tensorRelu(var_225);
+    void *var_227 = tensorPooling(var_226, 1, 7, 7, 0, 0, 7, 7);
+    void *var_229 = tensorGemmGPU(var_227, dense_1_w);
+    void *var_230 = tensorAdd(var_229, dense_1_b);
+    void *var_231 = tensorSoftmax(var_230);
 
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
 
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); 
-
-    void* var_2 = tensorConvolution(input, conv2d_1_w, 3, 3, 2, 2, 1, 1); 
-    void* var_3 = tensorAdd(var_2, conv2d_1_b); 
-    void* var_4 = tensorRelu(var_3); 
-    void* var_5 = tensorPooling(var_4,0,3,3,0,0,2,2); 
-    void* var_6 = tensorBatchNorm(var_5, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
-    void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
-    void* var_8 = tensorAdd(var_7, conv2d_2_b); 
-    void* var_9 = tensorBatchNorm(var_8, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
-    void* var_10 = tensorRelu(var_9); 
-    void* var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); 
-    void* var_12 = tensorAdd(var_11, conv2d_3_b); 
-    void* var_13 = tensorBatchNorm(var_12, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
-    void* var_14 = tensorRelu(var_13); 
-    void* var_15 = tensorConvolution(var_14, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
-    void* var_16 = tensorAdd(var_15, conv2d_4_b); 
-    void* var_17 = tensorBatchNorm(var_16, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
-    void* var_18 = tensorConvolution(var_6, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
-    void* var_19 = tensorAdd(var_18, conv2d_5_b); 
-    void* var_20 = tensorBatchNorm(var_19, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
-    void* var_21 = tensorAdd(var_17, var_20); 
-    void* var_22 = tensorRelu(var_21); 
-    void* var_23 = tensorConvolution(var_22, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
-    void* var_24 = tensorAdd(var_23, conv2d_6_b); 
-    void* var_25 = tensorBatchNorm(var_24, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
-    void* var_26 = tensorRelu(var_25); 
-    void* var_27 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 1); 
-    void* var_28 = tensorAdd(var_27, conv2d_7_b); 
-    void* var_29 = tensorBatchNorm(var_28, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
-    void* var_30 = tensorRelu(var_29); 
-    void* var_31 = tensorConvolution(var_30, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
-    void* var_32 = tensorAdd(var_31, conv2d_8_b); 
-    void* var_33 = tensorBatchNorm(var_32, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
-    void* var_34 = tensorAdd(var_33, var_22); 
-    void* var_35 = tensorRelu(var_34); 
-    void* var_36 = tensorConvolution(var_35, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
-    void* var_37 = tensorAdd(var_36, conv2d_9_b); 
-    void* var_38 = tensorBatchNorm(var_37, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
-    void* var_39 = tensorRelu(var_38); 
-    void* var_40 = tensorConvolution(var_39, conv2d_10_w, 1, 1, 1, 1, 1, 1); 
-    void* var_41 = tensorAdd(var_40, conv2d_10_b); 
-    void* var_42 = tensorBatchNorm(var_41, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
-    void* var_43 = tensorRelu(var_42); 
-    void* var_44 = tensorConvolution(var_43, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
-    void* var_45 = tensorAdd(var_44, conv2d_11_b); 
-    void* var_46 = tensorBatchNorm(var_45, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
-    void* var_47 = tensorAdd(var_46, var_35); 
-    void* var_48 = tensorRelu(var_47); 
-    void* var_49 = tensorConvolution(var_48, conv2d_12_w, 0, 0, 2, 2, 1, 1); 
-    void* var_50 = tensorAdd(var_49, conv2d_12_b); 
-    void* var_51 = tensorBatchNorm(var_50, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
-    void* var_52 = tensorRelu(var_51); 
-    void* var_53 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 1); 
-    void* var_54 = tensorAdd(var_53, conv2d_13_b); 
-    void* var_55 = tensorBatchNorm(var_54, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
-    void* var_56 = tensorRelu(var_55); 
-    void* var_57 = tensorConvolution(var_56, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
-    void* var_58 = tensorAdd(var_57, conv2d_14_b); 
-    void* var_59 = tensorBatchNorm(var_58, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
-    void* var_60 = tensorConvolution(var_48, conv2d_15_w, 0, 0, 2, 2, 1, 1); 
-    void* var_61 = tensorAdd(var_60, conv2d_15_b); 
-    void* var_62 = tensorBatchNorm(var_61, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
-    void* var_63 = tensorAdd(var_59, var_62); 
-    void* var_64 = tensorRelu(var_63); 
-    void* var_65 = tensorConvolution(var_64, conv2d_16_w, 0, 0, 1, 1, 1, 1); 
-    void* var_66 = tensorAdd(var_65, conv2d_16_b); 
-    void* var_67 = tensorBatchNorm(var_66, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
-    void* var_68 = tensorRelu(var_67); 
-    void* var_69 = tensorConvolution(var_68, conv2d_17_w, 1, 1, 1, 1, 1, 1); 
-    void* var_70 = tensorAdd(var_69, conv2d_17_b); 
-    void* var_71 = tensorBatchNorm(var_70, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
-    void* var_72 = tensorRelu(var_71); 
-    void* var_73 = tensorConvolution(var_72, conv2d_18_w, 0, 0, 1, 1, 1, 1); 
-    void* var_74 = tensorAdd(var_73, conv2d_18_b); 
-    void* var_75 = tensorBatchNorm(var_74, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
-    void* var_76 = tensorAdd(var_75, var_64); 
-    void* var_77 = tensorRelu(var_76); 
-    void* var_78 = tensorConvolution(var_77, conv2d_19_w, 0, 0, 1, 1, 1, 1); 
-    void* var_79 = tensorAdd(var_78, conv2d_19_b); 
-    void* var_80 = tensorBatchNorm(var_79, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
-    void* var_81 = tensorRelu(var_80); 
-    void* var_82 = tensorConvolution(var_81, conv2d_20_w, 1, 1, 1, 1, 1, 1); 
-    void* var_83 = tensorAdd(var_82, conv2d_20_b); 
-    void* var_84 = tensorBatchNorm(var_83, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
-    void* var_85 = tensorRelu(var_84); 
-    void* var_86 = tensorConvolution(var_85, conv2d_21_w, 0, 0, 1, 1, 1, 1); 
-    void* var_87 = tensorAdd(var_86, conv2d_21_b); 
-    void* var_88 = tensorBatchNorm(var_87, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
-    void* var_89 = tensorAdd(var_88, var_77); 
-    void* var_90 = tensorRelu(var_89); 
-    void* var_91 = tensorConvolution(var_90, conv2d_22_w, 0, 0, 1, 1, 1, 1); 
-    void* var_92 = tensorAdd(var_91, conv2d_22_b); 
-    void* var_93 = tensorBatchNorm(var_92, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
-    void* var_94 = tensorRelu(var_93); 
-    void* var_95 = tensorConvolution(var_94, conv2d_23_w, 1, 1, 1, 1, 1, 1); 
-    void* var_96 = tensorAdd(var_95, conv2d_23_b); 
-    void* var_97 = tensorBatchNorm(var_96, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
-    void* var_98 = tensorRelu(var_97); 
-    void* var_99 = tensorConvolution(var_98, conv2d_24_w, 0, 0, 1, 1, 1, 1); 
-    void* var_100 = tensorAdd(var_99, conv2d_24_b); 
-    void* var_101 = tensorBatchNorm(var_100, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
-    void* var_102 = tensorAdd(var_101, var_90); 
-    void* var_103 = tensorRelu(var_102); 
-    void* var_104 = tensorConvolution(var_103, conv2d_25_w, 0, 0, 2, 2, 1, 1); 
-    void* var_105 = tensorAdd(var_104, conv2d_25_b); 
-    void* var_106 = tensorBatchNorm(var_105, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-    void* var_107 = tensorRelu(var_106); 
-    void* var_108 = tensorConvolution(var_107, conv2d_26_w, 1, 1, 1, 1, 1, 1); 
-    void* var_109 = tensorAdd(var_108, conv2d_26_b); 
-    void* var_110 = tensorBatchNorm(var_109, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
-    void* var_111 = tensorRelu(var_110); 
-    void* var_112 = tensorConvolution(var_111, conv2d_27_w, 0, 0, 1, 1, 1, 1); 
-    void* var_113 = tensorAdd(var_112, conv2d_27_b); 
-    void* var_114 = tensorBatchNorm(var_113, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-    void* var_115 = tensorConvolution(var_103, conv2d_28_w, 0, 0, 2, 2, 1, 1); 
-    void* var_116 = tensorAdd(var_115, conv2d_28_b); 
-    void* var_117 = tensorBatchNorm(var_116, batch_normalization_28_gamma, batch_normalization_28_beta, batch_normalization_28_mean, batch_normalization_28_variance, 0.001); 
-    void* var_118 = tensorAdd(var_114, var_117); 
-    void* var_119 = tensorRelu(var_118); 
-    void* var_120 = tensorConvolution(var_119, conv2d_29_w, 0, 0, 1, 1, 1, 1); 
-    void* var_121 = tensorAdd(var_120, conv2d_29_b); 
-    void* var_122 = tensorBatchNorm(var_121, batch_normalization_29_gamma, batch_normalization_29_beta, batch_normalization_29_mean, batch_normalization_29_variance, 0.001); 
-    void* var_123 = tensorRelu(var_122); 
-    void* var_124 = tensorConvolution(var_123, conv2d_30_w, 1, 1, 1, 1, 1, 1); 
-    void* var_125 = tensorAdd(var_124, conv2d_30_b); 
-    void* var_126 = tensorBatchNorm(var_125, batch_normalization_30_gamma, batch_normalization_30_beta, batch_normalization_30_mean, batch_normalization_30_variance, 0.001); 
-    void* var_127 = tensorRelu(var_126); 
-    void* var_128 = tensorConvolution(var_127, conv2d_31_w, 0, 0, 1, 1, 1, 1); 
-    void* var_129 = tensorAdd(var_128, conv2d_31_b); 
-    void* var_130 = tensorBatchNorm(var_129, batch_normalization_31_gamma, batch_normalization_31_beta, batch_normalization_31_mean, batch_normalization_31_variance, 0.001); 
-    void* var_131 = tensorAdd(var_130, var_119); 
-    void* var_132 = tensorRelu(var_131); 
-    void* var_133 = tensorConvolution(var_132, conv2d_32_w, 0, 0, 1, 1, 1, 1); 
-    void* var_134 = tensorAdd(var_133, conv2d_32_b); 
-    void* var_135 = tensorBatchNorm(var_134, batch_normalization_32_gamma, batch_normalization_32_beta, batch_normalization_32_mean, batch_normalization_32_variance, 0.001); 
-    void* var_136 = tensorRelu(var_135); 
-    void* var_137 = tensorConvolution(var_136, conv2d_33_w, 1, 1, 1, 1, 1, 1); 
-    void* var_138 = tensorAdd(var_137, conv2d_33_b); 
-    void* var_139 = tensorBatchNorm(var_138, batch_normalization_33_gamma, batch_normalization_33_beta, batch_normalization_33_mean, batch_normalization_33_variance, 0.001); 
-    void* var_140 = tensorRelu(var_139); 
-    void* var_141 = tensorConvolution(var_140, conv2d_34_w, 0, 0, 1, 1, 1, 1); 
-    void* var_142 = tensorAdd(var_141, conv2d_34_b); 
-    void* var_143 = tensorBatchNorm(var_142, batch_normalization_34_gamma, batch_normalization_34_beta, batch_normalization_34_mean, batch_normalization_34_variance, 0.001); 
-    void* var_144 = tensorAdd(var_143, var_132); 
-    void* var_145 = tensorRelu(var_144); 
-    void* var_146 = tensorConvolution(var_145, conv2d_35_w, 0, 0, 1, 1, 1, 1); 
-    void* var_147 = tensorAdd(var_146, conv2d_35_b); 
-    void* var_148 = tensorBatchNorm(var_147, batch_normalization_35_gamma, batch_normalization_35_beta, batch_normalization_35_mean, batch_normalization_35_variance, 0.001); 
-    void* var_149 = tensorRelu(var_148); 
-    void* var_150 = tensorConvolution(var_149, conv2d_36_w, 1, 1, 1, 1, 1, 1); 
-    void* var_151 = tensorAdd(var_150, conv2d_36_b); 
-    void* var_152 = tensorBatchNorm(var_151, batch_normalization_36_gamma, batch_normalization_36_beta, batch_normalization_36_mean, batch_normalization_36_variance, 0.001); 
-    void* var_153 = tensorRelu(var_152); 
-    void* var_154 = tensorConvolution(var_153, conv2d_37_w, 0, 0, 1, 1, 1, 1); 
-    void* var_155 = tensorAdd(var_154, conv2d_37_b); 
-    void* var_156 = tensorBatchNorm(var_155, batch_normalization_37_gamma, batch_normalization_37_beta, batch_normalization_37_mean, batch_normalization_37_variance, 0.001); 
-    void* var_157 = tensorAdd(var_156, var_145); 
-    void* var_158 = tensorRelu(var_157); 
-    void* var_159 = tensorConvolution(var_158, conv2d_38_w, 0, 0, 1, 1, 1, 1); 
-    void* var_160 = tensorAdd(var_159, conv2d_38_b); 
-    void* var_161 = tensorBatchNorm(var_160, batch_normalization_38_gamma, batch_normalization_38_beta, batch_normalization_38_mean, batch_normalization_38_variance, 0.001); 
-    void* var_162 = tensorRelu(var_161); 
-    void* var_163 = tensorConvolution(var_162, conv2d_39_w, 1, 1, 1, 1, 1, 1); 
-    void* var_164 = tensorAdd(var_163, conv2d_39_b); 
-    void* var_165 = tensorBatchNorm(var_164, batch_normalization_39_gamma, batch_normalization_39_beta, batch_normalization_39_mean, batch_normalization_39_variance, 0.001); 
-    void* var_166 = tensorRelu(var_165); 
-    void* var_167 = tensorConvolution(var_166, conv2d_40_w, 0, 0, 1, 1, 1, 1); 
-    void* var_168 = tensorAdd(var_167, conv2d_40_b); 
-    void* var_169 = tensorBatchNorm(var_168, batch_normalization_40_gamma, batch_normalization_40_beta, batch_normalization_40_mean, batch_normalization_40_variance, 0.001); 
-    void* var_170 = tensorAdd(var_169, var_158); 
-    void* var_171 = tensorRelu(var_170); 
-    void* var_172 = tensorConvolution(var_171, conv2d_41_w, 0, 0, 1, 1, 1, 1); 
-    void* var_173 = tensorAdd(var_172, conv2d_41_b); 
-    void* var_174 = tensorBatchNorm(var_173, batch_normalization_41_gamma, batch_normalization_41_beta, batch_normalization_41_mean, batch_normalization_41_variance, 0.001); 
-    void* var_175 = tensorRelu(var_174); 
-    void* var_176 = tensorConvolution(var_175, conv2d_42_w, 1, 1, 1, 1, 1, 1); 
-    void* var_177 = tensorAdd(var_176, conv2d_42_b); 
-    void* var_178 = tensorBatchNorm(var_177, batch_normalization_42_gamma, batch_normalization_42_beta, batch_normalization_42_mean, batch_normalization_42_variance, 0.001); 
-    void* var_179 = tensorRelu(var_178); 
-    void* var_180 = tensorConvolution(var_179, conv2d_43_w, 0, 0, 1, 1, 1, 1); 
-    void* var_181 = tensorAdd(var_180, conv2d_43_b); 
-    void* var_182 = tensorBatchNorm(var_181, batch_normalization_43_gamma, batch_normalization_43_beta, batch_normalization_43_mean, batch_normalization_43_variance, 0.001); 
-    void* var_183 = tensorAdd(var_182, var_171); 
-    void* var_184 = tensorRelu(var_183); 
-    void* var_185 = tensorConvolution(var_184, conv2d_44_w, 0, 0, 2, 2, 1, 1); 
-    void* var_186 = tensorAdd(var_185, conv2d_44_b); 
-    void* var_187 = tensorBatchNorm(var_186, batch_normalization_44_gamma, batch_normalization_44_beta, batch_normalization_44_mean, batch_normalization_44_variance, 0.001); 
-    void* var_188 = tensorRelu(var_187); 
-    void* var_189 = tensorConvolution(var_188, conv2d_45_w, 1, 1, 1, 1, 1, 1); 
-    void* var_190 = tensorAdd(var_189, conv2d_45_b); 
-    void* var_191 = tensorBatchNorm(var_190, batch_normalization_45_gamma, batch_normalization_45_beta, batch_normalization_45_mean, batch_normalization_45_variance, 0.001); 
-    void* var_192 = tensorRelu(var_191); 
-    void* var_193 = tensorConvolution(var_192, conv2d_46_w, 0, 0, 1, 1, 1, 1); 
-    void* var_194 = tensorAdd(var_193, conv2d_46_b); 
-    void* var_195 = tensorBatchNorm(var_194, batch_normalization_46_gamma, batch_normalization_46_beta, batch_normalization_46_mean, batch_normalization_46_variance, 0.001); 
-    void* var_196 = tensorConvolution(var_184, conv2d_47_w, 0, 0, 2, 2, 1, 1); 
-    void* var_197 = tensorAdd(var_196, conv2d_47_b); 
-    void* var_198 = tensorBatchNorm(var_197, batch_normalization_47_gamma, batch_normalization_47_beta, batch_normalization_47_mean, batch_normalization_47_variance, 0.001); 
-    void* var_199 = tensorAdd(var_195, var_198); 
-    void* var_200 = tensorRelu(var_199); 
-    void* var_201 = tensorConvolution(var_200, conv2d_48_w, 0, 0, 1, 1, 1, 1); 
-    void* var_202 = tensorAdd(var_201, conv2d_48_b); 
-    void* var_203 = tensorBatchNorm(var_202, batch_normalization_48_gamma, batch_normalization_48_beta, batch_normalization_48_mean, batch_normalization_48_variance, 0.001); 
-    void* var_204 = tensorRelu(var_203); 
-    void* var_205 = tensorConvolution(var_204, conv2d_49_w, 1, 1, 1, 1, 1, 1); 
-    void* var_206 = tensorAdd(var_205, conv2d_49_b); 
-    void* var_207 = tensorBatchNorm(var_206, batch_normalization_49_gamma, batch_normalization_49_beta, batch_normalization_49_mean, batch_normalization_49_variance, 0.001); 
-    void* var_208 = tensorRelu(var_207); 
-    void* var_209 = tensorConvolution(var_208, conv2d_50_w, 0, 0, 1, 1, 1, 1); 
-    void* var_210 = tensorAdd(var_209, conv2d_50_b); 
-    void* var_211 = tensorBatchNorm(var_210, batch_normalization_50_gamma, batch_normalization_50_beta, batch_normalization_50_mean, batch_normalization_50_variance, 0.001); 
-    void* var_212 = tensorAdd(var_211, var_200); 
-    void* var_213 = tensorRelu(var_212); 
-    void* var_214 = tensorConvolution(var_213, conv2d_51_w, 0, 0, 1, 1, 1, 1); 
-    void* var_215 = tensorAdd(var_214, conv2d_51_b); 
-    void* var_216 = tensorBatchNorm(var_215, batch_normalization_51_gamma, batch_normalization_51_beta, batch_normalization_51_mean, batch_normalization_51_variance, 0.001); 
-    void* var_217 = tensorRelu(var_216); 
-    void* var_218 = tensorConvolution(var_217, conv2d_52_w, 1, 1, 1, 1, 1, 1); 
-    void* var_219 = tensorAdd(var_218, conv2d_52_b); 
-    void* var_220 = tensorBatchNorm(var_219, batch_normalization_52_gamma, batch_normalization_52_beta, batch_normalization_52_mean, batch_normalization_52_variance, 0.001); 
-    void* var_221 = tensorRelu(var_220); 
-    void* var_222 = tensorConvolution(var_221, conv2d_53_w, 0, 0, 1, 1, 1, 1); 
-    void* var_223 = tensorAdd(var_222, conv2d_53_b); 
-    void* var_224 = tensorBatchNorm(var_223, batch_normalization_53_gamma, batch_normalization_53_beta, batch_normalization_53_mean, batch_normalization_53_variance, 0.001); 
-    void* var_225 = tensorAdd(var_224, var_213); 
-    void* var_226 = tensorRelu(var_225); 
-    void* var_227 = tensorPooling(var_226,1,7,7,0,0,7,7); 
-    void* var_229 = tensorGemmGPU(var_227, dense_1_w); 
-    void* var_230 = tensorAdd(var_229, dense_1_b); 
-    void* var_231 = tensorSoftmax(var_230); 
-
-    uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy3(labels, var_231); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
- 
+    float accuracy = computeAccuracy3(labels, var_231);
+    final_accuracy += accuracy;
+    freeBatchMemory();
   }
 
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
-
-
-  llvm_hpvm_cleanupTensorRt(); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc
index a6dc7cbc11cf77357a749bff117489fc4b292941..034ddb0cf8d6b286544c669375a46746ad23d4d2 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc
@@ -1,82 +1,103 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); 
 
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -85,77 +106,76 @@ int main(){
   int batch_count = test_input_size / batch_size;
   float final_accuracy = 0.0;
 
-  for(int i = 0; i < batch_count; i++){
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); 
- 
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorRelu(var_1); 
-    void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_5 = tensorAdd(var_4, conv2d_2_b); 
-    void* var_6 = tensorRelu(var_5); 
-    void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); 
-    void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_9 = tensorAdd(var_8, conv2d_3_b); 
-    void* var_10 = tensorRelu(var_9); 
-    void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_13 = tensorAdd(var_12, conv2d_4_b); 
-    void* var_14 = tensorRelu(var_13); 
-    void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); 
-    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorRelu(var_17); 
-    void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_21 = tensorAdd(var_20, conv2d_6_b); 
-    void* var_22 = tensorRelu(var_21); 
-    void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorAdd(var_24, conv2d_7_b); 
-    void* var_26 = tensorRelu(var_25); 
-    void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); 
-    void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorAdd(var_28, conv2d_8_b); 
-    void* var_30 = tensorRelu(var_29); 
-    void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_33 = tensorAdd(var_32, conv2d_9_b); 
-    void* var_34 = tensorRelu(var_33); 
-    void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
-    void* var_37 = tensorAdd(var_36, conv2d_10_b); 
-    void* var_38 = tensorRelu(var_37); 
-    void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); 
-    void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_41 = tensorAdd(var_40, conv2d_11_b); 
-    void* var_42 = tensorRelu(var_41); 
-    void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_45 = tensorAdd(var_44, conv2d_12_b); 
-    void* var_46 = tensorRelu(var_45); 
-    void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_49 = tensorAdd(var_48, conv2d_13_b); 
-    void* var_50 = tensorRelu(var_49); 
-    void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); 
-    void* var_54 = tensorGemmGPU(var_51, dense_1_w); 
-    void* var_55 = tensorAdd(var_54, dense_1_b); 
-    void* var_56 = tensorRelu(var_55); 
-    void* var_58 = tensorGemmGPU(var_56, dense_2_w); 
-    void* var_59 = tensorAdd(var_58, dense_2_b); 
-    void* var_60 = tensorSoftmax(var_59); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_60); 
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_1 = tensorAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorRelu(var_1);
+    void *var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_5 = tensorAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorRelu(var_5);
+    void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_9 = tensorAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorRelu(var_9);
+    void *var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_13 = tensorAdd(var_12, conv2d_4_b);
+    void *var_14 = tensorRelu(var_13);
+    void *var_15 = tensorPooling(var_14, 0, 2, 2, 0, 0, 2, 2);
+    void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorRelu(var_17);
+    void *var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_21 = tensorAdd(var_20, conv2d_6_b);
+    void *var_22 = tensorRelu(var_21);
+    void *var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorAdd(var_24, conv2d_7_b);
+    void *var_26 = tensorRelu(var_25);
+    void *var_27 = tensorPooling(var_26, 0, 2, 2, 0, 0, 2, 2);
+    void *var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorAdd(var_28, conv2d_8_b);
+    void *var_30 = tensorRelu(var_29);
+    void *var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_33 = tensorAdd(var_32, conv2d_9_b);
+    void *var_34 = tensorRelu(var_33);
+    void *var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0);
+    void *var_37 = tensorAdd(var_36, conv2d_10_b);
+    void *var_38 = tensorRelu(var_37);
+    void *var_39 = tensorPooling(var_38, 0, 2, 2, 0, 0, 2, 2);
+    void *var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_41 = tensorAdd(var_40, conv2d_11_b);
+    void *var_42 = tensorRelu(var_41);
+    void *var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_45 = tensorAdd(var_44, conv2d_12_b);
+    void *var_46 = tensorRelu(var_45);
+    void *var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_49 = tensorAdd(var_48, conv2d_13_b);
+    void *var_50 = tensorRelu(var_49);
+    void *var_51 = tensorPooling(var_50, 0, 2, 2, 0, 0, 2, 2);
+    void *var_54 = tensorGemmGPU(var_51, dense_1_w);
+    void *var_55 = tensorAdd(var_54, dense_1_b);
+    void *var_56 = tensorRelu(var_55);
+    void *var_58 = tensorGemmGPU(var_56, dense_2_w);
+    void *var_59 = tensorAdd(var_58, dense_2_b);
+    void *var_60 = tensorSoftmax(var_59);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_60);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
-  
-  llvm_hpvm_cleanupTensorRt(); 
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc
index 2539f8d8722909724a9dc2890e82f4f98853f5cd..94ca77329bc2f31d251590df3916d3cb10673fda 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc
@@ -1,161 +1,181 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); 
-
-
-  startMemTracking(); 
-
-  int test_input_size = 5000; 
-  int batch_size = 5000;  
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
-
-  for(int i = 0; i < batch_count; i++){ 
-
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
-
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
-
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorRelu(var_1); 
-    void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_5 = tensorAdd(var_4, conv2d_2_b); 
-    void* var_6 = tensorRelu(var_5); 
-    void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); 
-    void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_9 = tensorAdd(var_8, conv2d_3_b); 
-    void* var_10 = tensorRelu(var_9); 
-    void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_13 = tensorAdd(var_12, conv2d_4_b); 
-    void* var_14 = tensorRelu(var_13); 
-    void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); 
-    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorRelu(var_17); 
-    void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_21 = tensorAdd(var_20, conv2d_6_b); 
-    void* var_22 = tensorRelu(var_21); 
-    void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorAdd(var_24, conv2d_7_b); 
-    void* var_26 = tensorRelu(var_25); 
-    void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); 
-    void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorAdd(var_28, conv2d_8_b); 
-    void* var_30 = tensorRelu(var_29); 
-    void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_33 = tensorAdd(var_32, conv2d_9_b); 
-    void* var_34 = tensorRelu(var_33); 
-    void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
-    void* var_37 = tensorAdd(var_36, conv2d_10_b); 
-    void* var_38 = tensorRelu(var_37); 
-    void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); 
-    void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_41 = tensorAdd(var_40, conv2d_11_b); 
-    void* var_42 = tensorRelu(var_41); 
-    void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_45 = tensorAdd(var_44, conv2d_12_b); 
-    void* var_46 = tensorRelu(var_45); 
-    void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_49 = tensorAdd(var_48, conv2d_13_b); 
-    void* var_50 = tensorRelu(var_49); 
-    void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); 
-    void* var_54 = tensorGemmGPU(var_51, dense_1_w); 
-    void* var_55 = tensorAdd(var_54, dense_1_b); 
-    void* var_56 = tensorRelu(var_55); 
-    void* var_58 = tensorGemmGPU(var_56, dense_2_w); 
-    void* var_59 = tensorAdd(var_58, dense_2_b); 
-    void* var_60 = tensorSoftmax(var_59); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
- 
+
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1);
+
+  startMemTracking();
+
+  int test_input_size = 5000;
+  int batch_size = 5000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  for (int i = 0; i < batch_count; i++) {
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_1 = tensorAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorRelu(var_1);
+    void *var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_5 = tensorAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorRelu(var_5);
+    void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_9 = tensorAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorRelu(var_9);
+    void *var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_13 = tensorAdd(var_12, conv2d_4_b);
+    void *var_14 = tensorRelu(var_13);
+    void *var_15 = tensorPooling(var_14, 0, 2, 2, 0, 0, 2, 2);
+    void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorRelu(var_17);
+    void *var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_21 = tensorAdd(var_20, conv2d_6_b);
+    void *var_22 = tensorRelu(var_21);
+    void *var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorAdd(var_24, conv2d_7_b);
+    void *var_26 = tensorRelu(var_25);
+    void *var_27 = tensorPooling(var_26, 0, 2, 2, 0, 0, 2, 2);
+    void *var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorAdd(var_28, conv2d_8_b);
+    void *var_30 = tensorRelu(var_29);
+    void *var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_33 = tensorAdd(var_32, conv2d_9_b);
+    void *var_34 = tensorRelu(var_33);
+    void *var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0);
+    void *var_37 = tensorAdd(var_36, conv2d_10_b);
+    void *var_38 = tensorRelu(var_37);
+    void *var_39 = tensorPooling(var_38, 0, 2, 2, 0, 0, 2, 2);
+    void *var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_41 = tensorAdd(var_40, conv2d_11_b);
+    void *var_42 = tensorRelu(var_41);
+    void *var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_45 = tensorAdd(var_44, conv2d_12_b);
+    void *var_46 = tensorRelu(var_45);
+    void *var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_49 = tensorAdd(var_48, conv2d_13_b);
+    void *var_50 = tensorRelu(var_49);
+    void *var_51 = tensorPooling(var_50, 0, 2, 2, 0, 0, 2, 2);
+    void *var_54 = tensorGemmGPU(var_51, dense_1_w);
+    void *var_55 = tensorAdd(var_54, dense_1_b);
+    void *var_56 = tensorRelu(var_55);
+    void *var_58 = tensorGemmGPU(var_56, dense_2_w);
+    void *var_59 = tensorAdd(var_58, dense_2_b);
+    void *var_60 = tensorSoftmax(var_59);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_60, 100);
+    final_accuracy += accuracy;
+    freeBatchMemory();
   }
 
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-  llvm_hpvm_cleanupTensorRt(); 
+  llvm_hpvm_cleanupTensorRt();
 
-  return 0; 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc
index 1d78065c5725deae9c14fc97a699fc14f55ad8ef..c5da3faf7860df24e25293acaacc1c50bcdceb72 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc
@@ -1,173 +1,193 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "tensor_runtime.h" 
-#include "utils.h" 
-
-
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-
-  std::string dir_prefix = std::string("/home/nvidia/sd_card/vgg16_imagenet_new/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,25088,4096); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); 
-  std::string dense_3_w_path =  dir_prefix + std::string("dense_3_w.bin"); 
-  void* dense_3_w =  readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); 
-  std::string dense_3_b_path =  dir_prefix + std::string("dense_3_b.bin"); 
-  void* dense_3_b =  readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); 
-
-
-
-  startMemTracking(); 
-
-  int test_input_size = 500; 
-  int batch_size = 100; 
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
-
-  for(int i = 0; i < batch_count; i++){ 
-
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
-
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); 
-
-    void* var_1 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
-    void* var_2 = tensorAdd(var_1, conv2d_1_b); 
-    void* var_3 = tensorRelu(var_2); 
-    void* var_4 = tensorConvolution(var_3, conv2d_2_w, 1, 1, 1, 1, 1, 1); 
-    void* var_5 = tensorAdd(var_4, conv2d_2_b); 
-    void* var_6 = tensorRelu(var_5); 
-    void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); 
-    void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 1); 
-    void* var_9 = tensorAdd(var_8, conv2d_3_b); 
-    void* var_10 = tensorRelu(var_9); 
-    void* var_11 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 1); 
-    void* var_12 = tensorAdd(var_11, conv2d_4_b); 
-    void* var_13 = tensorRelu(var_12); 
-    void* var_14 = tensorPooling(var_13,0,2,2,0,0,2,2); 
-    void* var_15 = tensorConvolution(var_14, conv2d_5_w, 1, 1, 1, 1, 1, 1); 
-    void* var_16 = tensorAdd(var_15, conv2d_5_b); 
-    void* var_17 = tensorRelu(var_16); 
-    void* var_18 = tensorConvolution(var_17, conv2d_6_w, 1, 1, 1, 1, 1, 1); 
-    void* var_19 = tensorAdd(var_18, conv2d_6_b); 
-    void* var_20 = tensorRelu(var_19); 
-    void* var_21 = tensorConvolution(var_20, conv2d_7_w, 1, 1, 1, 1, 1, 1); 
-    void* var_22 = tensorAdd(var_21, conv2d_7_b); 
-    void* var_23 = tensorRelu(var_22); 
-    void* var_24 = tensorPooling(var_23,0,2,2,0,0,2,2); 
-    void* var_25 = tensorConvolution(var_24, conv2d_8_w, 1, 1, 1, 1, 1, 1); 
-    void* var_26 = tensorAdd(var_25, conv2d_8_b); 
-    void* var_27 = tensorRelu(var_26); 
-    void* var_28 = tensorConvolution(var_27, conv2d_9_w, 1, 1, 1, 1, 1, 1); 
-    void* var_29 = tensorAdd(var_28, conv2d_9_b); 
-    void* var_30 = tensorRelu(var_29); 
-    void* var_31 = tensorConvolution(var_30, conv2d_10_w, 1, 1, 1, 1, 1, 1); 
-    void* var_32 = tensorAdd(var_31, conv2d_10_b); 
-    void* var_33 = tensorRelu(var_32); 
-    void* var_34 = tensorPooling(var_33,0,2,2,0,0,2,2); 
-    void* var_35 = tensorConvolution(var_34, conv2d_11_w, 1, 1, 1, 1, 1, 1); 
-    void* var_36 = tensorAdd(var_35, conv2d_11_b); 
-    void* var_37 = tensorRelu(var_36); 
-    void* var_38 = tensorConvolution(var_37, conv2d_12_w, 1, 1, 1, 1, 1, 1); 
-    void* var_39 = tensorAdd(var_38, conv2d_12_b); 
-    void* var_40 = tensorRelu(var_39); 
-    void* var_41 = tensorConvolution(var_40, conv2d_13_w, 1, 1, 1, 1, 1, 1); 
-    void* var_42 = tensorAdd(var_41, conv2d_13_b); 
-    void* var_43 = tensorRelu(var_42); 
-    void* var_44 = tensorPooling(var_43,0,2,2,0,0,2,2); 
-    void* var_46 = tensorGemmGPU(var_44, dense_1_w); 
-    void* var_47 = tensorAdd(var_46, dense_1_b); 
-    void* var_48 = tensorRelu(var_47); 
-    void* var_49 = tensorGemmGPU(var_48, dense_2_w); 
-    void* var_50 = tensorAdd(var_49, dense_2_b); 
-    void* var_51 = tensorRelu(var_50); 
-    void* var_52 = tensorGemmGPU(var_51, dense_3_w); 
-    void* var_53 = tensorAdd(var_52, dense_3_b); 
-    void* var_54 = tensorSoftmax(var_53); 
-
-    uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy3(labels, var_54); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
- 
-  }
-
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
 
+#include "tensor_runtime.h"
+#include "utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix =
+      std::string("/home/nvidia/sd_card/vgg16_imagenet_new/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 25088, 4096);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b =
+      readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 4096, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 4096, 4096);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b =
+      readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 4096, 1, 1);
+  std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin");
+  void *dense_3_w =
+      readTrainedWeights(dense_3_w_path.c_str(), 0, 1, 1, 4096, 1000);
+  std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
+  void *dense_3_b =
+      readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
+
+  startMemTracking();
+
+  int test_input_size = 500;
+  int batch_size = 100;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  for (int i = 0; i < batch_count; i++) {
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+
+    void *input =
+        readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
+
+    void *var_1 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1);
+    void *var_2 = tensorAdd(var_1, conv2d_1_b);
+    void *var_3 = tensorRelu(var_2);
+    void *var_4 = tensorConvolution(var_3, conv2d_2_w, 1, 1, 1, 1, 1, 1);
+    void *var_5 = tensorAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorRelu(var_5);
+    void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 1);
+    void *var_9 = tensorAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorRelu(var_9);
+    void *var_11 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 1);
+    void *var_12 = tensorAdd(var_11, conv2d_4_b);
+    void *var_13 = tensorRelu(var_12);
+    void *var_14 = tensorPooling(var_13, 0, 2, 2, 0, 0, 2, 2);
+    void *var_15 = tensorConvolution(var_14, conv2d_5_w, 1, 1, 1, 1, 1, 1);
+    void *var_16 = tensorAdd(var_15, conv2d_5_b);
+    void *var_17 = tensorRelu(var_16);
+    void *var_18 = tensorConvolution(var_17, conv2d_6_w, 1, 1, 1, 1, 1, 1);
+    void *var_19 = tensorAdd(var_18, conv2d_6_b);
+    void *var_20 = tensorRelu(var_19);
+    void *var_21 = tensorConvolution(var_20, conv2d_7_w, 1, 1, 1, 1, 1, 1);
+    void *var_22 = tensorAdd(var_21, conv2d_7_b);
+    void *var_23 = tensorRelu(var_22);
+    void *var_24 = tensorPooling(var_23, 0, 2, 2, 0, 0, 2, 2);
+    void *var_25 = tensorConvolution(var_24, conv2d_8_w, 1, 1, 1, 1, 1, 1);
+    void *var_26 = tensorAdd(var_25, conv2d_8_b);
+    void *var_27 = tensorRelu(var_26);
+    void *var_28 = tensorConvolution(var_27, conv2d_9_w, 1, 1, 1, 1, 1, 1);
+    void *var_29 = tensorAdd(var_28, conv2d_9_b);
+    void *var_30 = tensorRelu(var_29);
+    void *var_31 = tensorConvolution(var_30, conv2d_10_w, 1, 1, 1, 1, 1, 1);
+    void *var_32 = tensorAdd(var_31, conv2d_10_b);
+    void *var_33 = tensorRelu(var_32);
+    void *var_34 = tensorPooling(var_33, 0, 2, 2, 0, 0, 2, 2);
+    void *var_35 = tensorConvolution(var_34, conv2d_11_w, 1, 1, 1, 1, 1, 1);
+    void *var_36 = tensorAdd(var_35, conv2d_11_b);
+    void *var_37 = tensorRelu(var_36);
+    void *var_38 = tensorConvolution(var_37, conv2d_12_w, 1, 1, 1, 1, 1, 1);
+    void *var_39 = tensorAdd(var_38, conv2d_12_b);
+    void *var_40 = tensorRelu(var_39);
+    void *var_41 = tensorConvolution(var_40, conv2d_13_w, 1, 1, 1, 1, 1, 1);
+    void *var_42 = tensorAdd(var_41, conv2d_13_b);
+    void *var_43 = tensorRelu(var_42);
+    void *var_44 = tensorPooling(var_43, 0, 2, 2, 0, 0, 2, 2);
+    void *var_46 = tensorGemmGPU(var_44, dense_1_w);
+    void *var_47 = tensorAdd(var_46, dense_1_b);
+    void *var_48 = tensorRelu(var_47);
+    void *var_49 = tensorGemmGPU(var_48, dense_2_w);
+    void *var_50 = tensorAdd(var_49, dense_2_b);
+    void *var_51 = tensorRelu(var_50);
+    void *var_52 = tensorGemmGPU(var_51, dense_3_w);
+    void *var_53 = tensorAdd(var_52, dense_3_b);
+    void *var_54 = tensorSoftmax(var_53);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy3(labels, var_54);
+    final_accuracy += accuracy;
+    freeBatchMemory();
+  }
 
-  llvm_hpvm_cleanupTensorRt(); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
index 3b08755172973d63132bcd1c5b19d9e58ec38611..ea959342a4ac034deeba4191faa6620f2ec81037 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
@@ -10,10 +10,7 @@
 
 using namespace std;
 
-
-
-
-class UnitTestResults{
+class UnitTestResults {
 
 private:
   unsigned int total_tests;
@@ -22,48 +19,46 @@ private:
   std::vector<string> failed_test_ids;
 
 public:
-
-  UnitTestResults(){
+  UnitTestResults() {
     total_tests = 0;
     failed_tests = 0;
     passed_tests = 0;
   }
 
-  void evalTestResult(Tensor* res, const float* expected_result, size_t num_elems,
-		      float epsilon, string test_name){
+  void evalTestResult(Tensor *res, const float *expected_result,
+                      size_t num_elems, float epsilon, string test_name) {
 
-    total_tests += 1;      
-    if(res->num_elems != num_elems){
+    total_tests += 1;
+    if (res->num_elems != num_elems) {
       failed_tests += 1;
       failed_test_ids.push_back(test_name);
       return;
     }
 
-    float* data_ptr = (float*) res->host_data;
-    for (unsigned int i = 0; i < res->num_elems; i++){
-      //printf("**diff value = %f ", std::abs(data_ptr[i] - expected_result[i]));
-      if (std::abs(data_ptr[i] - expected_result[i]) > epsilon){
-	failed_tests += 1;
-	failed_test_ids.push_back(test_name);
+    float *data_ptr = (float *)res->host_data;
+    for (unsigned int i = 0; i < res->num_elems; i++) {
+      // printf("**diff value = %f ", std::abs(data_ptr[i] -
+      // expected_result[i]));
+      if (std::abs(data_ptr[i] - expected_result[i]) > epsilon) {
+        failed_tests += 1;
+        failed_test_ids.push_back(test_name);
         return;
       }
     }
-    
-    passed_tests += 1;    
+
+    passed_tests += 1;
   }
 
-  void compareTensors(Tensor* res, Tensor* gold_res,
-		      float epsilon, string test_name){
+  void compareTensors(Tensor *res, Tensor *gold_res, float epsilon,
+                      string test_name) {
 
-    const float* expected_result = (float*) gold_res->host_data;
+    const float *expected_result = (float *)gold_res->host_data;
     unsigned int num_elems = res->num_elems;
 
     evalTestResult(res, expected_result, num_elems, epsilon, test_name);
-    
   }
 
-
-  void printSummary(){
+  void printSummary() {
 
     printf("\n\n\n ************* Printing Results Summary ********** \n\n");
     printf("-- Total tests :=  %d \n", total_tests);
@@ -71,147 +66,136 @@ public:
     printf("-- Tests Failed := %d \n", failed_tests);
 
     printf("\n\n Tests that failed : \n\n");
-    for (int i = 0; i < failed_test_ids.size(); i++){
+    for (int i = 0; i < failed_test_ids.size(); i++) {
       printf("*** Test = %s \n", failed_test_ids[i].c_str());
     }
   }
-  
 };
 
-
-
-
-void testTensorHgemm(UnitTestResults& unitTestResults){
+void testTensorHgemm(UnitTestResults &unitTestResults) {
 
   printf("***** TensorHgemm ***** \n\n");
-  void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
-  struct Tensor* lhs = (struct Tensor*) lhs_ptr;
+  void *lhs_ptr =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
+  struct Tensor *lhs = (struct Tensor *)lhs_ptr;
   fillTensorWithOnes(lhs);
-  
-  float* data_arr = (float*) lhs->host_data;
-  for(int i = 0; i < lhs->num_elems; i++){
+
+  float *data_arr = (float *)lhs->host_data;
+  for (int i = 0; i < lhs->num_elems; i++) {
     data_arr[i] = (i / 4) + 1;
   }
-  
-  void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);
+
+  void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);
   fillTensorWithOnes(rhs);
-  
-  void* output = tensorHalfGemm(lhs, rhs);
-  convertToFP32((struct Tensor*) output);
+
+  void *output = tensorHalfGemm(lhs, rhs);
+  convertToFP32((struct Tensor *)output);
 
   printTensorValues(output);
 
-  const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 20, 20, 20};
+  const float expected_result[15] = {4,  4,  4,  8,  8,  8,  12, 12,
+                                     12, 16, 16, 16, 20, 20, 20};
 
-  unitTestResults.evalTestResult((Tensor*) output, expected_result, 15, 0.01, "Hgemm");
+  unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01,
+                                 "Hgemm");
 }
 
-
-
-void testTensorSgemm(UnitTestResults& unitTestResults){
+void testTensorSgemm(UnitTestResults &unitTestResults) {
 
   printf("***** TensorSgemm ***** \n\n");
-  void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
-  struct Tensor* lhs = (struct Tensor*) lhs_ptr;
+  void *lhs_ptr =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
+  struct Tensor *lhs = (struct Tensor *)lhs_ptr;
   fillTensorWithOnes(lhs);
- 
-  float* data_arr = (float*) lhs->host_data;
-  for(int i = 0; i < lhs->num_elems; i++){
+
+  float *data_arr = (float *)lhs->host_data;
+  for (int i = 0; i < lhs->num_elems; i++) {
     data_arr[i] = (i / 4) + 1;
   }
 
-  void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);  
+  void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);
   fillTensorWithOnes(rhs);
-  
-  void* output = tensorGemmGPU(lhs, rhs);
-  printTensorValues(output);
 
-  const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 20, 20, 20};
+  void *output = tensorGemmGPU(lhs, rhs);
+  printTensorValues(output);
 
-  unitTestResults.evalTestResult((Tensor*) output, expected_result, 15, 0.01, "Sgemm");
+  const float expected_result[15] = {4,  4,  4,  8,  8,  8,  12, 12,
+                                     12, 16, 16, 16, 20, 20, 20};
 
+  unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01,
+                                 "Sgemm");
 }
 
+void testTensorConcatAndSplit() {
 
+  int conv_mode = 1;         // CROSS_CORRELATION mode
+  int compute_precision = 0; // floating point precision
 
-
-
-void testTensorConcatAndSplit(){
-
-  int conv_mode = 1;  // CROSS_CORRELATION mode
-  int compute_precision = 0; // floating point precision 
-  
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
   fillWithOnesAndTwos(input);
-  void** splits = tensorSplit(input, 2, 1);
+  void **splits = tensorSplit(input, 2, 1);
 
-  void* conv2W = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
+  void *conv2W =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
   fillTensorWithOnes(conv2W);
-		     
-  void** conv2fils = tensorSplit(conv2W, 2, 0);
 
-  void* conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0,
-				       1, 1, conv_mode, compute_precision);
+  void **conv2fils = tensorSplit(conv2W, 2, 0);
+
+  void *conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0, 1, 1,
+                                       conv_mode, compute_precision);
   printTensorDims(conv2a_out);
 
-  void* conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0,
-				       1, 1, conv_mode, compute_precision);
+  void *conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0, 1, 1,
+                                       conv_mode, compute_precision);
   printTensorDims(conv2b_out);
- 
-  void* conv2_outs[2];
+
+  void *conv2_outs[2];
   conv2_outs[0] = conv2a_out;
   conv2_outs[1] = conv2b_out;
 
-  void* conv2_concat_out = tensorConcat(conv2_outs, 2, 1);
+  void *conv2_concat_out = tensorConcat(conv2_outs, 2, 1);
   printTensorDims(conv2_concat_out);
   printTensorValues(conv2_concat_out);
-  
 }
 
+void testLRN() {
 
-
-
-
-
-void testLRN(){
-
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20);
+  void *input =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20);
   fillTensorWithOnes(input);
 
   unsigned LRN_window = 5;
   double LRN_alpha = 2e-05;
   printf("LRN_alpha = %f \n", LRN_alpha);
-  
+
   double LRN_beta = 0.75;
   double LRN_k = 1.0;
 
   // TEST-point - Compare TF vs CUDNN
-  void* lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k);
+  void *lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k);
   printTensorDims(lrn1out);
   dumpWeightsToFile("tensors_out/lrn1_test.out", lrn1out);
 
-  void* input2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7);
+  void *input2 =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7);
   fillTensorWithOnes(input2);
 
   LRN_window = 5;
   LRN_alpha = 0.5 * LRN_window;
-  
+
   LRN_beta = 0.75;
   LRN_k = 1.0;
 
-  void* lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k);
+  void *lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k);
   printTensorDims(lrn2out);
-  dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out); 
+  dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out);
 }
 
-
-
-
-void testTensorAdd(){
+void testTensorAdd() {
 
   // Tensor add with equal dimensions
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
-  void* bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
+  void *bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
   fillTensorWithOnes(x);
   fillTensorWithOnes(bias);
 
@@ -222,8 +206,8 @@ void testTensorAdd(){
   printTensorValues(x);
 
   // Tensor addd with matching channel dimension
-  void* x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2);
-  void* bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1);
+  void *x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2);
+  void *bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1);
   fillTensorWithOnes(x2);
   fillTensorWithOnes(bias2);
 
@@ -231,209 +215,181 @@ void testTensorAdd(){
   printTensorValues(x2);
 }
 
+void testTensorConv() {
 
-void testTensorError(){
-
-  // Tensor add with equal dimensions
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 128);
-  fillTensorWithOnes(x);
-
-  Tensor* x_tensor = (Tensor*) x;
-  float* data_arr = (float*) x_tensor->host_data;
-  for(int i = 0; i < x_tensor->num_elems; i++){
-    data_arr[i] = 0.2;
-  }
-  
-  tensorAddError(x, 3);
-  printTensorValues(x);
-}
-
-
-void testTensorConv(){
-
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
+  void *filter =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
 
   fillTensorWithOnes(input);
   fillTensorWithOnes(filter);
 
-  int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
+  int conv_mode = 1;         // NOTE: uses CROSS_CORRELATION
   int compute_precision = 0; // floating point precision for conv
-  
-  void* conv_out = tensorConvolution(input, filter, 0, 0,
-				  1, 1, conv_mode, compute_precision);
-  printTensorValues(conv_out);
 
+  void *conv_out = tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode,
+                                     compute_precision);
+  printTensorValues(conv_out);
 }
 
+void testTensorHalfConv() {
 
-void testTensorHalfConv(){
-
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
+  void *filter =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
 
   fillTensorWithOnes(input);
   fillTensorWithOnes(filter);
 
-  int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
+  int conv_mode = 1;         // NOTE: uses CROSS_CORRELATION
   int compute_precision = 0; // floating point precision for conv
-  
-  void* conv_out = tensorHalfConvolution(input, filter, 0, 0,
-					 1, 1, conv_mode, compute_precision);
-  printTensorValues(conv_out);
 
+  void *conv_out = tensorHalfConvolution(input, filter, 0, 0, 1, 1, conv_mode,
+                                         compute_precision);
+  printTensorValues(conv_out);
 }
 
+void testTensorGroupConv() {
 
+  // NOTE: The input channel count value (param2 to Tensor and Filter) must be
+  // the same
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
+  void *filter =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
 
-
-void testTensorGroupConv(){
-
-  // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
-
-  // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor, val)
+  // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor,
+  // val)
   fillTensorWithOnes(input);
   fillTensorWithOnes(filter);
 
   int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
   int conv_groups = 2;
-  
-  void* conv_out = tensorConvolution(input, filter,
-	                             0, 0,
-				     1, 1,
-				     conv_mode, conv_groups);
+
+  void *conv_out =
+      tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups);
   printTensorValues(conv_out);
-  
 }
 
+void testTensorHalfGroupConv() {
 
-void testTensorHalfGroupConv(){
-
-  // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
+  // NOTE: The input channel count value (param2 to Tensor and Filter) must be
+  // the same
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
+  void *filter =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
 
   fillTensorWithOnes(input);
   fillTensorWithOnes(filter);
 
   int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
   int conv_groups = 2;
-  
-  void* conv_out = tensorConvolution(input, filter,
-	                             0, 0,
-				     1, 1,
-				     conv_mode, conv_groups);
-  
-  convertToFP32((struct Tensor*) conv_out);
+
+  void *conv_out =
+      tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups);
+
+  convertToFP32((struct Tensor *)conv_out);
 
   printTensorValues(conv_out);
 }
 
+void testTensorPooling() {
 
-void testTensorPooling(){
-
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
   fillTensorWithOnes(x);
 
-  float* data_arr = (float*) ((Tensor*) x)->host_data;
-  for(int i = 0; i < ((Tensor*) x)->num_elems; i += 4){
+  float *data_arr = (float *)((Tensor *)x)->host_data;
+  for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) {
     data_arr[i] = i;
   }
 
-  void* output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
+  void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
   printTensorValues(output);
 }
 
+void testTensorHalfPooling() {
 
-void testTensorHalfPooling(){
-
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
   fillTensorWithOnes(x);
 
-  float* data_arr = (float*) ((Tensor*) x)->host_data;
-  for(int i = 0; i < ((Tensor*) x)->num_elems; i += 4){
+  float *data_arr = (float *)((Tensor *)x)->host_data;
+  for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) {
     data_arr[i] = i;
   }
 
-  void* output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
-  convertToFP32((struct Tensor*) output);
+  void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
+  convertToFP32((struct Tensor *)output);
 
   printTensorValues(output);
 }
 
+void testTensorBatchNorm() {
 
-void testTensorBatchNorm(){
-
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
   fillTensorWithVal(x, 3);
 
-  void* gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(gamma, 1);
 
-  void* beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(beta, 0);
 
-  void* mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(mean, 1);
 
-  void* variance = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *variance =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(variance, 1);
 
   double epsilon = 1;
   // NOTE: result = X - mean / sqrt(epsilon + variance)
-  void* output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);
+  void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);
 
-  printTensorValues(output);  
+  printTensorValues(output);
 }
 
+void testTensorHalfBatchNorm() {
 
-void testTensorHalfBatchNorm(){
-
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
   fillTensorWithVal(x, 3);
 
-  void* gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(gamma, 1);
 
-  void* beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(beta, 0);
 
-  void* mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(mean, 1);
 
-  void* variance = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *variance =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(variance, 1);
 
-
   double epsilon = 1;
   // NOTE: result = X - mean / sqrt(epsilon + variance)
-  void* output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);  
-  convertToFP32((struct Tensor*) output);
+  void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);
+  convertToFP32((struct Tensor *)output);
 
-  printTensorValues(output);  
+  printTensorValues(output);
 }
 
+void testTensorRelu() {
 
-void testTensorRelu(){
-
-  // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match 
+  // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match
   printf("***** TensorRelu ***** \n\n");
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
   fillTensorWithNegOnes(input);
 
-  void* output = tensorRelu(input);
+  void *output = tensorRelu(input);
   printTensorValues(output);
 }
 
-
-void testTensorSoftmax(){
+void testTensorSoftmax() {
 
   printf("***** TensorSoftmax ***** \n\n");
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1);
 
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
   host_ptr[0] = 0.1;
   host_ptr[1] = 0.2;
   host_ptr[2] = 0.3;
@@ -443,39 +399,36 @@ void testTensorSoftmax(){
   host_ptr[6] = 0.7;
   host_ptr[7] = 2.5;
 
-  void* output = tensorSoftmax(input);
+  void *output = tensorSoftmax(input);
   printTensorValues(output);
 }
 
+void testSoftmaxOutput(void *output_ptr) {
 
-void testSoftmaxOutput(void* output_ptr){
+  struct Tensor *output = (struct Tensor *)output_ptr;
 
-  struct Tensor* output = (struct Tensor*) output_ptr;
-  
   size_t batch_dim = output->dims.dim_sizes[0];
   size_t channels = output->dims.dim_sizes[1];
 
-  float* data = (float*) output->host_data;
-  for(int i = 0; i < batch_dim; i++){
+  float *data = (float *)output->host_data;
+  for (int i = 0; i < batch_dim; i++) {
     float sum = 0.0;
-    for(int j = 0; j < channels; j++){
+    for (int j = 0; j < channels; j++) {
       sum += data[i * channels + j];
     }
     printf("output_sum = %f \n", sum);
   }
-  
 }
 
-
-
-void testPromiseError(){
+void testPromiseError() {
 
   printf("***** TensorQuantize ***** \n\n");
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
 
-  void* gold_tensor = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
-  float* gold_ptr = (float*) ((struct Tensor*) gold_tensor)->host_data;
+  void *gold_tensor =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
+  float *gold_ptr = (float *)((struct Tensor *)gold_tensor)->host_data;
 
   gold_ptr[0] = -1;
   gold_ptr[1] = -2;
@@ -490,21 +443,20 @@ void testPromiseError(){
   gold_ptr[10] = 1;
   gold_ptr[11] = 1;
 
-
   int num_elems = 12;
   int num_runs = 1000;
 
-  float* result_ptr = (float*) malloc(sizeof(float) * num_elems);
+  float *result_ptr = (float *)malloc(sizeof(float) * num_elems);
 
-  for (int swing = 1; swing <= 7; swing++){
+  for (int swing = 1; swing <= 7; swing++) {
 
-    for (int j = 0; j < num_elems; j++){
-      result_ptr[j] = 0; 
+    for (int j = 0; j < num_elems; j++) {
+      result_ptr[j] = 0;
     }
 
     float error_sum = 0.0;
-    
-    for (int i = 0; i < 1000; i++){
+
+    for (int i = 0; i < 1000; i++) {
       host_ptr[0] = -1;
       host_ptr[1] = -2;
       host_ptr[2] = -3;
@@ -517,43 +469,39 @@ void testPromiseError(){
       host_ptr[9] = 2;
       host_ptr[10] = 1;
       host_ptr[11] = 1;
- 
-      void* error_out = addPromiseError(input, swing);
-      //printTensorValues(error_out);
+
+      void *error_out = addPromiseError(input, swing);
+      // printTensorValues(error_out);
 
       // Move result data back to the host
       hpvm_request_tensor(input, 0);
-      float* error_out_ptr = (float*) ((struct Tensor*) input)->host_data;
+      float *error_out_ptr = (float *)((struct Tensor *)input)->host_data;
 
-      for (int j = 0; j < num_elems; j++){
-	result_ptr[j] += error_out_ptr[j];
-	error_sum += (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]); 
+      for (int j = 0; j < num_elems; j++) {
+        result_ptr[j] += error_out_ptr[j];
+        error_sum +=
+            (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]);
       }
     }
 
-    printf ("\n\n - Swing %d results : \n", swing);
-    for (int j = 0; j < num_elems; j++){
+    printf("\n\n - Swing %d results : \n", swing);
+    for (int j = 0; j < num_elems; j++) {
       result_ptr[j] = result_ptr[j] / num_runs;
       printf(" %f ", result_ptr[j]);
     }
 
     printf("mean_error = %f \n", error_sum / num_runs);
-    
+
     printf(" \n");
   }
-  
-  
 }
 
-
-
-
-void testQuantization(){
+void testQuantization() {
 
   printf("***** TensorQuantize ***** \n\n");
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
 
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
   host_ptr[0] = -0.1;
   host_ptr[1] = -25;
   host_ptr[2] = 0.2;
@@ -566,13 +514,12 @@ void testQuantization(){
   host_ptr[9] = 7.2;
   host_ptr[10] = 2.5;
   host_ptr[11] = 3;
- 
 
-  void* quantize_result1 = quantizeTensorPromise(input, -4, 6);
+  void *quantize_result1 = quantizeTensorPromise(input, -4, 6);
 
-  printf ("\n ** quantizing with range min = %d max = %d \n", -4, 6);
+  printf("\n ** quantizing with range min = %d max = %d \n", -4, 6);
   printTensorValues(quantize_result1);
-  
+
   host_ptr[0] = -0.1;
   host_ptr[1] = -25;
   host_ptr[2] = 0.2;
@@ -586,9 +533,9 @@ void testQuantization(){
   host_ptr[10] = 2.5;
   host_ptr[11] = 3;
 
-  void* quantize_result2 = quantizeTensorPromise(input, -2, 2);
+  void *quantize_result2 = quantizeTensorPromise(input, -2, 2);
 
-  printf ("\n ** quantizing with range min = %d max = %d \n", -2, 2);
+  printf("\n ** quantizing with range min = %d max = %d \n", -2, 2);
   printTensorValues(quantize_result2);
 
   host_ptr[0] = -0.1;
@@ -604,13 +551,12 @@ void testQuantization(){
   host_ptr[10] = 2.5;
   host_ptr[11] = 3;
 
+  void *quantize_result3 = quantizeTensorPromise(input, -25, 8);
 
-  void* quantize_result3 = quantizeTensorPromise(input, -25, 8);
-
-  printf ("\n ** quantizing with range min = %d max = %d \n", -25, 8);
+  printf("\n ** quantizing with range min = %d max = %d \n", -25, 8);
   printTensorValues(quantize_result3);
 
-  printf ("\n ** quantizing with range min = %d max = %d \n", -10, 10);
+  printf("\n ** quantizing with range min = %d max = %d \n", -10, 10);
 
   host_ptr[0] = -0.1;
   host_ptr[1] = -25;
@@ -625,30 +571,26 @@ void testQuantization(){
   host_ptr[10] = 2.5;
   host_ptr[11] = 3;
 
-
-  void* quantize_result4 = quantizeTensorPromise(input, -10, 10);  
+  void *quantize_result4 = quantizeTensorPromise(input, -10, 10);
   printTensorValues(quantize_result4);
 
-
-  void* quantize_result5 = quantizeTensorPromise(input, -10, 10);
+  void *quantize_result5 = quantizeTensorPromise(input, -10, 10);
   printTensorValues(quantize_result5);
-  
-  //void* error_out = addPromiseError(quantize_result, 1);
-  //printTensorValues(error_out);
 
+  // void* error_out = addPromiseError(quantize_result, 1);
+  // printTensorValues(error_out);
 }
 
-
-
-
-void testSampleFilter(){
+void testSampleFilter() {
 
   printf("***** Tensor Sample Filter ***** \n\n");
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
-  //fillTensorWithVal(input, 3);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
+  // fillTensorWithVal(input, 3);
   fillWithOnesAndTwos(input);
-  
-  Tensor* input2 = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, 2, 32, 32);
+
+  Tensor *input2 = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
+                                            3, 2, 32, 32);
   fillTensorWithVal(input2, 1);
 
   /*  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
@@ -667,7 +609,7 @@ void testSampleFilter(){
   /*  printf("\n\n");
 
   hpvm_request_tensor(input, DEVICE);
-    
+
   sampleFilter(input, 2, 1);
 
   hpvm_request_tensor(input, HOST);
@@ -675,116 +617,81 @@ void testSampleFilter(){
   printTensorValues(input);
   */
 
-  void* exact_res = tensorConvolution(input2, input, 0, 0,
-				      1, 1, 1, 1);
+  void *exact_res = tensorConvolution(input2, input, 0, 0, 1, 1, 1, 1);
   printTensorValues(exact_res);
-  
-  void* res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0);
-
-  //void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3);
- 
-  printTensorValues(res);
-  
-}
 
+  void *res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0);
 
+  // void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3);
 
+  printTensorValues(res);
+}
 
-
-void testPerforationCalls(void* input, void* filter,
-			  int pad_h, int pad_w,
-			  int stride_h, int stride_w,
-			  int row, int col){
-
+void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w,
+                          int stride_h, int stride_w, int row, int col) {
 
   float interpolation_rate = 1.0;
-  for (int offset = 0; offset < 2; offset++){
-  
-      printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d row = %d col = %d  offset= %d \n\n",
-	     pad_h, pad_w, stride_h, stride_w, row, col, offset);
-
-    
-      void* res_exact = tensorConvolution(input, filter, pad_h, pad_w,
-					  stride_h, stride_w,
-					  1, 1);
-
-      printf ("tensorConvolution Result :");
-      printTensorValues(res_exact);
+  for (int offset = 0; offset < 2; offset++) {
 
+    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
+           "row = %d col = %d  offset= %d \n\n",
+           pad_h, pad_w, stride_h, stride_w, row, col, offset);
 
-      void* res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w,
-					  stride_h, stride_w,
-					  1, 1, 1, 1, 1, 1);
+    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1);
 
-      printf ("\nBaseline Result :");
-      printTensorValues(res_exact2);
+    printf("tensorConvolution Result :");
+    printTensorValues(res_exact);
 
+    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1, 1, 1, 1, 1);
 
-      void* res_exact3 = tensorConvApproxHalf2(input, filter, pad_h, pad_w,
-					       stride_h, stride_w,
-					       1, 1, 1, 1, 1, 1);
-      convertToFP32((struct Tensor*) res_exact3);
+    printf("\nBaseline Result :");
+    printTensorValues(res_exact2);
 
-      printf ("\nFP16_Baseline Result :");
-      printTensorValues(res_exact3);
+    void *res_exact3 = tensorConvApproxHalf2(
+        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
+    convertToFP32((struct Tensor *)res_exact3);
 
-    
-      void* res_sim = tensorConvPerfCuda(input, filter,
-					 pad_h, pad_w,
-					 stride_h, stride_w,
-					 1, 1,
-					 row, col,
-					 offset);
+    printf("\nFP16_Baseline Result :");
+    printTensorValues(res_exact3);
 
-      printf ("\nConvPerfCuda Result :");
-      printTensorValues(res_sim);
+    void *res_sim = tensorConvPerfCuda(input, filter, pad_h, pad_w, stride_h,
+                                       stride_w, 1, 1, row, col, offset);
 
-  
-      void* res = tensorConvApprox(input, filter,
-				   pad_h, pad_w,
-				   stride_h, stride_w,
-				   1, 1,
-				   row, col,
-				   1, offset);
+    printf("\nConvPerfCuda Result :");
+    printTensorValues(res_sim);
 
+    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                 stride_w, 1, 1, row, col, 1, offset);
 
-      printf ("\nConvApprox Result :");
-      printTensorValues(res);
+    printf("\nConvApprox Result :");
+    printTensorValues(res);
 
+    void *res_half =
+        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
+                              1, 1, row, col, 1, offset);
 
-      void* res_half = tensorConvApproxHalf2(input, filter,
-					     pad_h, pad_w,
-					     stride_h, stride_w,
-					     1, 1,
-					     row, col,
-					     1, offset);
+    convertToFP32((struct Tensor *)res_half);
 
-      convertToFP32((struct Tensor*) res_half);
-
-      printf ("\nConvApproxHalf2 Result :");
-      printTensorValues(res_half);
+    printf("\nConvApproxHalf2 Result :");
+    printTensorValues(res_half);
+  }
 
-    }
-  
- 
-  printf ("\n\n\n--- End of Test \n\n\n");
+  printf("\n\n\n--- End of Test \n\n\n");
 }
 
-
-
-
-
 /**** Tests Perforation for a set of different inputs */
-void testPerforation(UnitTestResults& unitTestResults){
+void testPerforation(UnitTestResults &unitTestResults) {
 
-  
   printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
   fillTensorWithVal(input, 1);
-  
-  Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
-  fillTensorWithVal(filter, 1);
 
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
+  fillTensorWithVal(filter, 1);
 
   /*
   float* host_ptr = (float*) ((struct Tensor*) filter)->host_data;
@@ -803,43 +710,33 @@ void testPerforation(UnitTestResults& unitTestResults){
   host_ptr[24] = 2;
   host_ptr[26] = 2;
   */
-  
 
   testPerforationCalls(input, filter, 0, 0, 1, 1, 1, 2);
 
   testPerforationCalls(input, filter, 0, 0, 1, 1, 2, 1);
 
-
   testPerforationCalls(input, filter, 1, 1, 1, 1, 1, 3);
 
   testPerforationCalls(input, filter, 1, 1, 1, 1, 3, 1);
 
-
   testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4);
 
   testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1);
-    
 }
 
-
-
-
-
-
-
-
-
-void testSampling(){
+void testSampling() {
 
   printf("***** Testing Sampling ***** \n\n");
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
   fillTensorWithVal(input, 1);
-  //fillWithOnesAndTwos(input);
-  
-  Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
+  // fillWithOnesAndTwos(input);
+
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
   fillTensorWithVal(filter, 1);
 
-  float* host_ptr = (float*) ((struct Tensor*) filter)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
   host_ptr[0] = 2;
   host_ptr[2] = 2;
   host_ptr[4] = 2;
@@ -854,144 +751,124 @@ void testSampling(){
   host_ptr[22] = 2;
   host_ptr[24] = 2;
   host_ptr[26] = 2;
-  //printTensorValues(input);
+  // printTensorValues(input);
+
+  void *res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
 
-  void* res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-  
   printTensorValues(res);
 
+  void *res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1);
 
-  void* res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1);
-  
   printTensorValues(res2);
 
+  void *res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0);
 
-  void* res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0);
-  
   printTensorValues(res2_sim);
 
-  
-  void* res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0);
-  
+  void *res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0);
+
   printTensorValues(res3);
 
+  void *res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
 
-  void* res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
-  
   printTensorValues(res4);
 
+  void *res4_half =
+      tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
 
-  void* res4_half = tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
-
-  convertToFP32((struct Tensor*) res4_half);
+  convertToFP32((struct Tensor *)res4_half);
 
   printTensorValues(res4_half);
-
 }
 
-
-
-
-void testSamplingCalls(void* input, void* filter,
-		       int pad_h, int pad_w,
-		       int stride_h, int stride_w,
-		       int skip_every, UnitTestResults& unitTestResults){
-
+void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w,
+                       int stride_h, int stride_w, int skip_every,
+                       UnitTestResults &unitTestResults) {
 
   float interpolation_rate = 1.0;
-  for (int offset = 0; offset < 2; offset++){
-
-  
-      printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d skip_every = %d offset= %d interpolation_rate = %f \n\n",
-	     pad_h, pad_w, stride_h, stride_w, skip_every, offset, interpolation_rate);
-
-    
-      void* res_exact = tensorConvolution(input, filter, pad_h, pad_w,
-					  stride_h, stride_w,
-					  1, 1);
+  for (int offset = 0; offset < 2; offset++) {
 
-      printf ("tensorConvolution Result :");
-      printTensorValues(res_exact);
+    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
+           "skip_every = %d offset= %d interpolation_rate = %f \n\n",
+           pad_h, pad_w, stride_h, stride_w, skip_every, offset,
+           interpolation_rate);
 
+    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1);
 
-      void* res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w,
-					  stride_h, stride_w,
-					  1, 1, 1, 1, 1, 1);
+    printf("tensorConvolution Result :");
+    printTensorValues(res_exact);
 
-      printf ("\nBaseline Result :");
-      printTensorValues(res_exact2);
+    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1, 1, 1, 1, 1);
 
+    printf("\nBaseline Result :");
+    printTensorValues(res_exact2);
 
-      void* res_exact3 = tensorConvApproxHalf2(input, filter, pad_h, pad_w,
-					       stride_h, stride_w,
-					       1, 1, 1, 1, 1, 1);
-      convertToFP32((struct Tensor*) res_exact3);
+    void *res_exact3 = tensorConvApproxHalf2(
+        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
+    convertToFP32((struct Tensor *)res_exact3);
 
-      printf ("\nFP16_Baseline Result :");
-      printTensorValues(res_exact3);
+    printf("\nFP16_Baseline Result :");
+    printTensorValues(res_exact3);
 
-    
-      void* res_sim = tensorConvSampSim2(input, filter, pad_h, pad_w,
-					 stride_h, stride_w,
-					 1, 1, skip_every, offset, interpolation_rate);
+    void *res_sim =
+        tensorConvSampSim2(input, filter, pad_h, pad_w, stride_h, stride_w, 1,
+                           1, skip_every, offset, interpolation_rate);
 
-      printf ("\nConvSampSim Result :");
-      printTensorValues(res_sim);
+    printf("\nConvSampSim Result :");
+    printTensorValues(res_sim);
 
-  
-      void* res = tensorConvApprox(input, filter, pad_h, pad_w,
-				   stride_h, stride_w,
-				   1, 1, 1, 1, skip_every, offset);
+    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                 stride_w, 1, 1, 1, 1, skip_every, offset);
 
+    printf("\nConvApprox Result :");
+    printTensorValues(res);
 
-      printf ("\nConvApprox Result :");
-      printTensorValues(res);
+    void *res_half =
+        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
+                              1, 1, 1, 1, skip_every, offset);
 
+    convertToFP32((struct Tensor *)res_half);
 
-      void* res_half = tensorConvApproxHalf2(input, filter, pad_h, pad_w,
-					     stride_h, stride_w,
-					     1, 1, 1, 1, skip_every, offset);
+    printf("\nConvApproxHalf2 Result :");
+    printTensorValues(res_half);
 
-      convertToFP32((struct Tensor*) res_half);
+    std::string suffix =
+        std::string(" pad_h = ") + std::to_string(pad_h) +
+        std::string(" pad_w = ") + std::to_string(pad_w) +
+        std::string(" stride_h = ") + std::to_string(stride_h) +
+        std::string(" stride_w = ") + std::to_string(stride_w) +
+        std::string(" skip_every = ") + std::to_string(skip_every) +
+        std::string(" offset = ") + std::to_string(offset);
 
-      printf ("\nConvApproxHalf2 Result :");
-      printTensorValues(res_half);
+    std::string test_name = std::string("SAMP_FP32 ") + suffix;
 
-      std::string suffix = std::string(" pad_h = ") + std::to_string(pad_h)
-	+ std::string(" pad_w = ") + std::to_string(pad_w)
-        + std::string(" stride_h = ") + std::to_string(stride_h)
-	+ std::string(" stride_w = ") + std::to_string(stride_w)
-        + std::string(" skip_every = ") + std::to_string(skip_every)
-	+ std::string(" offset = ") + std::to_string(offset);
+    unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.01,
+                                   test_name);
 
-      std::string test_name = std::string("SAMP_FP32 ") + suffix; 
-					  
-      unitTestResults.compareTensors((Tensor*) res, (Tensor*) res_sim, 0.01, test_name);
+    std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix;
+    unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.04,
+                                   fp16_test_name);
+  }
 
-      std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix; 
-      unitTestResults.compareTensors((Tensor*) res_half, (Tensor*) res_sim, 0.04, fp16_test_name);
-    }
-  
- 
-  printf ("\n\n\n --- End of Test \n\n\n");
+  printf("\n\n\n --- End of Test \n\n\n");
 }
 
-
-
 /**** Tests Sample for a sample 3 * 3 Filter */
-void testSampling_3_3(UnitTestResults& unitTestResults){
+void testSampling_3_3(UnitTestResults &unitTestResults) {
 
-  
   printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
   fillTensorWithVal(input, 1);
-  //fillWithOnesAndTwos(input);
-  
-  Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
-  fillTensorWithVal(filter, 1);
+  // fillWithOnesAndTwos(input);
 
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
+  fillTensorWithVal(filter, 1);
 
-  float* host_ptr = (float*) ((struct Tensor*) filter)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
   host_ptr[0] = 2;
   host_ptr[2] = 2;
   host_ptr[4] = 2;
@@ -1007,7 +884,6 @@ void testSampling_3_3(UnitTestResults& unitTestResults){
   host_ptr[24] = 2;
   host_ptr[26] = 2;
 
-  
   // Tests with padding = 0 stride = 1
   testSamplingCalls(input, filter, 0, 0, 1, 1, 2, unitTestResults);
 
@@ -1028,27 +904,19 @@ void testSampling_3_3(UnitTestResults& unitTestResults){
   testSamplingCalls(input, filter, 1, 1, 2, 2, 3, unitTestResults);
 
   testSamplingCalls(input, filter, 1, 1, 2, 2, 4, unitTestResults);
-
-    
 }
 
-
-
-
-
-
-
 /**** Tests Sample for a sample 1 * 1 Filter */
-void testSampling_1_1(UnitTestResults& unitTestResults){
+void testSampling_1_1(UnitTestResults &unitTestResults) {
 
-  
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2);
   fillTensorWithVal(input, 2);
-  //fillWithOnesAndTwos(input);
-  
-  Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1);
+  // fillWithOnesAndTwos(input);
+
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1);
   fillTensorWithVal(filter, 2);
-  
 
   // Tests with padding = 0 stride = 1
   testSamplingCalls(input, filter, 0, 0, 1, 1, 2, unitTestResults);
@@ -1057,25 +925,20 @@ void testSampling_1_1(UnitTestResults& unitTestResults){
 
   testSamplingCalls(input, filter, 0, 0, 1, 1, 4, unitTestResults);
 
-
   // Tests with padding = 1 stride = 1
   testSamplingCalls(input, filter, 1, 1, 1, 1, 2, unitTestResults);
 
   testSamplingCalls(input, filter, 1, 1, 1, 1, 3, unitTestResults);
 
   testSamplingCalls(input, filter, 1, 1, 1, 1, 4, unitTestResults);
-
-    
 }
 
+void *testTensorArgMax() {
 
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1);
 
-
-void* testTensorArgMax(){
-
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1);
- 
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
 
   // Input 0
   host_ptr[0] = 1;
@@ -1097,37 +960,34 @@ void* testTensorArgMax(){
   host_ptr[10] = 2;
   host_ptr[11] = 8;
 
-  void* argmax_out = tensorArgMax(input);
-  
-  // Expect Output of call below to be:  
+  void *argmax_out = tensorArgMax(input);
+
+  // Expect Output of call below to be:
   //   1    2    2    0
   printTensorValues(argmax_out);
 
-  return argmax_out; 
+  return argmax_out;
 }
 
+void *testTensorSelect(void *argmax_out) {
 
-
-void* testTensorSelect(void* argmax_out){
-
-  void* select_out = tensorSelect(argmax_out, 2);
-  printf ("***** tensorSelect output \n");
+  void *select_out = tensorSelect(argmax_out, 2);
+  printf("***** tensorSelect output \n");
 
   printTensorValues(select_out);
 
-  return select_out; 
-  
+  return select_out;
 }
 
+void testTensorContract(void *select_out) {
 
-void testTensorContract(void* select_out){
-
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1);
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1);
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
 
   // Input 0
   host_ptr[0] = 1;
-  host_ptr[1] = 1; 
+  host_ptr[1] = 1;
   host_ptr[2] = 1;
   host_ptr[3] = 1;
 
@@ -1136,51 +996,38 @@ void testTensorContract(void* select_out){
   host_ptr[5] = 2;
   host_ptr[6] = 2;
   host_ptr[7] = 2;
-  
+
   // Input 2
   host_ptr[8] = 3;
   host_ptr[9] = 3;
-  host_ptr[10] = 3; 
-  host_ptr[11] = 3; 
+  host_ptr[10] = 3;
+  host_ptr[11] = 3;
 
   // Input 3
-  host_ptr[12] = 4; 
+  host_ptr[12] = 4;
   host_ptr[13] = 4;
   host_ptr[14] = 4;
   host_ptr[15] = 4;
 
-
-  void* contract_out = tensorContract(input, select_out);
-  printf ("***** tensorContract output \n");
+  void *contract_out = tensorContract(input, select_out);
+  printf("***** tensorContract output \n");
 
   printTensorValues(contract_out);
-
 }
 
+void testNewTensorOps() {
 
-
-void testNewTensorOps(){
-
-  void* argmax_out = testTensorArgMax();
-  void* select_out = testTensorSelect(argmax_out);
+  void *argmax_out = testTensorArgMax();
+  void *select_out = testTensorSelect(argmax_out);
   testTensorContract(select_out);
-  
 }
 
-
-
-
-
-
-
-
-int main(){
+int main() {
 
   llvm_hpvm_initTensorRt(0);
 
-
   UnitTestResults unitTestResults;
-  
+
   // Function call per unit test
   testTensorHgemm(unitTestResults);
   testTensorSgemm(unitTestResults);
@@ -1199,31 +1046,26 @@ int main(){
   testTensorHalfPooling();
 
   */
-  
+
   testSampling_3_3(unitTestResults);
   testSampling_1_1(unitTestResults);
 
   testPerforation(unitTestResults);
 
-  
-
   unitTestResults.printSummary();
-  
 
   // testTensorError();
-  // testQuantization(); 
+  // testQuantization();
   // testTensorGemm();
   // testTensorGemmGPU();
-  // testTensorGemmBias();  
+  // testTensorGemmBias();
   // testTensorConv2();
   // testTensorConv3();
   // testLRN();
   // testSampleFilter();
-  // testNewTensorOps(); 
+  // testNewTensorOps();
   // testQuantization();
   // testPromiseError();
-  
-    
+
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
index 98d6d63eadc44b171b54bd09a9096d072c4be10d..1ca90cf6f724b5e42f3b8c774b23c25f7d294437 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -14,10 +14,10 @@ __global__ void convToGemmApproxHalf(
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     for (int i = 0; i < KH; i++) {
       for (int j = 0; j < KW; j++) {
         const int filter_elem_num =
@@ -58,7 +58,7 @@ convToGemmPerfRow(float *const __restrict__ output,
                                                             // number
   const int h = tx % (H_eff * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out; // output width index (col number)
+  const int w = tx % W_out;                   // output width index (col number)
   int past_start = (h % (x - 1) >= (x - 1 - start));
   const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride -
                   V_pad;                // input height index (row number)
@@ -135,7 +135,7 @@ convToGemmPerfCol(float *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_eff) / W_eff; // output height index (row
                                               // number)
-  const int w = tx % W_eff; // output width index (col number)
+  const int w = tx % W_eff;                   // output width index (col number)
   int past_start = (w % (x - 1)) >= (x - 1 - start);
   const int inH = h * V_stride - V_pad; // input height index (row number)
   const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride -
@@ -394,7 +394,7 @@ __global__ void convToGemmPerfRowHalf(
                                                             // number
   const int h = tx % (H_eff * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out; // output width index (col number)
+  const int w = tx % W_out;                   // output width index (col number)
   int past_start = (h % (x - 1) >= (x - 1 - start));
   const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride -
                   V_pad;                // input height index (row number)
@@ -469,7 +469,7 @@ __global__ void convToGemmPerfColHalf(
                                                             // number
   const int h = tx % (H_out * W_eff) / W_eff; // output height index (row
                                               // number)
-  const int w = tx % W_eff; // output width index (col number)
+  const int w = tx % W_eff;                   // output width index (col number)
   int past_start = (w % (x - 1)) >= (x - 1 - start);
   const int inH = h * V_stride - V_pad; // input height index (row number)
   const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride -
@@ -557,10 +557,10 @@ __global__ void convToGemmApproxHalfN(
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     for (int i = 0; i < KH; i++) {
       for (int j = 0; j < KW; j++) {
         const int filter_elem_num =
@@ -832,10 +832,10 @@ convToGemmHalfInput(__half *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     for (int i = 0; i < KH; i++) {
       for (int j = 0; j < KW; j++) {
         const int filter_elem_num =
@@ -873,10 +873,10 @@ convToGemmHalfInput2(__half *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     const int filter_elem_num = c * KH * KW;
     for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) {
       int i = l / KW;
@@ -1044,10 +1044,10 @@ convToGemmFullInput(float *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     for (int i = 0; i < KH; i++) {
       for (int j = 0; j < KW; j++) {
         const int filter_elem_num =
@@ -1085,10 +1085,10 @@ convToGemmFullInput2(float *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     const int filter_elem_num = c * KH * KW;
     for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) {
       int i = l / KW;
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
deleted file mode 100644
index 2545f07b48ddabfa6793f1d9eb01911542f4198e..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
+++ /dev/null
@@ -1,324 +0,0 @@
-#ifndef APPROXHPVM_IMG_RUNTIME_UTILS
-#define APPROXHPVM_IMG_RUNTIME_UTILS
-
-#include "configuration.h"
-#include "hpvm-rt-controller.h"
-
-#include "img_tensor_runtime.h"
-
-// Utilities header for ApproxHPVM image runtime API (wrapper runtime API)
-
-void *handleTensorFftApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input, bool inverse) {
-
-  if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
-    int param = approxTuples[0].second;
-    switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorFft(input, inverse);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorFft", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorFft", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorFftHalf(input, inverse);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorFftHalf", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorFftHalf", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
-      // TODO additional approx methods implemented here
-    }
-  } else if (approxTuples.size() == 2) {
-    ERROR("Currently unsupported case");
-    abort();
-  } else {
-    ERROR("Unsupported case");
-    abort();
-  }
-  return NULL;
-}
-
-void *handleTensorReduceApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input, size_t axis, MathOp func) {
-  if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
-    int param = approxTuples[0].second;
-    switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorReduce(input, axis, func, 0.0f);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorReduceHalf(input, axis, func, 0.0f);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorReduceHalf", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorReduceHalf", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING: {
-      void *t_out;
-      float skip_ratio;
-      bool is_half;
-      switch (param) {
-      case 41:
-        skip_ratio = 0.5f;
-        is_half = false;
-        break;
-      case 42:
-        skip_ratio = 0.5f;
-        is_half = true;
-        break;
-      case 43:
-        skip_ratio = 0.4f;
-        is_half = false;
-        break;
-      case 44:
-        skip_ratio = 0.4f;
-        is_half = true;
-        break;
-      case 45:
-        skip_ratio = 0.25f;
-        is_half = false;
-        break;
-      case 46:
-        skip_ratio = 0.25f;
-        is_half = true;
-        break;
-      default:
-        DEBUG("Unsupported Option: Select default, skip_ratio = 0.0.\n");
-        skip_ratio = 0.0f;
-        is_half = false;
-        break;
-      }
-      RC->resume_profiler();
-      if (is_half)
-        t_out = tensorReduceHalf(input, axis, func, skip_ratio);
-      else
-        t_out = tensorReduce(input, axis, func, skip_ratio);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      if (is_half) {
-        RC->addToCurrentIterationComputeTime("tensorReduceHalf", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorReduceHalf",
-                                               pinfo.second);
-      } else {
-        RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second);
-      }
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
-      // TODO additional approx methods implemented here
-    }
-  } else if (approxTuples.size() == 2) {
-    ERROR("Currently unsupported case");
-    abort();
-  } else {
-    ERROR("Unsupported case");
-    abort();
-  }
-  return NULL;
-}
-
-void *handleTensorProjectiveTApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    void *input, void *transformation) {
-  if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
-    int param = approxTuples[0].second;
-    switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorProjectiveT(input, transformation);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorProjectiveT", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorProjectiveT", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
-      // TODO additional approx methods implemented here
-    }
-  } else if (approxTuples.size() == 2) {
-    ERROR("Currently unsupported case");
-    abort();
-  } else {
-    ERROR("Unsupported case");
-    abort();
-  }
-  return NULL;
-}
-
-void *handleTensorMap1ApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    MathOp func, void *input) {
-  if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
-    int param = approxTuples[0].second;
-    switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorMap1(func, input);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorMap1", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorMap1", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorMap1Half(func, input);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorMap1Half", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorMap1Half", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
-      // TODO additional approx methods implemented here
-    }
-  } else if (approxTuples.size() == 2) {
-    ERROR("Currently unsupported case");
-    abort();
-  } else {
-    ERROR("Unsupported case");
-    abort();
-  }
-  return NULL;
-}
-
-void *handleTensorMap2ApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    MathOp func, void *input1, void *input2) {
-  if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
-    int param = approxTuples[0].second;
-    switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorMap2(func, input1, input2);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorMap2", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorMap2", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorMap2Half(func, input1, input2);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorMap2Half", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorMap2Half", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
-      // TODO additional approx methods implemented here
-    }
-  } else if (approxTuples.size() == 2) {
-    ERROR("Currently unsupported case");
-    abort();
-  } else {
-    ERROR("Unsupported case");
-    abort();
-  }
-  return NULL;
-}
-
-void *handleTensorMap3ApproximationTuples(
-    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
-    MathOp func, void *input1, void *input2, void *input3) {
-  if (approxTuples.size() == 1) {
-    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
-    int param = approxTuples[0].second;
-    switch (approx) {
-    case GPUNodeConfiguration::APPROX::FP32: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorMap3(func, input1, input2, input3);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorMap3", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorMap3", pinfo.second);
-      return t_out;
-    }
-    case GPUNodeConfiguration::APPROX::FP16: {
-      void *t_out;
-      RC->resume_profiler();
-      t_out = tensorMap3Half(func, input1, input2, input3);
-      RC->pause_profiler();
-      std::pair<double, double> pinfo = RC->get_time_energy();
-      RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime("tensorMap3Half", pinfo.first);
-      RC->addToCurrentIterationComputeEnergy("tensorMap3Half", pinfo.second);
-      return t_out;
-    }
-    default:
-      CUSTOM_ASSERT(false && "Unknown approximation type");
-      ERROR("Unknown approximation type");
-      abort();
-      // TODO additional approx methods implemented here
-    }
-  } else if (approxTuples.size() == 2) {
-    ERROR("Currently unsupported case");
-    abort();
-  } else {
-    ERROR("Unsupported case");
-    abort();
-  }
-  return NULL;
-}
-
-#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
index 330d97600e6cdcf44bb93dbf28625cca8051c3ec..c318a8fb6aba604282cf709d09b6a6ef1a771f0e 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
@@ -3,7 +3,6 @@
 #ifndef APPROXHPVM_RUNTIME_UTILS
 #define APPROXHPVM_RUNTIME_UTILS
 
-
 #include "tensor_runtime.h"
 #include "tensor_cpu_runtime.h"
 #include "configuration.h"
@@ -17,30 +16,29 @@
 //---                      CPU Approximation handling                      ---//
 //----------------------------------------------------------------------------//
 
-void* handleTensorAddApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, void* bias) {
+void *handleTensorAddApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *bias) {
 
-if (approxTuples.size() == 1) {
+  if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorAddCPU(input, bias);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorAddCPU(input, bias);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -53,32 +51,31 @@ if (approxTuples.size() == 1) {
   return NULL;
 }
 
-void* handleTensorMulApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* lhs, void* rhs) {
+void *handleTensorMulApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *lhs, void *rhs) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorGemmCPU(lhs, rhs);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorGemmCPU(lhs, rhs);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
-      }
+    }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
     abort();
@@ -89,79 +86,72 @@ void* handleTensorMulApproximationTuples_CPU(
   return NULL;
 }
 
-void* handleTensorConvApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter, 
-  int conv_pad_h, int conv_pad_w,
-  int conv_stride_h, int conv_stride_w) {
+void *handleTensorConvApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int conv_pad_h, int conv_pad_w,
+    int conv_stride_h, int conv_stride_w) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvApproxCPU(input, filter,
-                               conv_pad_h, conv_pad_w,
-                               conv_stride_h, conv_stride_w,
-                               1, 1,
-                               1, 1, 1, 1);
-
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
-        return t_out;
-        }
-      case CPUNodeConfiguration::APPROX::PERFORATION :
-        {
-          PerfParams params = perfParamSet->getPerfParams(param);
-          INFO("perforation param = %i\n", param);
-          INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
-                params.row, params.col, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxCPU(input, filter,
-                                 conv_pad_h, conv_pad_w,
-                                 conv_stride_h, conv_stride_w,
-                                 1, 1,
-                                 params.row, params.col, 1, params.skip_offset);
-
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)", pinfo.second);
-          return t_out;
-        }
-      case CPUNodeConfiguration::APPROX::INPUT_SAMPLING :
-        {
-          SampParams params = sampParamSet->getSampParams(param);
-          INFO("sampling param = %i\n", param);
-          INFO("params.skip_rate = %i, params.skip_offset = %i\n",
-                params.skip_rate, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxCPU(input, filter,
-                                 conv_pad_h, conv_pad_w,
-                                 conv_stride_h, conv_stride_w,
-                                 1, 1,
-                                 1, 1,
-                                params.skip_rate, params.skip_offset);
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)", pinfo.second);
-          return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out =
+          tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w,
+                              conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
+      return t_out;
+    }
+    case CPUNodeConfiguration::APPROX::PERFORATION: {
+      PerfParams params = perfParamSet->getPerfParams(param);
+      INFO("perforation param = %i\n", param);
+      INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
+           params.row, params.col, params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxCPU(
+          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
+          1, 1, params.row, params.col, 1, params.skip_offset);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)",
+                                             pinfo.second);
+      return t_out;
+    }
+    case CPUNodeConfiguration::APPROX::INPUT_SAMPLING: {
+      SampParams params = sampParamSet->getSampParams(param);
+      INFO("sampling param = %i\n", param);
+      INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate,
+           params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w,
+                                  conv_stride_h, conv_stride_w, 1, 1, 1, 1,
+                                  params.skip_rate, params.skip_offset);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -174,75 +164,73 @@ void* handleTensorConvApproximationTuples_CPU(
   return NULL;
 }
 
-void* handleTensorGroupConvApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride,
-  int conv_mode, int conv_groups) {
+void *handleTensorGroupConvApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int conv_groups) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvCutlassCPU(input, filter,
-                                     vertical_pad, horizontal_pad,
-                                     vertical_stride, horizontal_stride,
-                                     conv_mode, conv_groups);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvCutlassCPU(input, filter, vertical_pad, horizontal_pad,
+                                   vertical_stride, horizontal_stride,
+                                   conv_mode, conv_groups);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorBatchNormApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, void* gamma_ptr, void* beta_ptr,
-  void* mean_ptr, void* variance_ptr, double epsilon) {
+void *handleTensorBatchNormApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr,
+    void *variance_ptr, double epsilon) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr,
-                                  mean_ptr, variance_ptr, epsilon);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-    // TODO additional approx methods implemented here
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+                                 variance_ptr, epsilon);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
+      // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
@@ -254,161 +242,154 @@ void* handleTensorBatchNormApproximationTuples_CPU(
   return NULL;
 }
 
-void* handleTensorReluApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorReluApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorReluCPU(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorReluCPU(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorClippedReluApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, float min, float max) {
+void *handleTensorClippedReluApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, float min, float max) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorRelu2CPU(input, min, max);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorRelu2CPU(input, min, max);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorTanhApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorTanhApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorTanhCPU(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorTanhCPU(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorPoolingApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, int poolFunction,
-  int window_height, int window_width,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride) {
+void *handleTensorPoolingApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, int poolFunction, int window_height, int window_width,
+    int vertical_pad, int horizontal_pad, int vertical_stride,
+    int horizontal_stride) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorPoolingCPU(input_ptr,
-                                 poolFunction,
-                                 window_height, window_width,
-                                 vertical_pad, horizontal_pad,
-                                 vertical_stride, horizontal_stride);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorPoolingCPU(input_ptr, poolFunction, window_height,
+                               window_width, vertical_pad, horizontal_pad,
+                               vertical_stride, horizontal_stride);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorSoftmaxApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input_ptr) {
-  void* t_out;
+void *handleTensorSoftmaxApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr) {
+  void *t_out;
   RC->resume_profiler();
   t_out = tensorSoftmaxCPU(input_ptr);
   RC->pause_profiler();
@@ -423,42 +404,40 @@ void* handleTensorSoftmaxApproximationTuples_CPU(
 //---                      GPU Approximation handling                      ---//
 //----------------------------------------------------------------------------//
 
-void* handleTensorAddApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, void* bias) {
+void *handleTensorAddApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *bias) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorAdd(input, bias);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfAdd(input, bias);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorAdd(input, bias);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfAdd(input, bias);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -471,44 +450,42 @@ void* handleTensorAddApproximationTuples(
   return NULL;
 }
 
-void* handleTensorMulApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* lhs, void* rhs) {
+void *handleTensorMulApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *lhs, void *rhs) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorGemmGPU(lhs, rhs);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfGemmGPU(lhs, rhs);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorGemmGPU(lhs, rhs);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfGemmGPU(lhs, rhs);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
-      }
+    }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
     abort();
@@ -519,100 +496,88 @@ void* handleTensorMulApproximationTuples(
   return NULL;
 }
 
-void* handleTensorConvApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter, 
-  int conv_pad_h, int conv_pad_w,
-  int conv_stride_h, int conv_stride_w) {
+void *handleTensorConvApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int conv_pad_h, int conv_pad_w,
+    int conv_stride_h, int conv_stride_w) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvApprox(input, filter,
-                                 conv_pad_h, conv_pad_w,
-                                 conv_stride_h, conv_stride_w,
-                                 1, 1,
-                                 1, 1, 1, 1);
-	
-
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvApproxHalf2(input, filter,
-                                     conv_pad_h, conv_pad_w,
-                                     conv_stride_h, conv_stride_w,
-                                     1, 1,
-                                     1, 1, 1, 1);
-	
-
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::PERFORATION :
-      case GPUNodeConfiguration::APPROX::PERFORATION_HP :
-        {
-          PerfParams params = perfParamSet->getPerfParams(param);
-          INFO("perforation param = %i\n", param);
-          INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
-                params.row, params.col, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxHalf2(input, filter,
-                                       conv_pad_h, conv_pad_w,
-                                       conv_stride_h, conv_stride_w,
-                                       1, 1,
-                                       params.row, params.col, 1, params.skip_offset);
-
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", pinfo.second);
-          return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::INPUT_SAMPLING :
-      case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP :
-        {
-          SampParams params = sampParamSet->getSampParams(param);
-          INFO("sampling param = %i\n", param);
-          INFO("params.skip_rate = %i, params.skip_offset = %i\n",
-                params.skip_rate, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxHalf2(input, filter,
-                                       conv_pad_h, conv_pad_w,
-                                       conv_stride_h, conv_stride_w,
-                                       1, 1,
-                                       1, 1,
-                                       params.skip_rate, params.skip_offset);
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", pinfo.second);
-          return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w,
+                               conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out =
+          tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
+                                conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf",
+                                             pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::PERFORATION:
+    case GPUNodeConfiguration::APPROX::PERFORATION_HP: {
+      PerfParams params = perfParamSet->getPerfParams(param);
+      INFO("perforation param = %i\n", param);
+      INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
+           params.row, params.col, params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxHalf2(
+          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
+          1, 1, params.row, params.col, 1, params.skip_offset);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)",
+                                             pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::INPUT_SAMPLING:
+    case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP: {
+      SampParams params = sampParamSet->getSampParams(param);
+      INFO("sampling param = %i\n", param);
+      INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate,
+           params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
+                                    conv_stride_h, conv_stride_w, 1, 1, 1, 1,
+                                    params.skip_rate, params.skip_offset);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -625,103 +590,99 @@ void* handleTensorConvApproximationTuples(
   return NULL;
 }
 
-void* handleTensorGroupConvApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride,
-  int conv_mode, int conv_groups) {
+void *handleTensorGroupConvApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int conv_groups) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvCutlass(input, filter,
-                                  vertical_pad, horizontal_pad,
-                                  vertical_stride, horizontal_stride,
-                                  conv_mode, conv_groups);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfConvCutlass(input, filter,
-                                      vertical_pad, horizontal_pad,
-                                      vertical_stride, horizontal_stride,
-                                      conv_mode, conv_groups);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvCutlass(input, filter, vertical_pad, horizontal_pad,
+                                vertical_stride, horizontal_stride, conv_mode,
+                                conv_groups);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfConvCutlass(input, filter, vertical_pad, horizontal_pad,
+                                    vertical_stride, horizontal_stride,
+                                    conv_mode, conv_groups);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorBatchNormApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, void* gamma_ptr, void* beta_ptr,
-  void* mean_ptr, void* variance_ptr, double epsilon) {
+void *handleTensorBatchNormApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr,
+    void *variance_ptr, double epsilon) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr,
-                               mean_ptr, variance_ptr, epsilon);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr,
-                                   mean_ptr, variance_ptr, epsilon);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-    // TODO additional approx methods implemented here
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+                              variance_ptr, epsilon);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+                                  variance_ptr, epsilon);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
+      // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
@@ -733,215 +694,202 @@ void* handleTensorBatchNormApproximationTuples(
   return NULL;
 }
 
-void* handleTensorReluApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorReluApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorRelu(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfRelu(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorRelu(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfRelu(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorClippedReluApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, float min, float max) {
+void *handleTensorClippedReluApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, float min, float max) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorRelu2(input, min, max);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfRelu2(input, min, max);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorRelu2(input, min, max);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfRelu2(input, min, max);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorTanhApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorTanhApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorTanh(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfTanh(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorTanh(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfTanh(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorPoolingApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, int poolFunction,
-  int window_height, int window_width,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride) {
+void *handleTensorPoolingApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, int poolFunction, int window_height, int window_width,
+    int vertical_pad, int horizontal_pad, int vertical_stride,
+    int horizontal_stride) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorPooling(input_ptr,
-                             poolFunction,
-                             window_height, window_width,
-                             vertical_pad, horizontal_pad,
-                             vertical_stride, horizontal_stride);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfPooling(input_ptr,
-                                 poolFunction,
-                                 window_height, window_width,
-                                 vertical_pad, horizontal_pad,
-                                 vertical_stride, horizontal_stride);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorPooling(input_ptr, poolFunction, window_height,
+                            window_width, vertical_pad, horizontal_pad,
+                            vertical_stride, horizontal_stride);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfPooling(input_ptr, poolFunction, window_height,
+                                window_width, vertical_pad, horizontal_pad,
+                                vertical_stride, horizontal_stride);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorSoftmaxApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input_ptr) {
-  //TODO: if approximation choices are added for softmax operation,
+void *handleTensorSoftmaxApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr) {
+  // TODO: if approximation choices are added for softmax operation,
   // implement this like the other handle* functions
-  void* t_out;
+  void *t_out;
   RC->resume_profiler();
   t_out = tensorSoftmax(input_ptr);
   RC->pause_profiler();
@@ -952,5 +900,4 @@ void* handleTensorSoftmaxApproximationTuples(
   return t_out;
 }
 
-
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
index b4f3d39fae77b214a46301ba7d6c95a5e651c44f..3b52cce9f62504753d63015a599d214194d48d98 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
@@ -144,7 +144,8 @@ public:
 // - energy
 // - accuracy (compared to golden output)
 // - accuracy loss (compared to baseline)
-// - a hardware choice and set or operations-approximation choices, described in setup
+// - a hardware choice and set or operations-approximation choices, described in
+// setup
 struct Configuration {
   std::string name;
   float speedup;
@@ -152,7 +153,7 @@ struct Configuration {
   float accuracy;
   float accuracyLoss;
   std::map<std::string, NodeConfiguration *> setup;
-  // map for mapping visc.node.id IDs to HPVM (fused) node approx-configurations 
+  // map for mapping visc.node.id IDs to HPVM (fused) node approx-configurations
   std::map<int, NodeConfiguration *> idConfigMap;
 
   Configuration(std::string &n, float f, float e, float a, float al);
@@ -171,8 +172,8 @@ struct Configuration {
 // Comparison operator definition, in increasing accuracy loss
 // (for std sort, used in pareto optimal computation)
 struct ConfigurationLessThan {
-  bool operator()(
-      const struct Configuration &a, const struct Configuration &b) const;
+  bool operator()(const struct Configuration &a,
+                  const struct Configuration &b) const;
 };
 
 // Comparison operator definition, in increasing accuracy loss
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h
index 7724a49edf2465ee5e3d9ed5568ef2d87f943030..2c9f48203ba5d334e3c9bdd2409250cef47fa43b 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h
@@ -5,6 +5,7 @@
 
 #define LOG_DEBUG 1 // Sets the debug logging to true
 #define LOG_INFO 1  // Sets the info logging to true
+#define LOG_ERROR 1 // Print Errors
 #define ASSERT_FLAG // Sets assertions to true (opposite of NDEBUG macro)
 
 #include "tensor.h"
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h
index a3d51141acd9e45d3231689a39f43e97fbeb0a9f..8c9a711c8a8355eb7e0240cc6ed15b5c7ebd23c9 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h
@@ -45,21 +45,6 @@ __global__ void vecConstDiv(float *A, float div_factor, int n);
 
 __global__ void vecMul(float *A, float *B, int n);
 
-/****  ERROR injecion routines  ******/
-void initRandValues(Tensor *bias, int error_scale);
-
-void initRandValues2(Tensor *bias, int error_scale);
-
-void *addBitError(void *x_ptr, int error_scale);
-
-void randomCeilAndFloor(float *x, size_t num_elems);
-
-// Routine for Adding RoundOff Errors
-void *addRoundError(void *x_ptr, int error_scale);
-
-// Routine for Adding Gaussian Error
-void *addGaussianError(void *x_ptr, int error_scale);
-
 void initPromiseRandValues(Tensor *bias, int error_scale);
 
 // NOTE: Assumption is that x_ptr is FP32 tensor - doesn't work with FP16
@@ -72,8 +57,6 @@ __global__ void quantizeAndClip(float *A, int n, float mul_factor, float min,
 __global__ void quantizeElem(float *A, int n, float mul_factor, float min);
 
 void *quantizeTensorPromise(void *input_ptr, float min, float max);
-
-void *tensorAddError(void *x_ptr, int error_scale);
 }
 
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h
deleted file mode 100644
index 71099a89e4ff1c47a14c4652556838e55c3850ea..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
-broadcast.h
-Calculates shape of two tensors broadcasted together, using a numpy-like (but
-weaker) rule.
-*/
-
-#ifndef FUNCTIONAL_BROADCAST_H
-#define FUNCTIONAL_BROADCAST_H
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <type_traits>
-
-#include "common.h"
-#include "debug.h"
-#include "tensor.h"
-
-// TODO: don't accept N == 1
-template <size_t N, typename std::enable_if<N >= 1, int>::type = 0>
-class BroadcastRemap {
-public:
-  BroadcastRemap(const std::array<Tensor *, N> &tensors)
-      : out_sizes(), sizes() {
-    this->in_dims = tensors[0]->dims.num_dims;
-    for (size_t i = 0; i < N; i++) {
-      Tensor *t = tensors[i];
-      this->sizes[i] = ::sizes(t);
-      if (this->in_dims != t->dims.num_dims)
-        ERROR("Broadcast tensors have different dimensions\n");
-      this->tail_stride[i] = 1;
-    }
-    fill_broadcast_dims();
-  }
-
-  std::vector<size_t> getDim() const { return this->out_sizes; }
-
-  const size_t *getStrides() const { return tail_stride; }
-
-private:
-  void fill_broadcast_dims() {
-    // Simplified broadcasting rule:
-    // 1. Tensors must have the same dimension that is greater than 1.
-    // 2. Dimension size being 1 (instead of equal) is only allowed for each
-    // tensor for a continuous N dimensions starting from the last one.
-
-    // Assume all this->in_dims are 1, and compute
-    // out_dims is reverse-constructed
-    if (this->in_dims < 1)
-      ERROR("Broadcast tensors should have at least 1 dimension\n");
-    bool broadcast_ended[N]{false};
-    this->out_sizes.resize(this->in_dims, 1);
-    for (long i = this->in_dims - 1; i >= 0; i--) {
-      // First get tensors agree on dim size
-      for (size_t j = 0; j < N; j++) {
-        size_t this_size = this->sizes[j][i];
-        if (this_size == 1)
-          continue;
-        if (this->out_sizes[i] != 1 && this->out_sizes[i] != this_size)
-          ERROR("Dimension size mismatch\n");
-        this->out_sizes[i] = this_size;
-      }
-    }
-    for (size_t j = 0; j < N; j++)
-      for (long i = this->in_dims - 1; i >= 0; i--) {
-        // Check for continuity, calculate stride size
-        size_t this_size = this->sizes[j][i];
-        if (this_size != 1) {
-          // Broadcast cannot go on anymore
-          broadcast_ended[j] = true;
-          continue;
-        }
-        if (this->out_sizes[i] != this_size && broadcast_ended[j])
-          ERROR("Broadcast dims must be continuous\n");
-        else
-          tail_stride[j] *= this->out_sizes[i];
-      }
-  }
-
-  size_t in_dims;
-  std::vector<size_t> out_sizes, sizes[N];
-  size_t tail_stride[N];
-};
-
-#endif // FUNCTIONAL_BROADCAST_H
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
deleted file mode 100644
index 00326bef03b78d905f5923ae3ab5a79f327c2e7b..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-common.h
-Helper functions shared among functional/* header files and their corresponding
-*.cu files.
-These include util functions for CUDA, or functions on __device__, or Tensor
-methods that really should be in `struct Tensor`.
-*/
-
-#ifndef FUNCTIONAL_COMMON_H
-#define FUNCTIONAL_COMMON_H
-
-#include <cuda_fp16.h>
-#include <cudnn.h>
-#include <device_launch_parameters.h>
-#include <typeinfo>
-#include <vector>
-
-#include "debug.h"
-#include "profiling.h"
-#include "tensor.h"
-
-// Return ceil(a / b) for both host and device.
-template <typename T> __host__ __device__ __forceinline__ T ceilDiv(T a, T b) {
-  return (a + b - 1) / b;
-}
-
-// Profiles float -> half conversion, can be used like a guard.
-template <typename T> class HFProfileGuard {
-  static const char *getEventName(bool end) {
-    if (typeid(T) == typeid(half) || typeid(T) == typeid(half2))
-      return end ? "F2H_end" : "F2H_start";
-    ERROR("Type not accepted\n");
-    return ""; // For some compilers
-  }
-
-  static bool needProfiling() {
-    // Only profile when given type T is half / half2.
-    // As this guard is often used in templated, scalar-type-agnostic
-    // implementation of an operator, this `T` is often that operator's scalar
-    // type.
-    return typeid(T) == typeid(half) || typeid(T) == typeid(half2);
-  }
-
-public:
-  HFProfileGuard() {
-    if (needProfiling())
-      profileEvent(getEventName(false));
-  }
-
-  ~HFProfileGuard() {
-    if (needProfiling())
-      profileEvent(getEventName(true));
-  }
-};
-
-// Convert C++ type (given by template type T) to "tensor datatype", which is a
-// enum that `struct Tensor` recognizes.
-template <typename T> int getTensorType() {
-  if (typeid(T) == typeid(float))
-    return (int)float_type;
-  else if (typeid(T) == typeid(half))
-    return (int)half_type;
-  else if (typeid(T) == typeid(float2))
-    return (int)float2_type;
-  else if (typeid(T) == typeid(half2))
-    return (int)half2_type;
-  else {
-    ERROR("Unsupported type!\n");
-    return 0; // For some compilers
-  }
-}
-
-// Type-cast Tensor `t` to type `T` (regardless of what current type `t` has),
-// and return a pointer to its underlying data on GPU (which can be t->gpu_data
-// or t->gpu_half_data).
-// This is specialized and implemented for float, float2 (float-complex), half,
-// half2 (used for speeding up operations in half type)
-template <typename T> T *convertAndGetGPUData(Tensor *t);
-
-template <> float *convertAndGetGPUData<float>(Tensor *t);
-
-template <> float2 *convertAndGetGPUData<float2>(Tensor *t);
-
-template <> half *convertAndGetGPUData<half>(Tensor *t);
-
-template <> half2 *convertAndGetGPUData<half2>(Tensor *t);
-
-// Like convertAndGetGPUData, but calls `convertToFP32_offline` instead of
-// `convertToFP32`, which makes a difference when online / offline profiling is
-// involved.
-void convertToFloat2Offline(Tensor *t);
-
-// Return sizes of tensor with a vector.
-std::vector<size_t> sizes(Tensor *t);
-
-std::vector<size_t> sizes(const Dimension &dim);
-
-// Return total number of element in a tensor.
-size_t num_elems(const std::vector<size_t> &dim_sizes);
-
-size_t num_elems(const Dimension &dim);
-
-size_t num_elems(Tensor *t);
-
-// Checks equivalence of types t1 and t2 under the assumption that float=half
-// and float2=half2, and returns the equalized type.
-// 1. Define an equivalence operator (==):
-//  t == t          = True
-//  float == half   = True
-//  float2 == half2 = True
-//  otherwise       = False
-// and throws if t1 != t2.
-// 2. Returns the same type `t`. But as float is not _actually_ the same thing
-// as half, `get_half` determines wh which one to return. E.g. with t1 ==
-// float2, t2 == half, if get_half == true, half2 is returned, otherwise float2
-// is returned.
-Tensor_type_t getCompatibleType(int t1, int t2, bool get_half);
-
-#endif // FUNCTIONAL_COMMON_H
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
deleted file mode 100644
index 74568d8183a7a64f48750b4d02a6286224cac817..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-map.cuh
-Implementation of the map operator, including broadcast, the cuda kernel for
-map, and a general map_n function in host code (which calls the kernel).
-*/
-#ifndef FUNCTIONAL_MAP_H
-#define FUNCTIONAL_MAP_H
-
-#include <array>
-#include <cstddef>
-#include <device_launch_parameters.h>
-#include <type_traits>
-
-#include "broadcast.h"
-#include "common.h"
-#include "debug.h"
-#include "map_typing.h"
-#include "tensor.h"
-#include "tensor_utils.h"
-
-// Checks dimension and data order of each map argument.
-template <size_t N> void mapPrecheck(const std::array<Tensor *, N> &srcs) {
-  for (Tensor *src : srcs) {
-    if (src->dims.num_dims != 4 || src->data_format != CUDNN_TENSOR_NCHW)
-      ERROR("Not supported\n");
-  }
-}
-
-// CUDA kernel for map_n. This is _actually_ mostly unused as specialization for
-// float / half exists for performance benefit.
-template <typename Scalar, size_t N>
-__global__ void kernelMapBroadcast(
-    Scalar *target, unsigned num_rows, void *func, Scalar **srcs,
-    size_t *tail_strides) {
-  auto *n_ary_op = (NTo1MapF<Scalar, N>)func;
-
-  unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x,
-           stride = gridDim.x * blockDim.x;
-  Scalar buf[N];
-  for (unsigned row = threadId; row < num_rows; row += stride) {
-    for (size_t j = 0; j < N; j++)
-      buf[j] = srcs[j][row / tail_strides[j]];
-    target[row] = call_on_c_array<Scalar, Scalar, N>(n_ary_op, buf);
-  }
-}
-
-// Instantiate float to compare fairly to half. Implemented for N = 1...3
-template <size_t N>
-__global__ void kernelMapBroadcast<float, N>(
-    half *target, unsigned num_rows, void *func, half **srcs,
-    size_t *tail_strides);
-
-// Half uses a different implementation. Implemented for N = 1, 2
-template <size_t N>
-__global__ void kernelMapBroadcast<half, N>(
-    half *target, unsigned num_rows, void *func, half **srcs,
-    size_t *tail_strides);
-
-// Create parameter for cuda kernel by copying pointers to device (gpu).
-// This function unwraps BroadcastRemap into a cuda array of size N -- one value
-// for the broadcast stride of each map argument, and unwraps `srcs` into their
-// gpu data pointers.
-template <typename Scalar, size_t N>
-std::tuple<size_t *, Scalar **> make_cuda_params(
-    const BroadcastRemap<N> &br, const std::array<Tensor *, N> &srcs) {
-  for (Tensor *t : srcs)
-    hostToDeviceCopy(t);
-  std::array<Scalar *, N> gpu_datas;
-  {
-    HFProfileGuard<Scalar> g;
-    std::transform(srcs.begin(), srcs.end(), gpu_datas.begin(), [](Tensor *t) {
-      return convertAndGetGPUData<Scalar>(t);
-    });
-  }
-  size_t *cuda_strides;
-  Scalar **cuda_gpu_data;
-  cudaMalloc(&cuda_strides, N * sizeof(size_t));
-  cudaMemcpy(
-      cuda_strides, br.getStrides(), N * sizeof(size_t),
-      cudaMemcpyHostToDevice);
-  cudaMalloc(&cuda_gpu_data, N * sizeof(Scalar *));
-  cudaMemcpy(
-      cuda_gpu_data, gpu_datas.data(), N * sizeof(size_t),
-      cudaMemcpyHostToDevice);
-  return std::make_tuple(cuda_strides, cuda_gpu_data);
-}
-
-// Host code for map_n that check and converts the parameters, and calls the
-// cuda kernel.
-template <
-    typename Scalar, size_t N, typename std::enable_if<N >= 1, int>::type = 0>
-__host__ Tensor *mapGeneral(MathOp mop, const std::array<Tensor *, N> &srcs) {
-  mapPrecheck(srcs);
-
-  auto br = BroadcastRemap<N>(srcs);
-  std::vector<size_t> dim_sizes = br.getDim();
-  auto *target = (Tensor *)create4DTensor(
-      getTensorType<Scalar>(), CUDNN_TENSOR_NCHW, dim_sizes[0], dim_sizes[1],
-      dim_sizes[2], dim_sizes[3]);
-  changeTensorPlacement(target, DEVICE);
-  void *func_ptr = mathOpToFunc<Scalar>(mop);
-
-  size_t *cuda_strides;
-  Scalar **gpu_data;
-  std::tie(cuda_strides, gpu_data) = make_cuda_params<Scalar, N>(br, srcs);
-
-  unsigned n_elem = num_elems(dim_sizes);
-  unsigned max_threads = 512, max_grid = 2048;
-  unsigned threads = std::min(max_threads, n_elem);
-  unsigned grids = std::min(max_grid, ceilDiv(n_elem, threads));
-  kernelMapBroadcast<Scalar, N><<<grids, threads>>>(
-      convertAndGetGPUData<Scalar>(target), n_elem, func_ptr, gpu_data,
-      cuda_strides);
-  cudaDeviceSynchronize();
-  checkCUDA(cudaGetLastError());
-  return target;
-}
-
-#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h
deleted file mode 100644
index 54d919b3346047285bb0b89c2c8d97f625738183..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
-map_typing.h
-Helper for metaprogramming used by map.cuh.
-Defines some recursively templated types and functions.
-*/
-#ifndef FUNCTIONAL_MAP_TYPING_H
-#define FUNCTIONAL_MAP_TYPING_H
-
-#include <cstddef>
-#include <device_launch_parameters.h>
-#include <tuple>
-#include <utility>
-
-namespace {
-template <class T, size_t> using Type = T;
-
-template <typename, template <typename...> typename, typename> struct _RepNType;
-
-template <typename T, template <typename...> typename W, size_t... Is>
-struct _RepNType<T, W, std::index_sequence<Is...>> {
-  using type = W<Type<T, Is>...>;
-};
-
-// Constructs type W<T, T, T, T, ... (N times)> from T and N
-// RepNType T W N = W (T, T, T ... N times ..., T)
-template <typename T, template <typename...> typename W, size_t N>
-using RepNType = typename _RepNType<T, W, std::make_index_sequence<N>>::type;
-
-// Like std::function<Ret(Args...)> but denotes function raw pointer instead of
-// lambda function
-template <typename Ret, typename... Args> using FuncPtrT = Ret (*)(Args...);
-
-template <typename Ret, typename Arg, size_t N> struct _NAToBFunc {
-  template <typename... Args> using Wrapper = FuncPtrT<Ret, Args...>;
-
-  using type = RepNType<Arg, Wrapper, N>;
-};
-} // namespace
-
-// NAToBF Ret Arg N = Ret(*)(Arg, Arg, ...N times)
-template <typename Ret, typename Arg, size_t N>
-using NAToBF = typename _NAToBFunc<Ret, Arg, N>::type;
-
-// NTo1MapF Arg N = Arg(*)(Arg, Arg, ...N times)
-// This denotes n-to-1 map: Arg x Arg x Arg x ... -> Arg.
-template <typename Scalar, size_t N> using NTo1MapF = NAToBF<Scalar, Scalar, N>;
-
-// RepNTuple T N = std::tuple<Arg, Arg, ...N times>
-template <typename T, size_t N> using RepNTuple = RepNType<T, std::tuple, N>;
-
-namespace {
-template <typename TIterable, typename T, size_t... Is>
-constexpr RepNTuple<T, sizeof...(Is)> as_tuple(TIterable arr,
-                                               std::index_sequence<Is...>) {
-  return std::make_tuple(arr[Is]...);
-}
-
-template <typename Function, typename Tuple, size_t... I>
-__device__ auto call(Function f, Tuple t, std::index_sequence<I...>) {
-  return f(std::get<I>(t)...);
-}
-} // namespace
-
-// Converts Iterable of type T and length N to (same-typed) tuple
-// std::tuple<T, T, T, T, ...>
-template <typename TIterable, typename T, size_t N>
-constexpr RepNTuple<T, N> as_tuple(TIterable arr) {
-  return as_tuple<TIterable, T>(arr, std::make_index_sequence<N>{});
-}
-
-// Expands Tuple t into parameters of Function f, in python this would be
-// f(*t).
-template <typename Function, typename Tuple>
-__device__ auto call_on_tuple(Function f, Tuple t) {
-  static constexpr auto size = std::tuple_size<Tuple>::value;
-  return call(f, t, std::make_index_sequence<size>{});
-}
-
-// Expands Array of type T and size N into parameters of Function
-template <typename Ret, typename T, size_t N>
-__device__ Ret call_on_c_array(NAToBF<Ret, T, N> f, const T arr[N]) {
-  return call_on_tuple(f, as_tuple<const T *, T, N>(arr));
-}
-
-#endif // FUNCTIONAL_MAP_TYPING_H
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
deleted file mode 100644
index 9f4fabfb5e0b75017e901c2cb4c60d8649b04f07..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
-reduce.cuh
-Implementation for reduce operator.
-*/
-#include <device_launch_parameters.h>
-#include <functional>
-#include <numeric>
-
-#include "common.h"
-#include "debug.h"
-#include "tensor.h"
-#include "tensor_utils.h"
-
-// Between CUDA compute capability 1.0 and 7.5,
-// Least "max # threads per block" is 512, so 512 is used to be compatible;
-// at most 2048 threads per multiprocessor, where # of cores varies greatly
-// among devices. Titan X has 3072 cores, Quadro P1000 has 640. A bit of
-// over-subscription doesn't hurt. These numbers will keep us compatible even
-// for 1.0 devices.
-constexpr size_t NThreads = 512, MaxNBlocks = 2048 / NThreads * 3072;
-constexpr size_t MaxBlocksPerDim = 65535;
-
-constexpr size_t AlongDimTh = 16, CrossDimTh = 32;
-
-/*
- * Reduce along one dimension with a single thread.
- */
-template <typename K>
-__device__ void reduceAlongDim(
-    K *target, K *src, K init, K identity, void *func, size_t num_irows,
-    size_t dim_size);
-
-template <>
-__device__ void reduceAlongDim<float>(
-    float *target, float *src, float init, float identity, void *func,
-    size_t num_irows, size_t dim_size);
-
-template <>
-__device__ void reduceAlongDim<half>(
-    half *target, half *src, half init, half identity, void *func,
-    size_t num_irows, size_t dim_size);
-
-/*
- * Parallel reduce a dimension of tensor to a scalar value.
- * Use `n_along_dim_threads` threads to sweep along the dim to be reduced.
- * Intermediate values are collected in a divide-and-conquer manner,
- * with thread 0 finally writing back the result.
- */
-template <typename K>
-__device__ void parallelReduceAlongDim(
-    K *target, K *src, K init, K identity, void *func, size_t num_irows,
-    size_t dim_size, size_t along_dim_tid, size_t n_along_dim_threads);
-
-template <>
-__device__ void parallelReduceAlongDim<float>(
-    float *target, float *src, float init, float identity, void *func,
-    size_t num_irows, size_t dim_size, size_t along_dim_tid,
-    size_t n_along_dim_threads);
-
-template <>
-__device__ void parallelReduceAlongDim<half>(
-    half *target, half *src, half init, half identity, void *func,
-    size_t num_irows, size_t dim_size, size_t along_dim_tid,
-    size_t n_along_dim_threads);
-
-/*
- * Reduce the whole tensor with parallelism only on output.
- * The reduce axis is reduced sequentially.
- * Block is 2D, thread is 1D; block.y covers outer rows, block.x * thread.x
- * covers inner rows.
- */
-template <typename K>
-__global__ void kernelReduceDimSeq(
-    K *target_, K *src_, K init, K identity, void *func, size_t num_irows,
-    size_t num_orows, size_t row_size, size_t approx_row_size) {
-  size_t start_orow = blockIdx.y,
-         start_irow = blockIdx.x * blockDim.x + threadIdx.x;
-  size_t orow_stride = gridDim.y, irow_stride = gridDim.x * blockDim.x;
-  for (size_t orow = start_orow; orow < num_orows; orow += orow_stride) {
-    for (size_t irow = start_irow; irow < num_irows; irow += irow_stride) {
-      K *src = src_ + orow * row_size * num_irows + irow;
-      K *target = target_ + orow * num_irows + irow;
-      reduceAlongDim(
-          target, src, init, identity, func, num_irows, approx_row_size);
-    }
-  }
-}
-
-/*
- * Reduce the whole tensor with parallelism on output and reduce axis.
- * I.e., the reduce axis is reduced parallel.
- * Block is 2D, thread is 2D;
- * thread.x covers reduce axis,
- * block.x * thread.y covers inner rows,
- * and block.y covers outer rows.
- */
-template <typename K>
-__global__ void __launch_bounds__(NThreads) kernelReduceDimParallel(
-    K *target_, K *src_, K init, K identity, void *func, size_t num_irows,
-    size_t num_orows, size_t row_size, size_t approx_row_size) {
-  size_t start_orow = blockIdx.y,
-         start_irow = blockIdx.x * blockDim.y + threadIdx.y;
-  size_t orow_stride = gridDim.y, irow_stride = gridDim.x * blockDim.y;
-  for (size_t orow = start_orow; orow < num_orows; orow += orow_stride) {
-    for (size_t irow = start_irow; irow < num_irows; irow += irow_stride) {
-      K *src = src_ + orow * row_size * num_irows + irow;
-      K *target = target_ + orow * num_irows + irow;
-      parallelReduceAlongDim(
-          target, src, init, identity, func, num_irows, approx_row_size,
-          threadIdx.x, blockDim.x);
-    }
-  }
-}
-
-/* Entry point for `reduce` implementation. Calls the right version of reduction
- * kernel as needed. */
-template <typename Scalar>
-__host__ Tensor *reduceDim(
-    Tensor *src, const Scalar &init, MathOp op, size_t axis, float skip_rate) {
-  // Copy input over
-  hostToDeviceCopy(src);
-
-  // Prepare output
-  std::vector<size_t> in_sizes = sizes(src), out_sizes = in_sizes;
-  out_sizes[axis] = 1;
-  auto *target = (Tensor *)create4DTensor(
-      getTensorType<Scalar>(), CUDNN_TENSOR_NCHW, out_sizes[0], out_sizes[1],
-      out_sizes[2], out_sizes[3]);
-  changeTensorPlacement(target, DEVICE);
-
-  // Calculate schedule parameters
-  size_t num_orows = std::accumulate(
-      in_sizes.begin(), in_sizes.begin() + axis, 1, std::multiplies<>());
-  size_t row_size = in_sizes[axis];
-  size_t num_irows = std::accumulate(
-      in_sizes.begin() + axis + 1, in_sizes.end(), 1, std::multiplies<>());
-  size_t num_rows = num_irows * num_orows;
-
-  // Calculate approximation parameters
-  if (skip_rate != 0.0f)
-    INFO("Approximation happening...\n");
-  size_t approx_row_size = (size_t)((1 - skip_rate) * row_size);
-
-  void *func = mathOpToFunc<Scalar>(op);
-  Scalar identity = reduceOpToIdentity<Scalar>(op);
-  Scalar *src_data;
-  {
-    HFProfileGuard<Scalar> g;
-    src_data = convertAndGetGPUData<Scalar>(src);
-  }
-
-  // If # of output entries is small, and row_size is enough for 16 threads,
-  // reduce in parallel.
-  // Remember if reducing dim in parallel, threads must be (16, 32).
-  if (num_rows < NThreads * MaxNBlocks && row_size >= AlongDimTh * 8) {
-    DEBUG(
-        "Reducing in parallel, row size = %lu, actually using %lu\n", row_size,
-        approx_row_size);
-    size_t grid_x = std::min(MaxBlocksPerDim, ceilDiv(num_irows, 32ul));
-    size_t grid_y = std::min(
-        std::min(MaxBlocksPerDim, num_orows), ceilDiv(MaxNBlocks, grid_x));
-    dim3 threads(AlongDimTh, CrossDimTh);
-    dim3 grid(grid_x, grid_y);
-    kernelReduceDimParallel<Scalar><<<grid, threads>>>(
-        convertAndGetGPUData<Scalar>(target), src_data, init, identity, func,
-        num_irows, num_orows, row_size, approx_row_size);
-  } else {
-    DEBUG(
-        "Reducing sequentially, row size = %lu, actually using %lu\n", row_size,
-        approx_row_size);
-    // Reduce sequentially.
-    size_t threads = std::min(NThreads, num_irows);
-    size_t grid_x = std::min(MaxBlocksPerDim, ceilDiv(num_irows, threads));
-    size_t grid_y = std::min(
-        std::min(MaxBlocksPerDim, num_orows), ceilDiv(MaxNBlocks, grid_x));
-    dim3 grid(grid_x, grid_y);
-    kernelReduceDimSeq<Scalar><<<grid, threads>>>(
-        convertAndGetGPUData<Scalar>(target), src_data, init, identity, func,
-        num_irows, num_orows, row_size, approx_row_size);
-  }
-  cudaDeviceSynchronize();
-  checkCUDA(cudaGetLastError());
-  return target;
-}
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
index f859b83e94ecc2b4e103792b977279613f119d71..49c1725336ab4242ba4ed852a10ba3cde0d1c5d7 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
@@ -20,14 +20,8 @@
 #include "approx_knob_utils.h"
 #include "tensor.h"
 
-#define ERROR_INJECTION_ENABLED 0
 #define PROMISE_MODE 1
 
-#ifdef NO_INJECTION
-#undef ERROR_INJECTION_ENABLED
-#endif
-
-//#define ERROR_INJECTION_ENABLED 1
 /* Data declarations */
 extern cudnnHandle_t cudnnHandle;
 extern cublasHandle_t cublasHandle;
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h
deleted file mode 100644
index da7337008d8d39d65a45ab906155ed409b35a991..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h
+++ /dev/null
@@ -1,8258 +0,0 @@
-/* stb_image - v2.23 - public domain image loader - http://nothings.org/stb
-                                  no warranty implied; use at your own risk
-
-   Do this:
-      #define STB_IMAGE_IMPLEMENTATION
-   before you include this file in *one* C or C++ file to create the
-implementation.
-
-   // i.e. it should look like this:
-   #include ...
-   #include ...
-   #include ...
-   #define STB_IMAGE_IMPLEMENTATION
-   #include "stb_image.h"
-
-   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
-   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using
-malloc,realloc,free
-
-
-   QUICK NOTES:
-      Primarily of interest to game developers and other people who can
-          avoid problematic images and only need the trivial interface
-
-      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as
-stock IJG lib) PNG 1/2/4/8/16-bit-per-channel
-
-      TGA (not sure what subset, if a subset)
-      BMP non-1bpp, non-RLE
-      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
-
-      GIF (*comp always reports as 4-channel)
-      HDR (radiance rgbE format)
-      PIC (Softimage PIC)
-      PNM (PPM and PGM binary only)
-
-      Animated GIF still needs a proper API, but here's one way to do it:
-          http://gist.github.com/urraka/685d9a6340b26b830d49
-
-      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
-      - decode from arbitrary I/O callbacks
-      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
-
-   Full documentation under "DOCUMENTATION" below.
-
-
-LICENSE
-
-  See end of file for license information.
-
-RECENT REVISION HISTORY:
-
-      2.23  (2019-08-11) fix clang static analysis warning
-      2.22  (2019-03-04) gif fixes, fix warnings
-      2.21  (2019-02-25) fix typo in comment
-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and
-platform ifdefs 2.19  (2018-02-11) fix warning 2.18  (2018-01-30) fix warnings
-      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
-      2.16  (2017-07-23) all functions have 16-bit variants; optimizations;
-bugfixes 2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE
-detection on GCC 2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for
-Imagenet JPGs 2.13  (2016-12-04) experimental 16-bit API, only for PNG so far;
-fixes 2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes 2.11
-(2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64 RGB-format JPEG; remove
-white matting in PSD; allocate large structures on the stack; correct channel
-count for PNG & BMP 2.10  (2016-01-22) avoid warning introduced in 2.09 2.09
-(2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
-
-   See end of file for full revision history.
-
-
- ============================    Contributors    =========================
-
- Image formats                          Extensions, features
-    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
-    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
-    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
-    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
-    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
-    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
-    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
-    github:urraka (animated gif)           Junggon Kim (PNM comments)
-    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
-                                           socks-the-fox (16-bit PNG)
-                                           Jeremy Sawicki (handle all ImageNet
-JPGs) Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
-    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
-    Arseny Kapoulkine
-    John-Mark Allen
-    Carmelo J Fdez-Aguera
-
- Bug & warning fixes
-    Marc LeBlanc            David Woo          Guillaume George   Martins
-Mozeiko Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil
-Jordan Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
-    Won Chun                Luke Graham        Johan Duparc       Nick Verigakis
-    the Horde3D community   Thomas Ruf         Ronny Chevalier    github:rlyeh
-    Janez Zemva             John Bartholomew   Michal Cichon github:romigrou
-    Jonathan Blow           Ken Hamada         Tero Hanninen      github:svdijk
-    Laurent Gomila          Cort Stratton      Sergio Gonzalez    github:snagar
-    Aruelien Pocheville     Thibault Reuille   Cass Everitt       github:Zelex
-    Ryamond Barbiero        Paul Du Bois       Engin Manap        github:grim210
-    Aldo Culquicondor       Philipp Wiesemann  Dale Weiler        github:sammyhw
-    Oriol Ferrer Mesia      Josh Tobin         Matthew Gregan     github:phprus
-    Julian Raschke          Gregory Mullen     Baldur Karlsson
-github:poppolopoppo Christian Floisand      Kevin Schmidt      JR Smith
-github:darealshinji Blazej Dariusz Roszkowski github:Michaelangel007
-*/
-
-#ifndef STBI_INCLUDE_STB_IMAGE_H
-#define STBI_INCLUDE_STB_IMAGE_H
-
-// DOCUMENTATION
-//
-// Limitations:
-//    - no 12-bit-per-channel JPEG
-//    - no JPEGs with arithmetic coding
-//    - GIF always returns *comp=4
-//
-// Basic usage (see HDR discussion below for HDR usage):
-//    int x,y,n;
-//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
-//    // ... process data if not NULL ...
-//    // ... x = width, y = height, n = # 8-bit components per pixel ...
-//    // ... replace '0' with '1'..'4' to force that many components per pixel
-//    // ... but 'n' will always be the number that it would have been if you
-//    said 0 stbi_image_free(data)
-//
-// Standard parameters:
-//    int *x                 -- outputs image width in pixels
-//    int *y                 -- outputs image height in pixels
-//    int *channels_in_file  -- outputs # of image components in image file
-//    int desired_channels   -- if non-zero, # of image components requested in
-//    result
-//
-// The return value from an image loader is an 'unsigned char *' which points
-// to the pixel data, or NULL on an allocation failure or if the image is
-// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
-// with each pixel consisting of N interleaved 8-bit components; the first
-// pixel pointed to is top-left-most in the image. There is no padding between
-// image scanlines or between pixels, regardless of format. The number of
-// components N is 'desired_channels' if desired_channels is non-zero, or
-// *channels_in_file otherwise. If desired_channels is non-zero,
-// *channels_in_file has the number of components that _would_ have been
-// output otherwise. E.g. if you set desired_channels to 4, you will always
-// get RGBA output, but you can check *channels_in_file to see if it's trivially
-// opaque because e.g. there were only 3 channels in the source image.
-//
-// An output image with N components has the following components interleaved
-// in this order in each pixel:
-//
-//     N=#comp     components
-//       1           grey
-//       2           grey, alpha
-//       3           red, green, blue
-//       4           red, green, blue, alpha
-//
-// If image loading fails for any reason, the return value will be NULL,
-// and *x, *y, *channels_in_file will be unchanged. The function
-// stbi_failure_reason() can be queried for an extremely brief, end-user
-// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
-// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get
-// slightly more user-friendly ones.
-//
-// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
-//
-// ===========================================================================
-//
-// UNICODE:
-//
-//   If compiling for Windows and you wish to use Unicode filenames, compile
-//   with
-//       #define STBI_WINDOWS_UTF8
-//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
-//   Windows wchar_t filenames to utf8.
-//
-// ===========================================================================
-//
-// Philosophy
-//
-// stb libraries are designed with the following priorities:
-//
-//    1. easy to use
-//    2. easy to maintain
-//    3. good performance
-//
-// Sometimes I let "good performance" creep up in priority over "easy to
-// maintain", and for best performance I may provide less-easy-to-use APIs that
-// give higher performance, in addition to the easy-to-use ones. Nevertheless,
-// it's important to keep in mind that from the standpoint of you, a client of
-// this library, all you care about is #1 and #3, and stb libraries DO NOT
-// emphasize #3 above all.
-//
-// Some secondary priorities arise directly from the first two, some of which
-// provide more explicit reasons why performance can't be emphasized.
-//
-//    - Portable ("ease of use")
-//    - Small source code footprint ("easy to maintain")
-//    - No dependencies ("ease of use")
-//
-// ===========================================================================
-//
-// I/O callbacks
-//
-// I/O callbacks allow you to read from arbitrary sources, like packaged
-// files or some other source. Data read from callbacks are processed
-// through a small internal buffer (currently 128 bytes) to try to reduce
-// overhead.
-//
-// The three functions you must define are "read" (reads some bytes of data),
-// "skip" (skips some bytes of data), "eof" (reports if the stream is at the
-// end).
-//
-// ===========================================================================
-//
-// SIMD support
-//
-// The JPEG decoder will try to automatically use SIMD kernels on x86 when
-// supported by the compiler. For ARM Neon support, you must explicitly
-// request it.
-//
-// (The old do-it-yourself SIMD API is no longer supported in the current
-// code.)
-//
-// On x86, SSE2 will automatically be used when available based on a run-time
-// test; if not, the generic C versions are used as a fall-back. On ARM targets,
-// the typical path is to have separate builds for NEON and non-NEON devices
-// (at least this is true for iOS and Android). Therefore, the NEON support is
-// toggled by a build flag: define STBI_NEON to get NEON loops.
-//
-// If for some reason you do not want to use any of SIMD code, or if
-// you have issues compiling it, you can disable it entirely by
-// defining STBI_NO_SIMD.
-//
-// ===========================================================================
-//
-// HDR image support   (disable by defining STBI_NO_HDR)
-//
-// stb_image supports loading HDR images in general, and currently the Radiance
-// .HDR file format specifically. You can still load any file through the
-// existing interface; if you attempt to load an HDR file, it will be
-// automatically remapped to LDR, assuming gamma 2.2 and an arbitrary scale
-// factor defaulting to 1; both of these constants can be reconfigured through
-// this interface:
-//
-//     stbi_hdr_to_ldr_gamma(2.2f);
-//     stbi_hdr_to_ldr_scale(1.0f);
-//
-// (note, do not use _inverse_ constants; stbi_image will invert them
-// appropriately).
-//
-// Additionally, there is a new, parallel interface for loading files as
-// (linear) floats to preserve the full dynamic range:
-//
-//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
-//
-// If you load LDR images through this interface, those images will
-// be promoted to floating point values, run through the inverse of
-// constants corresponding to the above:
-//
-//     stbi_ldr_to_hdr_scale(1.0f);
-//     stbi_ldr_to_hdr_gamma(2.2f);
-//
-// Finally, given a filename (or an open file or memory block--see header
-// file for details) containing image data, you can query for the "most
-// appropriate" interface to use (that is, whether the image is HDR or
-// not), using:
-//
-//     stbi_is_hdr(char *filename);
-//
-// ===========================================================================
-//
-// iPhone PNG support:
-//
-// By default we convert iphone-formatted PNGs back to RGB, even though
-// they are internally encoded differently. You can disable this conversion
-// by calling stbi_convert_iphone_png_to_rgb(0), in which case
-// you will always just get the native iphone "format" through (which
-// is BGR stored in RGB).
-//
-// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
-// pixel to remove any premultiplied alpha *only* if the image file explicitly
-// says there's premultiplied data (currently only happens in iPhone images,
-// and only if iPhone convert-to-rgb processing is on).
-//
-// ===========================================================================
-//
-// ADDITIONAL CONFIGURATION
-//
-//  - You can suppress implementation of any of the decoders to reduce
-//    your code footprint by #defining one or more of the following
-//    symbols before creating the implementation.
-//
-//        STBI_NO_JPEG
-//        STBI_NO_PNG
-//        STBI_NO_BMP
-//        STBI_NO_PSD
-//        STBI_NO_TGA
-//        STBI_NO_GIF
-//        STBI_NO_HDR
-//        STBI_NO_PIC
-//        STBI_NO_PNM   (.ppm and .pgm)
-//
-//  - You can request *only* certain decoders and suppress all other ones
-//    (this will be more forward-compatible, as addition of new decoders
-//    doesn't require you to disable them explicitly):
-//
-//        STBI_ONLY_JPEG
-//        STBI_ONLY_PNG
-//        STBI_ONLY_BMP
-//        STBI_ONLY_PSD
-//        STBI_ONLY_TGA
-//        STBI_ONLY_GIF
-//        STBI_ONLY_HDR
-//        STBI_ONLY_PIC
-//        STBI_ONLY_PNM   (.ppm and .pgm)
-//
-//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
-//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
-//
-
-#ifndef STBI_NO_STDIO
-#include <stdio.h>
-#endif // STBI_NO_STDIO
-
-#define STBI_VERSION 1
-
-enum {
-  STBI_default = 0, // only used for desired_channels
-
-  STBI_grey = 1,
-  STBI_grey_alpha = 2,
-  STBI_rgb = 3,
-  STBI_rgb_alpha = 4
-};
-
-#include <stdlib.h>
-typedef unsigned char stbi_uc;
-typedef unsigned short stbi_us;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef STBIDEF
-#ifdef STB_IMAGE_STATIC
-#define STBIDEF static
-#else
-#define STBIDEF extern
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// PRIMARY API - works on images of any type
-//
-
-//
-// load image by filename, open file, or memory buffer
-//
-
-typedef struct {
-  int (*read)(void *user, char *data,
-              int size); // fill 'data' with 'size' bytes.  return number of
-                         // bytes actually read
-  void (*skip)(void *user, int n); // skip the next 'n' bytes, or 'unget' the
-                                   // last -n bytes if negative
-  int (*eof)(void *user); // returns nonzero if we are at end of file/data
-} stbi_io_callbacks;
-
-////////////////////////////////////
-//
-// 8-bits-per-channel interface
-//
-
-STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x,
-                                       int *y, int *channels_in_file,
-                                       int desired_channels);
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk,
-                                          void *user, int *x, int *y,
-                                          int *channels_in_file,
-                                          int desired_channels);
-
-#ifndef STBI_NO_STDIO
-STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y,
-                           int *channels_in_file, int desired_channels);
-STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y,
-                                     int *channels_in_file,
-                                     int desired_channels);
-// for stbi_load_from_file, file pointer is left pointing immediately after
-// image
-#endif
-
-#ifndef STBI_NO_GIF
-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len,
-                                           int **delays, int *x, int *y, int *z,
-                                           int *comp, int req_comp);
-#endif
-
-#ifdef STBI_WINDOWS_UTF8
-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen,
-                                       const wchar_t *input);
-#endif
-
-////////////////////////////////////
-//
-// 16-bits-per-channel interface
-//
-
-STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len,
-                                          int *x, int *y, int *channels_in_file,
-                                          int desired_channels);
-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk,
-                                             void *user, int *x, int *y,
-                                             int *channels_in_file,
-                                             int desired_channels);
-
-#ifndef STBI_NO_STDIO
-STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y,
-                              int *channels_in_file, int desired_channels);
-STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y,
-                                        int *channels_in_file,
-                                        int desired_channels);
-#endif
-
-////////////////////////////////////
-//
-// float-per-channel interface
-//
-#ifndef STBI_NO_LINEAR
-STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x,
-                                      int *y, int *channels_in_file,
-                                      int desired_channels);
-STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk,
-                                         void *user, int *x, int *y,
-                                         int *channels_in_file,
-                                         int desired_channels);
-
-#ifndef STBI_NO_STDIO
-STBIDEF float *stbi_loadf(char const *filename, int *x, int *y,
-                          int *channels_in_file, int desired_channels);
-STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y,
-                                    int *channels_in_file,
-                                    int desired_channels);
-#endif
-#endif
-
-#ifndef STBI_NO_HDR
-STBIDEF void stbi_hdr_to_ldr_gamma(float gamma);
-STBIDEF void stbi_hdr_to_ldr_scale(float scale);
-#endif // STBI_NO_HDR
-
-#ifndef STBI_NO_LINEAR
-STBIDEF void stbi_ldr_to_hdr_gamma(float gamma);
-STBIDEF void stbi_ldr_to_hdr_scale(float scale);
-#endif // STBI_NO_LINEAR
-
-// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
-STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk,
-                                       void *user);
-STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
-#ifndef STBI_NO_STDIO
-STBIDEF int stbi_is_hdr(char const *filename);
-STBIDEF int stbi_is_hdr_from_file(FILE *f);
-#endif // STBI_NO_STDIO
-
-// get a VERY brief reason for failure
-// NOT THREADSAFE
-STBIDEF const char *stbi_failure_reason(void);
-
-// free the loaded image -- this is just free()
-STBIDEF void stbi_image_free(void *retval_from_stbi_load);
-
-// get image dimensions & components without fully decoding
-STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x,
-                                  int *y, int *comp);
-STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user,
-                                     int *x, int *y, int *comp);
-STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
-STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk,
-                                          void *user);
-
-#ifndef STBI_NO_STDIO
-STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp);
-STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp);
-STBIDEF int stbi_is_16_bit(char const *filename);
-STBIDEF int stbi_is_16_bit_from_file(FILE *f);
-#endif
-
-// for image formats that explicitly notate that they have premultiplied alpha,
-// we just return the colors as stored in the file. set this flag to force
-// unpremultiplication. results are undefined if the unpremultiply overflow.
-STBIDEF void
-stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
-
-// indicate whether we should process iphone images back to canonical format,
-// or just pass them through "as-is"
-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
-
-// flip the image vertically, so the first pixel in the output array is the
-// bottom left
-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
-
-// ZLIB client - used by PNG, available for other purposes
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len,
-                                                int initial_size, int *outlen);
-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer,
-                                                           int len,
-                                                           int initial_size,
-                                                           int *outlen,
-                                                           int parse_header);
-STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
-STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen,
-                                    const char *ibuffer, int ilen);
-
-STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len,
-                                               int *outlen);
-STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen,
-                                             const char *ibuffer, int ilen);
-
-#ifdef __cplusplus
-}
-#endif
-
-//
-//
-////   end header file   /////////////////////////////////////////////////////
-#endif // STBI_INCLUDE_STB_IMAGE_H
-
-#ifdef STB_IMAGE_IMPLEMENTATION
-
-#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) ||                       \
-    defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) ||                        \
-    defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) ||                        \
-    defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) ||                        \
-    defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB)
-#ifndef STBI_ONLY_JPEG
-#define STBI_NO_JPEG
-#endif
-#ifndef STBI_ONLY_PNG
-#define STBI_NO_PNG
-#endif
-#ifndef STBI_ONLY_BMP
-#define STBI_NO_BMP
-#endif
-#ifndef STBI_ONLY_PSD
-#define STBI_NO_PSD
-#endif
-#ifndef STBI_ONLY_TGA
-#define STBI_NO_TGA
-#endif
-#ifndef STBI_ONLY_GIF
-#define STBI_NO_GIF
-#endif
-#ifndef STBI_ONLY_HDR
-#define STBI_NO_HDR
-#endif
-#ifndef STBI_ONLY_PIC
-#define STBI_NO_PIC
-#endif
-#ifndef STBI_ONLY_PNM
-#define STBI_NO_PNM
-#endif
-#endif
-
-#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) &&                     \
-    !defined(STBI_NO_ZLIB)
-#define STBI_NO_ZLIB
-#endif
-
-#include <limits.h>
-#include <stdarg.h>
-#include <stddef.h> // ptrdiff_t on osx
-#include <stdlib.h>
-#include <string.h>
-
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h> // ldexp, pow
-#endif
-
-#ifndef STBI_NO_STDIO
-#include <stdio.h>
-#endif
-
-#ifndef STBI_ASSERT
-#include <assert.h>
-#define STBI_ASSERT(x) assert(x)
-#endif
-
-#ifdef __cplusplus
-#define STBI_EXTERN extern "C"
-#else
-#define STBI_EXTERN extern
-#endif
-
-#ifndef _MSC_VER
-#ifdef __cplusplus
-#define stbi_inline inline
-#else
-#define stbi_inline
-#endif
-#else
-#define stbi_inline __forceinline
-#endif
-
-#ifdef _MSC_VER
-typedef unsigned short stbi__uint16;
-typedef signed short stbi__int16;
-typedef unsigned int stbi__uint32;
-typedef signed int stbi__int32;
-#else
-#include <stdint.h>
-typedef uint16_t stbi__uint16;
-typedef int16_t stbi__int16;
-typedef uint32_t stbi__uint32;
-typedef int32_t stbi__int32;
-#endif
-
-// should produce compiler error if size is wrong
-typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1];
-
-#ifdef _MSC_VER
-#define STBI_NOTUSED(v) (void)(v)
-#else
-#define STBI_NOTUSED(v) (void)sizeof(v)
-#endif
-
-#ifdef _MSC_VER
-#define STBI_HAS_LROTL
-#endif
-
-#ifdef STBI_HAS_LROTL
-#define stbi_lrot(x, y) _lrotl(x, y)
-#else
-#define stbi_lrot(x, y) (((x) << (y)) | ((x) >> (32 - (y))))
-#endif
-
-#if defined(STBI_MALLOC) && defined(STBI_FREE) &&                              \
-    (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
-// ok
-#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) &&                          \
-    !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
-// ok
-#else
-#error                                                                         \
-    "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
-#endif
-
-#ifndef STBI_MALLOC
-#define STBI_MALLOC(sz) malloc(sz)
-#define STBI_REALLOC(p, newsz) realloc(p, newsz)
-#define STBI_FREE(p) free(p)
-#endif
-
-#ifndef STBI_REALLOC_SIZED
-#define STBI_REALLOC_SIZED(p, oldsz, newsz) STBI_REALLOC(p, newsz)
-#endif
-
-// x86/x64 detection
-#if defined(__x86_64__) || defined(_M_X64)
-#define STBI__X64_TARGET
-#elif defined(__i386) || defined(_M_IX86)
-#define STBI__X86_TARGET
-#endif
-
-#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) &&    \
-    !defined(STBI_NO_SIMD)
-// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
-// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
-// but previous attempts to provide the SSE2 functions with runtime
-// detection caused numerous issues. The way architecture extensions are
-// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
-// New behavior: if compiled with -msse2, we use SSE2 without any
-// detection; if not, we don't use it at all.
-#define STBI_NO_SIMD
-#endif
-
-#if defined(__MINGW32__) && defined(STBI__X86_TARGET) &&                       \
-    !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
-// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid
-// STBI__X64_TARGET
-//
-// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
-// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
-// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
-// simultaneously enabling "-mstackrealign".
-//
-// See https://github.com/nothings/stb/issues/81 for more information.
-//
-// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
-// -mstackrealign to your build settings, feel free to #define
-// STBI_MINGW_ENABLE_SSE2.
-#define STBI_NO_SIMD
-#endif
-
-#if !defined(STBI_NO_SIMD) &&                                                  \
-    (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
-#define STBI_SSE2
-#include <emmintrin.h>
-
-#ifdef _MSC_VER
-
-#if _MSC_VER >= 1400 // not VC6
-#include <intrin.h>  // __cpuid
-static int stbi__cpuid3(void) {
-  int info[4];
-  __cpuid(info, 1);
-  return info[3];
-}
-#else
-static int stbi__cpuid3(void) {
-  int res;
-  __asm {
-      mov  eax,1
-      cpuid
-      mov  res,edx
-  }
-  return res;
-}
-#endif
-
-#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
-
-#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void) {
-  int info3 = stbi__cpuid3();
-  return ((info3 >> 26) & 1) != 0;
-}
-#endif
-
-#else // assume GCC-style if not VC++
-#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
-
-#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
-static int stbi__sse2_available(void) {
-  // If we're even attempting to compile this on GCC/Clang, that means
-  // -msse2 is on, which means the compiler is allowed to use SSE2
-  // instructions at will, and so are we.
-  return 1;
-}
-#endif
-
-#endif
-#endif
-
-// ARM NEON
-#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
-#undef STBI_NEON
-#endif
-
-#ifdef STBI_NEON
-#include <arm_neon.h>
-// assume GCC or Clang on ARM targets
-#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
-#endif
-
-#ifndef STBI_SIMD_ALIGN
-#define STBI_SIMD_ALIGN(type, name) type name
-#endif
-
-///////////////////////////////////////////////
-//
-//  stbi__context struct and start_xxx functions
-
-// stbi__context structure is our basic context used by all images, so it
-// contains all the IO context, plus some basic image information
-typedef struct {
-  stbi__uint32 img_x, img_y;
-  int img_n, img_out_n;
-
-  stbi_io_callbacks io;
-  void *io_user_data;
-
-  int read_from_callbacks;
-  int buflen;
-  stbi_uc buffer_start[128];
-
-  stbi_uc *img_buffer, *img_buffer_end;
-  stbi_uc *img_buffer_original, *img_buffer_original_end;
-} stbi__context;
-
-static void stbi__refill_buffer(stbi__context *s);
-
-// initialize a memory-decode context
-static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len) {
-  s->io.read = NULL;
-  s->read_from_callbacks = 0;
-  s->img_buffer = s->img_buffer_original = (stbi_uc *)buffer;
-  s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *)buffer + len;
-}
-
-// initialize a callback-based context
-static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c,
-                                  void *user) {
-  s->io = *c;
-  s->io_user_data = user;
-  s->buflen = sizeof(s->buffer_start);
-  s->read_from_callbacks = 1;
-  s->img_buffer_original = s->buffer_start;
-  stbi__refill_buffer(s);
-  s->img_buffer_original_end = s->img_buffer_end;
-}
-
-#ifndef STBI_NO_STDIO
-
-static int stbi__stdio_read(void *user, char *data, int size) {
-  return (int)fread(data, 1, size, (FILE *)user);
-}
-
-static void stbi__stdio_skip(void *user, int n) {
-  fseek((FILE *)user, n, SEEK_CUR);
-}
-
-static int stbi__stdio_eof(void *user) { return feof((FILE *)user); }
-
-static stbi_io_callbacks stbi__stdio_callbacks = {
-    stbi__stdio_read,
-    stbi__stdio_skip,
-    stbi__stdio_eof,
-};
-
-static void stbi__start_file(stbi__context *s, FILE *f) {
-  stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *)f);
-}
-
-// static void stop_file(stbi__context *s) { }
-
-#endif // !STBI_NO_STDIO
-
-static void stbi__rewind(stbi__context *s) {
-  // conceptually rewind SHOULD rewind to the beginning of the stream,
-  // but we just rewind to the beginning of the initial buffer, because
-  // we only use it after doing 'test', which only ever looks at at most 92
-  // bytes
-  s->img_buffer = s->img_buffer_original;
-  s->img_buffer_end = s->img_buffer_original_end;
-}
-
-enum { STBI_ORDER_RGB, STBI_ORDER_BGR };
-
-typedef struct {
-  int bits_per_channel;
-  int num_channels;
-  int channel_order;
-} stbi__result_info;
-
-#ifndef STBI_NO_JPEG
-static int stbi__jpeg_test(stbi__context *s);
-static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp,
-                             int req_comp, stbi__result_info *ri);
-static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PNG
-static int stbi__png_test(stbi__context *s);
-static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri);
-static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
-static int stbi__png_is16(stbi__context *s);
-#endif
-
-#ifndef STBI_NO_BMP
-static int stbi__bmp_test(stbi__context *s);
-static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri);
-static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_TGA
-static int stbi__tga_test(stbi__context *s);
-static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri);
-static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PSD
-static int stbi__psd_test(stbi__context *s);
-static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri, int bpc);
-static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
-static int stbi__psd_is16(stbi__context *s);
-#endif
-
-#ifndef STBI_NO_HDR
-static int stbi__hdr_test(stbi__context *s);
-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp,
-                             int req_comp, stbi__result_info *ri);
-static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PIC
-static int stbi__pic_test(stbi__context *s);
-static void *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri);
-static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_GIF
-static int stbi__gif_test(stbi__context *s);
-static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri);
-static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
-                                 int *z, int *comp, int req_comp);
-static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-#ifndef STBI_NO_PNM
-static int stbi__pnm_test(stbi__context *s);
-static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri);
-static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
-#endif
-
-// this is not threadsafe
-static const char *stbi__g_failure_reason;
-
-STBIDEF const char *stbi_failure_reason(void) { return stbi__g_failure_reason; }
-
-static int stbi__err(const char *str) {
-  stbi__g_failure_reason = str;
-  return 0;
-}
-
-static void *stbi__malloc(size_t size) { return STBI_MALLOC(size); }
-
-// stb_image uses ints pervasively, including for offset calculations.
-// therefore the largest decoded image size we can support with the
-// current code, even on 64-bit targets, is INT_MAX. this is not a
-// significant limitation for the intended use case.
-//
-// we do, however, need to make sure our size calculations don't
-// overflow. hence a few helper functions for size calculations that
-// multiply integers together, making sure that they're non-negative
-// and no overflow occurs.
-
-// return 1 if the sum is valid, 0 on overflow.
-// negative terms are considered invalid.
-static int stbi__addsizes_valid(int a, int b) {
-  if (b < 0)
-    return 0;
-  // now 0 <= b <= INT_MAX, hence also
-  // 0 <= INT_MAX - b <= INTMAX.
-  // And "a + b <= INT_MAX" (which might overflow) is the
-  // same as a <= INT_MAX - b (no overflow)
-  return a <= INT_MAX - b;
-}
-
-// returns 1 if the product is valid, 0 on overflow.
-// negative factors are considered invalid.
-static int stbi__mul2sizes_valid(int a, int b) {
-  if (a < 0 || b < 0)
-    return 0;
-  if (b == 0)
-    return 1; // mul-by-0 is always safe
-  // portable way to check for no overflows in a*b
-  return a <= INT_MAX / b;
-}
-
-// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad2sizes_valid(int a, int b, int add) {
-  return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a * b, add);
-}
-
-// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
-static int stbi__mad3sizes_valid(int a, int b, int c, int add) {
-  return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) &&
-         stbi__addsizes_valid(a * b * c, add);
-}
-
-// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't
-// overflow
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) {
-  return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) &&
-         stbi__mul2sizes_valid(a * b * c, d) &&
-         stbi__addsizes_valid(a * b * c * d, add);
-}
-#endif
-
-// mallocs with size overflow checking
-static void *stbi__malloc_mad2(int a, int b, int add) {
-  if (!stbi__mad2sizes_valid(a, b, add))
-    return NULL;
-  return stbi__malloc(a * b + add);
-}
-
-static void *stbi__malloc_mad3(int a, int b, int c, int add) {
-  if (!stbi__mad3sizes_valid(a, b, c, add))
-    return NULL;
-  return stbi__malloc(a * b * c + add);
-}
-
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-static void *stbi__malloc_mad4(int a, int b, int c, int d, int add) {
-  if (!stbi__mad4sizes_valid(a, b, c, d, add))
-    return NULL;
-  return stbi__malloc(a * b * c * d + add);
-}
-#endif
-
-// stbi__err - error
-// stbi__errpf - error returning pointer to float
-// stbi__errpuc - error returning pointer to unsigned char
-
-#ifdef STBI_NO_FAILURE_STRINGS
-#define stbi__err(x, y) 0
-#elif defined(STBI_FAILURE_USERMSG)
-#define stbi__err(x, y) stbi__err(y)
-#else
-#define stbi__err(x, y) stbi__err(x)
-#endif
-
-#define stbi__errpf(x, y) ((float *)(size_t)(stbi__err(x, y) ? NULL : NULL))
-#define stbi__errpuc(x, y)                                                     \
-  ((unsigned char *)(size_t)(stbi__err(x, y) ? NULL : NULL))
-
-STBIDEF void stbi_image_free(void *retval_from_stbi_load) {
-  STBI_FREE(retval_from_stbi_load);
-}
-
-#ifndef STBI_NO_LINEAR
-static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
-#endif
-
-#ifndef STBI_NO_HDR
-static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp);
-#endif
-
-static int stbi__vertically_flip_on_load = 0;
-
-STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) {
-  stbi__vertically_flip_on_load = flag_true_if_should_flip;
-}
-
-static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp,
-                             int req_comp, stbi__result_info *ri, int bpc) {
-  memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
-  ri->bits_per_channel =
-      8; // default is 8 so most paths don't have to be changed
-  ri->channel_order =
-      STBI_ORDER_RGB; // all current input & output are this, but this is here
-                      // so we can add BGR order
-  ri->num_channels = 0;
-
-#ifndef STBI_NO_JPEG
-  if (stbi__jpeg_test(s))
-    return stbi__jpeg_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_PNG
-  if (stbi__png_test(s))
-    return stbi__png_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_BMP
-  if (stbi__bmp_test(s))
-    return stbi__bmp_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_GIF
-  if (stbi__gif_test(s))
-    return stbi__gif_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_PSD
-  if (stbi__psd_test(s))
-    return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc);
-#endif
-#ifndef STBI_NO_PIC
-  if (stbi__pic_test(s))
-    return stbi__pic_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_PNM
-  if (stbi__pnm_test(s))
-    return stbi__pnm_load(s, x, y, comp, req_comp, ri);
-#endif
-
-#ifndef STBI_NO_HDR
-  if (stbi__hdr_test(s)) {
-    float *hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri);
-    return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
-  }
-#endif
-
-#ifndef STBI_NO_TGA
-  // test tga last because it's a crappy test!
-  if (stbi__tga_test(s))
-    return stbi__tga_load(s, x, y, comp, req_comp, ri);
-#endif
-
-  return stbi__errpuc("unknown image type",
-                      "Image not of any known type, or corrupt");
-}
-
-static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h,
-                                      int channels) {
-  int i;
-  int img_len = w * h * channels;
-  stbi_uc *reduced;
-
-  reduced = (stbi_uc *)stbi__malloc(img_len);
-  if (reduced == NULL)
-    return stbi__errpuc("outofmem", "Out of memory");
-
-  for (i = 0; i < img_len; ++i)
-    reduced[i] =
-        (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient
-                                          // approx of 16->8 bit scaling
-
-  STBI_FREE(orig);
-  return reduced;
-}
-
-static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h,
-                                           int channels) {
-  int i;
-  int img_len = w * h * channels;
-  stbi__uint16 *enlarged;
-
-  enlarged = (stbi__uint16 *)stbi__malloc(img_len * 2);
-  if (enlarged == NULL)
-    return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
-
-  for (i = 0; i < img_len; ++i)
-    enlarged[i] = (stbi__uint16)(
-        (orig[i] << 8) +
-        orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
-
-  STBI_FREE(orig);
-  return enlarged;
-}
-
-static void stbi__vertical_flip(void *image, int w, int h,
-                                int bytes_per_pixel) {
-  int row;
-  size_t bytes_per_row = (size_t)w * bytes_per_pixel;
-  stbi_uc temp[2048];
-  stbi_uc *bytes = (stbi_uc *)image;
-
-  for (row = 0; row < (h >> 1); row++) {
-    stbi_uc *row0 = bytes + row * bytes_per_row;
-    stbi_uc *row1 = bytes + (h - row - 1) * bytes_per_row;
-    // swap row0 with row1
-    size_t bytes_left = bytes_per_row;
-    while (bytes_left) {
-      size_t bytes_copy =
-          (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
-      memcpy(temp, row0, bytes_copy);
-      memcpy(row0, row1, bytes_copy);
-      memcpy(row1, temp, bytes_copy);
-      row0 += bytes_copy;
-      row1 += bytes_copy;
-      bytes_left -= bytes_copy;
-    }
-  }
-}
-
-#ifndef STBI_NO_GIF
-static void stbi__vertical_flip_slices(void *image, int w, int h, int z,
-                                       int bytes_per_pixel) {
-  int slice;
-  int slice_size = w * h * bytes_per_pixel;
-
-  stbi_uc *bytes = (stbi_uc *)image;
-  for (slice = 0; slice < z; ++slice) {
-    stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
-    bytes += slice_size;
-  }
-}
-#endif
-
-static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x,
-                                                      int *y, int *comp,
-                                                      int req_comp) {
-  stbi__result_info ri;
-  void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
-
-  if (result == NULL)
-    return NULL;
-
-  if (ri.bits_per_channel != 8) {
-    STBI_ASSERT(ri.bits_per_channel == 16);
-    result = stbi__convert_16_to_8((stbi__uint16 *)result, *x, *y,
-                                   req_comp == 0 ? *comp : req_comp);
-    ri.bits_per_channel = 8;
-  }
-
-  // @TODO: move stbi__convert_format to here
-
-  if (stbi__vertically_flip_on_load) {
-    int channels = req_comp ? req_comp : *comp;
-    stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
-  }
-
-  return (unsigned char *)result;
-}
-
-static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x,
-                                                      int *y, int *comp,
-                                                      int req_comp) {
-  stbi__result_info ri;
-  void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
-
-  if (result == NULL)
-    return NULL;
-
-  if (ri.bits_per_channel != 16) {
-    STBI_ASSERT(ri.bits_per_channel == 8);
-    result = stbi__convert_8_to_16((stbi_uc *)result, *x, *y,
-                                   req_comp == 0 ? *comp : req_comp);
-    ri.bits_per_channel = 16;
-  }
-
-  // @TODO: move stbi__convert_format16 to here
-  // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to
-  // keep more precision
-
-  if (stbi__vertically_flip_on_load) {
-    int channels = req_comp ? req_comp : *comp;
-    stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
-  }
-
-  return (stbi__uint16 *)result;
-}
-
-#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
-static void stbi__float_postprocess(float *result, int *x, int *y, int *comp,
-                                    int req_comp) {
-  if (stbi__vertically_flip_on_load && result != NULL) {
-    int channels = req_comp ? req_comp : *comp;
-    stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
-  }
-}
-#endif
-
-#ifndef STBI_NO_STDIO
-
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
-STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(
-    unsigned int cp, unsigned long flags, const char *str, int cbmb,
-    wchar_t *widestr, int cchwide);
-STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(
-    unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide,
-    char *str, int cbmb, const char *defchar, int *used_default);
-#endif
-
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
-STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen,
-                                       const wchar_t *input) {
-  return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer,
-                             (int)bufferlen, NULL, NULL);
-}
-#endif
-
-static FILE *stbi__fopen(char const *filename, char const *mode) {
-  FILE *f;
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
-  wchar_t wMode[64];
-  wchar_t wFilename[1024];
-  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename,
-                               sizeof(wFilename)))
-    return 0;
-
-  if (0 ==
-      MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
-    return 0;
-
-#if _MSC_VER >= 1400
-  if (0 != _wfopen_s(&f, wFilename, wMode))
-    f = 0;
-#else
-  f = _wfopen(wFilename, wMode);
-#endif
-
-#elif defined(_MSC_VER) && _MSC_VER >= 1400
-  if (0 != fopen_s(&f, filename, mode))
-    f = 0;
-#else
-  f = fopen(filename, mode);
-#endif
-  return f;
-}
-
-STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp,
-                           int req_comp) {
-  FILE *f = stbi__fopen(filename, "rb");
-  unsigned char *result;
-  if (!f)
-    return stbi__errpuc("can't fopen", "Unable to open file");
-  result = stbi_load_from_file(f, x, y, comp, req_comp);
-  fclose(f);
-  return result;
-}
-
-STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp,
-                                     int req_comp) {
-  unsigned char *result;
-  stbi__context s;
-  stbi__start_file(&s, f);
-  result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
-  if (result) {
-    // need to 'unget' all the characters in the IO buffer
-    fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
-  }
-  return result;
-}
-
-STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp,
-                                             int req_comp) {
-  stbi__uint16 *result;
-  stbi__context s;
-  stbi__start_file(&s, f);
-  result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp);
-  if (result) {
-    // need to 'unget' all the characters in the IO buffer
-    fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
-  }
-  return result;
-}
-
-STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp,
-                              int req_comp) {
-  FILE *f = stbi__fopen(filename, "rb");
-  stbi__uint16 *result;
-  if (!f)
-    return (stbi_us *)stbi__errpuc("can't fopen", "Unable to open file");
-  result = stbi_load_from_file_16(f, x, y, comp, req_comp);
-  fclose(f);
-  return result;
-}
-
-#endif //! STBI_NO_STDIO
-
-STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len,
-                                          int *x, int *y, int *channels_in_file,
-                                          int desired_channels) {
-  stbi__context s;
-  stbi__start_mem(&s, buffer, len);
-  return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file,
-                                          desired_channels);
-}
-
-STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk,
-                                             void *user, int *x, int *y,
-                                             int *channels_in_file,
-                                             int desired_channels) {
-  stbi__context s;
-  stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-  return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file,
-                                          desired_channels);
-}
-
-STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x,
-                                       int *y, int *comp, int req_comp) {
-  stbi__context s;
-  stbi__start_mem(&s, buffer, len);
-  return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
-}
-
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk,
-                                          void *user, int *x, int *y, int *comp,
-                                          int req_comp) {
-  stbi__context s;
-  stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-  return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
-}
-
-#ifndef STBI_NO_GIF
-STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len,
-                                           int **delays, int *x, int *y, int *z,
-                                           int *comp, int req_comp) {
-  unsigned char *result;
-  stbi__context s;
-  stbi__start_mem(&s, buffer, len);
-
-  result =
-      (unsigned char *)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
-  if (stbi__vertically_flip_on_load) {
-    stbi__vertical_flip_slices(result, *x, *y, *z, *comp);
-  }
-
-  return result;
-}
-#endif
-
-#ifndef STBI_NO_LINEAR
-static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp,
-                               int req_comp) {
-  unsigned char *data;
-#ifndef STBI_NO_HDR
-  if (stbi__hdr_test(s)) {
-    stbi__result_info ri;
-    float *hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri);
-    if (hdr_data)
-      stbi__float_postprocess(hdr_data, x, y, comp, req_comp);
-    return hdr_data;
-  }
-#endif
-  data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
-  if (data)
-    return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
-  return stbi__errpf("unknown image type",
-                     "Image not of any known type, or corrupt");
-}
-
-STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x,
-                                      int *y, int *comp, int req_comp) {
-  stbi__context s;
-  stbi__start_mem(&s, buffer, len);
-  return stbi__loadf_main(&s, x, y, comp, req_comp);
-}
-
-STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk,
-                                         void *user, int *x, int *y, int *comp,
-                                         int req_comp) {
-  stbi__context s;
-  stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-  return stbi__loadf_main(&s, x, y, comp, req_comp);
-}
-
-#ifndef STBI_NO_STDIO
-STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp,
-                          int req_comp) {
-  float *result;
-  FILE *f = stbi__fopen(filename, "rb");
-  if (!f)
-    return stbi__errpf("can't fopen", "Unable to open file");
-  result = stbi_loadf_from_file(f, x, y, comp, req_comp);
-  fclose(f);
-  return result;
-}
-
-STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp,
-                                    int req_comp) {
-  stbi__context s;
-  stbi__start_file(&s, f);
-  return stbi__loadf_main(&s, x, y, comp, req_comp);
-}
-#endif // !STBI_NO_STDIO
-
-#endif // !STBI_NO_LINEAR
-
-// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
-// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
-// reports false!
-
-STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len) {
-#ifndef STBI_NO_HDR
-  stbi__context s;
-  stbi__start_mem(&s, buffer, len);
-  return stbi__hdr_test(&s);
-#else
-  STBI_NOTUSED(buffer);
-  STBI_NOTUSED(len);
-  return 0;
-#endif
-}
-
-#ifndef STBI_NO_STDIO
-STBIDEF int stbi_is_hdr(char const *filename) {
-  FILE *f = stbi__fopen(filename, "rb");
-  int result = 0;
-  if (f) {
-    result = stbi_is_hdr_from_file(f);
-    fclose(f);
-  }
-  return result;
-}
-
-STBIDEF int stbi_is_hdr_from_file(FILE *f) {
-#ifndef STBI_NO_HDR
-  long pos = ftell(f);
-  int res;
-  stbi__context s;
-  stbi__start_file(&s, f);
-  res = stbi__hdr_test(&s);
-  fseek(f, pos, SEEK_SET);
-  return res;
-#else
-  STBI_NOTUSED(f);
-  return 0;
-#endif
-}
-#endif // !STBI_NO_STDIO
-
-STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk,
-                                       void *user) {
-#ifndef STBI_NO_HDR
-  stbi__context s;
-  stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
-  return stbi__hdr_test(&s);
-#else
-  STBI_NOTUSED(clbk);
-  STBI_NOTUSED(user);
-  return 0;
-#endif
-}
-
-#ifndef STBI_NO_LINEAR
-static float stbi__l2h_gamma = 2.2f, stbi__l2h_scale = 1.0f;
-
-STBIDEF void stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
-STBIDEF void stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
-#endif
-
-static float stbi__h2l_gamma_i = 1.0f / 2.2f, stbi__h2l_scale_i = 1.0f;
-
-STBIDEF void stbi_hdr_to_ldr_gamma(float gamma) {
-  stbi__h2l_gamma_i = 1 / gamma;
-}
-STBIDEF void stbi_hdr_to_ldr_scale(float scale) {
-  stbi__h2l_scale_i = 1 / scale;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// Common code used by all image loaders
-//
-
-enum { STBI__SCAN_load = 0, STBI__SCAN_type, STBI__SCAN_header };
-
-static void stbi__refill_buffer(stbi__context *s) {
-  int n = (s->io.read)(s->io_user_data, (char *)s->buffer_start, s->buflen);
-  if (n == 0) {
-    // at end of file, treat same as if from memory, but need to handle case
-    // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
-    s->read_from_callbacks = 0;
-    s->img_buffer = s->buffer_start;
-    s->img_buffer_end = s->buffer_start + 1;
-    *s->img_buffer = 0;
-  } else {
-    s->img_buffer = s->buffer_start;
-    s->img_buffer_end = s->buffer_start + n;
-  }
-}
-
-stbi_inline static stbi_uc stbi__get8(stbi__context *s) {
-  if (s->img_buffer < s->img_buffer_end)
-    return *s->img_buffer++;
-  if (s->read_from_callbacks) {
-    stbi__refill_buffer(s);
-    return *s->img_buffer++;
-  }
-  return 0;
-}
-
-stbi_inline static int stbi__at_eof(stbi__context *s) {
-  if (s->io.read) {
-    if (!(s->io.eof)(s->io_user_data))
-      return 0;
-    // if feof() is true, check if buffer = end
-    // special case: we've only got the special 0 character at the end
-    if (s->read_from_callbacks == 0)
-      return 1;
-  }
-
-  return s->img_buffer >= s->img_buffer_end;
-}
-
-static void stbi__skip(stbi__context *s, int n) {
-  if (n < 0) {
-    s->img_buffer = s->img_buffer_end;
-    return;
-  }
-  if (s->io.read) {
-    int blen = (int)(s->img_buffer_end - s->img_buffer);
-    if (blen < n) {
-      s->img_buffer = s->img_buffer_end;
-      (s->io.skip)(s->io_user_data, n - blen);
-      return;
-    }
-  }
-  s->img_buffer += n;
-}
-
-static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n) {
-  if (s->io.read) {
-    int blen = (int)(s->img_buffer_end - s->img_buffer);
-    if (blen < n) {
-      int res, count;
-
-      memcpy(buffer, s->img_buffer, blen);
-
-      count = (s->io.read)(s->io_user_data, (char *)buffer + blen, n - blen);
-      res = (count == (n - blen));
-      s->img_buffer = s->img_buffer_end;
-      return res;
-    }
-  }
-
-  if (s->img_buffer + n <= s->img_buffer_end) {
-    memcpy(buffer, s->img_buffer, n);
-    s->img_buffer += n;
-    return 1;
-  } else
-    return 0;
-}
-
-static int stbi__get16be(stbi__context *s) {
-  int z = stbi__get8(s);
-  return (z << 8) + stbi__get8(s);
-}
-
-static stbi__uint32 stbi__get32be(stbi__context *s) {
-  stbi__uint32 z = stbi__get16be(s);
-  return (z << 16) + stbi__get16be(s);
-}
-
-#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
-// nothing
-#else
-static int stbi__get16le(stbi__context *s) {
-  int z = stbi__get8(s);
-  return z + (stbi__get8(s) << 8);
-}
-#endif
-
-#ifndef STBI_NO_BMP
-static stbi__uint32 stbi__get32le(stbi__context *s) {
-  stbi__uint32 z = stbi__get16le(s);
-  return z + (stbi__get16le(s) << 16);
-}
-#endif
-
-#define STBI__BYTECAST(x)                                                      \
-  ((stbi_uc)((x)&255)) // truncate int to byte without warnings
-
-//////////////////////////////////////////////////////////////////////////////
-//
-//  generic converter from built-in img_n to req_comp
-//    individual types do this automatically as much as possible (e.g. jpeg
-//    does all cases internally since it needs to colorspace convert anyway,
-//    and it never has alpha, so very few cases ). png can automatically
-//    interleave an alpha=255 channel, but falls back to this for other cases
-//
-//  assume data buffer is malloced, so malloc a new one and free that one
-//  only failure mode is malloc failing
-
-static stbi_uc stbi__compute_y(int r, int g, int b) {
-  return (stbi_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8);
-}
-
-static unsigned char *stbi__convert_format(unsigned char *data, int img_n,
-                                           int req_comp, unsigned int x,
-                                           unsigned int y) {
-  int i, j;
-  unsigned char *good;
-
-  if (req_comp == img_n)
-    return data;
-  STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
-
-  good = (unsigned char *)stbi__malloc_mad3(req_comp, x, y, 0);
-  if (good == NULL) {
-    STBI_FREE(data);
-    return stbi__errpuc("outofmem", "Out of memory");
-  }
-
-  for (j = 0; j < (int)y; ++j) {
-    unsigned char *src = data + j * x * img_n;
-    unsigned char *dest = good + j * x * req_comp;
-
-#define STBI__COMBO(a, b) ((a)*8 + (b))
-#define STBI__CASE(a, b)                                                       \
-  case STBI__COMBO(a, b):                                                      \
-    for (i = x - 1; i >= 0; --i, src += a, dest += b)
-    // convert source image with img_n components to one with req_comp
-    // components; avoid switch per pixel, so use switch per scanline and
-    // massive macros
-    switch (STBI__COMBO(img_n, req_comp)) {
-      STBI__CASE(1, 2) {
-        dest[0] = src[0];
-        dest[1] = 255;
-      }
-      break;
-      STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
-      break;
-      STBI__CASE(1, 4) {
-        dest[0] = dest[1] = dest[2] = src[0];
-        dest[3] = 255;
-      }
-      break;
-      STBI__CASE(2, 1) { dest[0] = src[0]; }
-      break;
-      STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
-      break;
-      STBI__CASE(2, 4) {
-        dest[0] = dest[1] = dest[2] = src[0];
-        dest[3] = src[1];
-      }
-      break;
-      STBI__CASE(3, 4) {
-        dest[0] = src[0];
-        dest[1] = src[1];
-        dest[2] = src[2];
-        dest[3] = 255;
-      }
-      break;
-      STBI__CASE(3, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); }
-      break;
-      STBI__CASE(3, 2) {
-        dest[0] = stbi__compute_y(src[0], src[1], src[2]);
-        dest[1] = 255;
-      }
-      break;
-      STBI__CASE(4, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); }
-      break;
-      STBI__CASE(4, 2) {
-        dest[0] = stbi__compute_y(src[0], src[1], src[2]);
-        dest[1] = src[3];
-      }
-      break;
-      STBI__CASE(4, 3) {
-        dest[0] = src[0];
-        dest[1] = src[1];
-        dest[2] = src[2];
-      }
-      break;
-    default:
-      STBI_ASSERT(0);
-    }
-#undef STBI__CASE
-  }
-
-  STBI_FREE(data);
-  return good;
-}
-
-static stbi__uint16 stbi__compute_y_16(int r, int g, int b) {
-  return (stbi__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8);
-}
-
-static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n,
-                                            int req_comp, unsigned int x,
-                                            unsigned int y) {
-  int i, j;
-  stbi__uint16 *good;
-
-  if (req_comp == img_n)
-    return data;
-  STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
-
-  good = (stbi__uint16 *)stbi__malloc(req_comp * x * y * 2);
-  if (good == NULL) {
-    STBI_FREE(data);
-    return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory");
-  }
-
-  for (j = 0; j < (int)y; ++j) {
-    stbi__uint16 *src = data + j * x * img_n;
-    stbi__uint16 *dest = good + j * x * req_comp;
-
-#define STBI__COMBO(a, b) ((a)*8 + (b))
-#define STBI__CASE(a, b)                                                       \
-  case STBI__COMBO(a, b):                                                      \
-    for (i = x - 1; i >= 0; --i, src += a, dest += b)
-    // convert source image with img_n components to one with req_comp
-    // components; avoid switch per pixel, so use switch per scanline and
-    // massive macros
-    switch (STBI__COMBO(img_n, req_comp)) {
-      STBI__CASE(1, 2) {
-        dest[0] = src[0];
-        dest[1] = 0xffff;
-      }
-      break;
-      STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
-      break;
-      STBI__CASE(1, 4) {
-        dest[0] = dest[1] = dest[2] = src[0];
-        dest[3] = 0xffff;
-      }
-      break;
-      STBI__CASE(2, 1) { dest[0] = src[0]; }
-      break;
-      STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; }
-      break;
-      STBI__CASE(2, 4) {
-        dest[0] = dest[1] = dest[2] = src[0];
-        dest[3] = src[1];
-      }
-      break;
-      STBI__CASE(3, 4) {
-        dest[0] = src[0];
-        dest[1] = src[1];
-        dest[2] = src[2];
-        dest[3] = 0xffff;
-      }
-      break;
-      STBI__CASE(3, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); }
-      break;
-      STBI__CASE(3, 2) {
-        dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
-        dest[1] = 0xffff;
-      }
-      break;
-      STBI__CASE(4, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); }
-      break;
-      STBI__CASE(4, 2) {
-        dest[0] = stbi__compute_y_16(src[0], src[1], src[2]);
-        dest[1] = src[3];
-      }
-      break;
-      STBI__CASE(4, 3) {
-        dest[0] = src[0];
-        dest[1] = src[1];
-        dest[2] = src[2];
-      }
-      break;
-    default:
-      STBI_ASSERT(0);
-    }
-#undef STBI__CASE
-  }
-
-  STBI_FREE(data);
-  return good;
-}
-
-#ifndef STBI_NO_LINEAR
-static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp) {
-  int i, k, n;
-  float *output;
-  if (!data)
-    return NULL;
-  output = (float *)stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
-  if (output == NULL) {
-    STBI_FREE(data);
-    return stbi__errpf("outofmem", "Out of memory");
-  }
-  // compute number of non-alpha components
-  if (comp & 1)
-    n = comp;
-  else
-    n = comp - 1;
-  for (i = 0; i < x * y; ++i) {
-    for (k = 0; k < n; ++k) {
-      output[i * comp + k] =
-          (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) *
-                  stbi__l2h_scale);
-    }
-  }
-  if (n < comp) {
-    for (i = 0; i < x * y; ++i) {
-      output[i * comp + n] = data[i * comp + n] / 255.0f;
-    }
-  }
-  STBI_FREE(data);
-  return output;
-}
-#endif
-
-#ifndef STBI_NO_HDR
-#define stbi__float2int(x) ((int)(x))
-static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp) {
-  int i, k, n;
-  stbi_uc *output;
-  if (!data)
-    return NULL;
-  output = (stbi_uc *)stbi__malloc_mad3(x, y, comp, 0);
-  if (output == NULL) {
-    STBI_FREE(data);
-    return stbi__errpuc("outofmem", "Out of memory");
-  }
-  // compute number of non-alpha components
-  if (comp & 1)
-    n = comp;
-  else
-    n = comp - 1;
-  for (i = 0; i < x * y; ++i) {
-    for (k = 0; k < n; ++k) {
-      float z = (float)pow(data[i * comp + k] * stbi__h2l_scale_i,
-                           stbi__h2l_gamma_i) *
-                    255 +
-                0.5f;
-      if (z < 0)
-        z = 0;
-      if (z > 255)
-        z = 255;
-      output[i * comp + k] = (stbi_uc)stbi__float2int(z);
-    }
-    if (k < comp) {
-      float z = data[i * comp + k] * 255 + 0.5f;
-      if (z < 0)
-        z = 0;
-      if (z > 255)
-        z = 255;
-      output[i * comp + k] = (stbi_uc)stbi__float2int(z);
-    }
-  }
-  STBI_FREE(data);
-  return output;
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////////
-//
-//  "baseline" JPEG/JFIF decoder
-//
-//    simple implementation
-//      - doesn't support delayed output of y-dimension
-//      - simple interface (only one output format: 8-bit interleaved RGB)
-//      - doesn't try to recover corrupt jpegs
-//      - doesn't allow partial loading, loading multiple at once
-//      - still fast on x86 (copying globals into locals doesn't help x86)
-//      - allocates lots of intermediate memory (full size of all components)
-//        - non-interleaved case requires this anyway
-//        - allows good upsampling (see next)
-//    high-quality
-//      - upsampled channels are bilinearly interpolated, even across blocks
-//      - quality integer IDCT derived from IJG's 'slow'
-//    performance
-//      - fast huffman; reasonable integer IDCT
-//      - some SIMD kernels for common paths on targets with SSE2/NEON
-//      - uses a lot of intermediate memory, could cache poorly
-
-#ifndef STBI_NO_JPEG
-
-// huffman decoding acceleration
-#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache
-
-typedef struct {
-  stbi_uc fast[1 << FAST_BITS];
-  // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
-  stbi__uint16 code[256];
-  stbi_uc values[256];
-  stbi_uc size[257];
-  unsigned int maxcode[18];
-  int delta[17]; // old 'firstsymbol' - old 'firstcode'
-} stbi__huffman;
-
-typedef struct {
-  stbi__context *s;
-  stbi__huffman huff_dc[4];
-  stbi__huffman huff_ac[4];
-  stbi__uint16 dequant[4][64];
-  stbi__int16 fast_ac[4][1 << FAST_BITS];
-
-  // sizes for components, interleaved MCUs
-  int img_h_max, img_v_max;
-  int img_mcu_x, img_mcu_y;
-  int img_mcu_w, img_mcu_h;
-
-  // definition of jpeg image component
-  struct {
-    int id;
-    int h, v;
-    int tq;
-    int hd, ha;
-    int dc_pred;
-
-    int x, y, w2, h2;
-    stbi_uc *data;
-    void *raw_data, *raw_coeff;
-    stbi_uc *linebuf;
-    short *coeff;         // progressive only
-    int coeff_w, coeff_h; // number of 8x8 coefficient blocks
-  } img_comp[4];
-
-  stbi__uint32 code_buffer; // jpeg entropy-coded buffer
-  int code_bits;            // number of valid bits
-  unsigned char marker;     // marker seen while filling entropy buffer
-  int nomore;               // flag if we saw a marker so must stop
-
-  int progressive;
-  int spec_start;
-  int spec_end;
-  int succ_high;
-  int succ_low;
-  int eob_run;
-  int jfif;
-  int app14_color_transform; // Adobe APP14 tag
-  int rgb;
-
-  int scan_n, order[4];
-  int restart_interval, todo;
-
-  // kernels
-  void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
-  void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y,
-                              const stbi_uc *pcb, const stbi_uc *pcr, int count,
-                              int step);
-  stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near,
-                                       stbi_uc *in_far, int w, int hs);
-} stbi__jpeg;
-
-static int stbi__build_huffman(stbi__huffman *h, int *count) {
-  int i, j, k = 0;
-  unsigned int code;
-  // build size list for each symbol (from JPEG spec)
-  for (i = 0; i < 16; ++i)
-    for (j = 0; j < count[i]; ++j)
-      h->size[k++] = (stbi_uc)(i + 1);
-  h->size[k] = 0;
-
-  // compute actual symbols (from jpeg spec)
-  code = 0;
-  k = 0;
-  for (j = 1; j <= 16; ++j) {
-    // compute delta to add to code to compute symbol id
-    h->delta[j] = k - code;
-    if (h->size[k] == j) {
-      while (h->size[k] == j)
-        h->code[k++] = (stbi__uint16)(code++);
-      if (code - 1 >= (1u << j))
-        return stbi__err("bad code lengths", "Corrupt JPEG");
-    }
-    // compute largest code + 1 for this size, preshifted as needed later
-    h->maxcode[j] = code << (16 - j);
-    code <<= 1;
-  }
-  h->maxcode[j] = 0xffffffff;
-
-  // build non-spec acceleration table; 255 is flag for not-accelerated
-  memset(h->fast, 255, 1 << FAST_BITS);
-  for (i = 0; i < k; ++i) {
-    int s = h->size[i];
-    if (s <= FAST_BITS) {
-      int c = h->code[i] << (FAST_BITS - s);
-      int m = 1 << (FAST_BITS - s);
-      for (j = 0; j < m; ++j) {
-        h->fast[c + j] = (stbi_uc)i;
-      }
-    }
-  }
-  return 1;
-}
-
-// build a table that decodes both magnitude and value of small ACs in
-// one go.
-static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h) {
-  int i;
-  for (i = 0; i < (1 << FAST_BITS); ++i) {
-    stbi_uc fast = h->fast[i];
-    fast_ac[i] = 0;
-    if (fast < 255) {
-      int rs = h->values[fast];
-      int run = (rs >> 4) & 15;
-      int magbits = rs & 15;
-      int len = h->size[fast];
-
-      if (magbits && len + magbits <= FAST_BITS) {
-        // magnitude code followed by receive_extend code
-        int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
-        int m = 1 << (magbits - 1);
-        if (k < m)
-          k += (~0U << magbits) + 1;
-        // if the result is small enough, we can fit it in fast_ac table
-        if (k >= -128 && k <= 127)
-          fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits));
-      }
-    }
-  }
-}
-
-static void stbi__grow_buffer_unsafe(stbi__jpeg *j) {
-  do {
-    unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
-    if (b == 0xff) {
-      int c = stbi__get8(j->s);
-      while (c == 0xff)
-        c = stbi__get8(j->s); // consume fill bytes
-      if (c != 0) {
-        j->marker = (unsigned char)c;
-        j->nomore = 1;
-        return;
-      }
-    }
-    j->code_buffer |= b << (24 - j->code_bits);
-    j->code_bits += 8;
-  } while (j->code_bits <= 24);
-}
-
-// (1 << n) - 1
-static const stbi__uint32 stbi__bmask[17] = {
-    0,   1,    3,    7,    15,   31,    63,    127,  255,
-    511, 1023, 2047, 4095, 8191, 16383, 32767, 65535};
-
-// decode a jpeg huffman value from the bitstream
-stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) {
-  unsigned int temp;
-  int c, k;
-
-  if (j->code_bits < 16)
-    stbi__grow_buffer_unsafe(j);
-
-  // look at the top FAST_BITS and determine what symbol ID it is,
-  // if the code is <= FAST_BITS
-  c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
-  k = h->fast[c];
-  if (k < 255) {
-    int s = h->size[k];
-    if (s > j->code_bits)
-      return -1;
-    j->code_buffer <<= s;
-    j->code_bits -= s;
-    return h->values[k];
-  }
-
-  // naive test is to shift the code_buffer down so k bits are
-  // valid, then test against maxcode. To speed this up, we've
-  // preshifted maxcode left so that it has (16-k) 0s at the
-  // end; in other words, regardless of the number of bits, it
-  // wants to be compared against something shifted to have 16;
-  // that way we don't need to shift inside the loop.
-  temp = j->code_buffer >> 16;
-  for (k = FAST_BITS + 1;; ++k)
-    if (temp < h->maxcode[k])
-      break;
-  if (k == 17) {
-    // error! code not found
-    j->code_bits -= 16;
-    return -1;
-  }
-
-  if (k > j->code_bits)
-    return -1;
-
-  // convert the huffman code to the symbol id
-  c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
-  STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) &
-               stbi__bmask[h->size[c]]) == h->code[c]);
-
-  // convert the id to a symbol
-  j->code_bits -= k;
-  j->code_buffer <<= k;
-  return h->values[c];
-}
-
-// bias[n] = (-1<<n) + 1
-static const int stbi__jbias[16] = {0,     -1,    -3,     -7,    -15,   -31,
-                                    -63,   -127,  -255,   -511,  -1023, -2047,
-                                    -4095, -8191, -16383, -32767};
-
-// combined JPEG 'receive' and JPEG 'extend', since baseline
-// always extends everything it receives.
-stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n) {
-  unsigned int k;
-  int sgn;
-  if (j->code_bits < n)
-    stbi__grow_buffer_unsafe(j);
-
-  sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
-  k = stbi_lrot(j->code_buffer, n);
-  STBI_ASSERT(n >= 0 && n < (int)(sizeof(stbi__bmask) / sizeof(*stbi__bmask)));
-  j->code_buffer = k & ~stbi__bmask[n];
-  k &= stbi__bmask[n];
-  j->code_bits -= n;
-  return k + (stbi__jbias[n] & ~sgn);
-}
-
-// get some unsigned bits
-stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n) {
-  unsigned int k;
-  if (j->code_bits < n)
-    stbi__grow_buffer_unsafe(j);
-  k = stbi_lrot(j->code_buffer, n);
-  j->code_buffer = k & ~stbi__bmask[n];
-  k &= stbi__bmask[n];
-  j->code_bits -= n;
-  return k;
-}
-
-stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j) {
-  unsigned int k;
-  if (j->code_bits < 1)
-    stbi__grow_buffer_unsafe(j);
-  k = j->code_buffer;
-  j->code_buffer <<= 1;
-  --j->code_bits;
-  return k & 0x80000000;
-}
-
-// given a value that's at position X in the zigzag stream,
-// where does it appear in the 8x8 matrix coded as row-major?
-static const stbi_uc stbi__jpeg_dezigzag[64 + 15] = {
-    0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40,
-    48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36,
-    29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61,
-    54, 47, 55, 62, 63,
-    // let corrupt input sample past end
-    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63};
-
-// decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64],
-                                   stbi__huffman *hdc, stbi__huffman *hac,
-                                   stbi__int16 *fac, int b,
-                                   stbi__uint16 *dequant) {
-  int diff, dc, k;
-  int t;
-
-  if (j->code_bits < 16)
-    stbi__grow_buffer_unsafe(j);
-  t = stbi__jpeg_huff_decode(j, hdc);
-  if (t < 0)
-    return stbi__err("bad huffman code", "Corrupt JPEG");
-
-  // 0 all the ac values now so we can do it 32-bits at a time
-  memset(data, 0, 64 * sizeof(data[0]));
-
-  diff = t ? stbi__extend_receive(j, t) : 0;
-  dc = j->img_comp[b].dc_pred + diff;
-  j->img_comp[b].dc_pred = dc;
-  data[0] = (short)(dc * dequant[0]);
-
-  // decode AC components, see JPEG spec
-  k = 1;
-  do {
-    unsigned int zig;
-    int c, r, s;
-    if (j->code_bits < 16)
-      stbi__grow_buffer_unsafe(j);
-    c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
-    r = fac[c];
-    if (r) {              // fast-AC path
-      k += (r >> 4) & 15; // run
-      s = r & 15;         // combined length
-      j->code_buffer <<= s;
-      j->code_bits -= s;
-      // decode into unzigzag'd location
-      zig = stbi__jpeg_dezigzag[k++];
-      data[zig] = (short)((r >> 8) * dequant[zig]);
-    } else {
-      int rs = stbi__jpeg_huff_decode(j, hac);
-      if (rs < 0)
-        return stbi__err("bad huffman code", "Corrupt JPEG");
-      s = rs & 15;
-      r = rs >> 4;
-      if (s == 0) {
-        if (rs != 0xf0)
-          break; // end block
-        k += 16;
-      } else {
-        k += r;
-        // decode into unzigzag'd location
-        zig = stbi__jpeg_dezigzag[k++];
-        data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]);
-      }
-    }
-  } while (k < 64);
-  return 1;
-}
-
-static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64],
-                                           stbi__huffman *hdc, int b) {
-  int diff, dc;
-  int t;
-  if (j->spec_end != 0)
-    return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-
-  if (j->code_bits < 16)
-    stbi__grow_buffer_unsafe(j);
-
-  if (j->succ_high == 0) {
-    // first scan for DC coefficient, must be first
-    memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now
-    t = stbi__jpeg_huff_decode(j, hdc);
-    diff = t ? stbi__extend_receive(j, t) : 0;
-
-    dc = j->img_comp[b].dc_pred + diff;
-    j->img_comp[b].dc_pred = dc;
-    data[0] = (short)(dc << j->succ_low);
-  } else {
-    // refinement scan for DC coefficient
-    if (stbi__jpeg_get_bit(j))
-      data[0] += (short)(1 << j->succ_low);
-  }
-  return 1;
-}
-
-// @OPTIMIZE: store non-zigzagged during the decode passes,
-// and only de-zigzag when dequantizing
-static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64],
-                                           stbi__huffman *hac,
-                                           stbi__int16 *fac) {
-  int k;
-  if (j->spec_start == 0)
-    return stbi__err("can't merge dc and ac", "Corrupt JPEG");
-
-  if (j->succ_high == 0) {
-    int shift = j->succ_low;
-
-    if (j->eob_run) {
-      --j->eob_run;
-      return 1;
-    }
-
-    k = j->spec_start;
-    do {
-      unsigned int zig;
-      int c, r, s;
-      if (j->code_bits < 16)
-        stbi__grow_buffer_unsafe(j);
-      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1);
-      r = fac[c];
-      if (r) {              // fast-AC path
-        k += (r >> 4) & 15; // run
-        s = r & 15;         // combined length
-        j->code_buffer <<= s;
-        j->code_bits -= s;
-        zig = stbi__jpeg_dezigzag[k++];
-        data[zig] = (short)((r >> 8) << shift);
-      } else {
-        int rs = stbi__jpeg_huff_decode(j, hac);
-        if (rs < 0)
-          return stbi__err("bad huffman code", "Corrupt JPEG");
-        s = rs & 15;
-        r = rs >> 4;
-        if (s == 0) {
-          if (r < 15) {
-            j->eob_run = (1 << r);
-            if (r)
-              j->eob_run += stbi__jpeg_get_bits(j, r);
-            --j->eob_run;
-            break;
-          }
-          k += 16;
-        } else {
-          k += r;
-          zig = stbi__jpeg_dezigzag[k++];
-          data[zig] = (short)(stbi__extend_receive(j, s) << shift);
-        }
-      }
-    } while (k <= j->spec_end);
-  } else {
-    // refinement scan for these AC coefficients
-
-    short bit = (short)(1 << j->succ_low);
-
-    if (j->eob_run) {
-      --j->eob_run;
-      for (k = j->spec_start; k <= j->spec_end; ++k) {
-        short *p = &data[stbi__jpeg_dezigzag[k]];
-        if (*p != 0)
-          if (stbi__jpeg_get_bit(j))
-            if ((*p & bit) == 0) {
-              if (*p > 0)
-                *p += bit;
-              else
-                *p -= bit;
-            }
-      }
-    } else {
-      k = j->spec_start;
-      do {
-        int r, s;
-        int rs = stbi__jpeg_huff_decode(
-            j, hac); // @OPTIMIZE see if we can use the fast path here,
-                     // advance-by-r is so slow, eh
-        if (rs < 0)
-          return stbi__err("bad huffman code", "Corrupt JPEG");
-        s = rs & 15;
-        r = rs >> 4;
-        if (s == 0) {
-          if (r < 15) {
-            j->eob_run = (1 << r) - 1;
-            if (r)
-              j->eob_run += stbi__jpeg_get_bits(j, r);
-            r = 64; // force end of block
-          } else {
-            // r=15 s=0 should write 16 0s, so we just do
-            // a run of 15 0s and then write s (which is 0),
-            // so we don't have to do anything special here
-          }
-        } else {
-          if (s != 1)
-            return stbi__err("bad huffman code", "Corrupt JPEG");
-          // sign bit
-          if (stbi__jpeg_get_bit(j))
-            s = bit;
-          else
-            s = -bit;
-        }
-
-        // advance by r
-        while (k <= j->spec_end) {
-          short *p = &data[stbi__jpeg_dezigzag[k++]];
-          if (*p != 0) {
-            if (stbi__jpeg_get_bit(j))
-              if ((*p & bit) == 0) {
-                if (*p > 0)
-                  *p += bit;
-                else
-                  *p -= bit;
-              }
-          } else {
-            if (r == 0) {
-              *p = (short)s;
-              break;
-            }
-            --r;
-          }
-        }
-      } while (k <= j->spec_end);
-    }
-  }
-  return 1;
-}
-
-// take a -128..127 value and stbi__clamp it and convert to 0..255
-stbi_inline static stbi_uc stbi__clamp(int x) {
-  // trick to use a single test to catch both cases
-  if ((unsigned int)x > 255) {
-    if (x < 0)
-      return 0;
-    if (x > 255)
-      return 255;
-  }
-  return (stbi_uc)x;
-}
-
-#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
-#define stbi__fsh(x) ((x)*4096)
-
-// derived from jidctint -- DCT_ISLOW
-#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7)                          \
-  int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3;                      \
-  p2 = s2;                                                                     \
-  p3 = s6;                                                                     \
-  p1 = (p2 + p3) * stbi__f2f(0.5411961f);                                      \
-  t2 = p1 + p3 * stbi__f2f(-1.847759065f);                                     \
-  t3 = p1 + p2 * stbi__f2f(0.765366865f);                                      \
-  p2 = s0;                                                                     \
-  p3 = s4;                                                                     \
-  t0 = stbi__fsh(p2 + p3);                                                     \
-  t1 = stbi__fsh(p2 - p3);                                                     \
-  x0 = t0 + t3;                                                                \
-  x3 = t0 - t3;                                                                \
-  x1 = t1 + t2;                                                                \
-  x2 = t1 - t2;                                                                \
-  t0 = s7;                                                                     \
-  t1 = s5;                                                                     \
-  t2 = s3;                                                                     \
-  t3 = s1;                                                                     \
-  p3 = t0 + t2;                                                                \
-  p4 = t1 + t3;                                                                \
-  p1 = t0 + t3;                                                                \
-  p2 = t1 + t2;                                                                \
-  p5 = (p3 + p4) * stbi__f2f(1.175875602f);                                    \
-  t0 = t0 * stbi__f2f(0.298631336f);                                           \
-  t1 = t1 * stbi__f2f(2.053119869f);                                           \
-  t2 = t2 * stbi__f2f(3.072711026f);                                           \
-  t3 = t3 * stbi__f2f(1.501321110f);                                           \
-  p1 = p5 + p1 * stbi__f2f(-0.899976223f);                                     \
-  p2 = p5 + p2 * stbi__f2f(-2.562915447f);                                     \
-  p3 = p3 * stbi__f2f(-1.961570560f);                                          \
-  p4 = p4 * stbi__f2f(-0.390180644f);                                          \
-  t3 += p1 + p4;                                                               \
-  t2 += p2 + p3;                                                               \
-  t1 += p2 + p4;                                                               \
-  t0 += p1 + p3;
-
-static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) {
-  int i, val[64], *v = val;
-  stbi_uc *o;
-  short *d = data;
-
-  // columns
-  for (i = 0; i < 8; ++i, ++d, ++v) {
-    // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-    if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 &&
-        d[48] == 0 && d[56] == 0) {
-      //    no shortcut                 0     seconds
-      //    (1|2|3|4|5|6|7)==0          0     seconds
-      //    all separate               -0.047 seconds
-      //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-      int dcterm = d[0] * 4;
-      v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
-    } else {
-      STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
-      // constants scaled things up by 1<<12; let's bring them back
-      // down, but keep 2 extra bits of precision
-      x0 += 512;
-      x1 += 512;
-      x2 += 512;
-      x3 += 512;
-      v[0] = (x0 + t3) >> 10;
-      v[56] = (x0 - t3) >> 10;
-      v[8] = (x1 + t2) >> 10;
-      v[48] = (x1 - t2) >> 10;
-      v[16] = (x2 + t1) >> 10;
-      v[40] = (x2 - t1) >> 10;
-      v[24] = (x3 + t0) >> 10;
-      v[32] = (x3 - t0) >> 10;
-    }
-  }
-
-  for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
-    // no fast case since the first 1D IDCT spread components out
-    STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
-    // constants scaled things up by 1<<12, plus we had 1<<2 from first
-    // loop, plus horizontal and vertical each scale by sqrt(8) so together
-    // we've got an extra 1<<3, so 1<<17 total we need to remove.
-    // so we want to round that, which means adding 0.5 * 1<<17,
-    // aka 65536. Also, we'll end up with -128 to 127 that we want
-    // to encode as 0..255 by adding 128, so we'll add that before the shift
-    x0 += 65536 + (128 << 17);
-    x1 += 65536 + (128 << 17);
-    x2 += 65536 + (128 << 17);
-    x3 += 65536 + (128 << 17);
-    // tried computing the shifts into temps, or'ing the temps to see
-    // if any were out of range, but that was slower
-    o[0] = stbi__clamp((x0 + t3) >> 17);
-    o[7] = stbi__clamp((x0 - t3) >> 17);
-    o[1] = stbi__clamp((x1 + t2) >> 17);
-    o[6] = stbi__clamp((x1 - t2) >> 17);
-    o[2] = stbi__clamp((x2 + t1) >> 17);
-    o[5] = stbi__clamp((x2 - t1) >> 17);
-    o[3] = stbi__clamp((x3 + t0) >> 17);
-    o[4] = stbi__clamp((x3 - t0) >> 17);
-  }
-}
-
-#ifdef STBI_SSE2
-// sse2 integer IDCT. not the fastest possible implementation but it
-// produces bit-identical results to the generic C version so it's
-// fully "transparent".
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
-  // This is constructed to match our regular (generic) integer IDCT exactly.
-  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
-  __m128i tmp;
-
-// dot product constant: even elems=x, odd elems=y
-#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y))
-
-// out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
-// out(1) = c1[even]*x + c1[odd]*y
-#define dct_rot(out0, out1, x, y, c0, c1)                                      \
-  __m128i c0##lo = _mm_unpacklo_epi16((x), (y));                               \
-  __m128i c0##hi = _mm_unpackhi_epi16((x), (y));                               \
-  __m128i out0##_l = _mm_madd_epi16(c0##lo, c0);                               \
-  __m128i out0##_h = _mm_madd_epi16(c0##hi, c0);                               \
-  __m128i out1##_l = _mm_madd_epi16(c0##lo, c1);                               \
-  __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
-
-// out = in << 12  (in 16-bit, out 32-bit)
-#define dct_widen(out, in)                                                     \
-  __m128i out##_l =                                                            \
-      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4);        \
-  __m128i out##_h =                                                            \
-      _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
-
-// wide add
-#define dct_wadd(out, a, b)                                                    \
-  __m128i out##_l = _mm_add_epi32(a##_l, b##_l);                               \
-  __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
-
-// wide sub
-#define dct_wsub(out, a, b)                                                    \
-  __m128i out##_l = _mm_sub_epi32(a##_l, b##_l);                               \
-  __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
-
-// butterfly a/b, add bias, then shift by "s" and pack
-#define dct_bfly32o(out0, out1, a, b, bias, s)                                 \
-  {                                                                            \
-    __m128i abiased_l = _mm_add_epi32(a##_l, bias);                            \
-    __m128i abiased_h = _mm_add_epi32(a##_h, bias);                            \
-    dct_wadd(sum, abiased, b);                                                 \
-    dct_wsub(dif, abiased, b);                                                 \
-    out0 =                                                                     \
-        _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s));   \
-    out1 =                                                                     \
-        _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s));   \
-  }
-
-// 8-bit interleave step (for transposes)
-#define dct_interleave8(a, b)                                                  \
-  tmp = a;                                                                     \
-  a = _mm_unpacklo_epi8(a, b);                                                 \
-  b = _mm_unpackhi_epi8(tmp, b)
-
-// 16-bit interleave step (for transposes)
-#define dct_interleave16(a, b)                                                 \
-  tmp = a;                                                                     \
-  a = _mm_unpacklo_epi16(a, b);                                                \
-  b = _mm_unpackhi_epi16(tmp, b)
-
-#define dct_pass(bias, shift)                                                  \
-  {                                                                            \
-    /* even part */                                                            \
-    dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1);                             \
-    __m128i sum04 = _mm_add_epi16(row0, row4);                                 \
-    __m128i dif04 = _mm_sub_epi16(row0, row4);                                 \
-    dct_widen(t0e, sum04);                                                     \
-    dct_widen(t1e, dif04);                                                     \
-    dct_wadd(x0, t0e, t3e);                                                    \
-    dct_wsub(x3, t0e, t3e);                                                    \
-    dct_wadd(x1, t1e, t2e);                                                    \
-    dct_wsub(x2, t1e, t2e);                                                    \
-    /* odd part */                                                             \
-    dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1);                             \
-    dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1);                             \
-    __m128i sum17 = _mm_add_epi16(row1, row7);                                 \
-    __m128i sum35 = _mm_add_epi16(row3, row5);                                 \
-    dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1);                           \
-    dct_wadd(x4, y0o, y4o);                                                    \
-    dct_wadd(x5, y1o, y5o);                                                    \
-    dct_wadd(x6, y2o, y5o);                                                    \
-    dct_wadd(x7, y3o, y4o);                                                    \
-    dct_bfly32o(row0, row7, x0, x7, bias, shift);                              \
-    dct_bfly32o(row1, row6, x1, x6, bias, shift);                              \
-    dct_bfly32o(row2, row5, x2, x5, bias, shift);                              \
-    dct_bfly32o(row3, row4, x3, x4, bias, shift);                              \
-  }
-
-  __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f),
-                             stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
-  __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f),
-                             stbi__f2f(0.5411961f));
-  __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f),
-                             stbi__f2f(1.175875602f));
-  __m128i rot1_1 =
-      dct_const(stbi__f2f(1.175875602f),
-                stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
-  __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f),
-                             stbi__f2f(-1.961570560f));
-  __m128i rot2_1 =
-      dct_const(stbi__f2f(-1.961570560f),
-                stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
-  __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f),
-                             stbi__f2f(-0.390180644f));
-  __m128i rot3_1 =
-      dct_const(stbi__f2f(-0.390180644f),
-                stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
-
-  // rounding biases in column/row passes, see stbi__idct_block for explanation.
-  __m128i bias_0 = _mm_set1_epi32(512);
-  __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
-
-  // load
-  row0 = _mm_load_si128((const __m128i *)(data + 0 * 8));
-  row1 = _mm_load_si128((const __m128i *)(data + 1 * 8));
-  row2 = _mm_load_si128((const __m128i *)(data + 2 * 8));
-  row3 = _mm_load_si128((const __m128i *)(data + 3 * 8));
-  row4 = _mm_load_si128((const __m128i *)(data + 4 * 8));
-  row5 = _mm_load_si128((const __m128i *)(data + 5 * 8));
-  row6 = _mm_load_si128((const __m128i *)(data + 6 * 8));
-  row7 = _mm_load_si128((const __m128i *)(data + 7 * 8));
-
-  // column pass
-  dct_pass(bias_0, 10);
-
-  {
-    // 16bit 8x8 transpose pass 1
-    dct_interleave16(row0, row4);
-    dct_interleave16(row1, row5);
-    dct_interleave16(row2, row6);
-    dct_interleave16(row3, row7);
-
-    // transpose pass 2
-    dct_interleave16(row0, row2);
-    dct_interleave16(row1, row3);
-    dct_interleave16(row4, row6);
-    dct_interleave16(row5, row7);
-
-    // transpose pass 3
-    dct_interleave16(row0, row1);
-    dct_interleave16(row2, row3);
-    dct_interleave16(row4, row5);
-    dct_interleave16(row6, row7);
-  }
-
-  // row pass
-  dct_pass(bias_1, 17);
-
-  {
-    // pack
-    __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
-    __m128i p1 = _mm_packus_epi16(row2, row3);
-    __m128i p2 = _mm_packus_epi16(row4, row5);
-    __m128i p3 = _mm_packus_epi16(row6, row7);
-
-    // 8bit 8x8 transpose pass 1
-    dct_interleave8(p0, p2); // a0e0a1e1...
-    dct_interleave8(p1, p3); // c0g0c1g1...
-
-    // transpose pass 2
-    dct_interleave8(p0, p1); // a0c0e0g0...
-    dct_interleave8(p2, p3); // b0d0f0h0...
-
-    // transpose pass 3
-    dct_interleave8(p0, p2); // a0b0c0d0...
-    dct_interleave8(p1, p3); // a4b4c4d4...
-
-    // store
-    _mm_storel_epi64((__m128i *)out, p0);
-    out += out_stride;
-    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e));
-    out += out_stride;
-    _mm_storel_epi64((__m128i *)out, p2);
-    out += out_stride;
-    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e));
-    out += out_stride;
-    _mm_storel_epi64((__m128i *)out, p1);
-    out += out_stride;
-    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e));
-    out += out_stride;
-    _mm_storel_epi64((__m128i *)out, p3);
-    out += out_stride;
-    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e));
-  }
-
-#undef dct_const
-#undef dct_rot
-#undef dct_widen
-#undef dct_wadd
-#undef dct_wsub
-#undef dct_bfly32o
-#undef dct_interleave8
-#undef dct_interleave16
-#undef dct_pass
-}
-
-#endif // STBI_SSE2
-
-#ifdef STBI_NEON
-
-// NEON integer IDCT. should produce bit-identical
-// results to the generic C version.
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
-  int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
-
-  int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
-  int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
-  int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f));
-  int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f));
-  int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
-  int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
-  int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
-  int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
-  int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f));
-  int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f));
-  int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f));
-  int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f));
-
-#define dct_long_mul(out, inq, coeff)                                          \
-  int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff);                     \
-  int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
-
-#define dct_long_mac(out, acc, inq, coeff)                                     \
-  int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff);            \
-  int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
-
-#define dct_widen(out, inq)                                                    \
-  int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12);                      \
-  int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
-
-// wide add
-#define dct_wadd(out, a, b)                                                    \
-  int32x4_t out##_l = vaddq_s32(a##_l, b##_l);                                 \
-  int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
-
-// wide sub
-#define dct_wsub(out, a, b)                                                    \
-  int32x4_t out##_l = vsubq_s32(a##_l, b##_l);                                 \
-  int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
-
-// butterfly a/b, then shift using "shiftop" by "s" and pack
-#define dct_bfly32o(out0, out1, a, b, shiftop, s)                              \
-  {                                                                            \
-    dct_wadd(sum, a, b);                                                       \
-    dct_wsub(dif, a, b);                                                       \
-    out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s));                 \
-    out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s));                 \
-  }
-
-#define dct_pass(shiftop, shift)                                               \
-  {                                                                            \
-    /* even part */                                                            \
-    int16x8_t sum26 = vaddq_s16(row2, row6);                                   \
-    dct_long_mul(p1e, sum26, rot0_0);                                          \
-    dct_long_mac(t2e, p1e, row6, rot0_1);                                      \
-    dct_long_mac(t3e, p1e, row2, rot0_2);                                      \
-    int16x8_t sum04 = vaddq_s16(row0, row4);                                   \
-    int16x8_t dif04 = vsubq_s16(row0, row4);                                   \
-    dct_widen(t0e, sum04);                                                     \
-    dct_widen(t1e, dif04);                                                     \
-    dct_wadd(x0, t0e, t3e);                                                    \
-    dct_wsub(x3, t0e, t3e);                                                    \
-    dct_wadd(x1, t1e, t2e);                                                    \
-    dct_wsub(x2, t1e, t2e);                                                    \
-    /* odd part */                                                             \
-    int16x8_t sum15 = vaddq_s16(row1, row5);                                   \
-    int16x8_t sum17 = vaddq_s16(row1, row7);                                   \
-    int16x8_t sum35 = vaddq_s16(row3, row5);                                   \
-    int16x8_t sum37 = vaddq_s16(row3, row7);                                   \
-    int16x8_t sumodd = vaddq_s16(sum17, sum35);                                \
-    dct_long_mul(p5o, sumodd, rot1_0);                                         \
-    dct_long_mac(p1o, p5o, sum17, rot1_1);                                     \
-    dct_long_mac(p2o, p5o, sum35, rot1_2);                                     \
-    dct_long_mul(p3o, sum37, rot2_0);                                          \
-    dct_long_mul(p4o, sum15, rot2_1);                                          \
-    dct_wadd(sump13o, p1o, p3o);                                               \
-    dct_wadd(sump24o, p2o, p4o);                                               \
-    dct_wadd(sump23o, p2o, p3o);                                               \
-    dct_wadd(sump14o, p1o, p4o);                                               \
-    dct_long_mac(x4, sump13o, row7, rot3_0);                                   \
-    dct_long_mac(x5, sump24o, row5, rot3_1);                                   \
-    dct_long_mac(x6, sump23o, row3, rot3_2);                                   \
-    dct_long_mac(x7, sump14o, row1, rot3_3);                                   \
-    dct_bfly32o(row0, row7, x0, x7, shiftop, shift);                           \
-    dct_bfly32o(row1, row6, x1, x6, shiftop, shift);                           \
-    dct_bfly32o(row2, row5, x2, x5, shiftop, shift);                           \
-    dct_bfly32o(row3, row4, x3, x4, shiftop, shift);                           \
-  }
-
-  // load
-  row0 = vld1q_s16(data + 0 * 8);
-  row1 = vld1q_s16(data + 1 * 8);
-  row2 = vld1q_s16(data + 2 * 8);
-  row3 = vld1q_s16(data + 3 * 8);
-  row4 = vld1q_s16(data + 4 * 8);
-  row5 = vld1q_s16(data + 5 * 8);
-  row6 = vld1q_s16(data + 6 * 8);
-  row7 = vld1q_s16(data + 7 * 8);
-
-  // add DC bias
-  row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
-
-  // column pass
-  dct_pass(vrshrn_n_s32, 10);
-
-  // 16bit 8x8 transpose
-  {
-// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
-// whether compilers actually get this is another story, sadly.
-#define dct_trn16(x, y)                                                        \
-  {                                                                            \
-    int16x8x2_t t = vtrnq_s16(x, y);                                           \
-    x = t.val[0];                                                              \
-    y = t.val[1];                                                              \
-  }
-#define dct_trn32(x, y)                                                        \
-  {                                                                            \
-    int32x4x2_t t =                                                            \
-        vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y));         \
-    x = vreinterpretq_s16_s32(t.val[0]);                                       \
-    y = vreinterpretq_s16_s32(t.val[1]);                                       \
-  }
-#define dct_trn64(x, y)                                                        \
-  {                                                                            \
-    int16x8_t x0 = x;                                                          \
-    int16x8_t y0 = y;                                                          \
-    x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0));                      \
-    y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0));                    \
-  }
-
-    // pass 1
-    dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
-    dct_trn16(row2, row3);
-    dct_trn16(row4, row5);
-    dct_trn16(row6, row7);
-
-    // pass 2
-    dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
-    dct_trn32(row1, row3);
-    dct_trn32(row4, row6);
-    dct_trn32(row5, row7);
-
-    // pass 3
-    dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
-    dct_trn64(row1, row5);
-    dct_trn64(row2, row6);
-    dct_trn64(row3, row7);
-
-#undef dct_trn16
-#undef dct_trn32
-#undef dct_trn64
-  }
-
-  // row pass
-  // vrshrn_n_s32 only supports shifts up to 16, we need
-  // 17. so do a non-rounding shift of 16 first then follow
-  // up with a rounding shift by 1.
-  dct_pass(vshrn_n_s32, 16);
-
-  {
-    // pack and round
-    uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
-    uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
-    uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
-    uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
-    uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
-    uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
-    uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
-    uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
-
-    // again, these can translate into one instruction, but often don't.
-#define dct_trn8_8(x, y)                                                       \
-  {                                                                            \
-    uint8x8x2_t t = vtrn_u8(x, y);                                             \
-    x = t.val[0];                                                              \
-    y = t.val[1];                                                              \
-  }
-#define dct_trn8_16(x, y)                                                      \
-  {                                                                            \
-    uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); \
-    x = vreinterpret_u8_u16(t.val[0]);                                         \
-    y = vreinterpret_u8_u16(t.val[1]);                                         \
-  }
-#define dct_trn8_32(x, y)                                                      \
-  {                                                                            \
-    uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); \
-    x = vreinterpret_u8_u32(t.val[0]);                                         \
-    y = vreinterpret_u8_u32(t.val[1]);                                         \
-  }
-
-    // sadly can't use interleaved stores here since we only write
-    // 8 bytes to each scan line!
-
-    // 8x8 8-bit transpose pass 1
-    dct_trn8_8(p0, p1);
-    dct_trn8_8(p2, p3);
-    dct_trn8_8(p4, p5);
-    dct_trn8_8(p6, p7);
-
-    // pass 2
-    dct_trn8_16(p0, p2);
-    dct_trn8_16(p1, p3);
-    dct_trn8_16(p4, p6);
-    dct_trn8_16(p5, p7);
-
-    // pass 3
-    dct_trn8_32(p0, p4);
-    dct_trn8_32(p1, p5);
-    dct_trn8_32(p2, p6);
-    dct_trn8_32(p3, p7);
-
-    // store
-    vst1_u8(out, p0);
-    out += out_stride;
-    vst1_u8(out, p1);
-    out += out_stride;
-    vst1_u8(out, p2);
-    out += out_stride;
-    vst1_u8(out, p3);
-    out += out_stride;
-    vst1_u8(out, p4);
-    out += out_stride;
-    vst1_u8(out, p5);
-    out += out_stride;
-    vst1_u8(out, p6);
-    out += out_stride;
-    vst1_u8(out, p7);
-
-#undef dct_trn8_8
-#undef dct_trn8_16
-#undef dct_trn8_32
-  }
-
-#undef dct_long_mul
-#undef dct_long_mac
-#undef dct_widen
-#undef dct_wadd
-#undef dct_wsub
-#undef dct_bfly32o
-#undef dct_pass
-}
-
-#endif // STBI_NEON
-
-#define STBI__MARKER_none 0xff
-// if there's a pending marker from the entropy stream, return that
-// otherwise, fetch from the stream and get a marker. if there's no
-// marker, return 0xff, which is never a valid marker value
-static stbi_uc stbi__get_marker(stbi__jpeg *j) {
-  stbi_uc x;
-  if (j->marker != STBI__MARKER_none) {
-    x = j->marker;
-    j->marker = STBI__MARKER_none;
-    return x;
-  }
-  x = stbi__get8(j->s);
-  if (x != 0xff)
-    return STBI__MARKER_none;
-  while (x == 0xff)
-    x = stbi__get8(j->s); // consume repeated 0xff fill bytes
-  return x;
-}
-
-// in each scan, we'll have scan_n components, and the order
-// of the components is specified by order[]
-#define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
-
-// after a restart interval, stbi__jpeg_reset the entropy decoder and
-// the dc prediction
-static void stbi__jpeg_reset(stbi__jpeg *j) {
-  j->code_bits = 0;
-  j->code_buffer = 0;
-  j->nomore = 0;
-  j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred =
-      j->img_comp[3].dc_pred = 0;
-  j->marker = STBI__MARKER_none;
-  j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
-  j->eob_run = 0;
-  // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
-  // since we don't even allow 1<<30 pixels
-}
-
-static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
-  stbi__jpeg_reset(z);
-  if (!z->progressive) {
-    if (z->scan_n == 1) {
-      int i, j;
-      STBI_SIMD_ALIGN(short, data[64]);
-      int n = z->order[0];
-      // non-interleaved data, we just need to process one block at a time,
-      // in trivial scanline order
-      // number of blocks to do just depends on how many actual "pixels" this
-      // component has, independent of interleaved MCU blocking and such
-      int w = (z->img_comp[n].x + 7) >> 3;
-      int h = (z->img_comp[n].y + 7) >> 3;
-      for (j = 0; j < h; ++j) {
-        for (i = 0; i < w; ++i) {
-          int ha = z->img_comp[n].ha;
-          if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd,
-                                       z->huff_ac + ha, z->fast_ac[ha], n,
-                                       z->dequant[z->img_comp[n].tq]))
-            return 0;
-          z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 +
-                                   i * 8,
-                               z->img_comp[n].w2, data);
-          // every data block is an MCU, so countdown the restart interval
-          if (--z->todo <= 0) {
-            if (z->code_bits < 24)
-              stbi__grow_buffer_unsafe(z);
-            // if it's NOT a restart, then just bail, so we get corrupt data
-            // rather than no data
-            if (!STBI__RESTART(z->marker))
-              return 1;
-            stbi__jpeg_reset(z);
-          }
-        }
-      }
-      return 1;
-    } else { // interleaved
-      int i, j, k, x, y;
-      STBI_SIMD_ALIGN(short, data[64]);
-      for (j = 0; j < z->img_mcu_y; ++j) {
-        for (i = 0; i < z->img_mcu_x; ++i) {
-          // scan an interleaved mcu... process scan_n components in order
-          for (k = 0; k < z->scan_n; ++k) {
-            int n = z->order[k];
-            // scan out an mcu's worth of this component; that's just determined
-            // by the basic H and V specified for the component
-            for (y = 0; y < z->img_comp[n].v; ++y) {
-              for (x = 0; x < z->img_comp[n].h; ++x) {
-                int x2 = (i * z->img_comp[n].h + x) * 8;
-                int y2 = (j * z->img_comp[n].v + y) * 8;
-                int ha = z->img_comp[n].ha;
-                if (!stbi__jpeg_decode_block(z, data,
-                                             z->huff_dc + z->img_comp[n].hd,
-                                             z->huff_ac + ha, z->fast_ac[ha], n,
-                                             z->dequant[z->img_comp[n].tq]))
-                  return 0;
-                z->idct_block_kernel(z->img_comp[n].data +
-                                         z->img_comp[n].w2 * y2 + x2,
-                                     z->img_comp[n].w2, data);
-              }
-            }
-          }
-          // after all interleaved components, that's an interleaved MCU,
-          // so now count down the restart interval
-          if (--z->todo <= 0) {
-            if (z->code_bits < 24)
-              stbi__grow_buffer_unsafe(z);
-            if (!STBI__RESTART(z->marker))
-              return 1;
-            stbi__jpeg_reset(z);
-          }
-        }
-      }
-      return 1;
-    }
-  } else {
-    if (z->scan_n == 1) {
-      int i, j;
-      int n = z->order[0];
-      // non-interleaved data, we just need to process one block at a time,
-      // in trivial scanline order
-      // number of blocks to do just depends on how many actual "pixels" this
-      // component has, independent of interleaved MCU blocking and such
-      int w = (z->img_comp[n].x + 7) >> 3;
-      int h = (z->img_comp[n].y + 7) >> 3;
-      for (j = 0; j < h; ++j) {
-        for (i = 0; i < w; ++i) {
-          short *data =
-              z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-          if (z->spec_start == 0) {
-            if (!stbi__jpeg_decode_block_prog_dc(
-                    z, data, &z->huff_dc[z->img_comp[n].hd], n))
-              return 0;
-          } else {
-            int ha = z->img_comp[n].ha;
-            if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha],
-                                                 z->fast_ac[ha]))
-              return 0;
-          }
-          // every data block is an MCU, so countdown the restart interval
-          if (--z->todo <= 0) {
-            if (z->code_bits < 24)
-              stbi__grow_buffer_unsafe(z);
-            if (!STBI__RESTART(z->marker))
-              return 1;
-            stbi__jpeg_reset(z);
-          }
-        }
-      }
-      return 1;
-    } else { // interleaved
-      int i, j, k, x, y;
-      for (j = 0; j < z->img_mcu_y; ++j) {
-        for (i = 0; i < z->img_mcu_x; ++i) {
-          // scan an interleaved mcu... process scan_n components in order
-          for (k = 0; k < z->scan_n; ++k) {
-            int n = z->order[k];
-            // scan out an mcu's worth of this component; that's just determined
-            // by the basic H and V specified for the component
-            for (y = 0; y < z->img_comp[n].v; ++y) {
-              for (x = 0; x < z->img_comp[n].h; ++x) {
-                int x2 = (i * z->img_comp[n].h + x);
-                int y2 = (j * z->img_comp[n].v + y);
-                short *data = z->img_comp[n].coeff +
-                              64 * (x2 + y2 * z->img_comp[n].coeff_w);
-                if (!stbi__jpeg_decode_block_prog_dc(
-                        z, data, &z->huff_dc[z->img_comp[n].hd], n))
-                  return 0;
-              }
-            }
-          }
-          // after all interleaved components, that's an interleaved MCU,
-          // so now count down the restart interval
-          if (--z->todo <= 0) {
-            if (z->code_bits < 24)
-              stbi__grow_buffer_unsafe(z);
-            if (!STBI__RESTART(z->marker))
-              return 1;
-            stbi__jpeg_reset(z);
-          }
-        }
-      }
-      return 1;
-    }
-  }
-}
-
-static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant) {
-  int i;
-  for (i = 0; i < 64; ++i)
-    data[i] *= dequant[i];
-}
-
-static void stbi__jpeg_finish(stbi__jpeg *z) {
-  if (z->progressive) {
-    // dequantize and idct the data
-    int i, j, n;
-    for (n = 0; n < z->s->img_n; ++n) {
-      int w = (z->img_comp[n].x + 7) >> 3;
-      int h = (z->img_comp[n].y + 7) >> 3;
-      for (j = 0; j < h; ++j) {
-        for (i = 0; i < w; ++i) {
-          short *data =
-              z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
-          stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
-          z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 +
-                                   i * 8,
-                               z->img_comp[n].w2, data);
-        }
-      }
-    }
-  }
-}
-
-static int stbi__process_marker(stbi__jpeg *z, int m) {
-  int L;
-  switch (m) {
-  case STBI__MARKER_none: // no marker found
-    return stbi__err("expected marker", "Corrupt JPEG");
-
-  case 0xDD: // DRI - specify restart interval
-    if (stbi__get16be(z->s) != 4)
-      return stbi__err("bad DRI len", "Corrupt JPEG");
-    z->restart_interval = stbi__get16be(z->s);
-    return 1;
-
-  case 0xDB: // DQT - define quantization table
-    L = stbi__get16be(z->s) - 2;
-    while (L > 0) {
-      int q = stbi__get8(z->s);
-      int p = q >> 4, sixteen = (p != 0);
-      int t = q & 15, i;
-      if (p != 0 && p != 1)
-        return stbi__err("bad DQT type", "Corrupt JPEG");
-      if (t > 3)
-        return stbi__err("bad DQT table", "Corrupt JPEG");
-
-      for (i = 0; i < 64; ++i)
-        z->dequant[t][stbi__jpeg_dezigzag[i]] =
-            (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
-      L -= (sixteen ? 129 : 65);
-    }
-    return L == 0;
-
-  case 0xC4: // DHT - define huffman table
-    L = stbi__get16be(z->s) - 2;
-    while (L > 0) {
-      stbi_uc *v;
-      int sizes[16], i, n = 0;
-      int q = stbi__get8(z->s);
-      int tc = q >> 4;
-      int th = q & 15;
-      if (tc > 1 || th > 3)
-        return stbi__err("bad DHT header", "Corrupt JPEG");
-      for (i = 0; i < 16; ++i) {
-        sizes[i] = stbi__get8(z->s);
-        n += sizes[i];
-      }
-      L -= 17;
-      if (tc == 0) {
-        if (!stbi__build_huffman(z->huff_dc + th, sizes))
-          return 0;
-        v = z->huff_dc[th].values;
-      } else {
-        if (!stbi__build_huffman(z->huff_ac + th, sizes))
-          return 0;
-        v = z->huff_ac[th].values;
-      }
-      for (i = 0; i < n; ++i)
-        v[i] = stbi__get8(z->s);
-      if (tc != 0)
-        stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
-      L -= n;
-    }
-    return L == 0;
-  }
-
-  // check for comment block or APP blocks
-  if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
-    L = stbi__get16be(z->s);
-    if (L < 2) {
-      if (m == 0xFE)
-        return stbi__err("bad COM len", "Corrupt JPEG");
-      else
-        return stbi__err("bad APP len", "Corrupt JPEG");
-    }
-    L -= 2;
-
-    if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
-      static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'};
-      int ok = 1;
-      int i;
-      for (i = 0; i < 5; ++i)
-        if (stbi__get8(z->s) != tag[i])
-          ok = 0;
-      L -= 5;
-      if (ok)
-        z->jfif = 1;
-    } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
-      static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'};
-      int ok = 1;
-      int i;
-      for (i = 0; i < 6; ++i)
-        if (stbi__get8(z->s) != tag[i])
-          ok = 0;
-      L -= 6;
-      if (ok) {
-        stbi__get8(z->s);                            // version
-        stbi__get16be(z->s);                         // flags0
-        stbi__get16be(z->s);                         // flags1
-        z->app14_color_transform = stbi__get8(z->s); // color transform
-        L -= 6;
-      }
-    }
-
-    stbi__skip(z->s, L);
-    return 1;
-  }
-
-  return stbi__err("unknown marker", "Corrupt JPEG");
-}
-
-// after we see SOS
-static int stbi__process_scan_header(stbi__jpeg *z) {
-  int i;
-  int Ls = stbi__get16be(z->s);
-  z->scan_n = stbi__get8(z->s);
-  if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n)
-    return stbi__err("bad SOS component count", "Corrupt JPEG");
-  if (Ls != 6 + 2 * z->scan_n)
-    return stbi__err("bad SOS len", "Corrupt JPEG");
-  for (i = 0; i < z->scan_n; ++i) {
-    int id = stbi__get8(z->s), which;
-    int q = stbi__get8(z->s);
-    for (which = 0; which < z->s->img_n; ++which)
-      if (z->img_comp[which].id == id)
-        break;
-    if (which == z->s->img_n)
-      return 0; // no match
-    z->img_comp[which].hd = q >> 4;
-    if (z->img_comp[which].hd > 3)
-      return stbi__err("bad DC huff", "Corrupt JPEG");
-    z->img_comp[which].ha = q & 15;
-    if (z->img_comp[which].ha > 3)
-      return stbi__err("bad AC huff", "Corrupt JPEG");
-    z->order[i] = which;
-  }
-
-  {
-    int aa;
-    z->spec_start = stbi__get8(z->s);
-    z->spec_end = stbi__get8(z->s); // should be 63, but might be 0
-    aa = stbi__get8(z->s);
-    z->succ_high = (aa >> 4);
-    z->succ_low = (aa & 15);
-    if (z->progressive) {
-      if (z->spec_start > 63 || z->spec_end > 63 ||
-          z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
-        return stbi__err("bad SOS", "Corrupt JPEG");
-    } else {
-      if (z->spec_start != 0)
-        return stbi__err("bad SOS", "Corrupt JPEG");
-      if (z->succ_high != 0 || z->succ_low != 0)
-        return stbi__err("bad SOS", "Corrupt JPEG");
-      z->spec_end = 63;
-    }
-  }
-
-  return 1;
-}
-
-static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why) {
-  int i;
-  for (i = 0; i < ncomp; ++i) {
-    if (z->img_comp[i].raw_data) {
-      STBI_FREE(z->img_comp[i].raw_data);
-      z->img_comp[i].raw_data = NULL;
-      z->img_comp[i].data = NULL;
-    }
-    if (z->img_comp[i].raw_coeff) {
-      STBI_FREE(z->img_comp[i].raw_coeff);
-      z->img_comp[i].raw_coeff = 0;
-      z->img_comp[i].coeff = 0;
-    }
-    if (z->img_comp[i].linebuf) {
-      STBI_FREE(z->img_comp[i].linebuf);
-      z->img_comp[i].linebuf = NULL;
-    }
-  }
-  return why;
-}
-
-static int stbi__process_frame_header(stbi__jpeg *z, int scan) {
-  stbi__context *s = z->s;
-  int Lf, p, i, q, h_max = 1, v_max = 1, c;
-  Lf = stbi__get16be(s);
-  if (Lf < 11)
-    return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG
-  p = stbi__get8(s);
-  if (p != 8)
-    return stbi__err("only 8-bit",
-                     "JPEG format not supported: 8-bit only"); // JPEG baseline
-  s->img_y = stbi__get16be(s);
-  if (s->img_y == 0)
-    return stbi__err(
-        "no header height",
-        "JPEG format not supported: delayed height"); // Legal, but we don't
-                                                      // handle it--but neither
-                                                      // does IJG
-  s->img_x = stbi__get16be(s);
-  if (s->img_x == 0)
-    return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires
-  c = stbi__get8(s);
-  if (c != 3 && c != 1 && c != 4)
-    return stbi__err("bad component count", "Corrupt JPEG");
-  s->img_n = c;
-  for (i = 0; i < c; ++i) {
-    z->img_comp[i].data = NULL;
-    z->img_comp[i].linebuf = NULL;
-  }
-
-  if (Lf != 8 + 3 * s->img_n)
-    return stbi__err("bad SOF len", "Corrupt JPEG");
-
-  z->rgb = 0;
-  for (i = 0; i < s->img_n; ++i) {
-    static const unsigned char rgb[3] = {'R', 'G', 'B'};
-    z->img_comp[i].id = stbi__get8(s);
-    if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
-      ++z->rgb;
-    q = stbi__get8(s);
-    z->img_comp[i].h = (q >> 4);
-    if (!z->img_comp[i].h || z->img_comp[i].h > 4)
-      return stbi__err("bad H", "Corrupt JPEG");
-    z->img_comp[i].v = q & 15;
-    if (!z->img_comp[i].v || z->img_comp[i].v > 4)
-      return stbi__err("bad V", "Corrupt JPEG");
-    z->img_comp[i].tq = stbi__get8(s);
-    if (z->img_comp[i].tq > 3)
-      return stbi__err("bad TQ", "Corrupt JPEG");
-  }
-
-  if (scan != STBI__SCAN_load)
-    return 1;
-
-  if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0))
-    return stbi__err("too large", "Image too large to decode");
-
-  for (i = 0; i < s->img_n; ++i) {
-    if (z->img_comp[i].h > h_max)
-      h_max = z->img_comp[i].h;
-    if (z->img_comp[i].v > v_max)
-      v_max = z->img_comp[i].v;
-  }
-
-  // compute interleaved mcu info
-  z->img_h_max = h_max;
-  z->img_v_max = v_max;
-  z->img_mcu_w = h_max * 8;
-  z->img_mcu_h = v_max * 8;
-  // these sizes can't be more than 17 bits
-  z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w;
-  z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h;
-
-  for (i = 0; i < s->img_n; ++i) {
-    // number of effective pixels (e.g. for non-interleaved MCU)
-    z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max;
-    z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max;
-    // to simplify generation, we'll allocate enough memory to decode
-    // the bogus oversized data from using interleaved MCUs and their
-    // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
-    // discard the extra data until colorspace conversion
-    //
-    // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked
-    // earlier) so these muls can't overflow with 32-bit ints (which we require)
-    z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
-    z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-    z->img_comp[i].coeff = 0;
-    z->img_comp[i].raw_coeff = 0;
-    z->img_comp[i].linebuf = NULL;
-    z->img_comp[i].raw_data =
-        stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
-    if (z->img_comp[i].raw_data == NULL)
-      return stbi__free_jpeg_components(z, i + 1,
-                                        stbi__err("outofmem", "Out of memory"));
-    // align blocks for idct using mmx/sse
-    z->img_comp[i].data =
-        (stbi_uc *)(((size_t)z->img_comp[i].raw_data + 15) & ~15);
-    if (z->progressive) {
-      // w2, h2 are multiples of 8 (see above)
-      z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
-      z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
-      z->img_comp[i].raw_coeff = stbi__malloc_mad3(
-          z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
-      if (z->img_comp[i].raw_coeff == NULL)
-        return stbi__free_jpeg_components(
-            z, i + 1, stbi__err("outofmem", "Out of memory"));
-      z->img_comp[i].coeff =
-          (short *)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15);
-    }
-  }
-
-  return 1;
-}
-
-// use comparisons since in some cases we handle more than one case (e.g. SOF)
-#define stbi__DNL(x) ((x) == 0xdc)
-#define stbi__SOI(x) ((x) == 0xd8)
-#define stbi__EOI(x) ((x) == 0xd9)
-#define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
-#define stbi__SOS(x) ((x) == 0xda)
-
-#define stbi__SOF_progressive(x) ((x) == 0xc2)
-
-static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan) {
-  int m;
-  z->jfif = 0;
-  z->app14_color_transform = -1; // valid values are 0,1,2
-  z->marker = STBI__MARKER_none; // initialize cached marker to empty
-  m = stbi__get_marker(z);
-  if (!stbi__SOI(m))
-    return stbi__err("no SOI", "Corrupt JPEG");
-  if (scan == STBI__SCAN_type)
-    return 1;
-  m = stbi__get_marker(z);
-  while (!stbi__SOF(m)) {
-    if (!stbi__process_marker(z, m))
-      return 0;
-    m = stbi__get_marker(z);
-    while (m == STBI__MARKER_none) {
-      // some files have extra padding after their blocks, so ok, we'll scan
-      if (stbi__at_eof(z->s))
-        return stbi__err("no SOF", "Corrupt JPEG");
-      m = stbi__get_marker(z);
-    }
-  }
-  z->progressive = stbi__SOF_progressive(m);
-  if (!stbi__process_frame_header(z, scan))
-    return 0;
-  return 1;
-}
-
-// decode image to YCbCr format
-static int stbi__decode_jpeg_image(stbi__jpeg *j) {
-  int m;
-  for (m = 0; m < 4; m++) {
-    j->img_comp[m].raw_data = NULL;
-    j->img_comp[m].raw_coeff = NULL;
-  }
-  j->restart_interval = 0;
-  if (!stbi__decode_jpeg_header(j, STBI__SCAN_load))
-    return 0;
-  m = stbi__get_marker(j);
-  while (!stbi__EOI(m)) {
-    if (stbi__SOS(m)) {
-      if (!stbi__process_scan_header(j))
-        return 0;
-      if (!stbi__parse_entropy_coded_data(j))
-        return 0;
-      if (j->marker == STBI__MARKER_none) {
-        // handle 0s at the end of image data from IP Kamera 9060
-        while (!stbi__at_eof(j->s)) {
-          int x = stbi__get8(j->s);
-          if (x == 255) {
-            j->marker = stbi__get8(j->s);
-            break;
-          }
-        }
-        // if we reach eof without hitting a marker, stbi__get_marker() below
-        // will fail and we'll eventually return 0
-      }
-    } else if (stbi__DNL(m)) {
-      int Ld = stbi__get16be(j->s);
-      stbi__uint32 NL = stbi__get16be(j->s);
-      if (Ld != 4)
-        return stbi__err("bad DNL len", "Corrupt JPEG");
-      if (NL != j->s->img_y)
-        return stbi__err("bad DNL height", "Corrupt JPEG");
-    } else {
-      if (!stbi__process_marker(j, m))
-        return 0;
-    }
-    m = stbi__get_marker(j);
-  }
-  if (j->progressive)
-    stbi__jpeg_finish(j);
-  return 1;
-}
-
-// static jfif-centered resampling (across block boundaries)
-
-typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
-                                      int w, int hs);
-
-#define stbi__div4(x) ((stbi_uc)((x) >> 2))
-
-static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far,
-                               int w, int hs) {
-  STBI_NOTUSED(out);
-  STBI_NOTUSED(in_far);
-  STBI_NOTUSED(w);
-  STBI_NOTUSED(hs);
-  return in_near;
-}
-
-static stbi_uc *stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near,
-                                       stbi_uc *in_far, int w, int hs) {
-  // need to generate two samples vertically for every one in input
-  int i;
-  STBI_NOTUSED(hs);
-  for (i = 0; i < w; ++i)
-    out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2);
-  return out;
-}
-
-static stbi_uc *stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near,
-                                       stbi_uc *in_far, int w, int hs) {
-  // need to generate two samples horizontally for every one in input
-  int i;
-  stbi_uc *input = in_near;
-
-  if (w == 1) {
-    // if only one sample, can't do any interpolation
-    out[0] = out[1] = input[0];
-    return out;
-  }
-
-  out[0] = input[0];
-  out[1] = stbi__div4(input[0] * 3 + input[1] + 2);
-  for (i = 1; i < w - 1; ++i) {
-    int n = 3 * input[i] + 2;
-    out[i * 2 + 0] = stbi__div4(n + input[i - 1]);
-    out[i * 2 + 1] = stbi__div4(n + input[i + 1]);
-  }
-  out[i * 2 + 0] = stbi__div4(input[w - 2] * 3 + input[w - 1] + 2);
-  out[i * 2 + 1] = input[w - 1];
-
-  STBI_NOTUSED(in_far);
-  STBI_NOTUSED(hs);
-
-  return out;
-}
-
-#define stbi__div16(x) ((stbi_uc)((x) >> 4))
-
-static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near,
-                                        stbi_uc *in_far, int w, int hs) {
-  // need to generate 2x2 samples for every one in input
-  int i, t0, t1;
-  if (w == 1) {
-    out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
-    return out;
-  }
-
-  t1 = 3 * in_near[0] + in_far[0];
-  out[0] = stbi__div4(t1 + 2);
-  for (i = 1; i < w; ++i) {
-    t0 = t1;
-    t1 = 3 * in_near[i] + in_far[i];
-    out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
-    out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
-  }
-  out[w * 2 - 1] = stbi__div4(t1 + 2);
-
-  STBI_NOTUSED(hs);
-
-  return out;
-}
-
-#if defined(STBI_SSE2) || defined(STBI_NEON)
-static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near,
-                                             stbi_uc *in_far, int w, int hs) {
-  // need to generate 2x2 samples for every one in input
-  int i = 0, t0, t1;
-
-  if (w == 1) {
-    out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2);
-    return out;
-  }
-
-  t1 = 3 * in_near[0] + in_far[0];
-  // process groups of 8 pixels for as long as we can.
-  // note we can't handle the last pixel in a row in this loop
-  // because we need to handle the filter boundary conditions.
-  for (; i < ((w - 1) & ~7); i += 8) {
-#if defined(STBI_SSE2)
-    // load and perform the vertical filtering pass
-    // this uses 3*x + y = 4*x + (y - x)
-    __m128i zero = _mm_setzero_si128();
-    __m128i farb = _mm_loadl_epi64((__m128i *)(in_far + i));
-    __m128i nearb = _mm_loadl_epi64((__m128i *)(in_near + i));
-    __m128i farw = _mm_unpacklo_epi8(farb, zero);
-    __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
-    __m128i diff = _mm_sub_epi16(farw, nearw);
-    __m128i nears = _mm_slli_epi16(nearw, 2);
-    __m128i curr = _mm_add_epi16(nears, diff); // current row
-
-    // horizontal filter works the same based on shifted vers of current
-    // row. "prev" is current row shifted right by 1 pixel; we need to
-    // insert the previous pixel value (from t1).
-    // "next" is current row shifted left by 1 pixel, with first pixel
-    // of next block of 8 pixels added in.
-    __m128i prv0 = _mm_slli_si128(curr, 2);
-    __m128i nxt0 = _mm_srli_si128(curr, 2);
-    __m128i prev = _mm_insert_epi16(prv0, t1, 0);
-    __m128i next =
-        _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7);
-
-    // horizontal filter, polyphase implementation since it's convenient:
-    // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-    // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-    // note the shared term.
-    __m128i bias = _mm_set1_epi16(8);
-    __m128i curs = _mm_slli_epi16(curr, 2);
-    __m128i prvd = _mm_sub_epi16(prev, curr);
-    __m128i nxtd = _mm_sub_epi16(next, curr);
-    __m128i curb = _mm_add_epi16(curs, bias);
-    __m128i even = _mm_add_epi16(prvd, curb);
-    __m128i odd = _mm_add_epi16(nxtd, curb);
-
-    // interleave even and odd pixels, then undo scaling.
-    __m128i int0 = _mm_unpacklo_epi16(even, odd);
-    __m128i int1 = _mm_unpackhi_epi16(even, odd);
-    __m128i de0 = _mm_srli_epi16(int0, 4);
-    __m128i de1 = _mm_srli_epi16(int1, 4);
-
-    // pack and write output
-    __m128i outv = _mm_packus_epi16(de0, de1);
-    _mm_storeu_si128((__m128i *)(out + i * 2), outv);
-#elif defined(STBI_NEON)
-    // load and perform the vertical filtering pass
-    // this uses 3*x + y = 4*x + (y - x)
-    uint8x8_t farb = vld1_u8(in_far + i);
-    uint8x8_t nearb = vld1_u8(in_near + i);
-    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
-    int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
-    int16x8_t curr = vaddq_s16(nears, diff); // current row
-
-    // horizontal filter works the same based on shifted vers of current
-    // row. "prev" is current row shifted right by 1 pixel; we need to
-    // insert the previous pixel value (from t1).
-    // "next" is current row shifted left by 1 pixel, with first pixel
-    // of next block of 8 pixels added in.
-    int16x8_t prv0 = vextq_s16(curr, curr, 7);
-    int16x8_t nxt0 = vextq_s16(curr, curr, 1);
-    int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
-    int16x8_t next =
-        vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7);
-
-    // horizontal filter, polyphase implementation since it's convenient:
-    // even pixels = 3*cur + prev = cur*4 + (prev - cur)
-    // odd  pixels = 3*cur + next = cur*4 + (next - cur)
-    // note the shared term.
-    int16x8_t curs = vshlq_n_s16(curr, 2);
-    int16x8_t prvd = vsubq_s16(prev, curr);
-    int16x8_t nxtd = vsubq_s16(next, curr);
-    int16x8_t even = vaddq_s16(curs, prvd);
-    int16x8_t odd = vaddq_s16(curs, nxtd);
-
-    // undo scaling and round, then store with even/odd phases interleaved
-    uint8x8x2_t o;
-    o.val[0] = vqrshrun_n_s16(even, 4);
-    o.val[1] = vqrshrun_n_s16(odd, 4);
-    vst2_u8(out + i * 2, o);
-#endif
-
-    // "previous" value for next iter
-    t1 = 3 * in_near[i + 7] + in_far[i + 7];
-  }
-
-  t0 = t1;
-  t1 = 3 * in_near[i] + in_far[i];
-  out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
-
-  for (++i; i < w; ++i) {
-    t0 = t1;
-    t1 = 3 * in_near[i] + in_far[i];
-    out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8);
-    out[i * 2] = stbi__div16(3 * t1 + t0 + 8);
-  }
-  out[w * 2 - 1] = stbi__div4(t1 + 2);
-
-  STBI_NOTUSED(hs);
-
-  return out;
-}
-#endif
-
-static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near,
-                                           stbi_uc *in_far, int w, int hs) {
-  // resample with nearest-neighbor
-  int i, j;
-  STBI_NOTUSED(in_far);
-  for (i = 0; i < w; ++i)
-    for (j = 0; j < hs; ++j)
-      out[i * hs + j] = in_near[i];
-  return out;
-}
-
-// this is a reduced-precision calculation of YCbCr-to-RGB introduced
-// to make sure the code produces the same results in both SIMD and scalar
-#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y,
-                                   const stbi_uc *pcb, const stbi_uc *pcr,
-                                   int count, int step) {
-  int i;
-  for (i = 0; i < count; ++i) {
-    int y_fixed = (y[i] << 20) + (1 << 19); // rounding
-    int r, g, b;
-    int cr = pcr[i] - 128;
-    int cb = pcb[i] - 128;
-    r = y_fixed + cr * stbi__float2fixed(1.40200f);
-    g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) +
-        ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
-    b = y_fixed + cb * stbi__float2fixed(1.77200f);
-    r >>= 20;
-    g >>= 20;
-    b >>= 20;
-    if ((unsigned)r > 255) {
-      if (r < 0)
-        r = 0;
-      else
-        r = 255;
-    }
-    if ((unsigned)g > 255) {
-      if (g < 0)
-        g = 0;
-      else
-        g = 255;
-    }
-    if ((unsigned)b > 255) {
-      if (b < 0)
-        b = 0;
-      else
-        b = 255;
-    }
-    out[0] = (stbi_uc)r;
-    out[1] = (stbi_uc)g;
-    out[2] = (stbi_uc)b;
-    out[3] = 255;
-    out += step;
-  }
-}
-
-#if defined(STBI_SSE2) || defined(STBI_NEON)
-static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y,
-                                    stbi_uc const *pcb, stbi_uc const *pcr,
-                                    int count, int step) {
-  int i = 0;
-
-#ifdef STBI_SSE2
-  // step == 3 is pretty ugly on the final interleave, and i'm not convinced
-  // it's useful in practice (you wouldn't use it for textures, for example).
-  // so just accelerate step == 4 case.
-  if (step == 4) {
-    // this is a fairly straightforward implementation and not super-optimized.
-    __m128i signflip = _mm_set1_epi8(-0x80);
-    __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
-    __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
-    __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
-    __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
-    __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
-    __m128i xw = _mm_set1_epi16(255); // alpha channel
-
-    for (; i + 7 < count; i += 8) {
-      // load
-      __m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i));
-      __m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i));
-      __m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i));
-      __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
-      __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
-
-      // unpack to short (and left-shift cr, cb by 8)
-      __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
-      __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
-      __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
-
-      // color transform
-      __m128i yws = _mm_srli_epi16(yw, 4);
-      __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
-      __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
-      __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
-      __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
-      __m128i rws = _mm_add_epi16(cr0, yws);
-      __m128i gwt = _mm_add_epi16(cb0, yws);
-      __m128i bws = _mm_add_epi16(yws, cb1);
-      __m128i gws = _mm_add_epi16(gwt, cr1);
-
-      // descale
-      __m128i rw = _mm_srai_epi16(rws, 4);
-      __m128i bw = _mm_srai_epi16(bws, 4);
-      __m128i gw = _mm_srai_epi16(gws, 4);
-
-      // back to byte, set up for transpose
-      __m128i brb = _mm_packus_epi16(rw, bw);
-      __m128i gxb = _mm_packus_epi16(gw, xw);
-
-      // transpose to interleave channels
-      __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
-      __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
-      __m128i o0 = _mm_unpacklo_epi16(t0, t1);
-      __m128i o1 = _mm_unpackhi_epi16(t0, t1);
-
-      // store
-      _mm_storeu_si128((__m128i *)(out + 0), o0);
-      _mm_storeu_si128((__m128i *)(out + 16), o1);
-      out += 32;
-    }
-  }
-#endif
-
-#ifdef STBI_NEON
-  // in this version, step=3 support would be easy to add. but is there demand?
-  if (step == 4) {
-    // this is a fairly straightforward implementation and not super-optimized.
-    uint8x8_t signflip = vdup_n_u8(0x80);
-    int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
-    int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
-    int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
-    int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
-
-    for (; i + 7 < count; i += 8) {
-      // load
-      uint8x8_t y_bytes = vld1_u8(y + i);
-      uint8x8_t cr_bytes = vld1_u8(pcr + i);
-      uint8x8_t cb_bytes = vld1_u8(pcb + i);
-      int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
-      int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
-
-      // expand to s16
-      int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
-      int16x8_t crw = vshll_n_s8(cr_biased, 7);
-      int16x8_t cbw = vshll_n_s8(cb_biased, 7);
-
-      // color transform
-      int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
-      int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
-      int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
-      int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
-      int16x8_t rws = vaddq_s16(yws, cr0);
-      int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
-      int16x8_t bws = vaddq_s16(yws, cb1);
-
-      // undo scaling, round, convert to byte
-      uint8x8x4_t o;
-      o.val[0] = vqrshrun_n_s16(rws, 4);
-      o.val[1] = vqrshrun_n_s16(gws, 4);
-      o.val[2] = vqrshrun_n_s16(bws, 4);
-      o.val[3] = vdup_n_u8(255);
-
-      // store, interleaving r/g/b/a
-      vst4_u8(out, o);
-      out += 8 * 4;
-    }
-  }
-#endif
-
-  for (; i < count; ++i) {
-    int y_fixed = (y[i] << 20) + (1 << 19); // rounding
-    int r, g, b;
-    int cr = pcr[i] - 128;
-    int cb = pcb[i] - 128;
-    r = y_fixed + cr * stbi__float2fixed(1.40200f);
-    g = y_fixed + cr * -stbi__float2fixed(0.71414f) +
-        ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
-    b = y_fixed + cb * stbi__float2fixed(1.77200f);
-    r >>= 20;
-    g >>= 20;
-    b >>= 20;
-    if ((unsigned)r > 255) {
-      if (r < 0)
-        r = 0;
-      else
-        r = 255;
-    }
-    if ((unsigned)g > 255) {
-      if (g < 0)
-        g = 0;
-      else
-        g = 255;
-    }
-    if ((unsigned)b > 255) {
-      if (b < 0)
-        b = 0;
-      else
-        b = 255;
-    }
-    out[0] = (stbi_uc)r;
-    out[1] = (stbi_uc)g;
-    out[2] = (stbi_uc)b;
-    out[3] = 255;
-    out += step;
-  }
-}
-#endif
-
-// set up the kernels
-static void stbi__setup_jpeg(stbi__jpeg *j) {
-  j->idct_block_kernel = stbi__idct_block;
-  j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
-  j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
-
-#ifdef STBI_SSE2
-  if (stbi__sse2_available()) {
-    j->idct_block_kernel = stbi__idct_simd;
-    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
-  }
-#endif
-
-#ifdef STBI_NEON
-  j->idct_block_kernel = stbi__idct_simd;
-  j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-  j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
-#endif
-}
-
-// clean up the temporary component buffers
-static void stbi__cleanup_jpeg(stbi__jpeg *j) {
-  stbi__free_jpeg_components(j, j->s->img_n, 0);
-}
-
-typedef struct {
-  resample_row_func resample;
-  stbi_uc *line0, *line1;
-  int hs, vs;  // expansion factor in each axis
-  int w_lores; // horizontal pixels pre-expansion
-  int ystep;   // how far through vertical expansion we are
-  int ypos;    // which pre-expansion row we're on
-} stbi__resample;
-
-// fast 0..255 * 0..255 => 0..255 rounded multiplication
-static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y) {
-  unsigned int t = x * y + 128;
-  return (stbi_uc)((t + (t >> 8)) >> 8);
-}
-
-static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y,
-                                int *comp, int req_comp) {
-  int n, decode_n, is_rgb;
-  z->s->img_n = 0; // make stbi__cleanup_jpeg safe
-
-  // validate req_comp
-  if (req_comp < 0 || req_comp > 4)
-    return stbi__errpuc("bad req_comp", "Internal error");
-
-  // load a jpeg image from whichever source, but leave in YCbCr format
-  if (!stbi__decode_jpeg_image(z)) {
-    stbi__cleanup_jpeg(z);
-    return NULL;
-  }
-
-  // determine actual number of components to generate
-  n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
-
-  is_rgb = z->s->img_n == 3 &&
-           (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
-
-  if (z->s->img_n == 3 && n < 3 && !is_rgb)
-    decode_n = 1;
-  else
-    decode_n = z->s->img_n;
-
-  // resample and color-convert
-  {
-    int k;
-    unsigned int i, j;
-    stbi_uc *output;
-    stbi_uc *coutput[4] = {NULL, NULL, NULL, NULL};
-
-    stbi__resample res_comp[4];
-
-    for (k = 0; k < decode_n; ++k) {
-      stbi__resample *r = &res_comp[k];
-
-      // allocate line buffer big enough for upsampling off the edges
-      // with upsample factor of 4
-      z->img_comp[k].linebuf = (stbi_uc *)stbi__malloc(z->s->img_x + 3);
-      if (!z->img_comp[k].linebuf) {
-        stbi__cleanup_jpeg(z);
-        return stbi__errpuc("outofmem", "Out of memory");
-      }
-
-      r->hs = z->img_h_max / z->img_comp[k].h;
-      r->vs = z->img_v_max / z->img_comp[k].v;
-      r->ystep = r->vs >> 1;
-      r->w_lores = (z->s->img_x + r->hs - 1) / r->hs;
-      r->ypos = 0;
-      r->line0 = r->line1 = z->img_comp[k].data;
-
-      if (r->hs == 1 && r->vs == 1)
-        r->resample = resample_row_1;
-      else if (r->hs == 1 && r->vs == 2)
-        r->resample = stbi__resample_row_v_2;
-      else if (r->hs == 2 && r->vs == 1)
-        r->resample = stbi__resample_row_h_2;
-      else if (r->hs == 2 && r->vs == 2)
-        r->resample = z->resample_row_hv_2_kernel;
-      else
-        r->resample = stbi__resample_row_generic;
-    }
-
-    // can't error after this so, this is safe
-    output = (stbi_uc *)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
-    if (!output) {
-      stbi__cleanup_jpeg(z);
-      return stbi__errpuc("outofmem", "Out of memory");
-    }
-
-    // now go ahead and resample
-    for (j = 0; j < z->s->img_y; ++j) {
-      stbi_uc *out = output + n * z->s->img_x * j;
-      for (k = 0; k < decode_n; ++k) {
-        stbi__resample *r = &res_comp[k];
-        int y_bot = r->ystep >= (r->vs >> 1);
-        coutput[k] =
-            r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0,
-                        y_bot ? r->line0 : r->line1, r->w_lores, r->hs);
-        if (++r->ystep >= r->vs) {
-          r->ystep = 0;
-          r->line0 = r->line1;
-          if (++r->ypos < z->img_comp[k].y)
-            r->line1 += z->img_comp[k].w2;
-        }
-      }
-      if (n >= 3) {
-        stbi_uc *y = coutput[0];
-        if (z->s->img_n == 3) {
-          if (is_rgb) {
-            for (i = 0; i < z->s->img_x; ++i) {
-              out[0] = y[i];
-              out[1] = coutput[1][i];
-              out[2] = coutput[2][i];
-              out[3] = 255;
-              out += n;
-            }
-          } else {
-            z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x,
-                                   n);
-          }
-        } else if (z->s->img_n == 4) {
-          if (z->app14_color_transform == 0) { // CMYK
-            for (i = 0; i < z->s->img_x; ++i) {
-              stbi_uc m = coutput[3][i];
-              out[0] = stbi__blinn_8x8(coutput[0][i], m);
-              out[1] = stbi__blinn_8x8(coutput[1][i], m);
-              out[2] = stbi__blinn_8x8(coutput[2][i], m);
-              out[3] = 255;
-              out += n;
-            }
-          } else if (z->app14_color_transform == 2) { // YCCK
-            z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x,
-                                   n);
-            for (i = 0; i < z->s->img_x; ++i) {
-              stbi_uc m = coutput[3][i];
-              out[0] = stbi__blinn_8x8(255 - out[0], m);
-              out[1] = stbi__blinn_8x8(255 - out[1], m);
-              out[2] = stbi__blinn_8x8(255 - out[2], m);
-              out += n;
-            }
-          } else { // YCbCr + alpha?  Ignore the fourth channel for now
-            z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x,
-                                   n);
-          }
-        } else
-          for (i = 0; i < z->s->img_x; ++i) {
-            out[0] = out[1] = out[2] = y[i];
-            out[3] = 255; // not used if n==3
-            out += n;
-          }
-      } else {
-        if (is_rgb) {
-          if (n == 1)
-            for (i = 0; i < z->s->img_x; ++i)
-              *out++ =
-                  stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-          else {
-            for (i = 0; i < z->s->img_x; ++i, out += 2) {
-              out[0] =
-                  stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
-              out[1] = 255;
-            }
-          }
-        } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
-          for (i = 0; i < z->s->img_x; ++i) {
-            stbi_uc m = coutput[3][i];
-            stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
-            stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
-            stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
-            out[0] = stbi__compute_y(r, g, b);
-            out[1] = 255;
-            out += n;
-          }
-        } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
-          for (i = 0; i < z->s->img_x; ++i) {
-            out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
-            out[1] = 255;
-            out += n;
-          }
-        } else {
-          stbi_uc *y = coutput[0];
-          if (n == 1)
-            for (i = 0; i < z->s->img_x; ++i)
-              out[i] = y[i];
-          else
-            for (i = 0; i < z->s->img_x; ++i) {
-              *out++ = y[i];
-              *out++ = 255;
-            }
-        }
-      }
-    }
-    stbi__cleanup_jpeg(z);
-    *out_x = z->s->img_x;
-    *out_y = z->s->img_y;
-    if (comp)
-      *comp =
-          z->s->img_n >= 3 ? 3 : 1; // report original components, not output
-    return output;
-  }
-}
-
-static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp,
-                             int req_comp, stbi__result_info *ri) {
-  unsigned char *result;
-  stbi__jpeg *j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
-  STBI_NOTUSED(ri);
-  j->s = s;
-  stbi__setup_jpeg(j);
-  result = load_jpeg_image(j, x, y, comp, req_comp);
-  STBI_FREE(j);
-  return result;
-}
-
-static int stbi__jpeg_test(stbi__context *s) {
-  int r;
-  stbi__jpeg *j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg));
-  j->s = s;
-  stbi__setup_jpeg(j);
-  r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
-  stbi__rewind(s);
-  STBI_FREE(j);
-  return r;
-}
-
-static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp) {
-  if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
-    stbi__rewind(j->s);
-    return 0;
-  }
-  if (x)
-    *x = j->s->img_x;
-  if (y)
-    *y = j->s->img_y;
-  if (comp)
-    *comp = j->s->img_n >= 3 ? 3 : 1;
-  return 1;
-}
-
-static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) {
-  int result;
-  stbi__jpeg *j = (stbi__jpeg *)(stbi__malloc(sizeof(stbi__jpeg)));
-  j->s = s;
-  result = stbi__jpeg_info_raw(j, x, y, comp);
-  STBI_FREE(j);
-  return result;
-}
-#endif
-
-// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
-//    simple implementation
-//      - all input must be provided in an upfront buffer
-//      - all output is written to a single output buffer (can malloc/realloc)
-//    performance
-//      - fast huffman
-
-#ifndef STBI_NO_ZLIB
-
-// fast-way is faster to check than jpeg huffman, but slow way is slower
-#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables
-#define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1)
-
-// zlib-style huffman encoding
-// (jpegs packs from left, zlib from right, so can't share code)
-typedef struct {
-  stbi__uint16 fast[1 << STBI__ZFAST_BITS];
-  stbi__uint16 firstcode[16];
-  int maxcode[17];
-  stbi__uint16 firstsymbol[16];
-  stbi_uc size[288];
-  stbi__uint16 value[288];
-} stbi__zhuffman;
-
-stbi_inline static int stbi__bitreverse16(int n) {
-  n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1);
-  n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2);
-  n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4);
-  n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8);
-  return n;
-}
-
-stbi_inline static int stbi__bit_reverse(int v, int bits) {
-  STBI_ASSERT(bits <= 16);
-  // to bit reverse n bits, reverse 16 and shift
-  // e.g. 11 bits, bit reverse and shift away 5
-  return stbi__bitreverse16(v) >> (16 - bits);
-}
-
-static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist,
-                                int num) {
-  int i, k = 0;
-  int code, next_code[16], sizes[17];
-
-  // DEFLATE spec for generating codes
-  memset(sizes, 0, sizeof(sizes));
-  memset(z->fast, 0, sizeof(z->fast));
-  for (i = 0; i < num; ++i)
-    ++sizes[sizelist[i]];
-  sizes[0] = 0;
-  for (i = 1; i < 16; ++i)
-    if (sizes[i] > (1 << i))
-      return stbi__err("bad sizes", "Corrupt PNG");
-  code = 0;
-  for (i = 1; i < 16; ++i) {
-    next_code[i] = code;
-    z->firstcode[i] = (stbi__uint16)code;
-    z->firstsymbol[i] = (stbi__uint16)k;
-    code = (code + sizes[i]);
-    if (sizes[i])
-      if (code - 1 >= (1 << i))
-        return stbi__err("bad codelengths", "Corrupt PNG");
-    z->maxcode[i] = code << (16 - i); // preshift for inner loop
-    code <<= 1;
-    k += sizes[i];
-  }
-  z->maxcode[16] = 0x10000; // sentinel
-  for (i = 0; i < num; ++i) {
-    int s = sizelist[i];
-    if (s) {
-      int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
-      stbi__uint16 fastv = (stbi__uint16)((s << 9) | i);
-      z->size[c] = (stbi_uc)s;
-      z->value[c] = (stbi__uint16)i;
-      if (s <= STBI__ZFAST_BITS) {
-        int j = stbi__bit_reverse(next_code[s], s);
-        while (j < (1 << STBI__ZFAST_BITS)) {
-          z->fast[j] = fastv;
-          j += (1 << s);
-        }
-      }
-      ++next_code[s];
-    }
-  }
-  return 1;
-}
-
-// zlib-from-memory implementation for PNG reading
-//    because PNG allows splitting the zlib stream arbitrarily,
-//    and it's annoying structurally to have PNG call ZLIB call PNG,
-//    we require PNG read all the IDATs and combine them into a single
-//    memory buffer
-
-typedef struct {
-  stbi_uc *zbuffer, *zbuffer_end;
-  int num_bits;
-  stbi__uint32 code_buffer;
-
-  char *zout;
-  char *zout_start;
-  char *zout_end;
-  int z_expandable;
-
-  stbi__zhuffman z_length, z_distance;
-} stbi__zbuf;
-
-stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z) {
-  if (z->zbuffer >= z->zbuffer_end)
-    return 0;
-  return *z->zbuffer++;
-}
-
-static void stbi__fill_bits(stbi__zbuf *z) {
-  do {
-    STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
-    z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits;
-    z->num_bits += 8;
-  } while (z->num_bits <= 24);
-}
-
-stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n) {
-  unsigned int k;
-  if (z->num_bits < n)
-    stbi__fill_bits(z);
-  k = z->code_buffer & ((1 << n) - 1);
-  z->code_buffer >>= n;
-  z->num_bits -= n;
-  return k;
-}
-
-static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z) {
-  int b, s, k;
-  // not resolved by fast table, so compute it the slow way
-  // use jpeg approach, which requires MSbits at top
-  k = stbi__bit_reverse(a->code_buffer, 16);
-  for (s = STBI__ZFAST_BITS + 1;; ++s)
-    if (k < z->maxcode[s])
-      break;
-  if (s == 16)
-    return -1; // invalid code!
-  // code size is s, so:
-  b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
-  STBI_ASSERT(z->size[b] == s);
-  a->code_buffer >>= s;
-  a->num_bits -= s;
-  return z->value[b];
-}
-
-stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z) {
-  int b, s;
-  if (a->num_bits < 16)
-    stbi__fill_bits(a);
-  b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
-  if (b) {
-    s = b >> 9;
-    a->code_buffer >>= s;
-    a->num_bits -= s;
-    return b & 511;
-  }
-  return stbi__zhuffman_decode_slowpath(a, z);
-}
-
-static int stbi__zexpand(stbi__zbuf *z, char *zout,
-                         int n) // need to make room for n bytes
-{
-  char *q;
-  int cur, limit, old_limit __attribute__((unused));
-  z->zout = zout;
-  if (!z->z_expandable)
-    return stbi__err("output buffer limit", "Corrupt PNG");
-  cur = (int)(z->zout - z->zout_start);
-  limit = old_limit = (int)(z->zout_end - z->zout_start);
-  while (cur + n > limit)
-    limit *= 2;
-  q = (char *)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
-  STBI_NOTUSED(old_limit);
-  if (q == NULL)
-    return stbi__err("outofmem", "Out of memory");
-  z->zout_start = q;
-  z->zout = q + cur;
-  z->zout_end = q + limit;
-  return 1;
-}
-
-static const int stbi__zlength_base[31] = {
-    3,  4,  5,  6,  7,  8,  9,  10,  11,  13,  15,  17,  19,  23, 27, 31,
-    35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0,  0};
-
-static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
-                                            1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,
-                                            4, 4, 5, 5, 5, 5, 0, 0, 0};
-
-static const int stbi__zdist_base[32] = {
-    1,    2,    3,    4,    5,    7,     9,     13,    17,  25,   33,
-    49,   65,   97,   129,  193,  257,   385,   513,   769, 1025, 1537,
-    2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0,   0};
-
-static const int stbi__zdist_extra[32] = {0, 0, 0,  0,  1,  1,  2,  2,  3,  3,
-                                          4, 4, 5,  5,  6,  6,  7,  7,  8,  8,
-                                          9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
-
-static int stbi__parse_huffman_block(stbi__zbuf *a) {
-  char *zout = a->zout;
-  for (;;) {
-    int z = stbi__zhuffman_decode(a, &a->z_length);
-    if (z < 256) {
-      if (z < 0)
-        return stbi__err("bad huffman code",
-                         "Corrupt PNG"); // error in huffman codes
-      if (zout >= a->zout_end) {
-        if (!stbi__zexpand(a, zout, 1))
-          return 0;
-        zout = a->zout;
-      }
-      *zout++ = (char)z;
-    } else {
-      stbi_uc *p;
-      int len, dist;
-      if (z == 256) {
-        a->zout = zout;
-        return 1;
-      }
-      z -= 257;
-      len = stbi__zlength_base[z];
-      if (stbi__zlength_extra[z])
-        len += stbi__zreceive(a, stbi__zlength_extra[z]);
-      z = stbi__zhuffman_decode(a, &a->z_distance);
-      if (z < 0)
-        return stbi__err("bad huffman code", "Corrupt PNG");
-      dist = stbi__zdist_base[z];
-      if (stbi__zdist_extra[z])
-        dist += stbi__zreceive(a, stbi__zdist_extra[z]);
-      if (zout - a->zout_start < dist)
-        return stbi__err("bad dist", "Corrupt PNG");
-      if (zout + len > a->zout_end) {
-        if (!stbi__zexpand(a, zout, len))
-          return 0;
-        zout = a->zout;
-      }
-      p = (stbi_uc *)(zout - dist);
-      if (dist == 1) { // run of one byte; common in images.
-        stbi_uc v = *p;
-        if (len) {
-          do
-            *zout++ = v;
-          while (--len);
-        }
-      } else {
-        if (len) {
-          do
-            *zout++ = *p++;
-          while (--len);
-        }
-      }
-    }
-  }
-}
-
-static int stbi__compute_huffman_codes(stbi__zbuf *a) {
-  static const stbi_uc length_dezigzag[19] = {
-      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
-  stbi__zhuffman z_codelength;
-  stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op
-  stbi_uc codelength_sizes[19];
-  int i, n;
-
-  int hlit = stbi__zreceive(a, 5) + 257;
-  int hdist = stbi__zreceive(a, 5) + 1;
-  int hclen = stbi__zreceive(a, 4) + 4;
-  int ntot = hlit + hdist;
-
-  memset(codelength_sizes, 0, sizeof(codelength_sizes));
-  for (i = 0; i < hclen; ++i) {
-    int s = stbi__zreceive(a, 3);
-    codelength_sizes[length_dezigzag[i]] = (stbi_uc)s;
-  }
-  if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19))
-    return 0;
-
-  n = 0;
-  while (n < ntot) {
-    int c = stbi__zhuffman_decode(a, &z_codelength);
-    if (c < 0 || c >= 19)
-      return stbi__err("bad codelengths", "Corrupt PNG");
-    if (c < 16)
-      lencodes[n++] = (stbi_uc)c;
-    else {
-      stbi_uc fill = 0;
-      if (c == 16) {
-        c = stbi__zreceive(a, 2) + 3;
-        if (n == 0)
-          return stbi__err("bad codelengths", "Corrupt PNG");
-        fill = lencodes[n - 1];
-      } else if (c == 17)
-        c = stbi__zreceive(a, 3) + 3;
-      else {
-        STBI_ASSERT(c == 18);
-        c = stbi__zreceive(a, 7) + 11;
-      }
-      if (ntot - n < c)
-        return stbi__err("bad codelengths", "Corrupt PNG");
-      memset(lencodes + n, fill, c);
-      n += c;
-    }
-  }
-  if (n != ntot)
-    return stbi__err("bad codelengths", "Corrupt PNG");
-  if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit))
-    return 0;
-  if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist))
-    return 0;
-  return 1;
-}
-
-static int stbi__parse_uncompressed_block(stbi__zbuf *a) {
-  stbi_uc header[4];
-  int len, nlen, k;
-  if (a->num_bits & 7)
-    stbi__zreceive(a, a->num_bits & 7); // discard
-  // drain the bit-packed data into header
-  k = 0;
-  while (a->num_bits > 0) {
-    header[k++] =
-        (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check
-    a->code_buffer >>= 8;
-    a->num_bits -= 8;
-  }
-  STBI_ASSERT(a->num_bits == 0);
-  // now fill header the normal way
-  while (k < 4)
-    header[k++] = stbi__zget8(a);
-  len = header[1] * 256 + header[0];
-  nlen = header[3] * 256 + header[2];
-  if (nlen != (len ^ 0xffff))
-    return stbi__err("zlib corrupt", "Corrupt PNG");
-  if (a->zbuffer + len > a->zbuffer_end)
-    return stbi__err("read past buffer", "Corrupt PNG");
-  if (a->zout + len > a->zout_end)
-    if (!stbi__zexpand(a, a->zout, len))
-      return 0;
-  memcpy(a->zout, a->zbuffer, len);
-  a->zbuffer += len;
-  a->zout += len;
-  return 1;
-}
-
-static int stbi__parse_zlib_header(stbi__zbuf *a) {
-  int cmf = stbi__zget8(a);
-  int cm = cmf & 15;
-  /* int cinfo = cmf >> 4; */
-  int flg = stbi__zget8(a);
-  if ((cmf * 256 + flg) % 31 != 0)
-    return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec
-  if (flg & 32)
-    return stbi__err("no preset dict",
-                     "Corrupt PNG"); // preset dictionary not allowed in png
-  if (cm != 8)
-    return stbi__err("bad compression",
-                     "Corrupt PNG"); // DEFLATE required for png
-  // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
-  return 1;
-}
-
-static const stbi_uc stbi__zdefault_length[288] = {
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8};
-static const stbi_uc stbi__zdefault_distance[32] = {
-    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
-/*
-Init algorithm:
-{
-   int i;   // use <= to match clearly with spec
-   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
-   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
-   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
-   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
-
-   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
-}
-*/
-
-static int stbi__parse_zlib(stbi__zbuf *a, int parse_header) {
-  int final, type;
-  if (parse_header)
-    if (!stbi__parse_zlib_header(a))
-      return 0;
-  a->num_bits = 0;
-  a->code_buffer = 0;
-  do {
-    final = stbi__zreceive(a, 1);
-    type = stbi__zreceive(a, 2);
-    if (type == 0) {
-      if (!stbi__parse_uncompressed_block(a))
-        return 0;
-    } else if (type == 3) {
-      return 0;
-    } else {
-      if (type == 1) {
-        // use fixed code lengths
-        if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288))
-          return 0;
-        if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32))
-          return 0;
-      } else {
-        if (!stbi__compute_huffman_codes(a))
-          return 0;
-      }
-      if (!stbi__parse_huffman_block(a))
-        return 0;
-    }
-  } while (!final);
-  return 1;
-}
-
-static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp,
-                         int parse_header) {
-  a->zout_start = obuf;
-  a->zout = obuf;
-  a->zout_end = obuf + olen;
-  a->z_expandable = exp;
-
-  return stbi__parse_zlib(a, parse_header);
-}
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len,
-                                                int initial_size, int *outlen) {
-  stbi__zbuf a;
-  char *p = (char *)stbi__malloc(initial_size);
-  if (p == NULL)
-    return NULL;
-  a.zbuffer = (stbi_uc *)buffer;
-  a.zbuffer_end = (stbi_uc *)buffer + len;
-  if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
-    if (outlen)
-      *outlen = (int)(a.zout - a.zout_start);
-    return a.zout_start;
-  } else {
-    STBI_FREE(a.zout_start);
-    return NULL;
-  }
-}
-
-STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len,
-                                      int *outlen) {
-  return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
-}
-
-STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer,
-                                                           int len,
-                                                           int initial_size,
-                                                           int *outlen,
-                                                           int parse_header) {
-  stbi__zbuf a;
-  char *p = (char *)stbi__malloc(initial_size);
-  if (p == NULL)
-    return NULL;
-  a.zbuffer = (stbi_uc *)buffer;
-  a.zbuffer_end = (stbi_uc *)buffer + len;
-  if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
-    if (outlen)
-      *outlen = (int)(a.zout - a.zout_start);
-    return a.zout_start;
-  } else {
-    STBI_FREE(a.zout_start);
-    return NULL;
-  }
-}
-
-STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen,
-                                    char const *ibuffer, int ilen) {
-  stbi__zbuf a;
-  a.zbuffer = (stbi_uc *)ibuffer;
-  a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
-  if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
-    return (int)(a.zout - a.zout_start);
-  else
-    return -1;
-}
-
-STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len,
-                                               int *outlen) {
-  stbi__zbuf a;
-  char *p = (char *)stbi__malloc(16384);
-  if (p == NULL)
-    return NULL;
-  a.zbuffer = (stbi_uc *)buffer;
-  a.zbuffer_end = (stbi_uc *)buffer + len;
-  if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
-    if (outlen)
-      *outlen = (int)(a.zout - a.zout_start);
-    return a.zout_start;
-  } else {
-    STBI_FREE(a.zout_start);
-    return NULL;
-  }
-}
-
-STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen,
-                                             const char *ibuffer, int ilen) {
-  stbi__zbuf a;
-  a.zbuffer = (stbi_uc *)ibuffer;
-  a.zbuffer_end = (stbi_uc *)ibuffer + ilen;
-  if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
-    return (int)(a.zout - a.zout_start);
-  else
-    return -1;
-}
-#endif
-
-// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
-//    simple implementation
-//      - only 8-bit samples
-//      - no CRC checking
-//      - allocates lots of intermediate memory
-//        - avoids problem of streaming data between subsystems
-//        - avoids explicit window management
-//    performance
-//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
-
-#ifndef STBI_NO_PNG
-typedef struct {
-  stbi__uint32 length;
-  stbi__uint32 type;
-} stbi__pngchunk;
-
-static stbi__pngchunk stbi__get_chunk_header(stbi__context *s) {
-  stbi__pngchunk c;
-  c.length = stbi__get32be(s);
-  c.type = stbi__get32be(s);
-  return c;
-}
-
-static int stbi__check_png_header(stbi__context *s) {
-  static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
-  int i;
-  for (i = 0; i < 8; ++i)
-    if (stbi__get8(s) != png_sig[i])
-      return stbi__err("bad png sig", "Not a PNG");
-  return 1;
-}
-
-typedef struct {
-  stbi__context *s;
-  stbi_uc *idata, *expanded, *out;
-  int depth;
-} stbi__png;
-
-enum {
-  STBI__F_none = 0,
-  STBI__F_sub = 1,
-  STBI__F_up = 2,
-  STBI__F_avg = 3,
-  STBI__F_paeth = 4,
-  // synthetic filters used for first scanline to avoid needing a dummy row of
-  // 0s
-  STBI__F_avg_first,
-  STBI__F_paeth_first
-};
-
-static stbi_uc first_row_filter[5] = {STBI__F_none, STBI__F_sub, STBI__F_none,
-                                      STBI__F_avg_first, STBI__F_paeth_first};
-
-static int stbi__paeth(int a, int b, int c) {
-  int p = a + b - c;
-  int pa = abs(p - a);
-  int pb = abs(p - b);
-  int pc = abs(p - c);
-  if (pa <= pb && pa <= pc)
-    return a;
-  if (pb <= pc)
-    return b;
-  return c;
-}
-
-static const stbi_uc stbi__depth_scale_table[9] = {0, 0xff, 0x55, 0,   0x11,
-                                                   0, 0,    0,    0x01};
-
-// create the png data from post-deflated data
-static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw,
-                                      stbi__uint32 raw_len, int out_n,
-                                      stbi__uint32 x, stbi__uint32 y, int depth,
-                                      int color) {
-  int bytes = (depth == 16 ? 2 : 1);
-  stbi__context *s = a->s;
-  stbi__uint32 i, j, stride = x * out_n * bytes;
-  stbi__uint32 img_len, img_width_bytes;
-  int k;
-  int img_n = s->img_n; // copy it into a local for later
-
-  int output_bytes = out_n * bytes;
-  int filter_bytes = img_n * bytes;
-  int width = x;
-
-  STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1);
-  a->out = (stbi_uc *)stbi__malloc_mad3(
-      x, y, output_bytes, 0); // extra bytes to write off the end into
-  if (!a->out)
-    return stbi__err("outofmem", "Out of memory");
-
-  if (!stbi__mad3sizes_valid(img_n, x, depth, 7))
-    return stbi__err("too large", "Corrupt PNG");
-  img_width_bytes = (((img_n * x * depth) + 7) >> 3);
-  img_len = (img_width_bytes + 1) * y;
-
-  // we used to check for exact match between raw_len and img_len on
-  // non-interlaced PNGs, but issue #276 reported a PNG in the wild that had
-  // extra data at the end (all zeros), so just check for raw_len < img_len
-  // always.
-  if (raw_len < img_len)
-    return stbi__err("not enough pixels", "Corrupt PNG");
-
-  for (j = 0; j < y; ++j) {
-    stbi_uc *cur = a->out + stride * j;
-    stbi_uc *prior;
-    int filter = *raw++;
-
-    if (filter > 4)
-      return stbi__err("invalid filter", "Corrupt PNG");
-
-    if (depth < 8) {
-      STBI_ASSERT(img_width_bytes <= x);
-      cur +=
-          x * out_n - img_width_bytes; // store output to the rightmost img_len
-                                       // bytes, so we can decode in place
-      filter_bytes = 1;
-      width = img_width_bytes;
-    }
-    prior =
-        cur -
-        stride; // bugfix: need to compute this after 'cur +=' computation above
-
-    // if first row, use special filter that doesn't sample previous row
-    if (j == 0)
-      filter = first_row_filter[filter];
-
-    // handle first byte explicitly
-    for (k = 0; k < filter_bytes; ++k) {
-      switch (filter) {
-      case STBI__F_none:
-        cur[k] = raw[k];
-        break;
-      case STBI__F_sub:
-        cur[k] = raw[k];
-        break;
-      case STBI__F_up:
-        cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
-        break;
-      case STBI__F_avg:
-        cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
-        break;
-      case STBI__F_paeth:
-        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0));
-        break;
-      case STBI__F_avg_first:
-        cur[k] = raw[k];
-        break;
-      case STBI__F_paeth_first:
-        cur[k] = raw[k];
-        break;
-      }
-    }
-
-    if (depth == 8) {
-      if (img_n != out_n)
-        cur[img_n] = 255; // first pixel
-      raw += img_n;
-      cur += out_n;
-      prior += out_n;
-    } else if (depth == 16) {
-      if (img_n != out_n) {
-        cur[filter_bytes] = 255;     // first pixel top byte
-        cur[filter_bytes + 1] = 255; // first pixel bottom byte
-      }
-      raw += filter_bytes;
-      cur += output_bytes;
-      prior += output_bytes;
-    } else {
-      raw += 1;
-      cur += 1;
-      prior += 1;
-    }
-
-    // this is a little gross, so that we don't switch per-pixel or
-    // per-component
-    if (depth < 8 || img_n == out_n) {
-      int nk = (width - 1) * filter_bytes;
-#define STBI__CASE(f)                                                          \
-  case f:                                                                      \
-    for (k = 0; k < nk; ++k)
-      switch (filter) {
-      // "none" filter turns into a memcpy here; make that explicit.
-      case STBI__F_none:
-        memcpy(cur, raw, nk);
-        break;
-        STBI__CASE(STBI__F_sub) {
-          cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]);
-        }
-        break;
-        STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); }
-        break;
-        STBI__CASE(STBI__F_avg) {
-          cur[k] = STBI__BYTECAST(raw[k] +
-                                  ((prior[k] + cur[k - filter_bytes]) >> 1));
-        }
-        break;
-        STBI__CASE(STBI__F_paeth) {
-          cur[k] = STBI__BYTECAST(raw[k] +
-                                  stbi__paeth(cur[k - filter_bytes], prior[k],
-                                              prior[k - filter_bytes]));
-        }
-        break;
-        STBI__CASE(STBI__F_avg_first) {
-          cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1));
-        }
-        break;
-        STBI__CASE(STBI__F_paeth_first) {
-          cur[k] =
-              STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0));
-        }
-        break;
-      }
-#undef STBI__CASE
-      raw += nk;
-    } else {
-      STBI_ASSERT(img_n + 1 == out_n);
-#define STBI__CASE(f)                                                          \
-  case f:                                                                      \
-    for (i = x - 1; i >= 1; --i, cur[filter_bytes] = 255, raw += filter_bytes, \
-        cur += output_bytes, prior += output_bytes)                            \
-      for (k = 0; k < filter_bytes; ++k)
-      switch (filter) {
-        STBI__CASE(STBI__F_none) { cur[k] = raw[k]; }
-        break;
-        STBI__CASE(STBI__F_sub) {
-          cur[k] = STBI__BYTECAST(raw[k] + cur[k - output_bytes]);
-        }
-        break;
-        STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); }
-        break;
-        STBI__CASE(STBI__F_avg) {
-          cur[k] = STBI__BYTECAST(raw[k] +
-                                  ((prior[k] + cur[k - output_bytes]) >> 1));
-        }
-        break;
-        STBI__CASE(STBI__F_paeth) {
-          cur[k] = STBI__BYTECAST(raw[k] +
-                                  stbi__paeth(cur[k - output_bytes], prior[k],
-                                              prior[k - output_bytes]));
-        }
-        break;
-        STBI__CASE(STBI__F_avg_first) {
-          cur[k] = STBI__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1));
-        }
-        break;
-        STBI__CASE(STBI__F_paeth_first) {
-          cur[k] =
-              STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], 0, 0));
-        }
-        break;
-      }
-#undef STBI__CASE
-
-      // the loop above sets the high byte of the pixels' alpha, but for
-      // 16 bit png files we also need the low byte set. we'll do that here.
-      if (depth == 16) {
-        cur = a->out + stride * j; // start at the beginning of the row again
-        for (i = 0; i < x; ++i, cur += output_bytes) {
-          cur[filter_bytes + 1] = 255;
-        }
-      }
-    }
-  }
-
-  // we make a separate pass to expand bits to pixels; for performance,
-  // this could run two scanlines behind the above code, so it won't
-  // intefere with filtering but will still be in the cache.
-  if (depth < 8) {
-    for (j = 0; j < y; ++j) {
-      stbi_uc *cur = a->out + stride * j;
-      stbi_uc *in = a->out + stride * j + x * out_n - img_width_bytes;
-      // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common
-      // 8-bit path optimal at minimal cost for 1/2/4-bit png guarante byte
-      // alignment, if width is not multiple of 8/4/2 we'll decode dummy
-      // trailing data that will be skipped in the later loop
-      stbi_uc scale = (color == 0)
-                          ? stbi__depth_scale_table[depth]
-                          : 1; // scale grayscale values to 0..255 range
-
-      // note that the final byte might overshoot and write more data than
-      // desired. we can allocate enough data that this never writes out of
-      // memory, but it could also overwrite the next scanline. can it overwrite
-      // non-empty data on the next scanline? yes, consider 1-pixel-wide
-      // scanlines with 1-bit-per-pixel. so we need to explicitly clamp the
-      // final ones
-
-      if (depth == 4) {
-        for (k = x * img_n; k >= 2; k -= 2, ++in) {
-          *cur++ = scale * ((*in >> 4));
-          *cur++ = scale * ((*in) & 0x0f);
-        }
-        if (k > 0)
-          *cur++ = scale * ((*in >> 4));
-      } else if (depth == 2) {
-        for (k = x * img_n; k >= 4; k -= 4, ++in) {
-          *cur++ = scale * ((*in >> 6));
-          *cur++ = scale * ((*in >> 4) & 0x03);
-          *cur++ = scale * ((*in >> 2) & 0x03);
-          *cur++ = scale * ((*in) & 0x03);
-        }
-        if (k > 0)
-          *cur++ = scale * ((*in >> 6));
-        if (k > 1)
-          *cur++ = scale * ((*in >> 4) & 0x03);
-        if (k > 2)
-          *cur++ = scale * ((*in >> 2) & 0x03);
-      } else if (depth == 1) {
-        for (k = x * img_n; k >= 8; k -= 8, ++in) {
-          *cur++ = scale * ((*in >> 7));
-          *cur++ = scale * ((*in >> 6) & 0x01);
-          *cur++ = scale * ((*in >> 5) & 0x01);
-          *cur++ = scale * ((*in >> 4) & 0x01);
-          *cur++ = scale * ((*in >> 3) & 0x01);
-          *cur++ = scale * ((*in >> 2) & 0x01);
-          *cur++ = scale * ((*in >> 1) & 0x01);
-          *cur++ = scale * ((*in) & 0x01);
-        }
-        if (k > 0)
-          *cur++ = scale * ((*in >> 7));
-        if (k > 1)
-          *cur++ = scale * ((*in >> 6) & 0x01);
-        if (k > 2)
-          *cur++ = scale * ((*in >> 5) & 0x01);
-        if (k > 3)
-          *cur++ = scale * ((*in >> 4) & 0x01);
-        if (k > 4)
-          *cur++ = scale * ((*in >> 3) & 0x01);
-        if (k > 5)
-          *cur++ = scale * ((*in >> 2) & 0x01);
-        if (k > 6)
-          *cur++ = scale * ((*in >> 1) & 0x01);
-      }
-      if (img_n != out_n) {
-        int q;
-        // insert alpha = 255
-        cur = a->out + stride * j;
-        if (img_n == 1) {
-          for (q = x - 1; q >= 0; --q) {
-            cur[q * 2 + 1] = 255;
-            cur[q * 2 + 0] = cur[q];
-          }
-        } else {
-          STBI_ASSERT(img_n == 3);
-          for (q = x - 1; q >= 0; --q) {
-            cur[q * 4 + 3] = 255;
-            cur[q * 4 + 2] = cur[q * 3 + 2];
-            cur[q * 4 + 1] = cur[q * 3 + 1];
-            cur[q * 4 + 0] = cur[q * 3 + 0];
-          }
-        }
-      }
-    }
-  } else if (depth == 16) {
-    // force the image data from big-endian to platform-native.
-    // this is done in a separate pass due to the decoding relying
-    // on the data being untouched, but could probably be done
-    // per-line during decode if care is taken.
-    stbi_uc *cur = a->out;
-    stbi__uint16 *cur16 = (stbi__uint16 *)cur;
-
-    for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) {
-      *cur16 = (cur[0] << 8) | cur[1];
-    }
-  }
-
-  return 1;
-}
-
-static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data,
-                                  stbi__uint32 image_data_len, int out_n,
-                                  int depth, int color, int interlaced) {
-  int bytes = (depth == 16 ? 2 : 1);
-  int out_bytes = out_n * bytes;
-  stbi_uc *final;
-  int p;
-  if (!interlaced)
-    return stbi__create_png_image_raw(a, image_data, image_data_len, out_n,
-                                      a->s->img_x, a->s->img_y, depth, color);
-
-  // de-interlacing
-  final = (stbi_uc *)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
-  for (p = 0; p < 7; ++p) {
-    int xorig[] = {0, 4, 0, 2, 0, 1, 0};
-    int yorig[] = {0, 0, 4, 0, 2, 0, 1};
-    int xspc[] = {8, 8, 4, 4, 2, 2, 1};
-    int yspc[] = {8, 8, 8, 4, 4, 2, 2};
-    int i, j, x, y;
-    // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
-    x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p];
-    y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p];
-    if (x && y) {
-      stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
-      if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x,
-                                      y, depth, color)) {
-        STBI_FREE(final);
-        return 0;
-      }
-      for (j = 0; j < y; ++j) {
-        for (i = 0; i < x; ++i) {
-          int out_y = j * yspc[p] + yorig[p];
-          int out_x = i * xspc[p] + xorig[p];
-          memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes,
-                 a->out + (j * x + i) * out_bytes, out_bytes);
-        }
-      }
-      STBI_FREE(a->out);
-      image_data += img_len;
-      image_data_len -= img_len;
-    }
-  }
-  a->out = final;
-
-  return 1;
-}
-
-static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n) {
-  stbi__context *s = z->s;
-  stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-  stbi_uc *p = z->out;
-
-  // compute color-based transparency, assuming we've
-  // already got 255 as the alpha value in the output
-  STBI_ASSERT(out_n == 2 || out_n == 4);
-
-  if (out_n == 2) {
-    for (i = 0; i < pixel_count; ++i) {
-      p[1] = (p[0] == tc[0] ? 0 : 255);
-      p += 2;
-    }
-  } else {
-    for (i = 0; i < pixel_count; ++i) {
-      if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-        p[3] = 0;
-      p += 4;
-    }
-  }
-  return 1;
-}
-
-static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3],
-                                        int out_n) {
-  stbi__context *s = z->s;
-  stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-  stbi__uint16 *p = (stbi__uint16 *)z->out;
-
-  // compute color-based transparency, assuming we've
-  // already got 65535 as the alpha value in the output
-  STBI_ASSERT(out_n == 2 || out_n == 4);
-
-  if (out_n == 2) {
-    for (i = 0; i < pixel_count; ++i) {
-      p[1] = (p[0] == tc[0] ? 0 : 65535);
-      p += 2;
-    }
-  } else {
-    for (i = 0; i < pixel_count; ++i) {
-      if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
-        p[3] = 0;
-      p += 4;
-    }
-  }
-  return 1;
-}
-
-static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len,
-                                    int pal_img_n) {
-  stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
-  stbi_uc *p, *temp_out, *orig = a->out;
-
-  p = (stbi_uc *)stbi__malloc_mad2(pixel_count, pal_img_n, 0);
-  if (p == NULL)
-    return stbi__err("outofmem", "Out of memory");
-
-  // between here and free(out) below, exitting would leak
-  temp_out = p;
-
-  if (pal_img_n == 3) {
-    for (i = 0; i < pixel_count; ++i) {
-      int n = orig[i] * 4;
-      p[0] = palette[n];
-      p[1] = palette[n + 1];
-      p[2] = palette[n + 2];
-      p += 3;
-    }
-  } else {
-    for (i = 0; i < pixel_count; ++i) {
-      int n = orig[i] * 4;
-      p[0] = palette[n];
-      p[1] = palette[n + 1];
-      p[2] = palette[n + 2];
-      p[3] = palette[n + 3];
-      p += 4;
-    }
-  }
-  STBI_FREE(a->out);
-  a->out = temp_out;
-
-  STBI_NOTUSED(len);
-
-  return 1;
-}
-
-static int stbi__unpremultiply_on_load = 0;
-static int stbi__de_iphone_flag = 0;
-
-STBIDEF void
-stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) {
-  stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
-}
-
-STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert) {
-  stbi__de_iphone_flag = flag_true_if_should_convert;
-}
-
-static void stbi__de_iphone(stbi__png *z) {
-  stbi__context *s = z->s;
-  stbi__uint32 i, pixel_count = s->img_x * s->img_y;
-  stbi_uc *p = z->out;
-
-  if (s->img_out_n == 3) { // convert bgr to rgb
-    for (i = 0; i < pixel_count; ++i) {
-      stbi_uc t = p[0];
-      p[0] = p[2];
-      p[2] = t;
-      p += 3;
-    }
-  } else {
-    STBI_ASSERT(s->img_out_n == 4);
-    if (stbi__unpremultiply_on_load) {
-      // convert bgr to rgb and unpremultiply
-      for (i = 0; i < pixel_count; ++i) {
-        stbi_uc a = p[3];
-        stbi_uc t = p[0];
-        if (a) {
-          stbi_uc half = a / 2;
-          p[0] = (p[2] * 255 + half) / a;
-          p[1] = (p[1] * 255 + half) / a;
-          p[2] = (t * 255 + half) / a;
-        } else {
-          p[0] = p[2];
-          p[2] = t;
-        }
-        p += 4;
-      }
-    } else {
-      // convert bgr to rgb
-      for (i = 0; i < pixel_count; ++i) {
-        stbi_uc t = p[0];
-        p[0] = p[2];
-        p[2] = t;
-        p += 4;
-      }
-    }
-  }
-}
-
-#define STBI__PNG_TYPE(a, b, c, d)                                             \
-  (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) +      \
-   (unsigned)(d))
-
-static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) {
-  stbi_uc palette[1024], pal_img_n = 0;
-  stbi_uc has_trans = 0, tc[3] = {0};
-  stbi__uint16 tc16[3];
-  stbi__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0;
-  int first = 1, k, interlace = 0, color = 0, is_iphone = 0;
-  stbi__context *s = z->s;
-
-  z->expanded = NULL;
-  z->idata = NULL;
-  z->out = NULL;
-
-  if (!stbi__check_png_header(s))
-    return 0;
-
-  if (scan == STBI__SCAN_type)
-    return 1;
-
-  for (;;) {
-    stbi__pngchunk c = stbi__get_chunk_header(s);
-    switch (c.type) {
-    case STBI__PNG_TYPE('C', 'g', 'B', 'I'):
-      is_iphone = 1;
-      stbi__skip(s, c.length);
-      break;
-    case STBI__PNG_TYPE('I', 'H', 'D', 'R'): {
-      int comp, filter;
-      if (!first)
-        return stbi__err("multiple IHDR", "Corrupt PNG");
-      first = 0;
-      if (c.length != 13)
-        return stbi__err("bad IHDR len", "Corrupt PNG");
-      s->img_x = stbi__get32be(s);
-      if (s->img_x > (1 << 24))
-        return stbi__err("too large", "Very large image (corrupt?)");
-      s->img_y = stbi__get32be(s);
-      if (s->img_y > (1 << 24))
-        return stbi__err("too large", "Very large image (corrupt?)");
-      z->depth = stbi__get8(s);
-      if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 &&
-          z->depth != 16)
-        return stbi__err("1/2/4/8/16-bit only",
-                         "PNG not supported: 1/2/4/8/16-bit only");
-      color = stbi__get8(s);
-      if (color > 6)
-        return stbi__err("bad ctype", "Corrupt PNG");
-      if (color == 3 && z->depth == 16)
-        return stbi__err("bad ctype", "Corrupt PNG");
-      if (color == 3)
-        pal_img_n = 3;
-      else if (color & 1)
-        return stbi__err("bad ctype", "Corrupt PNG");
-      comp = stbi__get8(s);
-      if (comp)
-        return stbi__err("bad comp method", "Corrupt PNG");
-      filter = stbi__get8(s);
-      if (filter)
-        return stbi__err("bad filter method", "Corrupt PNG");
-      interlace = stbi__get8(s);
-      if (interlace > 1)
-        return stbi__err("bad interlace method", "Corrupt PNG");
-      if (!s->img_x || !s->img_y)
-        return stbi__err("0-pixel image", "Corrupt PNG");
-      if (!pal_img_n) {
-        s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
-        if ((1 << 30) / s->img_x / s->img_n < s->img_y)
-          return stbi__err("too large", "Image too large to decode");
-        if (scan == STBI__SCAN_header)
-          return 1;
-      } else {
-        // if paletted, then pal_n is our final components, and
-        // img_n is # components to decompress/filter.
-        s->img_n = 1;
-        if ((1 << 30) / s->img_x / 4 < s->img_y)
-          return stbi__err("too large", "Corrupt PNG");
-        // if SCAN_header, have to scan to see if we have a tRNS
-      }
-      break;
-    }
-
-    case STBI__PNG_TYPE('P', 'L', 'T', 'E'): {
-      if (first)
-        return stbi__err("first not IHDR", "Corrupt PNG");
-      if (c.length > 256 * 3)
-        return stbi__err("invalid PLTE", "Corrupt PNG");
-      pal_len = c.length / 3;
-      if (pal_len * 3 != c.length)
-        return stbi__err("invalid PLTE", "Corrupt PNG");
-      for (i = 0; i < pal_len; ++i) {
-        palette[i * 4 + 0] = stbi__get8(s);
-        palette[i * 4 + 1] = stbi__get8(s);
-        palette[i * 4 + 2] = stbi__get8(s);
-        palette[i * 4 + 3] = 255;
-      }
-      break;
-    }
-
-    case STBI__PNG_TYPE('t', 'R', 'N', 'S'): {
-      if (first)
-        return stbi__err("first not IHDR", "Corrupt PNG");
-      if (z->idata)
-        return stbi__err("tRNS after IDAT", "Corrupt PNG");
-      if (pal_img_n) {
-        if (scan == STBI__SCAN_header) {
-          s->img_n = 4;
-          return 1;
-        }
-        if (pal_len == 0)
-          return stbi__err("tRNS before PLTE", "Corrupt PNG");
-        if (c.length > pal_len)
-          return stbi__err("bad tRNS len", "Corrupt PNG");
-        pal_img_n = 4;
-        for (i = 0; i < c.length; ++i)
-          palette[i * 4 + 3] = stbi__get8(s);
-      } else {
-        if (!(s->img_n & 1))
-          return stbi__err("tRNS with alpha", "Corrupt PNG");
-        if (c.length != (stbi__uint32)s->img_n * 2)
-          return stbi__err("bad tRNS len", "Corrupt PNG");
-        has_trans = 1;
-        if (z->depth == 16) {
-          for (k = 0; k < s->img_n; ++k)
-            tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
-        } else {
-          for (k = 0; k < s->img_n; ++k)
-            tc[k] = (stbi_uc)(stbi__get16be(s) & 255) *
-                    stbi__depth_scale_table[z->depth]; // non 8-bit images will
-                                                       // be larger
-        }
-      }
-      break;
-    }
-
-    case STBI__PNG_TYPE('I', 'D', 'A', 'T'): {
-      if (first)
-        return stbi__err("first not IHDR", "Corrupt PNG");
-      if (pal_img_n && !pal_len)
-        return stbi__err("no PLTE", "Corrupt PNG");
-      if (scan == STBI__SCAN_header) {
-        s->img_n = pal_img_n;
-        return 1;
-      }
-      if ((int)(ioff + c.length) < (int)ioff)
-        return 0;
-      if (ioff + c.length > idata_limit) {
-        stbi__uint32 idata_limit_old __attribute__((unused)) = idata_limit;
-        stbi_uc *p;
-        if (idata_limit == 0)
-          idata_limit = c.length > 4096 ? c.length : 4096;
-        while (ioff + c.length > idata_limit)
-          idata_limit *= 2;
-        STBI_NOTUSED(idata_limit_old);
-        p = (stbi_uc *)STBI_REALLOC_SIZED(z->idata, idata_limit_old,
-                                          idata_limit);
-        if (p == NULL)
-          return stbi__err("outofmem", "Out of memory");
-        z->idata = p;
-      }
-      if (!stbi__getn(s, z->idata + ioff, c.length))
-        return stbi__err("outofdata", "Corrupt PNG");
-      ioff += c.length;
-      break;
-    }
-
-    case STBI__PNG_TYPE('I', 'E', 'N', 'D'): {
-      stbi__uint32 raw_len, bpl;
-      if (first)
-        return stbi__err("first not IHDR", "Corrupt PNG");
-      if (scan != STBI__SCAN_load)
-        return 1;
-      if (z->idata == NULL)
-        return stbi__err("no IDAT", "Corrupt PNG");
-      // initial guess for decoded data size to avoid unnecessary reallocs
-      bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
-      raw_len = bpl * s->img_y * s->img_n /* pixels */ +
-                s->img_y /* filter mode per row */;
-      z->expanded = (stbi_uc *)stbi_zlib_decode_malloc_guesssize_headerflag(
-          (char *)z->idata, ioff, raw_len, (int *)&raw_len, !is_iphone);
-      if (z->expanded == NULL)
-        return 0; // zlib should set error
-      STBI_FREE(z->idata);
-      z->idata = NULL;
-      if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) ||
-          has_trans)
-        s->img_out_n = s->img_n + 1;
-      else
-        s->img_out_n = s->img_n;
-      if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n,
-                                  z->depth, color, interlace))
-        return 0;
-      if (has_trans) {
-        if (z->depth == 16) {
-          if (!stbi__compute_transparency16(z, tc16, s->img_out_n))
-            return 0;
-        } else {
-          if (!stbi__compute_transparency(z, tc, s->img_out_n))
-            return 0;
-        }
-      }
-      if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
-        stbi__de_iphone(z);
-      if (pal_img_n) {
-        // pal_img_n == 3 or 4
-        s->img_n = pal_img_n; // record the actual colors we had
-        s->img_out_n = pal_img_n;
-        if (req_comp >= 3)
-          s->img_out_n = req_comp;
-        if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
-          return 0;
-      } else if (has_trans) {
-        // non-paletted image with tRNS -> source image has (constant) alpha
-        ++s->img_n;
-      }
-      STBI_FREE(z->expanded);
-      z->expanded = NULL;
-      return 1;
-    }
-
-    default:
-      // if critical, fail
-      if (first)
-        return stbi__err("first not IHDR", "Corrupt PNG");
-      if ((c.type & (1 << 29)) == 0) {
-#ifndef STBI_NO_FAILURE_STRINGS
-        // not threadsafe
-        static char invalid_chunk[] = "XXXX PNG chunk not known";
-        invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
-        invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
-        invalid_chunk[2] = STBI__BYTECAST(c.type >> 8);
-        invalid_chunk[3] = STBI__BYTECAST(c.type >> 0);
-#endif
-        return stbi__err(invalid_chunk,
-                         "PNG not supported: unknown PNG chunk type");
-      }
-      stbi__skip(s, c.length);
-      break;
-    }
-    // end of PNG chunk, read and skip CRC
-    stbi__get32be(s);
-  }
-}
-
-static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp,
-                          stbi__result_info *ri) {
-  void *result = NULL;
-  if (req_comp < 0 || req_comp > 4)
-    return stbi__errpuc("bad req_comp", "Internal error");
-  if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
-    if (p->depth < 8)
-      ri->bits_per_channel = 8;
-    else
-      ri->bits_per_channel = p->depth;
-    result = p->out;
-    p->out = NULL;
-    if (req_comp && req_comp != p->s->img_out_n) {
-      if (ri->bits_per_channel == 8)
-        result = stbi__convert_format((unsigned char *)result, p->s->img_out_n,
-                                      req_comp, p->s->img_x, p->s->img_y);
-      else
-        result = stbi__convert_format16((stbi__uint16 *)result, p->s->img_out_n,
-                                        req_comp, p->s->img_x, p->s->img_y);
-      p->s->img_out_n = req_comp;
-      if (result == NULL)
-        return result;
-    }
-    *x = p->s->img_x;
-    *y = p->s->img_y;
-    if (n)
-      *n = p->s->img_n;
-  }
-  STBI_FREE(p->out);
-  p->out = NULL;
-  STBI_FREE(p->expanded);
-  p->expanded = NULL;
-  STBI_FREE(p->idata);
-  p->idata = NULL;
-
-  return result;
-}
-
-static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri) {
-  stbi__png p;
-  p.s = s;
-  return stbi__do_png(&p, x, y, comp, req_comp, ri);
-}
-
-static int stbi__png_test(stbi__context *s) {
-  int r;
-  r = stbi__check_png_header(s);
-  stbi__rewind(s);
-  return r;
-}
-
-static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp) {
-  if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
-    stbi__rewind(p->s);
-    return 0;
-  }
-  if (x)
-    *x = p->s->img_x;
-  if (y)
-    *y = p->s->img_y;
-  if (comp)
-    *comp = p->s->img_n;
-  return 1;
-}
-
-static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp) {
-  stbi__png p;
-  p.s = s;
-  return stbi__png_info_raw(&p, x, y, comp);
-}
-
-static int stbi__png_is16(stbi__context *s) {
-  stbi__png p;
-  p.s = s;
-  if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
-    return 0;
-  if (p.depth != 16) {
-    stbi__rewind(p.s);
-    return 0;
-  }
-  return 1;
-}
-#endif
-
-// Microsoft/Windows BMP image
-
-#ifndef STBI_NO_BMP
-static int stbi__bmp_test_raw(stbi__context *s) {
-  int r;
-  int sz;
-  if (stbi__get8(s) != 'B')
-    return 0;
-  if (stbi__get8(s) != 'M')
-    return 0;
-  stbi__get32le(s); // discard filesize
-  stbi__get16le(s); // discard reserved
-  stbi__get16le(s); // discard reserved
-  stbi__get32le(s); // discard data offset
-  sz = stbi__get32le(s);
-  r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
-  return r;
-}
-
-static int stbi__bmp_test(stbi__context *s) {
-  int r = stbi__bmp_test_raw(s);
-  stbi__rewind(s);
-  return r;
-}
-
-// returns 0..31 for the highest set bit
-static int stbi__high_bit(unsigned int z) {
-  int n = 0;
-  if (z == 0)
-    return -1;
-  if (z >= 0x10000) {
-    n += 16;
-    z >>= 16;
-  }
-  if (z >= 0x00100) {
-    n += 8;
-    z >>= 8;
-  }
-  if (z >= 0x00010) {
-    n += 4;
-    z >>= 4;
-  }
-  if (z >= 0x00004) {
-    n += 2;
-    z >>= 2;
-  }
-  if (z >= 0x00002) {
-    n += 1; /* >>=  1;*/
-  }
-  return n;
-}
-
-static int stbi__bitcount(unsigned int a) {
-  a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2
-  a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4
-  a = (a + (a >> 4)) & 0x0f0f0f0f;                // max 8 per 4, now 8 bits
-  a = (a + (a >> 8));                             // max 16 per 8 bits
-  a = (a + (a >> 16));                            // max 32 per 8 bits
-  return a & 0xff;
-}
-
-// extract an arbitrarily-aligned N-bit value (N=bits)
-// from v, and then make it 8-bits long and fractionally
-// extend it to full full range.
-static int stbi__shiftsigned(unsigned int v, int shift, int bits) {
-  static unsigned int mul_table[9] = {
-      0,
-      0xff /*0b11111111*/,
-      0x55 /*0b01010101*/,
-      0x49 /*0b01001001*/,
-      0x11 /*0b00010001*/,
-      0x21 /*0b00100001*/,
-      0x41 /*0b01000001*/,
-      0x81 /*0b10000001*/,
-      0x01 /*0b00000001*/,
-  };
-  static unsigned int shift_table[9] = {
-      0, 0, 0, 1, 0, 2, 4, 6, 0,
-  };
-  if (shift < 0)
-    v <<= -shift;
-  else
-    v >>= shift;
-  STBI_ASSERT(v < 256);
-  v >>= (8 - bits);
-  STBI_ASSERT(bits >= 0 && bits <= 8);
-  return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits];
-}
-
-typedef struct {
-  int bpp, offset, hsz;
-  unsigned int mr, mg, mb, ma, all_a;
-} stbi__bmp_data;
-
-static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) {
-  int hsz;
-  if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M')
-    return stbi__errpuc("not BMP", "Corrupt BMP");
-  stbi__get32le(s); // discard filesize
-  stbi__get16le(s); // discard reserved
-  stbi__get16le(s); // discard reserved
-  info->offset = stbi__get32le(s);
-  info->hsz = hsz = stbi__get32le(s);
-  info->mr = info->mg = info->mb = info->ma = 0;
-
-  if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124)
-    return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
-  if (hsz == 12) {
-    s->img_x = stbi__get16le(s);
-    s->img_y = stbi__get16le(s);
-  } else {
-    s->img_x = stbi__get32le(s);
-    s->img_y = stbi__get32le(s);
-  }
-  if (stbi__get16le(s) != 1)
-    return stbi__errpuc("bad BMP", "bad BMP");
-  info->bpp = stbi__get16le(s);
-  if (hsz != 12) {
-    int compress = stbi__get32le(s);
-    if (compress == 1 || compress == 2)
-      return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
-    stbi__get32le(s); // discard sizeof
-    stbi__get32le(s); // discard hres
-    stbi__get32le(s); // discard vres
-    stbi__get32le(s); // discard colorsused
-    stbi__get32le(s); // discard max important
-    if (hsz == 40 || hsz == 56) {
-      if (hsz == 56) {
-        stbi__get32le(s);
-        stbi__get32le(s);
-        stbi__get32le(s);
-        stbi__get32le(s);
-      }
-      if (info->bpp == 16 || info->bpp == 32) {
-        if (compress == 0) {
-          if (info->bpp == 32) {
-            info->mr = 0xffu << 16;
-            info->mg = 0xffu << 8;
-            info->mb = 0xffu << 0;
-            info->ma = 0xffu << 24;
-            info->all_a = 0; // if all_a is 0 at end, then we loaded alpha
-                             // channel but it was all 0
-          } else {
-            info->mr = 31u << 10;
-            info->mg = 31u << 5;
-            info->mb = 31u << 0;
-          }
-        } else if (compress == 3) {
-          info->mr = stbi__get32le(s);
-          info->mg = stbi__get32le(s);
-          info->mb = stbi__get32le(s);
-          // not documented, but generated by photoshop and handled by mspaint
-          if (info->mr == info->mg && info->mg == info->mb) {
-            // ?!?!?
-            return stbi__errpuc("bad BMP", "bad BMP");
-          }
-        } else
-          return stbi__errpuc("bad BMP", "bad BMP");
-      }
-    } else {
-      int i;
-      if (hsz != 108 && hsz != 124)
-        return stbi__errpuc("bad BMP", "bad BMP");
-      info->mr = stbi__get32le(s);
-      info->mg = stbi__get32le(s);
-      info->mb = stbi__get32le(s);
-      info->ma = stbi__get32le(s);
-      stbi__get32le(s); // discard color space
-      for (i = 0; i < 12; ++i)
-        stbi__get32le(s); // discard color space parameters
-      if (hsz == 124) {
-        stbi__get32le(s); // discard rendering intent
-        stbi__get32le(s); // discard offset of profile data
-        stbi__get32le(s); // discard size of profile data
-        stbi__get32le(s); // discard reserved
-      }
-    }
-  }
-  return (void *)1;
-}
-
-static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri) {
-  stbi_uc *out;
-  unsigned int mr = 0, mg = 0, mb = 0, ma = 0, all_a;
-  stbi_uc pal[256][4];
-  int psize = 0, i, j, width;
-  int flip_vertically, pad, target;
-  stbi__bmp_data info;
-  STBI_NOTUSED(ri);
-
-  info.all_a = 255;
-  if (stbi__bmp_parse_header(s, &info) == NULL)
-    return NULL; // error code already set
-
-  flip_vertically = ((int)s->img_y) > 0;
-  s->img_y = abs((int)s->img_y);
-
-  mr = info.mr;
-  mg = info.mg;
-  mb = info.mb;
-  ma = info.ma;
-  all_a = info.all_a;
-
-  if (info.hsz == 12) {
-    if (info.bpp < 24)
-      psize = (info.offset - 14 - 24) / 3;
-  } else {
-    if (info.bpp < 16)
-      psize = (info.offset - 14 - info.hsz) >> 2;
-  }
-
-  if (info.bpp == 24 && ma == 0xff000000)
-    s->img_n = 3;
-  else
-    s->img_n = ma ? 4 : 3;
-  if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
-    target = req_comp;
-  else
-    target = s->img_n; // if they want monochrome, we'll post-convert
-
-  // sanity-check size
-  if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
-    return stbi__errpuc("too large", "Corrupt BMP");
-
-  out = (stbi_uc *)stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
-  if (!out)
-    return stbi__errpuc("outofmem", "Out of memory");
-  if (info.bpp < 16) {
-    int z = 0;
-    if (psize == 0 || psize > 256) {
-      STBI_FREE(out);
-      return stbi__errpuc("invalid", "Corrupt BMP");
-    }
-    for (i = 0; i < psize; ++i) {
-      pal[i][2] = stbi__get8(s);
-      pal[i][1] = stbi__get8(s);
-      pal[i][0] = stbi__get8(s);
-      if (info.hsz != 12)
-        stbi__get8(s);
-      pal[i][3] = 255;
-    }
-    stbi__skip(s,
-               info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
-    if (info.bpp == 1)
-      width = (s->img_x + 7) >> 3;
-    else if (info.bpp == 4)
-      width = (s->img_x + 1) >> 1;
-    else if (info.bpp == 8)
-      width = s->img_x;
-    else {
-      STBI_FREE(out);
-      return stbi__errpuc("bad bpp", "Corrupt BMP");
-    }
-    pad = (-width) & 3;
-    if (info.bpp == 1) {
-      for (j = 0; j < (int)s->img_y; ++j) {
-        int bit_offset = 7, v = stbi__get8(s);
-        for (i = 0; i < (int)s->img_x; ++i) {
-          int color = (v >> bit_offset) & 0x1;
-          out[z++] = pal[color][0];
-          out[z++] = pal[color][1];
-          out[z++] = pal[color][2];
-          if (target == 4)
-            out[z++] = 255;
-          if (i + 1 == (int)s->img_x)
-            break;
-          if ((--bit_offset) < 0) {
-            bit_offset = 7;
-            v = stbi__get8(s);
-          }
-        }
-        stbi__skip(s, pad);
-      }
-    } else {
-      for (j = 0; j < (int)s->img_y; ++j) {
-        for (i = 0; i < (int)s->img_x; i += 2) {
-          int v = stbi__get8(s), v2 = 0;
-          if (info.bpp == 4) {
-            v2 = v & 15;
-            v >>= 4;
-          }
-          out[z++] = pal[v][0];
-          out[z++] = pal[v][1];
-          out[z++] = pal[v][2];
-          if (target == 4)
-            out[z++] = 255;
-          if (i + 1 == (int)s->img_x)
-            break;
-          v = (info.bpp == 8) ? stbi__get8(s) : v2;
-          out[z++] = pal[v][0];
-          out[z++] = pal[v][1];
-          out[z++] = pal[v][2];
-          if (target == 4)
-            out[z++] = 255;
-        }
-        stbi__skip(s, pad);
-      }
-    }
-  } else {
-    int rshift = 0, gshift = 0, bshift = 0, ashift = 0, rcount = 0, gcount = 0,
-        bcount = 0, acount = 0;
-    int z = 0;
-    int easy = 0;
-    stbi__skip(s, info.offset - 14 - info.hsz);
-    if (info.bpp == 24)
-      width = 3 * s->img_x;
-    else if (info.bpp == 16)
-      width = 2 * s->img_x;
-    else /* bpp = 32 and pad = 0 */
-      width = 0;
-    pad = (-width) & 3;
-    if (info.bpp == 24) {
-      easy = 1;
-    } else if (info.bpp == 32) {
-      if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
-        easy = 2;
-    }
-    if (!easy) {
-      if (!mr || !mg || !mb) {
-        STBI_FREE(out);
-        return stbi__errpuc("bad masks", "Corrupt BMP");
-      }
-      // right shift amt to put high bit in position #7
-      rshift = stbi__high_bit(mr) - 7;
-      rcount = stbi__bitcount(mr);
-      gshift = stbi__high_bit(mg) - 7;
-      gcount = stbi__bitcount(mg);
-      bshift = stbi__high_bit(mb) - 7;
-      bcount = stbi__bitcount(mb);
-      ashift = stbi__high_bit(ma) - 7;
-      acount = stbi__bitcount(ma);
-    }
-    for (j = 0; j < (int)s->img_y; ++j) {
-      if (easy) {
-        for (i = 0; i < (int)s->img_x; ++i) {
-          unsigned char a;
-          out[z + 2] = stbi__get8(s);
-          out[z + 1] = stbi__get8(s);
-          out[z + 0] = stbi__get8(s);
-          z += 3;
-          a = (easy == 2 ? stbi__get8(s) : 255);
-          all_a |= a;
-          if (target == 4)
-            out[z++] = a;
-        }
-      } else {
-        int bpp = info.bpp;
-        for (i = 0; i < (int)s->img_x; ++i) {
-          stbi__uint32 v =
-              (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s));
-          unsigned int a;
-          out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
-          out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
-          out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
-          a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
-          all_a |= a;
-          if (target == 4)
-            out[z++] = STBI__BYTECAST(a);
-        }
-      }
-      stbi__skip(s, pad);
-    }
-  }
-
-  // if alpha channel is all 0s, replace with all 255s
-  if (target == 4 && all_a == 0)
-    for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4)
-      out[i] = 255;
-
-  if (flip_vertically) {
-    stbi_uc t;
-    for (j = 0; j< ((int)s->img_y >> 1); ++j) {
-      stbi_uc *p1 = out + j * s->img_x * target;
-      stbi_uc *p2 = out + (s->img_y - 1 - j) * s->img_x * target;
-      for (i = 0; i < (int)s->img_x * target; ++i) {
-        t = p1[i];
-        p1[i] = p2[i];
-        p2[i] = t;
-      }
-    }
-  }
-
-  if (req_comp && req_comp != target) {
-    out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
-    if (out == NULL)
-      return out; // stbi__convert_format frees input on failure
-  }
-
-  *x = s->img_x;
-  *y = s->img_y;
-  if (comp)
-    *comp = s->img_n;
-  return out;
-}
-#endif
-
-// Targa Truevision - TGA
-// by Jonathan Dummer
-#ifndef STBI_NO_TGA
-// returns STBI_rgb or whatever, 0 on error
-static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int *is_rgb16) {
-  // only RGB or RGBA (incl. 16bit) or grey allowed
-  if (is_rgb16)
-    *is_rgb16 = 0;
-  switch (bits_per_pixel) {
-  case 8:
-    return STBI_grey;
-  case 16:
-    if (is_grey)
-      return STBI_grey_alpha;
-    // fallthrough
-  case 15:
-    if (is_rgb16)
-      *is_rgb16 = 1;
-    return STBI_rgb;
-  case 24: // fallthrough
-  case 32:
-    return bits_per_pixel / 8;
-  default:
-    return 0;
-  }
-}
-
-static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp) {
-  int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel,
-      tga_colormap_bpp;
-  int sz, tga_colormap_type;
-  stbi__get8(s);                     // discard Offset
-  tga_colormap_type = stbi__get8(s); // colormap type
-  if (tga_colormap_type > 1) {
-    stbi__rewind(s);
-    return 0; // only RGB or indexed allowed
-  }
-  tga_image_type = stbi__get8(s); // image type
-  if (tga_colormap_type == 1) {   // colormapped (paletted) image
-    if (tga_image_type != 1 && tga_image_type != 9) {
-      stbi__rewind(s);
-      return 0;
-    }
-    stbi__skip(s,
-               4); // skip index of first colormap entry and number of entries
-    sz = stbi__get8(s); //   check bits per palette color entry
-    if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) {
-      stbi__rewind(s);
-      return 0;
-    }
-    stbi__skip(s, 4); // skip image x and y origin
-    tga_colormap_bpp = sz;
-  } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
-    if ((tga_image_type != 2) && (tga_image_type != 3) &&
-        (tga_image_type != 10) && (tga_image_type != 11)) {
-      stbi__rewind(s);
-      return 0; // only RGB or grey allowed, +/- RLE
-    }
-    stbi__skip(s, 9); // skip colormap specification and image x/y origin
-    tga_colormap_bpp = 0;
-  }
-  tga_w = stbi__get16le(s);
-  if (tga_w < 1) {
-    stbi__rewind(s);
-    return 0; // test width
-  }
-  tga_h = stbi__get16le(s);
-  if (tga_h < 1) {
-    stbi__rewind(s);
-    return 0; // test height
-  }
-  tga_bits_per_pixel = stbi__get8(s); // bits per pixel
-  stbi__get8(s);                      // ignore alpha bits
-  if (tga_colormap_bpp != 0) {
-    if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
-      // when using a colormap, tga_bits_per_pixel is the size of the indexes
-      // I don't think anything but 8 or 16bit indexes makes sense
-      stbi__rewind(s);
-      return 0;
-    }
-    tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
-  } else {
-    tga_comp = stbi__tga_get_comp(
-        tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11),
-        NULL);
-  }
-  if (!tga_comp) {
-    stbi__rewind(s);
-    return 0;
-  }
-  if (x)
-    *x = tga_w;
-  if (y)
-    *y = tga_h;
-  if (comp)
-    *comp = tga_comp;
-  return 1; // seems to have passed everything
-}
-
-static int stbi__tga_test(stbi__context *s) {
-  int res = 0;
-  int sz, tga_color_type;
-  stbi__get8(s);                  //   discard Offset
-  tga_color_type = stbi__get8(s); //   color type
-  if (tga_color_type > 1)
-    goto errorEnd;           //   only RGB or indexed allowed
-  sz = stbi__get8(s);        //   image type
-  if (tga_color_type == 1) { // colormapped (paletted) image
-    if (sz != 1 && sz != 9)
-      goto errorEnd; // colortype 1 demands image type 1 or 9
-    stbi__skip(s,
-               4); // skip index of first colormap entry and number of entries
-    sz = stbi__get8(s); //   check bits per palette color entry
-    if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
-      goto errorEnd;
-    stbi__skip(s, 4); // skip image x and y origin
-  } else {            // "normal" image w/o colormap
-    if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11))
-      goto errorEnd;  // only RGB or grey allowed, +/- RLE
-    stbi__skip(s, 9); // skip colormap specification and image x/y origin
-  }
-  if (stbi__get16le(s) < 1)
-    goto errorEnd; //   test width
-  if (stbi__get16le(s) < 1)
-    goto errorEnd;    //   test height
-  sz = stbi__get8(s); //   bits per pixel
-  if ((tga_color_type == 1) && (sz != 8) && (sz != 16))
-    goto errorEnd; // for colormapped images, bpp is size of an index
-  if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32))
-    goto errorEnd;
-
-  res = 1; // if we got this far, everything's good and we can return 1 instead
-           // of 0
-
-errorEnd:
-  stbi__rewind(s);
-  return res;
-}
-
-// read 16bit value and convert to 24bit RGB
-static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc *out) {
-  stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
-  stbi__uint16 fiveBitMask = 31;
-  // we have 3 channels with 5bits each
-  int r = (px >> 10) & fiveBitMask;
-  int g = (px >> 5) & fiveBitMask;
-  int b = px & fiveBitMask;
-  // Note that this saves the data in RGB(A) order, so it doesn't need to be
-  // swapped later
-  out[0] = (stbi_uc)((r * 255) / 31);
-  out[1] = (stbi_uc)((g * 255) / 31);
-  out[2] = (stbi_uc)((b * 255) / 31);
-
-  // some people claim that the most significant bit might be used for alpha
-  // (possibly if an alpha-bit is set in the "image descriptor byte")
-  // but that only made 16bit test images completely translucent..
-  // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
-}
-
-static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri) {
-  //   read in the TGA header stuff
-  int tga_offset = stbi__get8(s);
-  int tga_indexed = stbi__get8(s);
-  int tga_image_type = stbi__get8(s);
-  int tga_is_RLE = 0;
-  int tga_palette_start = stbi__get16le(s);
-  int tga_palette_len = stbi__get16le(s);
-  int tga_palette_bits = stbi__get8(s);
-  int tga_x_origin = stbi__get16le(s);
-  int tga_y_origin = stbi__get16le(s);
-  int tga_width = stbi__get16le(s);
-  int tga_height = stbi__get16le(s);
-  int tga_bits_per_pixel = stbi__get8(s);
-  int tga_comp, tga_rgb16 = 0;
-  int tga_inverted = stbi__get8(s);
-  // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused
-  // (useless?)
-  //   image data
-  unsigned char *tga_data;
-  unsigned char *tga_palette = NULL;
-  int i, j;
-  unsigned char raw_data[4] = {0};
-  int RLE_count = 0;
-  int RLE_repeating = 0;
-  int read_next_pixel = 1;
-  STBI_NOTUSED(ri);
-  STBI_NOTUSED(tga_x_origin); // @TODO
-  STBI_NOTUSED(tga_y_origin); // @TODO
-
-  //   do a tiny bit of precessing
-  if (tga_image_type >= 8) {
-    tga_image_type -= 8;
-    tga_is_RLE = 1;
-  }
-  tga_inverted = 1 - ((tga_inverted >> 5) & 1);
-
-  //   If I'm paletted, then I'll use the number of bits from the palette
-  if (tga_indexed)
-    tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
-  else
-    tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3),
-                                  &tga_rgb16);
-
-  if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have
-                 // ensured basic consistency
-    return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
-
-  //   tga info
-  *x = tga_width;
-  *y = tga_height;
-  if (comp)
-    *comp = tga_comp;
-
-  if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
-    return stbi__errpuc("too large", "Corrupt TGA");
-
-  tga_data =
-      (unsigned char *)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
-  if (!tga_data)
-    return stbi__errpuc("outofmem", "Out of memory");
-
-  // skip to the data's starting position (offset usually = 0)
-  stbi__skip(s, tga_offset);
-
-  if (!tga_indexed && !tga_is_RLE && !tga_rgb16) {
-    for (i = 0; i < tga_height; ++i) {
-      int row = tga_inverted ? tga_height - i - 1 : i;
-      stbi_uc *tga_row = tga_data + row * tga_width * tga_comp;
-      stbi__getn(s, tga_row, tga_width * tga_comp);
-    }
-  } else {
-    //   do I need to load a palette?
-    if (tga_indexed) {
-      //   any data to skip? (offset usually = 0)
-      stbi__skip(s, tga_palette_start);
-      //   load the palette
-      tga_palette =
-          (unsigned char *)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
-      if (!tga_palette) {
-        STBI_FREE(tga_data);
-        return stbi__errpuc("outofmem", "Out of memory");
-      }
-      if (tga_rgb16) {
-        stbi_uc *pal_entry = tga_palette;
-        STBI_ASSERT(tga_comp == STBI_rgb);
-        for (i = 0; i < tga_palette_len; ++i) {
-          stbi__tga_read_rgb16(s, pal_entry);
-          pal_entry += tga_comp;
-        }
-      } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
-        STBI_FREE(tga_data);
-        STBI_FREE(tga_palette);
-        return stbi__errpuc("bad palette", "Corrupt TGA");
-      }
-    }
-    //   load the data
-    for (i = 0; i < tga_width * tga_height; ++i) {
-      //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
-      if (tga_is_RLE) {
-        if (RLE_count == 0) {
-          //   yep, get the next byte as a RLE command
-          int RLE_cmd = stbi__get8(s);
-          RLE_count = 1 + (RLE_cmd & 127);
-          RLE_repeating = RLE_cmd >> 7;
-          read_next_pixel = 1;
-        } else if (!RLE_repeating) {
-          read_next_pixel = 1;
-        }
-      } else {
-        read_next_pixel = 1;
-      }
-      //   OK, if I need to read a pixel, do it now
-      if (read_next_pixel) {
-        //   load however much data we did have
-        if (tga_indexed) {
-          // read in index, then perform the lookup
-          int pal_idx =
-              (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
-          if (pal_idx >= tga_palette_len) {
-            // invalid index
-            pal_idx = 0;
-          }
-          pal_idx *= tga_comp;
-          for (j = 0; j < tga_comp; ++j) {
-            raw_data[j] = tga_palette[pal_idx + j];
-          }
-        } else if (tga_rgb16) {
-          STBI_ASSERT(tga_comp == STBI_rgb);
-          stbi__tga_read_rgb16(s, raw_data);
-        } else {
-          //   read in the data raw
-          for (j = 0; j < tga_comp; ++j) {
-            raw_data[j] = stbi__get8(s);
-          }
-        }
-        //   clear the reading flag for the next pixel
-        read_next_pixel = 0;
-      } // end of reading a pixel
-
-      // copy data
-      for (j = 0; j < tga_comp; ++j)
-        tga_data[i * tga_comp + j] = raw_data[j];
-
-      //   in case we're in RLE mode, keep counting down
-      --RLE_count;
-    }
-    //   do I need to invert the image?
-    if (tga_inverted) {
-      for (j = 0; j * 2 < tga_height; ++j) {
-        int index1 = j * tga_width * tga_comp;
-        int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
-        for (i = tga_width * tga_comp; i > 0; --i) {
-          unsigned char temp = tga_data[index1];
-          tga_data[index1] = tga_data[index2];
-          tga_data[index2] = temp;
-          ++index1;
-          ++index2;
-        }
-      }
-    }
-    //   clear my palette, if I had one
-    if (tga_palette != NULL) {
-      STBI_FREE(tga_palette);
-    }
-  }
-
-  // swap RGB - if the source data was RGB16, it already is in the right order
-  if (tga_comp >= 3 && !tga_rgb16) {
-    unsigned char *tga_pixel = tga_data;
-    for (i = 0; i < tga_width * tga_height; ++i) {
-      unsigned char temp = tga_pixel[0];
-      tga_pixel[0] = tga_pixel[2];
-      tga_pixel[2] = temp;
-      tga_pixel += tga_comp;
-    }
-  }
-
-  // convert to target component count
-  if (req_comp && req_comp != tga_comp)
-    tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width,
-                                    tga_height);
-
-  //   the things I do to get rid of an error message, and yet keep
-  //   Microsoft's C compilers happy... [8^(
-  tga_palette_start = tga_palette_len = tga_palette_bits = tga_x_origin =
-      tga_y_origin = 0;
-  STBI_NOTUSED(tga_palette_start);
-  //   OK, done
-  return tga_data;
-}
-#endif
-
-// *************************************************************************************************
-// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz,
-// tweaked by STB
-
-#ifndef STBI_NO_PSD
-static int stbi__psd_test(stbi__context *s) {
-  int r = (stbi__get32be(s) == 0x38425053);
-  stbi__rewind(s);
-  return r;
-}
-
-static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount) {
-  int count, nleft, len;
-
-  count = 0;
-  while ((nleft = pixelCount - count) > 0) {
-    len = stbi__get8(s);
-    if (len == 128) {
-      // No-op.
-    } else if (len < 128) {
-      // Copy next len+1 bytes literally.
-      len++;
-      if (len > nleft)
-        return 0; // corrupt data
-      count += len;
-      while (len) {
-        *p = stbi__get8(s);
-        p += 4;
-        len--;
-      }
-    } else if (len > 128) {
-      stbi_uc val;
-      // Next -len+1 bytes in the dest are replicated from next source byte.
-      // (Interpret len as a negative 8-bit int.)
-      len = 257 - len;
-      if (len > nleft)
-        return 0; // corrupt data
-      val = stbi__get8(s);
-      count += len;
-      while (len) {
-        *p = val;
-        p += 4;
-        len--;
-      }
-    }
-  }
-
-  return 1;
-}
-
-static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri, int bpc) {
-  int pixelCount;
-  int channelCount, compression;
-  int channel, i;
-  int bitdepth;
-  int w, h;
-  stbi_uc *out;
-  STBI_NOTUSED(ri);
-
-  // Check identifier
-  if (stbi__get32be(s) != 0x38425053) // "8BPS"
-    return stbi__errpuc("not PSD", "Corrupt PSD image");
-
-  // Check file type version.
-  if (stbi__get16be(s) != 1)
-    return stbi__errpuc("wrong version", "Unsupported version of PSD image");
-
-  // Skip 6 reserved bytes.
-  stbi__skip(s, 6);
-
-  // Read the number of channels (R, G, B, A, etc).
-  channelCount = stbi__get16be(s);
-  if (channelCount < 0 || channelCount > 16)
-    return stbi__errpuc("wrong channel count",
-                        "Unsupported number of channels in PSD image");
-
-  // Read the rows and columns of the image.
-  h = stbi__get32be(s);
-  w = stbi__get32be(s);
-
-  // Make sure the depth is 8 bits.
-  bitdepth = stbi__get16be(s);
-  if (bitdepth != 8 && bitdepth != 16)
-    return stbi__errpuc("unsupported bit depth",
-                        "PSD bit depth is not 8 or 16 bit");
-
-  // Make sure the color mode is RGB.
-  // Valid options are:
-  //   0: Bitmap
-  //   1: Grayscale
-  //   2: Indexed color
-  //   3: RGB color
-  //   4: CMYK color
-  //   7: Multichannel
-  //   8: Duotone
-  //   9: Lab color
-  if (stbi__get16be(s) != 3)
-    return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
-
-  // Skip the Mode Data.  (It's the palette for indexed color; other info for
-  // other modes.)
-  stbi__skip(s, stbi__get32be(s));
-
-  // Skip the image resources.  (resolution, pen tool paths, etc)
-  stbi__skip(s, stbi__get32be(s));
-
-  // Skip the reserved data.
-  stbi__skip(s, stbi__get32be(s));
-
-  // Find out if the data is compressed.
-  // Known values:
-  //   0: no compression
-  //   1: RLE compressed
-  compression = stbi__get16be(s);
-  if (compression > 1)
-    return stbi__errpuc("bad compression",
-                        "PSD has an unknown compression format");
-
-  // Check size
-  if (!stbi__mad3sizes_valid(4, w, h, 0))
-    return stbi__errpuc("too large", "Corrupt PSD");
-
-  // Create the destination image.
-
-  if (!compression && bitdepth == 16 && bpc == 16) {
-    out = (stbi_uc *)stbi__malloc_mad3(8, w, h, 0);
-    ri->bits_per_channel = 16;
-  } else
-    out = (stbi_uc *)stbi__malloc(4 * w * h);
-
-  if (!out)
-    return stbi__errpuc("outofmem", "Out of memory");
-  pixelCount = w * h;
-
-  // Initialize the data to zero.
-  // memset( out, 0, pixelCount * 4 );
-
-  // Finally, the image data.
-  if (compression) {
-    // RLE as used by .PSD and .TIFF
-    // Loop until you get the number of unpacked bytes you are expecting:
-    //     Read the next source byte into n.
-    //     If n is between 0 and 127 inclusive, copy the next n+1 bytes
-    //     literally. Else if n is between -127 and -1 inclusive, copy the next
-    //     byte -n+1 times. Else if n is 128, noop.
-    // Endloop
-
-    // The RLE-compressed data is preceded by a 2-byte data count for each row
-    // in the data, which we're going to just skip.
-    stbi__skip(s, h * channelCount * 2);
-
-    // Read the RLE data by channel.
-    for (channel = 0; channel < 4; channel++) {
-      stbi_uc *p;
-
-      p = out + channel;
-      if (channel >= channelCount) {
-        // Fill this channel with default data.
-        for (i = 0; i < pixelCount; i++, p += 4)
-          *p = (channel == 3 ? 255 : 0);
-      } else {
-        // Read the RLE data.
-        if (!stbi__psd_decode_rle(s, p, pixelCount)) {
-          STBI_FREE(out);
-          return stbi__errpuc("corrupt", "bad RLE data");
-        }
-      }
-    }
-
-  } else {
-    // We're at the raw image data.  It's each channel in order (Red, Green,
-    // Blue, Alpha, ...) where each channel consists of an 8-bit (or 16-bit)
-    // value for each pixel in the image.
-
-    // Read the data by channel.
-    for (channel = 0; channel < 4; channel++) {
-      if (channel >= channelCount) {
-        // Fill this channel with default data.
-        if (bitdepth == 16 && bpc == 16) {
-          stbi__uint16 *q = ((stbi__uint16 *)out) + channel;
-          stbi__uint16 val = channel == 3 ? 65535 : 0;
-          for (i = 0; i < pixelCount; i++, q += 4)
-            *q = val;
-        } else {
-          stbi_uc *p = out + channel;
-          stbi_uc val = channel == 3 ? 255 : 0;
-          for (i = 0; i < pixelCount; i++, p += 4)
-            *p = val;
-        }
-      } else {
-        if (ri->bits_per_channel == 16) { // output bpc
-          stbi__uint16 *q = ((stbi__uint16 *)out) + channel;
-          for (i = 0; i < pixelCount; i++, q += 4)
-            *q = (stbi__uint16)stbi__get16be(s);
-        } else {
-          stbi_uc *p = out + channel;
-          if (bitdepth == 16) { // input bpc
-            for (i = 0; i < pixelCount; i++, p += 4)
-              *p = (stbi_uc)(stbi__get16be(s) >> 8);
-          } else {
-            for (i = 0; i < pixelCount; i++, p += 4)
-              *p = stbi__get8(s);
-          }
-        }
-      }
-    }
-  }
-
-  // remove weird white matte from PSD
-  if (channelCount >= 4) {
-    if (ri->bits_per_channel == 16) {
-      for (i = 0; i < w * h; ++i) {
-        stbi__uint16 *pixel = (stbi__uint16 *)out + 4 * i;
-        if (pixel[3] != 0 && pixel[3] != 65535) {
-          float a = pixel[3] / 65535.0f;
-          float ra = 1.0f / a;
-          float inv_a = 65535.0f * (1 - ra);
-          pixel[0] = (stbi__uint16)(pixel[0] * ra + inv_a);
-          pixel[1] = (stbi__uint16)(pixel[1] * ra + inv_a);
-          pixel[2] = (stbi__uint16)(pixel[2] * ra + inv_a);
-        }
-      }
-    } else {
-      for (i = 0; i < w * h; ++i) {
-        unsigned char *pixel = out + 4 * i;
-        if (pixel[3] != 0 && pixel[3] != 255) {
-          float a = pixel[3] / 255.0f;
-          float ra = 1.0f / a;
-          float inv_a = 255.0f * (1 - ra);
-          pixel[0] = (unsigned char)(pixel[0] * ra + inv_a);
-          pixel[1] = (unsigned char)(pixel[1] * ra + inv_a);
-          pixel[2] = (unsigned char)(pixel[2] * ra + inv_a);
-        }
-      }
-    }
-  }
-
-  // convert to desired output format
-  if (req_comp && req_comp != 4) {
-    if (ri->bits_per_channel == 16)
-      out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, 4, req_comp,
-                                              w, h);
-    else
-      out = stbi__convert_format(out, 4, req_comp, w, h);
-    if (out == NULL)
-      return out; // stbi__convert_format frees input on failure
-  }
-
-  if (comp)
-    *comp = 4;
-  *y = h;
-  *x = w;
-
-  return out;
-}
-#endif
-
-// *************************************************************************************************
-// Softimage PIC loader
-// by Tom Seddon
-//
-// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
-// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
-
-#ifndef STBI_NO_PIC
-static int stbi__pic_is4(stbi__context *s, const char *str) {
-  int i;
-  for (i = 0; i < 4; ++i)
-    if (stbi__get8(s) != (stbi_uc)str[i])
-      return 0;
-
-  return 1;
-}
-
-static int stbi__pic_test_core(stbi__context *s) {
-  int i;
-
-  if (!stbi__pic_is4(s, "\x53\x80\xF6\x34"))
-    return 0;
-
-  for (i = 0; i < 84; ++i)
-    stbi__get8(s);
-
-  if (!stbi__pic_is4(s, "PICT"))
-    return 0;
-
-  return 1;
-}
-
-typedef struct {
-  stbi_uc size, type, channel;
-} stbi__pic_packet;
-
-static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest) {
-  int mask = 0x80, i;
-
-  for (i = 0; i < 4; ++i, mask >>= 1) {
-    if (channel & mask) {
-      if (stbi__at_eof(s))
-        return stbi__errpuc("bad file", "PIC file too short");
-      dest[i] = stbi__get8(s);
-    }
-  }
-
-  return dest;
-}
-
-static void stbi__copyval(int channel, stbi_uc *dest, const stbi_uc *src) {
-  int mask = 0x80, i;
-
-  for (i = 0; i < 4; ++i, mask >>= 1)
-    if (channel & mask)
-      dest[i] = src[i];
-}
-
-static stbi_uc *stbi__pic_load_core(stbi__context *s, int width, int height,
-                                    int *comp, stbi_uc *result) {
-  int act_comp = 0, num_packets = 0, y, chained;
-  stbi__pic_packet packets[10];
-
-  // this will (should...) cater for even some bizarre stuff like having data
-  // for the same channel in multiple packets.
-  do {
-    stbi__pic_packet *packet;
-
-    if (num_packets == sizeof(packets) / sizeof(packets[0]))
-      return stbi__errpuc("bad format", "too many packets");
-
-    packet = &packets[num_packets++];
-
-    chained = stbi__get8(s);
-    packet->size = stbi__get8(s);
-    packet->type = stbi__get8(s);
-    packet->channel = stbi__get8(s);
-
-    act_comp |= packet->channel;
-
-    if (stbi__at_eof(s))
-      return stbi__errpuc("bad file", "file too short (reading packets)");
-    if (packet->size != 8)
-      return stbi__errpuc("bad format", "packet isn't 8bpp");
-  } while (chained);
-
-  *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
-
-  for (y = 0; y < height; ++y) {
-    int packet_idx;
-
-    for (packet_idx = 0; packet_idx < num_packets; ++packet_idx) {
-      stbi__pic_packet *packet = &packets[packet_idx];
-      stbi_uc *dest = result + y * width * 4;
-
-      switch (packet->type) {
-      default:
-        return stbi__errpuc("bad format", "packet has bad compression type");
-
-      case 0: { // uncompressed
-        int x;
-
-        for (x = 0; x < width; ++x, dest += 4)
-          if (!stbi__readval(s, packet->channel, dest))
-            return 0;
-        break;
-      }
-
-      case 1: // Pure RLE
-      {
-        int left = width, i;
-
-        while (left > 0) {
-          stbi_uc count, value[4];
-
-          count = stbi__get8(s);
-          if (stbi__at_eof(s))
-            return stbi__errpuc("bad file", "file too short (pure read count)");
-
-          if (count > left)
-            count = (stbi_uc)left;
-
-          if (!stbi__readval(s, packet->channel, value))
-            return 0;
-
-          for (i = 0; i < count; ++i, dest += 4)
-            stbi__copyval(packet->channel, dest, value);
-          left -= count;
-        }
-      } break;
-
-      case 2: { // Mixed RLE
-        int left = width;
-        while (left > 0) {
-          int count = stbi__get8(s), i;
-          if (stbi__at_eof(s))
-            return stbi__errpuc("bad file",
-                                "file too short (mixed read count)");
-
-          if (count >= 128) { // Repeated
-            stbi_uc value[4];
-
-            if (count == 128)
-              count = stbi__get16be(s);
-            else
-              count -= 127;
-            if (count > left)
-              return stbi__errpuc("bad file", "scanline overrun");
-
-            if (!stbi__readval(s, packet->channel, value))
-              return 0;
-
-            for (i = 0; i < count; ++i, dest += 4)
-              stbi__copyval(packet->channel, dest, value);
-          } else { // Raw
-            ++count;
-            if (count > left)
-              return stbi__errpuc("bad file", "scanline overrun");
-
-            for (i = 0; i < count; ++i, dest += 4)
-              if (!stbi__readval(s, packet->channel, dest))
-                return 0;
-          }
-          left -= count;
-        }
-        break;
-      }
-      }
-    }
-  }
-
-  return result;
-}
-
-static void *stbi__pic_load(stbi__context *s, int *px, int *py, int *comp,
-                            int req_comp, stbi__result_info *ri) {
-  stbi_uc *result;
-  int i, x, y, internal_comp;
-  STBI_NOTUSED(ri);
-
-  if (!comp)
-    comp = &internal_comp;
-
-  for (i = 0; i < 92; ++i)
-    stbi__get8(s);
-
-  x = stbi__get16be(s);
-  y = stbi__get16be(s);
-  if (stbi__at_eof(s))
-    return stbi__errpuc("bad file", "file too short (pic header)");
-  if (!stbi__mad3sizes_valid(x, y, 4, 0))
-    return stbi__errpuc("too large", "PIC image too large to decode");
-
-  stbi__get32be(s); // skip `ratio'
-  stbi__get16be(s); // skip `fields'
-  stbi__get16be(s); // skip `pad'
-
-  // intermediate buffer is RGBA
-  result = (stbi_uc *)stbi__malloc_mad3(x, y, 4, 0);
-  memset(result, 0xff, x * y * 4);
-
-  if (!stbi__pic_load_core(s, x, y, comp, result)) {
-    STBI_FREE(result);
-    result = 0;
-  }
-  *px = x;
-  *py = y;
-  if (req_comp == 0)
-    req_comp = *comp;
-  result = stbi__convert_format(result, 4, req_comp, x, y);
-
-  return result;
-}
-
-static int stbi__pic_test(stbi__context *s) {
-  int r = stbi__pic_test_core(s);
-  stbi__rewind(s);
-  return r;
-}
-#endif
-
-// *************************************************************************************************
-// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
-
-#ifndef STBI_NO_GIF
-typedef struct {
-  stbi__int16 prefix;
-  stbi_uc first;
-  stbi_uc suffix;
-} stbi__gif_lzw;
-
-typedef struct {
-  int w, h;
-  stbi_uc *out;        // output buffer (always 4 components)
-  stbi_uc *background; // The current "background" as far as a gif is concerned
-  stbi_uc *history;
-  int flags, bgindex, ratio, transparent, eflags;
-  stbi_uc pal[256][4];
-  stbi_uc lpal[256][4];
-  stbi__gif_lzw codes[8192];
-  stbi_uc *color_table;
-  int parse, step;
-  int lflags;
-  int start_x, start_y;
-  int max_x, max_y;
-  int cur_x, cur_y;
-  int line_size;
-  int delay;
-} stbi__gif;
-
-static int stbi__gif_test_raw(stbi__context *s) {
-  int sz;
-  if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' ||
-      stbi__get8(s) != '8')
-    return 0;
-  sz = stbi__get8(s);
-  if (sz != '9' && sz != '7')
-    return 0;
-  if (stbi__get8(s) != 'a')
-    return 0;
-  return 1;
-}
-
-static int stbi__gif_test(stbi__context *s) {
-  int r = stbi__gif_test_raw(s);
-  stbi__rewind(s);
-  return r;
-}
-
-static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4],
-                                       int num_entries, int transp) {
-  int i;
-  for (i = 0; i < num_entries; ++i) {
-    pal[i][2] = stbi__get8(s);
-    pal[i][1] = stbi__get8(s);
-    pal[i][0] = stbi__get8(s);
-    pal[i][3] = transp == i ? 0 : 255;
-  }
-}
-
-static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp,
-                            int is_info) {
-  stbi_uc version;
-  if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' ||
-      stbi__get8(s) != '8')
-    return stbi__err("not GIF", "Corrupt GIF");
-
-  version = stbi__get8(s);
-  if (version != '7' && version != '9')
-    return stbi__err("not GIF", "Corrupt GIF");
-  if (stbi__get8(s) != 'a')
-    return stbi__err("not GIF", "Corrupt GIF");
-
-  stbi__g_failure_reason = "";
-  g->w = stbi__get16le(s);
-  g->h = stbi__get16le(s);
-  g->flags = stbi__get8(s);
-  g->bgindex = stbi__get8(s);
-  g->ratio = stbi__get8(s);
-  g->transparent = -1;
-
-  if (comp != 0)
-    *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the
-               // comments
-
-  if (is_info)
-    return 1;
-
-  if (g->flags & 0x80)
-    stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1);
-
-  return 1;
-}
-
-static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp) {
-  stbi__gif *g = (stbi__gif *)stbi__malloc(sizeof(stbi__gif));
-  if (!stbi__gif_header(s, g, comp, 1)) {
-    STBI_FREE(g);
-    stbi__rewind(s);
-    return 0;
-  }
-  if (x)
-    *x = g->w;
-  if (y)
-    *y = g->h;
-  STBI_FREE(g);
-  return 1;
-}
-
-static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code) {
-  stbi_uc *p, *c;
-  int idx;
-
-  // recurse to decode the prefixes, since the linked-list is backwards,
-  // and working backwards through an interleaved image would be nasty
-  if (g->codes[code].prefix >= 0)
-    stbi__out_gif_code(g, g->codes[code].prefix);
-
-  if (g->cur_y >= g->max_y)
-    return;
-
-  idx = g->cur_x + g->cur_y;
-  p = &g->out[idx];
-  g->history[idx / 4] = 1;
-
-  c = &g->color_table[g->codes[code].suffix * 4];
-  if (c[3] > 128) { // don't render transparent pixels;
-    p[0] = c[2];
-    p[1] = c[1];
-    p[2] = c[0];
-    p[3] = c[3];
-  }
-  g->cur_x += 4;
-
-  if (g->cur_x >= g->max_x) {
-    g->cur_x = g->start_x;
-    g->cur_y += g->step;
-
-    while (g->cur_y >= g->max_y && g->parse > 0) {
-      g->step = (1 << g->parse) * g->line_size;
-      g->cur_y = g->start_y + (g->step >> 1);
-      --g->parse;
-    }
-  }
-}
-
-static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g) {
-  stbi_uc lzw_cs;
-  stbi__int32 len, init_code;
-  stbi__uint32 first;
-  stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
-  stbi__gif_lzw *p;
-
-  lzw_cs = stbi__get8(s);
-  if (lzw_cs > 12)
-    return NULL;
-  clear = 1 << lzw_cs;
-  first = 1;
-  codesize = lzw_cs + 1;
-  codemask = (1 << codesize) - 1;
-  bits = 0;
-  valid_bits = 0;
-  for (init_code = 0; init_code < clear; init_code++) {
-    g->codes[init_code].prefix = -1;
-    g->codes[init_code].first = (stbi_uc)init_code;
-    g->codes[init_code].suffix = (stbi_uc)init_code;
-  }
-
-  // support no starting clear code
-  avail = clear + 2;
-  oldcode = -1;
-
-  len = 0;
-  for (;;) {
-    if (valid_bits < codesize) {
-      if (len == 0) {
-        len = stbi__get8(s); // start new block
-        if (len == 0)
-          return g->out;
-      }
-      --len;
-      bits |= (stbi__int32)stbi__get8(s) << valid_bits;
-      valid_bits += 8;
-    } else {
-      stbi__int32 code = bits & codemask;
-      bits >>= codesize;
-      valid_bits -= codesize;
-      // @OPTIMIZE: is there some way we can accelerate the non-clear path?
-      if (code == clear) { // clear code
-        codesize = lzw_cs + 1;
-        codemask = (1 << codesize) - 1;
-        avail = clear + 2;
-        oldcode = -1;
-        first = 0;
-      } else if (code == clear + 1) { // end of stream code
-        stbi__skip(s, len);
-        while ((len = stbi__get8(s)) > 0)
-          stbi__skip(s, len);
-        return g->out;
-      } else if (code <= avail) {
-        if (first) {
-          return stbi__errpuc("no clear code", "Corrupt GIF");
-        }
-
-        if (oldcode >= 0) {
-          p = &g->codes[avail++];
-          if (avail > 8192) {
-            return stbi__errpuc("too many codes", "Corrupt GIF");
-          }
-
-          p->prefix = (stbi__int16)oldcode;
-          p->first = g->codes[oldcode].first;
-          p->suffix = (code == avail) ? p->first : g->codes[code].first;
-        } else if (code == avail)
-          return stbi__errpuc("illegal code in raster", "Corrupt GIF");
-
-        stbi__out_gif_code(g, (stbi__uint16)code);
-
-        if ((avail & codemask) == 0 && avail <= 0x0FFF) {
-          codesize++;
-          codemask = (1 << codesize) - 1;
-        }
-
-        oldcode = code;
-      } else {
-        return stbi__errpuc("illegal code in raster", "Corrupt GIF");
-      }
-    }
-  }
-}
-
-// this function is designed to support animated gifs, although stb_image
-// doesn't support it two back is the image from two frames ago, used for a very
-// specific disposal format
-static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp,
-                                    int req_comp, stbi_uc *two_back) {
-  int dispose;
-  int first_frame;
-  int pi;
-  int pcount;
-  STBI_NOTUSED(req_comp);
-
-  // on first frame, any non-written pixels get the background colour
-  // (non-transparent)
-  first_frame = 0;
-  if (g->out == 0) {
-    if (!stbi__gif_header(s, g, comp, 0))
-      return 0; // stbi__g_failure_reason set by stbi__gif_header
-    if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
-      return stbi__errpuc("too large", "GIF image is too large");
-    pcount = g->w * g->h;
-    g->out = (stbi_uc *)stbi__malloc(4 * pcount);
-    g->background = (stbi_uc *)stbi__malloc(4 * pcount);
-    g->history = (stbi_uc *)stbi__malloc(pcount);
-    if (!g->out || !g->background || !g->history)
-      return stbi__errpuc("outofmem", "Out of memory");
-
-    // image is treated as "transparent" at the start - ie, nothing overwrites
-    // the current background; background colour is only used for pixels that
-    // are not rendered first frame, after that "background" color refers to the
-    // color that was there the previous frame.
-    memset(g->out, 0x00, 4 * pcount);
-    memset(g->background, 0x00,
-           4 * pcount); // state of the background (starts transparent)
-    memset(g->history, 0x00,
-           pcount); // pixels that were affected previous frame
-    first_frame = 1;
-  } else {
-    // second frame - how do we dispoase of the previous one?
-    dispose = (g->eflags & 0x1C) >> 2;
-    pcount = g->w * g->h;
-
-    if ((dispose == 3) && (two_back == 0)) {
-      dispose = 2; // if I don't have an image to revert back to, default to the
-                   // old background
-    }
-
-    if (dispose == 3) { // use previous graphic
-      for (pi = 0; pi < pcount; ++pi) {
-        if (g->history[pi]) {
-          memcpy(&g->out[pi * 4], &two_back[pi * 4], 4);
-        }
-      }
-    } else if (dispose == 2) {
-      // restore what was changed last frame to background before that frame;
-      for (pi = 0; pi < pcount; ++pi) {
-        if (g->history[pi]) {
-          memcpy(&g->out[pi * 4], &g->background[pi * 4], 4);
-        }
-      }
-    } else {
-      // This is a non-disposal case eithe way, so just
-      // leave the pixels as is, and they will become the new background
-      // 1: do not dispose
-      // 0:  not specified.
-    }
-
-    // background is what out is after the undoing of the previou frame;
-    memcpy(g->background, g->out, 4 * g->w * g->h);
-  }
-
-  // clear my history;
-  memset(g->history, 0x00,
-         g->w * g->h); // pixels that were affected previous frame
-
-  for (;;) {
-    int tag = stbi__get8(s);
-    switch (tag) {
-    case 0x2C: /* Image Descriptor */
-    {
-      stbi__int32 x, y, w, h;
-      stbi_uc *o;
-
-      x = stbi__get16le(s);
-      y = stbi__get16le(s);
-      w = stbi__get16le(s);
-      h = stbi__get16le(s);
-      if (((x + w) > (g->w)) || ((y + h) > (g->h)))
-        return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
-
-      g->line_size = g->w * 4;
-      g->start_x = x * 4;
-      g->start_y = y * g->line_size;
-      g->max_x = g->start_x + w * 4;
-      g->max_y = g->start_y + h * g->line_size;
-      g->cur_x = g->start_x;
-      g->cur_y = g->start_y;
-
-      // if the width of the specified rectangle is 0, that means
-      // we may not see *any* pixels or the image is malformed;
-      // to make sure this is caught, move the current y down to
-      // max_y (which is what out_gif_code checks).
-      if (w == 0)
-        g->cur_y = g->max_y;
-
-      g->lflags = stbi__get8(s);
-
-      if (g->lflags & 0x40) {
-        g->step = 8 * g->line_size; // first interlaced spacing
-        g->parse = 3;
-      } else {
-        g->step = g->line_size;
-        g->parse = 0;
-      }
-
-      if (g->lflags & 0x80) {
-        stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7),
-                                   g->eflags & 0x01 ? g->transparent : -1);
-        g->color_table = (stbi_uc *)g->lpal;
-      } else if (g->flags & 0x80) {
-        g->color_table = (stbi_uc *)g->pal;
-      } else
-        return stbi__errpuc("missing color table", "Corrupt GIF");
-
-      o = stbi__process_gif_raster(s, g);
-      if (!o)
-        return NULL;
-
-      // if this was the first frame,
-      pcount = g->w * g->h;
-      if (first_frame && (g->bgindex > 0)) {
-        // if first frame, any pixel not drawn to gets the background color
-        for (pi = 0; pi < pcount; ++pi) {
-          if (g->history[pi] == 0) {
-            g->pal[g->bgindex][3] =
-                255; // just in case it was made transparent, undo that; It will
-                     // be reset next frame if need be;
-            memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4);
-          }
-        }
-      }
-
-      return o;
-    }
-
-    case 0x21: // Comment Extension.
-    {
-      int len;
-      int ext = stbi__get8(s);
-      if (ext == 0xF9) { // Graphic Control Extension.
-        len = stbi__get8(s);
-        if (len == 4) {
-          g->eflags = stbi__get8(s);
-          g->delay =
-              10 * stbi__get16le(
-                       s); // delay - 1/100th of a second, saving as 1/1000ths.
-
-          // unset old transparent
-          if (g->transparent >= 0) {
-            g->pal[g->transparent][3] = 255;
-          }
-          if (g->eflags & 0x01) {
-            g->transparent = stbi__get8(s);
-            if (g->transparent >= 0) {
-              g->pal[g->transparent][3] = 0;
-            }
-          } else {
-            // don't need transparent
-            stbi__skip(s, 1);
-            g->transparent = -1;
-          }
-        } else {
-          stbi__skip(s, len);
-          break;
-        }
-      }
-      while ((len = stbi__get8(s)) != 0) {
-        stbi__skip(s, len);
-      }
-      break;
-    }
-
-    case 0x3B:             // gif stream termination code
-      return (stbi_uc *)s; // using '1' causes warning on some compilers
-
-    default:
-      return stbi__errpuc("unknown code", "Corrupt GIF");
-    }
-  }
-}
-
-static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
-                                 int *z, int *comp, int req_comp) {
-  if (stbi__gif_test(s)) {
-    int layers = 0;
-    stbi_uc *u = 0;
-    stbi_uc *out = 0;
-    stbi_uc *two_back = 0;
-    stbi__gif g;
-    int stride;
-    memset(&g, 0, sizeof(g));
-    if (delays) {
-      *delays = 0;
-    }
-
-    do {
-      u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
-      if (u == (stbi_uc *)s)
-        u = 0; // end of animated gif marker
-
-      if (u) {
-        *x = g.w;
-        *y = g.h;
-        ++layers;
-        stride = g.w * g.h * 4;
-
-        if (out) {
-          out = (stbi_uc *)STBI_REALLOC(out, layers * stride);
-          if (delays) {
-            *delays = (int *)STBI_REALLOC(*delays, sizeof(int) * layers);
-          }
-        } else {
-          out = (stbi_uc *)stbi__malloc(layers * stride);
-          if (delays) {
-            *delays = (int *)stbi__malloc(layers * sizeof(int));
-          }
-        }
-        memcpy(out + ((layers - 1) * stride), u, stride);
-        if (layers >= 2) {
-          two_back = out - 2 * stride;
-        }
-
-        if (delays) {
-          (*delays)[layers - 1U] = g.delay;
-        }
-      }
-    } while (u != 0);
-
-    // free temp buffer;
-    STBI_FREE(g.out);
-    STBI_FREE(g.history);
-    STBI_FREE(g.background);
-
-    // do the final conversion after loading everything;
-    if (req_comp && req_comp != 4)
-      out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
-
-    *z = layers;
-    return out;
-  } else {
-    return stbi__errpuc("not GIF", "Image was not as a gif type.");
-  }
-}
-
-static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri) {
-  stbi_uc *u = 0;
-  stbi__gif g;
-  memset(&g, 0, sizeof(g));
-  STBI_NOTUSED(ri);
-
-  u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
-  if (u == (stbi_uc *)s)
-    u = 0; // end of animated gif marker
-  if (u) {
-    *x = g.w;
-    *y = g.h;
-
-    // moved conversion to after successful load so that the same
-    // can be done for multiple frames.
-    if (req_comp && req_comp != 4)
-      u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
-  } else if (g.out) {
-    // if there was an error and we allocated an image buffer, free it!
-    STBI_FREE(g.out);
-  }
-
-  // free buffers needed for multiple frame loading;
-  STBI_FREE(g.history);
-  STBI_FREE(g.background);
-
-  return u;
-}
-
-static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp) {
-  return stbi__gif_info_raw(s, x, y, comp);
-}
-#endif
-
-// *************************************************************************************************
-// Radiance RGBE HDR loader
-// originally by Nicolas Schulz
-#ifndef STBI_NO_HDR
-static int stbi__hdr_test_core(stbi__context *s, const char *signature) {
-  int i;
-  for (i = 0; signature[i]; ++i)
-    if (stbi__get8(s) != signature[i])
-      return 0;
-  stbi__rewind(s);
-  return 1;
-}
-
-static int stbi__hdr_test(stbi__context *s) {
-  int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
-  stbi__rewind(s);
-  if (!r) {
-    r = stbi__hdr_test_core(s, "#?RGBE\n");
-    stbi__rewind(s);
-  }
-  return r;
-}
-
-#define STBI__HDR_BUFLEN 1024
-static char *stbi__hdr_gettoken(stbi__context *z, char *buffer) {
-  int len = 0;
-  char c = '\0';
-
-  c = (char)stbi__get8(z);
-
-  while (!stbi__at_eof(z) && c != '\n') {
-    buffer[len++] = c;
-    if (len == STBI__HDR_BUFLEN - 1) {
-      // flush to end of line
-      while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
-        ;
-      break;
-    }
-    c = (char)stbi__get8(z);
-  }
-
-  buffer[len] = 0;
-  return buffer;
-}
-
-static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp) {
-  if (input[3] != 0) {
-    float f1;
-    // Exponent
-    f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8));
-    if (req_comp <= 2)
-      output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
-    else {
-      output[0] = input[0] * f1;
-      output[1] = input[1] * f1;
-      output[2] = input[2] * f1;
-    }
-    if (req_comp == 2)
-      output[1] = 1;
-    if (req_comp == 4)
-      output[3] = 1;
-  } else {
-    switch (req_comp) {
-    case 4:
-      output[3] = 1; /* fallthrough */
-    case 3:
-      output[0] = output[1] = output[2] = 0;
-      break;
-    case 2:
-      output[1] = 1; /* fallthrough */
-    case 1:
-      output[0] = 0;
-      break;
-    }
-  }
-}
-
-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp,
-                             int req_comp, stbi__result_info *ri) {
-  char buffer[STBI__HDR_BUFLEN];
-  char *token;
-  int valid = 0;
-  int width, height;
-  stbi_uc *scanline;
-  float *hdr_data;
-  int len;
-  unsigned char count, value;
-  int i, j, k, c1, c2, z;
-  const char *headerToken;
-  STBI_NOTUSED(ri);
-
-  // Check identifier
-  headerToken = stbi__hdr_gettoken(s, buffer);
-  if (strcmp(headerToken, "#?RADIANCE") != 0 &&
-      strcmp(headerToken, "#?RGBE") != 0)
-    return stbi__errpf("not HDR", "Corrupt HDR image");
-
-  // Parse header
-  for (;;) {
-    token = stbi__hdr_gettoken(s, buffer);
-    if (token[0] == 0)
-      break;
-    if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
-      valid = 1;
-  }
-
-  if (!valid)
-    return stbi__errpf("unsupported format", "Unsupported HDR format");
-
-  // Parse width and height
-  // can't use sscanf() if we're not using stdio!
-  token = stbi__hdr_gettoken(s, buffer);
-  if (strncmp(token, "-Y ", 3))
-    return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-  token += 3;
-  height = (int)strtol(token, &token, 10);
-  while (*token == ' ')
-    ++token;
-  if (strncmp(token, "+X ", 3))
-    return stbi__errpf("unsupported data layout", "Unsupported HDR format");
-  token += 3;
-  width = (int)strtol(token, NULL, 10);
-
-  *x = width;
-  *y = height;
-
-  if (comp)
-    *comp = 3;
-  if (req_comp == 0)
-    req_comp = 3;
-
-  if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
-    return stbi__errpf("too large", "HDR image is too large");
-
-  // Read data
-  hdr_data =
-      (float *)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
-  if (!hdr_data)
-    return stbi__errpf("outofmem", "Out of memory");
-
-  // Load image data
-  // image data is stored as some number of sca
-  if (width < 8 || width >= 32768) {
-    // Read flat data
-    for (j = 0; j < height; ++j) {
-      for (i = 0; i < width; ++i) {
-        stbi_uc rgbe[4];
-      main_decode_loop:
-        stbi__getn(s, rgbe, 4);
-        stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe,
-                          req_comp);
-      }
-    }
-  } else {
-    // Read RLE-encoded data
-    scanline = NULL;
-
-    for (j = 0; j < height; ++j) {
-      c1 = stbi__get8(s);
-      c2 = stbi__get8(s);
-      len = stbi__get8(s);
-      if (c1 != 2 || c2 != 2 || (len & 0x80)) {
-        // not run-length encoded, so we have to actually use THIS data as a
-        // decoded pixel (note this can't be a valid pixel--one of RGB must be
-        // >= 128)
-        stbi_uc rgbe[4];
-        rgbe[0] = (stbi_uc)c1;
-        rgbe[1] = (stbi_uc)c2;
-        rgbe[2] = (stbi_uc)len;
-        rgbe[3] = (stbi_uc)stbi__get8(s);
-        stbi__hdr_convert(hdr_data, rgbe, req_comp);
-        i = 1;
-        j = 0;
-        STBI_FREE(scanline);
-        goto main_decode_loop; // yes, this makes no sense
-      }
-      len <<= 8;
-      len |= stbi__get8(s);
-      if (len != width) {
-        STBI_FREE(hdr_data);
-        STBI_FREE(scanline);
-        return stbi__errpf("invalid decoded scanline length", "corrupt HDR");
-      }
-      if (scanline == NULL) {
-        scanline = (stbi_uc *)stbi__malloc_mad2(width, 4, 0);
-        if (!scanline) {
-          STBI_FREE(hdr_data);
-          return stbi__errpf("outofmem", "Out of memory");
-        }
-      }
-
-      for (k = 0; k < 4; ++k) {
-        int nleft;
-        i = 0;
-        while ((nleft = width - i) > 0) {
-          count = stbi__get8(s);
-          if (count > 128) {
-            // Run
-            value = stbi__get8(s);
-            count -= 128;
-            if (count > nleft) {
-              STBI_FREE(hdr_data);
-              STBI_FREE(scanline);
-              return stbi__errpf("corrupt", "bad RLE data in HDR");
-            }
-            for (z = 0; z < count; ++z)
-              scanline[i++ * 4 + k] = value;
-          } else {
-            // Dump
-            if (count > nleft) {
-              STBI_FREE(hdr_data);
-              STBI_FREE(scanline);
-              return stbi__errpf("corrupt", "bad RLE data in HDR");
-            }
-            for (z = 0; z < count; ++z)
-              scanline[i++ * 4 + k] = stbi__get8(s);
-          }
-        }
-      }
-      for (i = 0; i < width; ++i)
-        stbi__hdr_convert(hdr_data + (j * width + i) * req_comp,
-                          scanline + i * 4, req_comp);
-    }
-    if (scanline)
-      STBI_FREE(scanline);
-  }
-
-  return hdr_data;
-}
-
-static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp) {
-  char buffer[STBI__HDR_BUFLEN];
-  char *token;
-  int valid = 0;
-  int dummy;
-
-  if (!x)
-    x = &dummy;
-  if (!y)
-    y = &dummy;
-  if (!comp)
-    comp = &dummy;
-
-  if (stbi__hdr_test(s) == 0) {
-    stbi__rewind(s);
-    return 0;
-  }
-
-  for (;;) {
-    token = stbi__hdr_gettoken(s, buffer);
-    if (token[0] == 0)
-      break;
-    if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0)
-      valid = 1;
-  }
-
-  if (!valid) {
-    stbi__rewind(s);
-    return 0;
-  }
-  token = stbi__hdr_gettoken(s, buffer);
-  if (strncmp(token, "-Y ", 3)) {
-    stbi__rewind(s);
-    return 0;
-  }
-  token += 3;
-  *y = (int)strtol(token, &token, 10);
-  while (*token == ' ')
-    ++token;
-  if (strncmp(token, "+X ", 3)) {
-    stbi__rewind(s);
-    return 0;
-  }
-  token += 3;
-  *x = (int)strtol(token, NULL, 10);
-  *comp = 3;
-  return 1;
-}
-#endif // STBI_NO_HDR
-
-#ifndef STBI_NO_BMP
-static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp) {
-  void *p;
-  stbi__bmp_data info;
-
-  info.all_a = 255;
-  p = stbi__bmp_parse_header(s, &info);
-  stbi__rewind(s);
-  if (p == NULL)
-    return 0;
-  if (x)
-    *x = s->img_x;
-  if (y)
-    *y = s->img_y;
-  if (comp) {
-    if (info.bpp == 24 && info.ma == 0xff000000)
-      *comp = 3;
-    else
-      *comp = info.ma ? 4 : 3;
-  }
-  return 1;
-}
-#endif
-
-#ifndef STBI_NO_PSD
-static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp) {
-  int channelCount, dummy, depth;
-  if (!x)
-    x = &dummy;
-  if (!y)
-    y = &dummy;
-  if (!comp)
-    comp = &dummy;
-  if (stbi__get32be(s) != 0x38425053) {
-    stbi__rewind(s);
-    return 0;
-  }
-  if (stbi__get16be(s) != 1) {
-    stbi__rewind(s);
-    return 0;
-  }
-  stbi__skip(s, 6);
-  channelCount = stbi__get16be(s);
-  if (channelCount < 0 || channelCount > 16) {
-    stbi__rewind(s);
-    return 0;
-  }
-  *y = stbi__get32be(s);
-  *x = stbi__get32be(s);
-  depth = stbi__get16be(s);
-  if (depth != 8 && depth != 16) {
-    stbi__rewind(s);
-    return 0;
-  }
-  if (stbi__get16be(s) != 3) {
-    stbi__rewind(s);
-    return 0;
-  }
-  *comp = 4;
-  return 1;
-}
-
-static int stbi__psd_is16(stbi__context *s) {
-  int channelCount, depth;
-  if (stbi__get32be(s) != 0x38425053) {
-    stbi__rewind(s);
-    return 0;
-  }
-  if (stbi__get16be(s) != 1) {
-    stbi__rewind(s);
-    return 0;
-  }
-  stbi__skip(s, 6);
-  channelCount = stbi__get16be(s);
-  if (channelCount < 0 || channelCount > 16) {
-    stbi__rewind(s);
-    return 0;
-  }
-  (void)stbi__get32be(s);
-  (void)stbi__get32be(s);
-  depth = stbi__get16be(s);
-  if (depth != 16) {
-    stbi__rewind(s);
-    return 0;
-  }
-  return 1;
-}
-#endif
-
-#ifndef STBI_NO_PIC
-static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp) {
-  int act_comp = 0, num_packets = 0, chained, dummy;
-  stbi__pic_packet packets[10];
-
-  if (!x)
-    x = &dummy;
-  if (!y)
-    y = &dummy;
-  if (!comp)
-    comp = &dummy;
-
-  if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) {
-    stbi__rewind(s);
-    return 0;
-  }
-
-  stbi__skip(s, 88);
-
-  *x = stbi__get16be(s);
-  *y = stbi__get16be(s);
-  if (stbi__at_eof(s)) {
-    stbi__rewind(s);
-    return 0;
-  }
-  if ((*x) != 0 && (1 << 28) / (*x) < (*y)) {
-    stbi__rewind(s);
-    return 0;
-  }
-
-  stbi__skip(s, 8);
-
-  do {
-    stbi__pic_packet *packet;
-
-    if (num_packets == sizeof(packets) / sizeof(packets[0]))
-      return 0;
-
-    packet = &packets[num_packets++];
-    chained = stbi__get8(s);
-    packet->size = stbi__get8(s);
-    packet->type = stbi__get8(s);
-    packet->channel = stbi__get8(s);
-    act_comp |= packet->channel;
-
-    if (stbi__at_eof(s)) {
-      stbi__rewind(s);
-      return 0;
-    }
-    if (packet->size != 8) {
-      stbi__rewind(s);
-      return 0;
-    }
-  } while (chained);
-
-  *comp = (act_comp & 0x10 ? 4 : 3);
-
-  return 1;
-}
-#endif
-
-// *************************************************************************************************
-// Portable Gray Map and Portable Pixel Map loader
-// by Ken Miller
-//
-// PGM: http://netpbm.sourceforge.net/doc/pgm.html
-// PPM: http://netpbm.sourceforge.net/doc/ppm.html
-//
-// Known limitations:
-//    Does not support comments in the header section
-//    Does not support ASCII image data (formats P2 and P3)
-//    Does not support 16-bit-per-channel
-
-#ifndef STBI_NO_PNM
-
-static int stbi__pnm_test(stbi__context *s) {
-  char p, t;
-  p = (char)stbi__get8(s);
-  t = (char)stbi__get8(s);
-  if (p != 'P' || (t != '5' && t != '6')) {
-    stbi__rewind(s);
-    return 0;
-  }
-  return 1;
-}
-
-static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp,
-                            int req_comp, stbi__result_info *ri) {
-  stbi_uc *out;
-  STBI_NOTUSED(ri);
-
-  if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
-    return 0;
-
-  *x = s->img_x;
-  *y = s->img_y;
-  if (comp)
-    *comp = s->img_n;
-
-  if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
-    return stbi__errpuc("too large", "PNM too large");
-
-  out = (stbi_uc *)stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
-  if (!out)
-    return stbi__errpuc("outofmem", "Out of memory");
-  stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
-
-  if (req_comp && req_comp != s->img_n) {
-    out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
-    if (out == NULL)
-      return out; // stbi__convert_format frees input on failure
-  }
-  return out;
-}
-
-static int stbi__pnm_isspace(char c) {
-  return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' ||
-         c == '\r';
-}
-
-static void stbi__pnm_skip_whitespace(stbi__context *s, char *c) {
-  for (;;) {
-    while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-      *c = (char)stbi__get8(s);
-
-    if (stbi__at_eof(s) || *c != '#')
-      break;
-
-    while (!stbi__at_eof(s) && *c != '\n' && *c != '\r')
-      *c = (char)stbi__get8(s);
-  }
-}
-
-static int stbi__pnm_isdigit(char c) { return c >= '0' && c <= '9'; }
-
-static int stbi__pnm_getinteger(stbi__context *s, char *c) {
-  int value = 0;
-
-  while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
-    value = value * 10 + (*c - '0');
-    *c = (char)stbi__get8(s);
-  }
-
-  return value;
-}
-
-static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp) {
-  int maxv, dummy;
-  char c, p, t;
-
-  if (!x)
-    x = &dummy;
-  if (!y)
-    y = &dummy;
-  if (!comp)
-    comp = &dummy;
-
-  stbi__rewind(s);
-
-  // Get identifier
-  p = (char)stbi__get8(s);
-  t = (char)stbi__get8(s);
-  if (p != 'P' || (t != '5' && t != '6')) {
-    stbi__rewind(s);
-    return 0;
-  }
-
-  *comp =
-      (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm
-
-  c = (char)stbi__get8(s);
-  stbi__pnm_skip_whitespace(s, &c);
-
-  *x = stbi__pnm_getinteger(s, &c); // read width
-  stbi__pnm_skip_whitespace(s, &c);
-
-  *y = stbi__pnm_getinteger(s, &c); // read height
-  stbi__pnm_skip_whitespace(s, &c);
-
-  maxv = stbi__pnm_getinteger(s, &c); // read max value
-
-  if (maxv > 255)
-    return stbi__err("max value > 255", "PPM image not 8-bit");
-  else
-    return 1;
-}
-#endif
-
-static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp) {
-#ifndef STBI_NO_JPEG
-  if (stbi__jpeg_info(s, x, y, comp))
-    return 1;
-#endif
-
-#ifndef STBI_NO_PNG
-  if (stbi__png_info(s, x, y, comp))
-    return 1;
-#endif
-
-#ifndef STBI_NO_GIF
-  if (stbi__gif_info(s, x, y, comp))
-    return 1;
-#endif
-
-#ifndef STBI_NO_BMP
-  if (stbi__bmp_info(s, x, y, comp))
-    return 1;
-#endif
-
-#ifndef STBI_NO_PSD
-  if (stbi__psd_info(s, x, y, comp))
-    return 1;
-#endif
-
-#ifndef STBI_NO_PIC
-  if (stbi__pic_info(s, x, y, comp))
-    return 1;
-#endif
-
-#ifndef STBI_NO_PNM
-  if (stbi__pnm_info(s, x, y, comp))
-    return 1;
-#endif
-
-#ifndef STBI_NO_HDR
-  if (stbi__hdr_info(s, x, y, comp))
-    return 1;
-#endif
-
-// test tga last because it's a crappy test!
-#ifndef STBI_NO_TGA
-  if (stbi__tga_info(s, x, y, comp))
-    return 1;
-#endif
-  return stbi__err("unknown image type",
-                   "Image not of any known type, or corrupt");
-}
-
-static int stbi__is_16_main(stbi__context *s) {
-#ifndef STBI_NO_PNG
-  if (stbi__png_is16(s))
-    return 1;
-#endif
-
-#ifndef STBI_NO_PSD
-  if (stbi__psd_is16(s))
-    return 1;
-#endif
-
-  return 0;
-}
-
-#ifndef STBI_NO_STDIO
-STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp) {
-  FILE *f = stbi__fopen(filename, "rb");
-  int result;
-  if (!f)
-    return stbi__err("can't fopen", "Unable to open file");
-  result = stbi_info_from_file(f, x, y, comp);
-  fclose(f);
-  return result;
-}
-
-STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp) {
-  int r;
-  stbi__context s;
-  long pos = ftell(f);
-  stbi__start_file(&s, f);
-  r = stbi__info_main(&s, x, y, comp);
-  fseek(f, pos, SEEK_SET);
-  return r;
-}
-
-STBIDEF int stbi_is_16_bit(char const *filename) {
-  FILE *f = stbi__fopen(filename, "rb");
-  int result;
-  if (!f)
-    return stbi__err("can't fopen", "Unable to open file");
-  result = stbi_is_16_bit_from_file(f);
-  fclose(f);
-  return result;
-}
-
-STBIDEF int stbi_is_16_bit_from_file(FILE *f) {
-  int r;
-  stbi__context s;
-  long pos = ftell(f);
-  stbi__start_file(&s, f);
-  r = stbi__is_16_main(&s);
-  fseek(f, pos, SEEK_SET);
-  return r;
-}
-#endif // !STBI_NO_STDIO
-
-STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x,
-                                  int *y, int *comp) {
-  stbi__context s;
-  stbi__start_mem(&s, buffer, len);
-  return stbi__info_main(&s, x, y, comp);
-}
-
-STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user,
-                                     int *x, int *y, int *comp) {
-  stbi__context s;
-  stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
-  return stbi__info_main(&s, x, y, comp);
-}
-
-STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len) {
-  stbi__context s;
-  stbi__start_mem(&s, buffer, len);
-  return stbi__is_16_main(&s);
-}
-
-STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c,
-                                          void *user) {
-  stbi__context s;
-  stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user);
-  return stbi__is_16_main(&s);
-}
-
-#endif // STB_IMAGE_IMPLEMENTATION
-
-/*
-   revision history:
-      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and
-   platform ifdefs 2.19  (2018-02-11) fix warning 2.18  (2018-01-30) fix
-   warnings 2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
-                         1-bit BMP
-                         *_is_16_bit api
-                         avoid warnings
-      2.16  (2017-07-23) all functions have 16-bit variants;
-                         STBI_NO_STDIO works again;
-                         compilation fixes;
-                         fix rounding in unpremultiply;
-                         optimize vertical flip;
-                         disable raw_len validation;
-                         documentation fixes
-      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
-                         warning fixes; disable run-time SSE detection on gcc;
-                         uniform handling of optional "return" values;
-                         thread-safe initialization of zlib tables
-      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet
-   JPGs 2.13  (2016-11-29) add 16-bit API, only supported for PNG right now 2.12
-   (2016-04-02) fix typo in 2.11 PSD fix that caused crashes 2.11  (2016-04-02)
-   allocate large structures on the stack remove white matting for transparent
-   PSD fix reported channel count for PNG & BMP re-enable SSE2 in non-gcc 64-bit
-                         support RGB-formatted JPEG
-                         read 16-bit PNGs (only as 8-bit)
-      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
-      2.09  (2016-01-16) allow comments in PNM files
-                         16-bit-per-pixel TGA (not bit-per-component)
-                         info() for TGA could break due to .hdr handling
-                         info() for BMP to shares code instead of sloppy parse
-                         can use STBI_REALLOC_SIZED if allocator doesn't support
-   realloc code cleanup 2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD
-   as RGBA 2.07  (2015-09-13) fix compiler warnings partial animated GIF support
-                         limited 16-bpc PSD support
-                         #ifdef unused functions
-                         bug with < 92 byte PIC,PNM,HDR,TGA
-      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
-      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
-      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
-      2.03  (2015-04-12) extra corruption checking (mmozeiko)
-                         stbi_set_flip_vertically_on_load (nguillemot)
-                         fix NEON support; fix mingw support
-      2.02  (2015-01-19) fix incorrect assert, fix warning
-      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit
-   without -msse2 2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG 2.00
-   (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg) progressive
-   JPEG (stb) PGM/PPM support (Ken Miller) STBI_MALLOC,STBI_REALLOC,STBI_FREE
-                         GIF bugfix -- seemingly never worked
-                         STBI_NO_*, STBI_ONLY_*
-      1.48  (2014-12-14) fix incorrectly-named assert()
-      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar
-   Cornut & stb) optimize PNG (ryg) fix bug in interlaced PNG with
-   user-specified channel count (stb) 1.46  (2014-08-26) fix broken tRNS chunk
-   (colorkey-style transparency) in non-paletted PNG 1.45  (2014-08-16) fix
-   MSVC-ARM internal compiler error by wrapping malloc 1.44  (2014-08-07)
-              various warning fixes from Ronny Chevalier
-      1.43  (2014-07-15)
-              fix MSVC-only compiler problem in code changed in 1.42
-      1.42  (2014-07-09)
-              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
-              fixes to stbi__cleanup_jpeg path
-              added STBI_ASSERT to avoid requiring assert.h
-      1.41  (2014-06-25)
-              fix search&replace from 1.36 that messed up comments/error
-   messages 1.40  (2014-06-22) fix gcc struct-initialization warning 1.39
-   (2014-06-15) fix to TGA optimization when req_comp != number of components in
-   TGA; fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my
-   test suite) add support for BMP version 5 (more ignored fields) 1.38
-   (2014-06-06) suppress MSVC warnings on integer casts truncating values fix
-   accidental rename of 'skip' field of I/O 1.37  (2014-06-04) remove duplicate
-   typedef 1.36  (2014-06-03) convert to header file single-file library if
-   de-iphone isn't set, load iphone images color-swapped instead of returning
-   NULL 1.35  (2014-05-27) various warnings fix broken STBI_SIMD path fix bug
-   where stbi_load_from_file no longer left file pointer in correct place fix
-   broken non-easy path for 32-bit BMP (possibly never used) TGA optimization by
-   Arseny Kapoulkine 1.34  (unknown) use STBI_NOTUSED in
-   stbi__resample_row_generic(), fix one more leak in tga failure case 1.33
-   (2011-07-14) make stbi_is_hdr work in STBI_NO_HDR (as specified), minor
-   compiler-friendly improvements 1.32  (2011-07-13) support for "info" function
-   for all supported filetypes (SpartanJ) 1.31  (2011-06-20) a few more leak
-   fixes, bug in PNG handling (SpartanJ) 1.30  (2011-06-11) added ability to
-   load files via callbacks to accomidate custom input streams (Ben Wenger)
-              removed deprecated format-specific test/load functions
-              removed support for installable file formats (stbi_loader) --
-   would have been broken for IO callbacks anyway error cases in bmp and tga
-   give messages and don't leak (Raymond Barbiero, grisha) fix inefficiency in
-   decoding 32-bit BMP (David Woo) 1.29  (2010-08-16) various warning fixes from
-   Aurelien Pocheville 1.28  (2010-08-01) fix bug in GIF palette transparency
-   (SpartanJ) 1.27  (2010-08-01) cast-to-stbi_uc to fix warnings 1.26
-   (2010-07-24) fix bug in file buffering for PNG reported by SpartanJ 1.25
-   (2010-07-17) refix trans_data warning (Won Chun) 1.24  (2010-07-12) perf
-   improvements reading from files on platforms with lock-heavy fgetc() minor
-   perf improvements for jpeg deprecated type-specific functions so we'll get
-   feedback if they're needed attempt to fix trans_data warning (Won Chun) 1.23
-   fixed bug in iPhone support 1.22  (2010-07-10) removed image *writing*
-   support stbi_info support from Jetro Lauha GIF support from Jean-Marc Lienher
-              iPhone PNG-extensions from James Brown
-              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err.
-   Janez (U+017D)emva) 1.21    fix use of 'stbi_uc' in header (reported by jon
-   blow) 1.20    added support for Softimage PIC, by Tom Seddon 1.19    bug in
-   interlaced PNG corruption check (found by ryg) 1.18  (2008-08-02) fix a
-   threading bug (local mutable static) 1.17    support interlaced PNG 1.16
-   major bugfix - stbi__convert_format converted one too many pixels 1.15
-   initialize some fields for thread safety 1.14    fix threadsafe conversion
-   bug header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
-      1.13    threadsafe
-      1.12    const qualifiers in the API
-      1.11    Support installable IDCT, colorspace conversion routines
-      1.10    Fixes for 64-bit (don't use "unsigned long")
-              optimized upsampling by Fabian "ryg" Giesen
-      1.09    Fix format-conversion for PSD code (bad global variables!)
-      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
-      1.07    attempt to fix C++ warning/errors again
-      1.06    attempt to fix C++ warning/errors again
-      1.05    fix TGA loading to return correct *comp and use good luminance
-   calc 1.04    default float alpha is 1, not 255; use 'void *' for
-   stbi_image_free 1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR 1.02 support
-   for (subset of) HDR files, float interface for preferred access to them 1.01
-   fix bug: possible bug in handling right-side up bmps... not sure fix bug: the
-   stbi__bmp_load() and stbi__tga_load() functions didn't work at all 1.00
-   interface to zlib that skips zlib header 0.99    correct handling of alpha in
-   palette 0.98    TGA loader by lonesock; dynamically add loaders (untested)
-      0.97    jpeg errors on too large a file; also catch another malloc failure
-      0.96    fix detection of invalid v value - particleman@mollyrocket forum
-      0.95    during header scan, seek to markers in case of padding
-      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
-      0.93    handle jpegtran output; verbose errors
-      0.92    read 4,8,16,24,32-bit BMP files of several formats
-      0.91    output 24-bit Windows 3.0 BMP files
-      0.90    fix a few more warnings; bump version number to approach 1.0
-      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
-      0.60    fix compiling as c++
-      0.59    fix warnings: merge Dave Moore's -Wall fixes
-      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
-      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but
-   less than 16 available 0.56    fix bug: zlib uncompressed mode len vs. nlen
-      0.55    fix bug: restart_interval not initialized to 0
-      0.54    allow NULL for 'int *comp'
-      0.53    fix bug in png 3->4; speedup png decoding
-      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
-      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
-              on 'test' only check type, not whether we support this variant
-      0.50  (2006-11-19)
-              first released version
-*/
-
-/*
-------------------------------------------------------------------------------
-This software is available under 2 licenses -- choose whichever you prefer.
-------------------------------------------------------------------------------
-ALTERNATIVE A - MIT License
-Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-------------------------------------------------------------------------------
-ALTERNATIVE B - Public Domain (www.unlicense.org)
-This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-------------------------------------------------------------------------------
-*/
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h
deleted file mode 100644
index 84b84981b44876c35c9bb6cce1af402ec302c3eb..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h
+++ /dev/null
@@ -1,1933 +0,0 @@
-/* stb_image_write - v1.13 - public domain - http://nothings.org/stb
-   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
-                                     no warranty implied; use at your own risk
-
-   Before #including,
-
-       #define STB_IMAGE_WRITE_IMPLEMENTATION
-
-   in the file that you want to have the implementation.
-
-   Will probably not work correctly with strict-aliasing optimizations.
-
-ABOUT:
-
-   This header file is a library for writing images to C stdio or a callback.
-
-   The PNG output is not optimal; it is 20-50% larger than the file
-   written by a decent optimizing implementation; though providing a custom
-   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
-   This library is designed for source code compactness and simplicity,
-   not optimal image file size or run-time performance.
-
-BUILDING:
-
-   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
-   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
-   malloc,realloc,free.
-   You can #define STBIW_MEMMOVE() to replace memmove()
-   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress
-function for PNG compression (instead of the builtin one), it must have the
-following signature: unsigned char * my_compress(unsigned char *data, int
-data_len, int *out_len, int quality); The returned data will be freed with
-STBIW_FREE() (free() by default), so it must be heap allocated with
-STBIW_MALLOC() (malloc() by default),
-
-UNICODE:
-
-   If compiling for Windows and you wish to use Unicode filenames, compile
-   with
-       #define STBIW_WINDOWS_UTF8
-   and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert
-   Windows wchar_t filenames to utf8.
-
-USAGE:
-
-   There are five functions, one for each image file format:
-
-     int stbi_write_png(char const *filename, int w, int h, int comp, const void
-*data, int stride_in_bytes); int stbi_write_bmp(char const *filename, int w, int
-h, int comp, const void *data); int stbi_write_tga(char const *filename, int w,
-int h, int comp, const void *data); int stbi_write_jpg(char const *filename, int
-w, int h, int comp, const void *data, int quality); int stbi_write_hdr(char
-const *filename, int w, int h, int comp, const float *data);
-
-     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip
-data vertically
-
-   There are also five equivalent functions that use an arbitrary write
-function. You are expected to open/close your file-equivalent before and after
-calling these:
-
-     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int
-h, int comp, const void  *data, int stride_in_bytes); int
-stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int
-comp, const void  *data); int stbi_write_tga_to_func(stbi_write_func *func, void
-*context, int w, int h, int comp, const void  *data); int
-stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int
-comp, const float *data); int stbi_write_jpg_to_func(stbi_write_func *func, void
-*context, int x, int y, int comp, const void *data, int quality);
-
-   where the callback is:
-      void stbi_write_func(void *context, void *data, int size);
-
-   You can configure it with these global variables:
-      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to
-disable RLE int stbi_write_png_compression_level;    // defaults to 8; set to
-higher for more compression int stbi_write_force_png_filter;         // defaults
-to -1; set to 0..5 to force a filter mode
-
-
-   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
-   functions, so the library will not use stdio.h at all. However, this will
-   also disable HDR writing, because it requires stdio for formatted output.
-
-   Each function returns 0 on failure and non-0 on success.
-
-   The functions create an image file defined by the parameters. The image
-   is a rectangle of pixels stored from left-to-right, top-to-bottom.
-   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
-   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
-   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
-   The *data pointer points to the first byte of the top-left-most pixel.
-   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
-   a row of pixels to the first byte of the next row of pixels.
-
-   PNG creates output files with the same number of components as the input.
-   The BMP format expands Y to RGB in the file format and does not
-   output alpha.
-
-   PNG supports writing rectangles of data even when the bytes storing rows of
-   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
-   by supplying the stride between the beginning of adjacent rows. The other
-   formats do not. (Thus you cannot write a native-format BMP through the BMP
-   writer, both because it is in BGR order and because it may have padding
-   at the end of the line.)
-
-   PNG allows you to set the deflate compression level by setting the global
-   variable 'stbi_write_png_compression_level' (it defaults to 8).
-
-   HDR expects linear float data. Since the format is always 32-bit rgb(e)
-   data, alpha (if provided) is discarded, and for monochrome data it is
-   replicated across all three channels.
-
-   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
-   data, set the global variable 'stbi_write_tga_with_rle' to 0.
-
-   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
-   Higher quality looks better but results in a bigger image.
-   JPEG baseline (no JPEG progressive).
-
-CREDITS:
-
-
-   Sean Barrett           -    PNG/BMP/TGA
-   Baldur Karlsson        -    HDR
-   Jean-Sebastien Guay    -    TGA monochrome
-   Tim Kelsey             -    misc enhancements
-   Alan Hickman           -    TGA RLE
-   Emmanuel Julien        -    initial file IO callback implementation
-   Jon Olick              -    original jo_jpeg.cpp code
-   Daniel Gibson          -    integrate JPEG, allow external zlib
-   Aarni Koskela          -    allow choosing PNG filter
-
-   bugfixes:
-      github:Chribba
-      Guillaume Chereau
-      github:jry2
-      github:romigrou
-      Sergio Gonzalez
-      Jonas Karlsson
-      Filip Wasil
-      Thatcher Ulrich
-      github:poppolopoppo
-      Patrick Boettcher
-      github:xeekworx
-      Cap Petschulat
-      Simon Rodriguez
-      Ivan Tikhonov
-      github:ignotion
-      Adam Schackart
-
-LICENSE
-
-  See end of file for license information.
-
-*/
-
-#ifndef INCLUDE_STB_IMAGE_WRITE_H
-#define INCLUDE_STB_IMAGE_WRITE_H
-
-#include <stdlib.h>
-
-// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline'
-// or 'static inline'
-#ifndef STBIWDEF
-#ifdef STB_IMAGE_WRITE_STATIC
-#define STBIWDEF static
-#else
-#ifdef __cplusplus
-#define STBIWDEF extern "C"
-#else
-#define STBIWDEF extern
-#endif
-#endif
-#endif
-
-#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations
-extern int stbi_write_tga_with_rle;
-extern int stbi_write_png_compression_level;
-extern int stbi_write_force_png_filter;
-#endif
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp,
-                            const void *data, int stride_in_bytes);
-STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp,
-                            const void *data);
-STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp,
-                            const void *data);
-STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp,
-                            const float *data);
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp,
-                            const void *data, int quality);
-
-#ifdef STBI_WINDOWS_UTF8
-STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen,
-                                         const wchar_t *input);
-#endif
-#endif
-
-typedef void stbi_write_func(void *context, void *data, int size);
-
-STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w,
-                                    int h, int comp, const void *data,
-                                    int stride_in_bytes);
-STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w,
-                                    int h, int comp, const void *data);
-STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w,
-                                    int h, int comp, const void *data);
-STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w,
-                                    int h, int comp, const float *data);
-STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
-                                    int y, int comp, const void *data,
-                                    int quality);
-
-STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
-
-#endif // INCLUDE_STB_IMAGE_WRITE_H
-
-#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
-
-#ifdef _WIN32
-#ifndef _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-#ifndef _CRT_NONSTDC_NO_DEPRECATE
-#define _CRT_NONSTDC_NO_DEPRECATE
-#endif
-#endif
-
-#ifndef STBI_WRITE_NO_STDIO
-#include <stdio.h>
-#endif // STBI_WRITE_NO_STDIO
-
-#include <math.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-
-#if defined(STBIW_MALLOC) && defined(STBIW_FREE) &&                            \
-    (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
-// ok
-#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) &&                        \
-    !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
-// ok
-#else
-#error                                                                         \
-    "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
-#endif
-
-#ifndef STBIW_MALLOC
-#define STBIW_MALLOC(sz) malloc(sz)
-#define STBIW_REALLOC(p, newsz) realloc(p, newsz)
-#define STBIW_FREE(p) free(p)
-#endif
-
-#ifndef STBIW_REALLOC_SIZED
-#define STBIW_REALLOC_SIZED(p, oldsz, newsz) STBIW_REALLOC(p, newsz)
-#endif
-
-#ifndef STBIW_MEMMOVE
-#define STBIW_MEMMOVE(a, b, sz) memmove(a, b, sz)
-#endif
-
-#ifndef STBIW_ASSERT
-#include <assert.h>
-#define STBIW_ASSERT(x) assert(x)
-#endif
-
-#define STBIW_UCHAR(x) (unsigned char)((x)&0xff)
-
-#ifdef STB_IMAGE_WRITE_STATIC
-static int stbi__flip_vertically_on_write = 0;
-static int stbi_write_png_compression_level = 8;
-static int stbi_write_tga_with_rle = 1;
-static int stbi_write_force_png_filter = -1;
-#else
-int stbi_write_png_compression_level = 8;
-int stbi__flip_vertically_on_write = 0;
-int stbi_write_tga_with_rle = 1;
-int stbi_write_force_png_filter = -1;
-#endif
-
-STBIWDEF void stbi_flip_vertically_on_write(int flag) {
-  stbi__flip_vertically_on_write = flag;
-}
-
-typedef struct {
-  stbi_write_func *func;
-  void *context;
-} stbi__write_context;
-
-// initialize a callback-based context
-static void stbi__start_write_callbacks(stbi__write_context *s,
-                                        stbi_write_func *c, void *context) {
-  s->func = c;
-  s->context = context;
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-
-static void stbi__stdio_write(void *context, void *data, int size) {
-  fwrite(data, 1, size, (FILE *)context);
-}
-
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
-#ifdef __cplusplus
-#define STBIW_EXTERN extern "C"
-#else
-#define STBIW_EXTERN extern
-#endif
-STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(
-    unsigned int cp, unsigned long flags, const char *str, int cbmb,
-    wchar_t *widestr, int cchwide);
-STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(
-    unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide,
-    char *str, int cbmb, const char *defchar, int *used_default);
-
-STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen,
-                                         const wchar_t *input) {
-  return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer,
-                             (int)bufferlen, NULL, NULL);
-}
-#endif
-
-static FILE *stbiw__fopen(char const *filename, char const *mode) {
-  FILE *f;
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
-  wchar_t wMode[64];
-  wchar_t wFilename[1024];
-  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename,
-                               sizeof(wFilename)))
-    return 0;
-
-  if (0 ==
-      MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
-    return 0;
-
-#if _MSC_VER >= 1400
-  if (0 != _wfopen_s(&f, wFilename, wMode))
-    f = 0;
-#else
-  f = _wfopen(wFilename, wMode);
-#endif
-
-#elif defined(_MSC_VER) && _MSC_VER >= 1400
-  if (0 != fopen_s(&f, filename, mode))
-    f = 0;
-#else
-  f = fopen(filename, mode);
-#endif
-  return f;
-}
-
-static int stbi__start_write_file(stbi__write_context *s,
-                                  const char *filename) {
-  FILE *f = stbiw__fopen(filename, "wb");
-  stbi__start_write_callbacks(s, stbi__stdio_write, (void *)f);
-  return f != NULL;
-}
-
-static void stbi__end_write_file(stbi__write_context *s) {
-  fclose((FILE *)s->context);
-}
-
-#endif // !STBI_WRITE_NO_STDIO
-
-typedef unsigned int stbiw_uint32;
-typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1];
-
-static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v) {
-  while (*fmt) {
-    switch (*fmt++) {
-    case ' ':
-      break;
-    case '1': {
-      unsigned char x = STBIW_UCHAR(va_arg(v, int));
-      s->func(s->context, &x, 1);
-      break;
-    }
-    case '2': {
-      int x = va_arg(v, int);
-      unsigned char b[2];
-      b[0] = STBIW_UCHAR(x);
-      b[1] = STBIW_UCHAR(x >> 8);
-      s->func(s->context, b, 2);
-      break;
-    }
-    case '4': {
-      stbiw_uint32 x = va_arg(v, int);
-      unsigned char b[4];
-      b[0] = STBIW_UCHAR(x);
-      b[1] = STBIW_UCHAR(x >> 8);
-      b[2] = STBIW_UCHAR(x >> 16);
-      b[3] = STBIW_UCHAR(x >> 24);
-      s->func(s->context, b, 4);
-      break;
-    }
-    default:
-      STBIW_ASSERT(0);
-      return;
-    }
-  }
-}
-
-static void stbiw__writef(stbi__write_context *s, const char *fmt, ...) {
-  va_list v;
-  va_start(v, fmt);
-  stbiw__writefv(s, fmt, v);
-  va_end(v);
-}
-
-static void stbiw__putc(stbi__write_context *s, unsigned char c) {
-  s->func(s->context, &c, 1);
-}
-
-static void stbiw__write3(stbi__write_context *s, unsigned char a,
-                          unsigned char b, unsigned char c) {
-  unsigned char arr[3];
-  arr[0] = a;
-  arr[1] = b;
-  arr[2] = c;
-  s->func(s->context, arr, 3);
-}
-
-static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp,
-                               int write_alpha, int expand_mono,
-                               unsigned char *d) {
-  unsigned char bg[3] = {255, 0, 255}, px[3];
-  int k;
-
-  if (write_alpha < 0)
-    s->func(s->context, &d[comp - 1], 1);
-
-  switch (comp) {
-  case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as
-          // 1-channel case
-  case 1:
-    if (expand_mono)
-      stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
-    else
-      s->func(s->context, d, 1); // monochrome TGA
-    break;
-  case 4:
-    if (!write_alpha) {
-      // composite against pink background
-      for (k = 0; k < 3; ++k)
-        px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
-      stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
-      break;
-    }
-    /* FALLTHROUGH */
-  case 3:
-    stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
-    break;
-  }
-  if (write_alpha > 0)
-    s->func(s->context, &d[comp - 1], 1);
-}
-
-static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir,
-                                int x, int y, int comp, void *data,
-                                int write_alpha, int scanline_pad,
-                                int expand_mono) {
-  stbiw_uint32 zero = 0;
-  int i, j, j_end;
-
-  if (y <= 0)
-    return;
-
-  if (stbi__flip_vertically_on_write)
-    vdir *= -1;
-
-  if (vdir < 0) {
-    j_end = -1;
-    j = y - 1;
-  } else {
-    j_end = y;
-    j = 0;
-  }
-
-  for (; j != j_end; j += vdir) {
-    for (i = 0; i < x; ++i) {
-      unsigned char *d = (unsigned char *)data + (j * x + i) * comp;
-      stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
-    }
-    s->func(s->context, &zero, scanline_pad);
-  }
-}
-
-static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x,
-                          int y, int comp, int expand_mono, void *data,
-                          int alpha, int pad, const char *fmt, ...) {
-  if (y < 0 || x < 0) {
-    return 0;
-  } else {
-    va_list v;
-    va_start(v, fmt);
-    stbiw__writefv(s, fmt, v);
-    va_end(v);
-    stbiw__write_pixels(s, rgb_dir, vdir, x, y, comp, data, alpha, pad,
-                        expand_mono);
-    return 1;
-  }
-}
-
-static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp,
-                               const void *data) {
-  int pad = (-x * 3) & 3;
-  return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void *)data, 0, pad,
-                        "11 4 22 4"
-                        "4 44 22 444444",
-                        'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0,
-                        14 + 40,                            // file header
-                        40, x, y, 1, 24, 0, 0, 0, 0, 0, 0); // bitmap header
-}
-
-STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x,
-                                    int y, int comp, const void *data) {
-  stbi__write_context s;
-  stbi__start_write_callbacks(&s, func, context);
-  return stbi_write_bmp_core(&s, x, y, comp, data);
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp,
-                            const void *data) {
-  stbi__write_context s;
-  if (stbi__start_write_file(&s, filename)) {
-    int r = stbi_write_bmp_core(&s, x, y, comp, data);
-    stbi__end_write_file(&s);
-    return r;
-  } else
-    return 0;
-}
-#endif //! STBI_WRITE_NO_STDIO
-
-static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp,
-                               void *data) {
-  int has_alpha = (comp == 2 || comp == 4);
-  int colorbytes = has_alpha ? comp - 1 : comp;
-  int format =
-      colorbytes < 2
-          ? 3
-          : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
-
-  if (y < 0 || x < 0)
-    return 0;
-
-  if (!stbi_write_tga_with_rle) {
-    return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *)data, has_alpha, 0,
-                          "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y,
-                          (colorbytes + has_alpha) * 8, has_alpha * 8);
-  } else {
-    int i, j, k;
-    int jend, jdir;
-
-    stbiw__writef(s, "111 221 2222 11", 0, 0, format + 8, 0, 0, 0, 0, 0, x, y,
-                  (colorbytes + has_alpha) * 8, has_alpha * 8);
-
-    if (stbi__flip_vertically_on_write) {
-      j = 0;
-      jend = y;
-      jdir = 1;
-    } else {
-      j = y - 1;
-      jend = -1;
-      jdir = -1;
-    }
-    for (; j != jend; j += jdir) {
-      unsigned char *row = (unsigned char *)data + j * x * comp;
-      int len;
-
-      for (i = 0; i < x; i += len) {
-        unsigned char *begin = row + i * comp;
-        int diff = 1;
-        len = 1;
-
-        if (i < x - 1) {
-          ++len;
-          diff = memcmp(begin, row + (i + 1) * comp, comp);
-          if (diff) {
-            const unsigned char *prev = begin;
-            for (k = i + 2; k < x && len < 128; ++k) {
-              if (memcmp(prev, row + k * comp, comp)) {
-                prev += comp;
-                ++len;
-              } else {
-                --len;
-                break;
-              }
-            }
-          } else {
-            for (k = i + 2; k < x && len < 128; ++k) {
-              if (!memcmp(begin, row + k * comp, comp)) {
-                ++len;
-              } else {
-                break;
-              }
-            }
-          }
-        }
-
-        if (diff) {
-          unsigned char header = STBIW_UCHAR(len - 1);
-          s->func(s->context, &header, 1);
-          for (k = 0; k < len; ++k) {
-            stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
-          }
-        } else {
-          unsigned char header = STBIW_UCHAR(len - 129);
-          s->func(s->context, &header, 1);
-          stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
-        }
-      }
-    }
-  }
-  return 1;
-}
-
-STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x,
-                                    int y, int comp, const void *data) {
-  stbi__write_context s;
-  stbi__start_write_callbacks(&s, func, context);
-  return stbi_write_tga_core(&s, x, y, comp, (void *)data);
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp,
-                            const void *data) {
-  stbi__write_context s;
-  if (stbi__start_write_file(&s, filename)) {
-    int r = stbi_write_tga_core(&s, x, y, comp, (void *)data);
-    stbi__end_write_file(&s);
-    return r;
-  } else
-    return 0;
-}
-#endif
-
-// *************************************************************************************************
-// Radiance RGBE HDR writer
-// by Baldur Karlsson
-
-#define stbiw__max(a, b) ((a) > (b) ? (a) : (b))
-
-static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear) {
-  int exponent;
-  float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
-
-  if (maxcomp < 1e-32f) {
-    rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
-  } else {
-    float normalize = (float)frexp(maxcomp, &exponent) * 256.0f / maxcomp;
-
-    rgbe[0] = (unsigned char)(linear[0] * normalize);
-    rgbe[1] = (unsigned char)(linear[1] * normalize);
-    rgbe[2] = (unsigned char)(linear[2] * normalize);
-    rgbe[3] = (unsigned char)(exponent + 128);
-  }
-}
-
-static void stbiw__write_run_data(stbi__write_context *s, int length,
-                                  unsigned char databyte) {
-  unsigned char lengthbyte = STBIW_UCHAR(length + 128);
-  STBIW_ASSERT(length + 128 <= 255);
-  s->func(s->context, &lengthbyte, 1);
-  s->func(s->context, &databyte, 1);
-}
-
-static void stbiw__write_dump_data(stbi__write_context *s, int length,
-                                   unsigned char *data) {
-  unsigned char lengthbyte = STBIW_UCHAR(length);
-  STBIW_ASSERT(length <=
-               128); // inconsistent with spec but consistent with official code
-  s->func(s->context, &lengthbyte, 1);
-  s->func(s->context, data, length);
-}
-
-static void stbiw__write_hdr_scanline(stbi__write_context *s, int width,
-                                      int ncomp, unsigned char *scratch,
-                                      float *scanline) {
-  unsigned char scanlineheader[4] = {2, 2, 0, 0};
-  unsigned char rgbe[4];
-  float linear[3];
-  int x;
-
-  scanlineheader[2] = (width & 0xff00) >> 8;
-  scanlineheader[3] = (width & 0x00ff);
-
-  /* skip RLE for images too small or large */
-  if (width < 8 || width >= 32768) {
-    for (x = 0; x < width; x++) {
-      switch (ncomp) {
-      case 4: /* fallthrough */
-      case 3:
-        linear[2] = scanline[x * ncomp + 2];
-        linear[1] = scanline[x * ncomp + 1];
-        linear[0] = scanline[x * ncomp + 0];
-        break;
-      default:
-        linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
-        break;
-      }
-      stbiw__linear_to_rgbe(rgbe, linear);
-      s->func(s->context, rgbe, 4);
-    }
-  } else {
-    int c, r;
-    /* encode into scratch buffer */
-    for (x = 0; x < width; x++) {
-      switch (ncomp) {
-      case 4: /* fallthrough */
-      case 3:
-        linear[2] = scanline[x * ncomp + 2];
-        linear[1] = scanline[x * ncomp + 1];
-        linear[0] = scanline[x * ncomp + 0];
-        break;
-      default:
-        linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0];
-        break;
-      }
-      stbiw__linear_to_rgbe(rgbe, linear);
-      scratch[x + width * 0] = rgbe[0];
-      scratch[x + width * 1] = rgbe[1];
-      scratch[x + width * 2] = rgbe[2];
-      scratch[x + width * 3] = rgbe[3];
-    }
-
-    s->func(s->context, scanlineheader, 4);
-
-    /* RLE each component separately */
-    for (c = 0; c < 4; c++) {
-      unsigned char *comp = &scratch[width * c];
-
-      x = 0;
-      while (x < width) {
-        // find first run
-        r = x;
-        while (r + 2 < width) {
-          if (comp[r] == comp[r + 1] && comp[r] == comp[r + 2])
-            break;
-          ++r;
-        }
-        if (r + 2 >= width)
-          r = width;
-        // dump up to first run
-        while (x < r) {
-          int len = r - x;
-          if (len > 128)
-            len = 128;
-          stbiw__write_dump_data(s, len, &comp[x]);
-          x += len;
-        }
-        // if there's a run, output it
-        if (r + 2 < width) { // same test as what we break out of in search
-                             // loop, so only true if we break'd
-          // find next byte after run
-          while (r < width && comp[r] == comp[x])
-            ++r;
-          // output run up to r
-          while (x < r) {
-            int len = r - x;
-            if (len > 127)
-              len = 127;
-            stbiw__write_run_data(s, len, comp[x]);
-            x += len;
-          }
-        }
-      }
-    }
-  }
-}
-
-static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp,
-                               float *data) {
-  if (y <= 0 || x <= 0 || data == NULL)
-    return 0;
-  else {
-    // Each component is stored separately. Allocate scratch space for full
-    // output scanline.
-    unsigned char *scratch = (unsigned char *)STBIW_MALLOC(x * 4);
-    int i, len;
-    char buffer[128];
-    char header[] =
-        "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
-    s->func(s->context, header, sizeof(header) - 1);
-
-#ifdef __STDC_WANT_SECURE_LIB__
-    len =
-        sprintf_s(buffer, sizeof(buffer),
-                  "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
-#else
-    len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n",
-                  y, x);
-#endif
-    s->func(s->context, buffer, len);
-
-    for (i = 0; i < y; i++)
-      stbiw__write_hdr_scanline(
-          s, x, comp, scratch,
-          data + comp * x * (stbi__flip_vertically_on_write ? y - 1 - i : i));
-    STBIW_FREE(scratch);
-    return 1;
-  }
-}
-
-STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x,
-                                    int y, int comp, const float *data) {
-  stbi__write_context s;
-  stbi__start_write_callbacks(&s, func, context);
-  return stbi_write_hdr_core(&s, x, y, comp, (float *)data);
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp,
-                            const float *data) {
-  stbi__write_context s;
-  if (stbi__start_write_file(&s, filename)) {
-    int r = stbi_write_hdr_core(&s, x, y, comp, (float *)data);
-    stbi__end_write_file(&s);
-    return r;
-  } else
-    return 0;
-}
-#endif // STBI_WRITE_NO_STDIO
-
-//////////////////////////////////////////////////////////////////////////////
-//
-// PNG writer
-//
-
-#ifndef STBIW_ZLIB_COMPRESS
-// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount()
-// == vector<>::size()
-#define stbiw__sbraw(a) ((int *)(a)-2)
-#define stbiw__sbm(a) stbiw__sbraw(a)[0]
-#define stbiw__sbn(a) stbiw__sbraw(a)[1]
-
-#define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a))
-#define stbiw__sbmaybegrow(a, n)                                               \
-  (stbiw__sbneedgrow(a, (n)) ? stbiw__sbgrow(a, n) : 0)
-#define stbiw__sbgrow(a, n) stbiw__sbgrowf((void **)&(a), (n), sizeof(*(a)))
-
-#define stbiw__sbpush(a, v)                                                    \
-  (stbiw__sbmaybegrow(a, 1), (a)[stbiw__sbn(a)++] = (v))
-#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0)
-#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0)
-
-static void *stbiw__sbgrowf(void **arr, int increment, int itemsize) {
-  int m = *arr ? 2 * stbiw__sbm(*arr) + increment : increment + 1;
-  void *p = STBIW_REALLOC_SIZED(
-      *arr ? stbiw__sbraw(*arr) : 0,
-      *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0,
-      itemsize * m + sizeof(int) * 2);
-  STBIW_ASSERT(p);
-  if (p) {
-    if (!*arr)
-      ((int *)p)[1] = 0;
-    *arr = (void *)((int *)p + 2);
-    stbiw__sbm(*arr) = m;
-  }
-  return *arr;
-}
-
-static unsigned char *stbiw__zlib_flushf(unsigned char *data,
-                                         unsigned int *bitbuffer,
-                                         int *bitcount) {
-  while (*bitcount >= 8) {
-    stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
-    *bitbuffer >>= 8;
-    *bitcount -= 8;
-  }
-  return data;
-}
-
-static int stbiw__zlib_bitrev(int code, int codebits) {
-  int res = 0;
-  while (codebits--) {
-    res = (res << 1) | (code & 1);
-    code >>= 1;
-  }
-  return res;
-}
-
-static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b,
-                                       int limit) {
-  int i;
-  for (i = 0; i < limit && i < 258; ++i)
-    if (a[i] != b[i])
-      break;
-  return i;
-}
-
-static unsigned int stbiw__zhash(unsigned char *data) {
-  stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
-  hash ^= hash << 3;
-  hash += hash >> 5;
-  hash ^= hash << 4;
-  hash += hash >> 17;
-  hash ^= hash << 25;
-  hash += hash >> 6;
-  return hash;
-}
-
-#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
-#define stbiw__zlib_add(code, codebits)                                        \
-  (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
-#define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c)
-// default huffman tables
-#define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8)
-#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9)
-#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256, 7)
-#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280, 8)
-#define stbiw__zlib_huff(n)                                                    \
-  ((n) <= 143 ? stbiw__zlib_huff1(n)                                           \
-              : (n) <= 255 ? stbiw__zlib_huff2(n)                              \
-                           : (n) <= 279 ? stbiw__zlib_huff3(n)                 \
-                                        : stbiw__zlib_huff4(n))
-#define stbiw__zlib_huffb(n)                                                   \
-  ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
-
-#define stbiw__ZHASH 16384
-
-#endif // STBIW_ZLIB_COMPRESS
-
-STBIWDEF unsigned char *stbi_zlib_compress(unsigned char *data, int data_len,
-                                           int *out_len, int quality) {
-#ifdef STBIW_ZLIB_COMPRESS
-  // user provided a zlib compress implementation, use that
-  return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
-#else  // use builtin
-  static unsigned short lengthc[] = {
-      3,  4,  5,  6,  7,  8,  9,  10, 11,  13,  15,  17,  19,  23,  27,
-      31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259};
-  static unsigned char lengtheb[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
-                                     1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
-                                     4, 4, 4, 4, 5, 5, 5, 5, 0};
-  static unsigned short distc[] = {
-      1,    2,    3,    4,    5,    7,     9,     13,    17,   25,   33,
-      49,   65,   97,   129,  193,  257,   385,   513,   769,  1025, 1537,
-      2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768};
-  static unsigned char disteb[] = {0, 0, 0,  0,  1,  1,  2,  2,  3,  3,
-                                   4, 4, 5,  5,  6,  6,  7,  7,  8,  8,
-                                   9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
-  unsigned int bitbuf = 0;
-  int i, j, bitcount = 0;
-  unsigned char *out = NULL;
-  unsigned char ***hash_table =
-      (unsigned char ***)STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char **));
-  if (hash_table == NULL)
-    return NULL;
-  if (quality < 5)
-    quality = 5;
-
-  stbiw__sbpush(out, 0x78); // DEFLATE 32K window
-  stbiw__sbpush(out, 0x5e); // FLEVEL = 1
-  stbiw__zlib_add(1, 1);    // BFINAL = 1
-  stbiw__zlib_add(1, 2);    // BTYPE = 1 -- fixed huffman
-
-  for (i = 0; i < stbiw__ZHASH; ++i)
-    hash_table[i] = NULL;
-
-  i = 0;
-  while (i < data_len - 3) {
-    // hash next 3 bytes of data to be compressed
-    int h = stbiw__zhash(data + i) & (stbiw__ZHASH - 1), best = 3;
-    unsigned char *bestloc = 0;
-    unsigned char **hlist = hash_table[h];
-    int n = stbiw__sbcount(hlist);
-    for (j = 0; j < n; ++j) {
-      if (hlist[j] - data > i - 32768) { // if entry lies within window
-        int d = stbiw__zlib_countm(hlist[j], data + i, data_len - i);
-        if (d >= best) {
-          best = d;
-          bestloc = hlist[j];
-        }
-      }
-    }
-    // when hash table entry is too long, delete half the entries
-    if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality) {
-      STBIW_MEMMOVE(hash_table[h], hash_table[h] + quality,
-                    sizeof(hash_table[h][0]) * quality);
-      stbiw__sbn(hash_table[h]) = quality;
-    }
-    stbiw__sbpush(hash_table[h], data + i);
-
-    if (bestloc) {
-      // "lazy matching" - check match at *next* byte, and if it's better, do
-      // cur byte as literal
-      h = stbiw__zhash(data + i + 1) & (stbiw__ZHASH - 1);
-      hlist = hash_table[h];
-      n = stbiw__sbcount(hlist);
-      for (j = 0; j < n; ++j) {
-        if (hlist[j] - data > i - 32767) {
-          int e = stbiw__zlib_countm(hlist[j], data + i + 1, data_len - i - 1);
-          if (e > best) { // if next match is better, bail on current match
-            bestloc = NULL;
-            break;
-          }
-        }
-      }
-    }
-
-    if (bestloc) {
-      int d = (int)(data + i - bestloc); // distance back
-      STBIW_ASSERT(d <= 32767 && best <= 258);
-      for (j = 0; best > lengthc[j + 1] - 1; ++j)
-        ;
-      stbiw__zlib_huff(j + 257);
-      if (lengtheb[j])
-        stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
-      for (j = 0; d > distc[j + 1] - 1; ++j)
-        ;
-      stbiw__zlib_add(stbiw__zlib_bitrev(j, 5), 5);
-      if (disteb[j])
-        stbiw__zlib_add(d - distc[j], disteb[j]);
-      i += best;
-    } else {
-      stbiw__zlib_huffb(data[i]);
-      ++i;
-    }
-  }
-  // write out final bytes
-  for (; i < data_len; ++i)
-    stbiw__zlib_huffb(data[i]);
-  stbiw__zlib_huff(256); // end of block
-  // pad with 0 bits to byte boundary
-  while (bitcount)
-    stbiw__zlib_add(0, 1);
-
-  for (i = 0; i < stbiw__ZHASH; ++i)
-    (void)stbiw__sbfree(hash_table[i]);
-  STBIW_FREE(hash_table);
-
-  {
-    // compute adler32 on input
-    unsigned int s1 = 1, s2 = 0;
-    int blocklen = (int)(data_len % 5552);
-    j = 0;
-    while (j < data_len) {
-      for (i = 0; i < blocklen; ++i) {
-        s1 += data[j + i];
-        s2 += s1;
-      }
-      s1 %= 65521;
-      s2 %= 65521;
-      j += blocklen;
-      blocklen = 5552;
-    }
-    stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
-    stbiw__sbpush(out, STBIW_UCHAR(s2));
-    stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
-    stbiw__sbpush(out, STBIW_UCHAR(s1));
-  }
-  *out_len = stbiw__sbn(out);
-  // make returned pointer freeable
-  STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
-  return (unsigned char *)stbiw__sbraw(out);
-#endif // STBIW_ZLIB_COMPRESS
-}
-
-static unsigned int stbiw__crc32(unsigned char *buffer, int len) {
-#ifdef STBIW_CRC32
-  return STBIW_CRC32(buffer, len);
-#else
-  static unsigned int crc_table[256] = {
-      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
-      0xE963A535, 0x9E6495A3, 0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
-      0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
-      0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
-      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9,
-      0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
-      0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
-      0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
-      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
-      0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
-      0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
-      0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
-      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
-      0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
-      0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
-      0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
-      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
-      0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
-      0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
-      0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
-      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
-      0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
-      0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
-      0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
-      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
-      0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
-      0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
-      0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
-      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
-      0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
-      0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
-      0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
-      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
-      0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
-      0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
-      0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
-      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
-      0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
-      0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
-      0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
-      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
-      0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
-      0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D};
-
-  unsigned int crc = ~0u;
-  int i;
-  for (i = 0; i < len; ++i)
-    crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
-  return ~crc;
-#endif
-}
-
-#define stbiw__wpng4(o, a, b, c, d)                                            \
-  ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c),  \
-   (o)[3] = STBIW_UCHAR(d), (o) += 4)
-#define stbiw__wp32(data, v)                                                   \
-  stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v));
-#define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3])
-
-static void stbiw__wpcrc(unsigned char **data, int len) {
-  unsigned int crc = stbiw__crc32(*data - len - 4, len + 4);
-  stbiw__wp32(*data, crc);
-}
-
-static unsigned char stbiw__paeth(int a, int b, int c) {
-  int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
-  if (pa <= pb && pa <= pc)
-    return STBIW_UCHAR(a);
-  if (pb <= pc)
-    return STBIW_UCHAR(b);
-  return STBIW_UCHAR(c);
-}
-
-// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
-static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes,
-                                   int width, int height, int y, int n,
-                                   int filter_type, signed char *line_buffer) {
-  static int mapping[] = {0, 1, 2, 3, 4};
-  static int firstmap[] = {0, 1, 0, 5, 6};
-  int *mymap = (y != 0) ? mapping : firstmap;
-  int i;
-  int type = mymap[filter_type];
-  unsigned char *z =
-      pixels +
-      stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y);
-  int signed_stride =
-      stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
-
-  if (type == 0) {
-    memcpy(line_buffer, z, width * n);
-    return;
-  }
-
-  // first loop isn't optimized since it's just one pixel
-  for (i = 0; i < n; ++i) {
-    switch (type) {
-    case 1:
-      line_buffer[i] = z[i];
-      break;
-    case 2:
-      line_buffer[i] = z[i] - z[i - signed_stride];
-      break;
-    case 3:
-      line_buffer[i] = z[i] - (z[i - signed_stride] >> 1);
-      break;
-    case 4:
-      line_buffer[i] =
-          (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0));
-      break;
-    case 5:
-      line_buffer[i] = z[i];
-      break;
-    case 6:
-      line_buffer[i] = z[i];
-      break;
-    }
-  }
-  switch (type) {
-  case 1:
-    for (i = n; i < width * n; ++i)
-      line_buffer[i] = z[i] - z[i - n];
-    break;
-  case 2:
-    for (i = n; i < width * n; ++i)
-      line_buffer[i] = z[i] - z[i - signed_stride];
-    break;
-  case 3:
-    for (i = n; i < width * n; ++i)
-      line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1);
-    break;
-  case 4:
-    for (i = n; i < width * n; ++i)
-      line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride],
-                                           z[i - signed_stride - n]);
-    break;
-  case 5:
-    for (i = n; i < width * n; ++i)
-      line_buffer[i] = z[i] - (z[i - n] >> 1);
-    break;
-  case 6:
-    for (i = n; i < width * n; ++i)
-      line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0);
-    break;
-  }
-}
-
-STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels,
-                                              int stride_bytes, int x, int y,
-                                              int n, int *out_len) {
-  int force_filter = stbi_write_force_png_filter;
-  int ctype[5] = {-1, 0, 4, 2, 6};
-  unsigned char sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
-  unsigned char *out, *o, *filt, *zlib;
-  signed char *line_buffer;
-  int j, zlen;
-
-  if (stride_bytes == 0)
-    stride_bytes = x * n;
-
-  if (force_filter >= 5) {
-    force_filter = -1;
-  }
-
-  filt = (unsigned char *)STBIW_MALLOC((x * n + 1) * y);
-  if (!filt)
-    return 0;
-  line_buffer = (signed char *)STBIW_MALLOC(x * n);
-  if (!line_buffer) {
-    STBIW_FREE(filt);
-    return 0;
-  }
-  for (j = 0; j < y; ++j) {
-    int filter_type;
-    if (force_filter > -1) {
-      filter_type = force_filter;
-      stbiw__encode_png_line((unsigned char *)(pixels), stride_bytes, x, y, j,
-                             n, force_filter, line_buffer);
-    } else { // Estimate the best filter by running through all of them:
-      int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
-      for (filter_type = 0; filter_type < 5; filter_type++) {
-        stbiw__encode_png_line((unsigned char *)(pixels), stride_bytes, x, y, j,
-                               n, filter_type, line_buffer);
-
-        // Estimate the entropy of the line using this filter; the less, the
-        // better.
-        est = 0;
-        for (i = 0; i < x * n; ++i) {
-          est += abs((signed char)line_buffer[i]);
-        }
-        if (est < best_filter_val) {
-          best_filter_val = est;
-          best_filter = filter_type;
-        }
-      }
-      if (filter_type != best_filter) { // If the last iteration already got us
-                                        // the best filter, don't redo it
-        stbiw__encode_png_line((unsigned char *)(pixels), stride_bytes, x, y, j,
-                               n, best_filter, line_buffer);
-        filter_type = best_filter;
-      }
-    }
-    // when we get here, filter_type contains the filter type, and line_buffer
-    // contains the data
-    filt[j * (x * n + 1)] = (unsigned char)filter_type;
-    STBIW_MEMMOVE(filt + j * (x * n + 1) + 1, line_buffer, x * n);
-  }
-  STBIW_FREE(line_buffer);
-  zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen,
-                            stbi_write_png_compression_level);
-  STBIW_FREE(filt);
-  if (!zlib)
-    return 0;
-
-  // each tag requires 12 bytes of overhead
-  out = (unsigned char *)STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12);
-  if (!out)
-    return 0;
-  *out_len = 8 + 12 + 13 + 12 + zlen + 12;
-
-  o = out;
-  STBIW_MEMMOVE(o, sig, 8);
-  o += 8;
-  stbiw__wp32(o, 13); // header length
-  stbiw__wptag(o, "IHDR");
-  stbiw__wp32(o, x);
-  stbiw__wp32(o, y);
-  *o++ = 8;
-  *o++ = STBIW_UCHAR(ctype[n]);
-  *o++ = 0;
-  *o++ = 0;
-  *o++ = 0;
-  stbiw__wpcrc(&o, 13);
-
-  stbiw__wp32(o, zlen);
-  stbiw__wptag(o, "IDAT");
-  STBIW_MEMMOVE(o, zlib, zlen);
-  o += zlen;
-  STBIW_FREE(zlib);
-  stbiw__wpcrc(&o, zlen);
-
-  stbiw__wp32(o, 0);
-  stbiw__wptag(o, "IEND");
-  stbiw__wpcrc(&o, 0);
-
-  STBIW_ASSERT(o == out + *out_len);
-
-  return out;
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp,
-                            const void *data, int stride_bytes) {
-  FILE *f;
-  int len;
-  unsigned char *png = stbi_write_png_to_mem((const unsigned char *)data,
-                                             stride_bytes, x, y, comp, &len);
-  if (png == NULL)
-    return 0;
-
-  f = stbiw__fopen(filename, "wb");
-  if (!f) {
-    STBIW_FREE(png);
-    return 0;
-  }
-  fwrite(png, 1, len, f);
-  fclose(f);
-  STBIW_FREE(png);
-  return 1;
-}
-#endif
-
-STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x,
-                                    int y, int comp, const void *data,
-                                    int stride_bytes) {
-  int len;
-  unsigned char *png = stbi_write_png_to_mem((const unsigned char *)data,
-                                             stride_bytes, x, y, comp, &len);
-  if (png == NULL)
-    return 0;
-  func(context, png, len);
-  STBIW_FREE(png);
-  return 1;
-}
-
-/* ***************************************************************************
- *
- * JPEG writer
- *
- * This is based on Jon Olick's jo_jpeg.cpp:
- * public domain Simple, Minimalistic JPEG writer -
- * http://www.jonolick.com/code.html
- */
-
-static const unsigned char stbiw__jpg_ZigZag[] = {
-    0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42,
-    3,  8,  12, 17, 25, 30, 41, 43, 9,  11, 18, 24, 31, 40, 44, 53,
-    10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
-    21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63};
-
-static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP,
-                                 int *bitCntP, const unsigned short *bs) {
-  int bitBuf = *bitBufP, bitCnt = *bitCntP;
-  bitCnt += bs[1];
-  bitBuf |= bs[0] << (24 - bitCnt);
-  while (bitCnt >= 8) {
-    unsigned char c = (bitBuf >> 16) & 255;
-    stbiw__putc(s, c);
-    if (c == 255) {
-      stbiw__putc(s, 0);
-    }
-    bitBuf <<= 8;
-    bitCnt -= 8;
-  }
-  *bitBufP = bitBuf;
-  *bitCntP = bitCnt;
-}
-
-static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p,
-                           float *d4p, float *d5p, float *d6p, float *d7p) {
-  float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p,
-        d6 = *d6p, d7 = *d7p;
-  float z1, z2, z3, z4, z5, z11, z13;
-
-  float tmp0 = d0 + d7;
-  float tmp7 = d0 - d7;
-  float tmp1 = d1 + d6;
-  float tmp6 = d1 - d6;
-  float tmp2 = d2 + d5;
-  float tmp5 = d2 - d5;
-  float tmp3 = d3 + d4;
-  float tmp4 = d3 - d4;
-
-  // Even part
-  float tmp10 = tmp0 + tmp3; // phase 2
-  float tmp13 = tmp0 - tmp3;
-  float tmp11 = tmp1 + tmp2;
-  float tmp12 = tmp1 - tmp2;
-
-  d0 = tmp10 + tmp11; // phase 3
-  d4 = tmp10 - tmp11;
-
-  z1 = (tmp12 + tmp13) * 0.707106781f; // c4
-  d2 = tmp13 + z1;                     // phase 5
-  d6 = tmp13 - z1;
-
-  // Odd part
-  tmp10 = tmp4 + tmp5; // phase 2
-  tmp11 = tmp5 + tmp6;
-  tmp12 = tmp6 + tmp7;
-
-  // The rotator is modified from fig 4-8 to avoid extra negations.
-  z5 = (tmp10 - tmp12) * 0.382683433f; // c6
-  z2 = tmp10 * 0.541196100f + z5;      // c2-c6
-  z4 = tmp12 * 1.306562965f + z5;      // c2+c6
-  z3 = tmp11 * 0.707106781f;           // c4
-
-  z11 = tmp7 + z3; // phase 5
-  z13 = tmp7 - z3;
-
-  *d5p = z13 + z2; // phase 6
-  *d3p = z13 - z2;
-  *d1p = z11 + z4;
-  *d7p = z11 - z4;
-
-  *d0p = d0;
-  *d2p = d2;
-  *d4p = d4;
-  *d6p = d6;
-}
-
-static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
-  int tmp1 = val < 0 ? -val : val;
-  val = val < 0 ? val - 1 : val;
-  bits[1] = 1;
-  while (tmp1 >>= 1) {
-    ++bits[1];
-  }
-  bits[0] = val & ((1 << bits[1]) - 1);
-}
-
-static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf,
-                                int *bitCnt, float *CDU, float *fdtbl, int DC,
-                                const unsigned short HTDC[256][2],
-                                const unsigned short HTAC[256][2]) {
-  const unsigned short EOB[2] = {HTAC[0x00][0], HTAC[0x00][1]};
-  const unsigned short M16zeroes[2] = {HTAC[0xF0][0], HTAC[0xF0][1]};
-  int dataOff, i, diff, end0pos;
-  int DU[64];
-
-  // DCT rows
-  for (dataOff = 0; dataOff < 64; dataOff += 8) {
-    stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 1], &CDU[dataOff + 2],
-                   &CDU[dataOff + 3], &CDU[dataOff + 4], &CDU[dataOff + 5],
-                   &CDU[dataOff + 6], &CDU[dataOff + 7]);
-  }
-  // DCT columns
-  for (dataOff = 0; dataOff < 8; ++dataOff) {
-    stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 8], &CDU[dataOff + 16],
-                   &CDU[dataOff + 24], &CDU[dataOff + 32], &CDU[dataOff + 40],
-                   &CDU[dataOff + 48], &CDU[dataOff + 56]);
-  }
-  // Quantize/descale/zigzag the coefficients
-  for (i = 0; i < 64; ++i) {
-    float v = CDU[i] * fdtbl[i];
-    // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v +
-    // 0.5f)); ceilf() and floorf() are C99, not C89, but I /think/ they're not
-    // needed here anyway?
-    DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
-  }
-
-  // Encode DC
-  diff = DU[0] - DC;
-  if (diff == 0) {
-    stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
-  } else {
-    unsigned short bits[2];
-    stbiw__jpg_calcBits(diff, bits);
-    stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
-    stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
-  }
-  // Encode ACs
-  end0pos = 63;
-  for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos) {
-  }
-  // end0pos = first element in reverse order !=0
-  if (end0pos == 0) {
-    stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
-    return DU[0];
-  }
-  for (i = 1; i <= end0pos; ++i) {
-    int startpos = i;
-    int nrzeroes;
-    unsigned short bits[2];
-    for (; DU[i] == 0 && i <= end0pos; ++i) {
-    }
-    nrzeroes = i - startpos;
-    if (nrzeroes >= 16) {
-      int lng = nrzeroes >> 4;
-      int nrmarker;
-      for (nrmarker = 1; nrmarker <= lng; ++nrmarker)
-        stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
-      nrzeroes &= 15;
-    }
-    stbiw__jpg_calcBits(DU[i], bits);
-    stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes << 4) + bits[1]]);
-    stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
-  }
-  if (end0pos != 63) {
-    stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
-  }
-  return DU[0];
-}
-
-static int stbi_write_jpg_core(stbi__write_context *s, int width, int height,
-                               int comp, const void *data, int quality) {
-  // Constants that don't pollute global namespace
-  static const unsigned char std_dc_luminance_nrcodes[] = {
-      0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0};
-  static const unsigned char std_dc_luminance_values[] = {0, 1, 2, 3, 4,  5,
-                                                          6, 7, 8, 9, 10, 11};
-  static const unsigned char std_ac_luminance_nrcodes[] = {
-      0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d};
-  static const unsigned char std_ac_luminance_values[] = {
-      0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06,
-      0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
-      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72,
-      0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
-      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45,
-      0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75,
-      0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3,
-      0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
-      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9,
-      0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
-      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4,
-      0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa};
-  static const unsigned char std_dc_chrominance_nrcodes[] = {
-      0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0};
-  static const unsigned char std_dc_chrominance_values[] = {0, 1, 2, 3, 4,  5,
-                                                            6, 7, 8, 9, 10, 11};
-  static const unsigned char std_ac_chrominance_nrcodes[] = {
-      0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77};
-  static const unsigned char std_ac_chrominance_values[] = {
-      0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41,
-      0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
-      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1,
-      0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
-      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44,
-      0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
-      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74,
-      0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a,
-      0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
-      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
-      0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
-      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4,
-      0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa};
-  // Huffman tables
-  static const unsigned short YDC_HT[256][2] = {
-      {0, 2},  {2, 3},  {3, 3},  {4, 3},   {5, 3},   {6, 3},
-      {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}};
-  static const unsigned short UVDC_HT[256][2] = {
-      {0, 2},  {1, 2},   {2, 2},   {6, 3},   {14, 4},    {30, 5},
-      {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}};
-  static const unsigned short YAC_HT[256][2] = {
-      {10, 4},     {0, 2},      {1, 2},      {4, 3},      {11, 4},
-      {26, 5},     {120, 7},    {248, 8},    {1014, 10},  {65410, 16},
-      {65411, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {12, 4},     {27, 5},     {121, 7},
-      {502, 9},    {2038, 11},  {65412, 16}, {65413, 16}, {65414, 16},
-      {65415, 16}, {65416, 16}, {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {28, 5},     {249, 8},
-      {1015, 10},  {4084, 12},  {65417, 16}, {65418, 16}, {65419, 16},
-      {65420, 16}, {65421, 16}, {65422, 16}, {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {58, 6},
-      {503, 9},    {4085, 12},  {65423, 16}, {65424, 16}, {65425, 16},
-      {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {59, 6},     {1016, 10},  {65430, 16}, {65431, 16}, {65432, 16},
-      {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {122, 7},    {2039, 11},  {65438, 16}, {65439, 16},
-      {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16},
-      {65445, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {123, 7},    {4086, 12},  {65446, 16},
-      {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16},
-      {65452, 16}, {65453, 16}, {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {250, 8},    {4087, 12},
-      {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16},
-      {65459, 16}, {65460, 16}, {65461, 16}, {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {504, 9},
-      {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16},
-      {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {505, 9},    {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16},
-      {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {506, 9},    {65479, 16}, {65480, 16}, {65481, 16},
-      {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16},
-      {65487, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {1017, 10},  {65488, 16}, {65489, 16},
-      {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16},
-      {65495, 16}, {65496, 16}, {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {1018, 10},  {65497, 16},
-      {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16},
-      {65503, 16}, {65504, 16}, {65505, 16}, {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {2040, 11},
-      {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16},
-      {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16},
-      {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {2041, 11},  {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16},
-      {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16},
-      {65534, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0}};
-  static const unsigned short UVAC_HT[256][2] = {
-      {0, 2},      {1, 2},      {4, 3},      {10, 4},     {24, 5},
-      {25, 5},     {56, 6},     {120, 7},    {500, 9},    {1014, 10},
-      {4084, 12},  {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {11, 4},     {57, 6},     {246, 8},
-      {501, 9},    {2038, 11},  {4085, 12},  {65416, 16}, {65417, 16},
-      {65418, 16}, {65419, 16}, {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {26, 5},     {247, 8},
-      {1015, 10},  {4086, 12},  {32706, 15}, {65420, 16}, {65421, 16},
-      {65422, 16}, {65423, 16}, {65424, 16}, {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {27, 5},
-      {248, 8},    {1016, 10},  {4087, 12},  {65425, 16}, {65426, 16},
-      {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {58, 6},     {502, 9},    {65431, 16}, {65432, 16}, {65433, 16},
-      {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {59, 6},     {1017, 10},  {65439, 16}, {65440, 16},
-      {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16},
-      {65446, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {121, 7},    {2039, 11},  {65447, 16},
-      {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16},
-      {65453, 16}, {65454, 16}, {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {122, 7},    {2040, 11},
-      {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16},
-      {65460, 16}, {65461, 16}, {65462, 16}, {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {249, 8},
-      {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16},
-      {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {503, 9},    {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16},
-      {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {504, 9},    {65481, 16}, {65482, 16}, {65483, 16},
-      {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16},
-      {65489, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {505, 9},    {65490, 16}, {65491, 16},
-      {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16},
-      {65497, 16}, {65498, 16}, {0, 0},      {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {506, 9},    {65499, 16},
-      {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16},
-      {65505, 16}, {65506, 16}, {65507, 16}, {0, 0},      {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {2041, 11},
-      {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16},
-      {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16},
-      {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16},
-      {0, 0},      {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {1018, 10},  {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16},
-      {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16},
-      {65534, 16}, {0, 0},      {0, 0},      {0, 0},      {0, 0},
-      {0, 0}};
-  static const int YQT[] = {
-      16, 11, 10, 16, 24,  40,  51,  61,  12, 12, 14, 19, 26,  58,  60,  55,
-      14, 13, 16, 24, 40,  57,  69,  56,  14, 17, 22, 29, 51,  87,  80,  62,
-      18, 22, 37, 56, 68,  109, 103, 77,  24, 35, 55, 64, 81,  104, 113, 92,
-      49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99};
-  static const int UVQT[] = {17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99,
-                             99, 99, 99, 24, 26, 56, 99, 99, 99, 99, 99, 47, 66,
-                             99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                             99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                             99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99};
-  static const float aasf[] = {
-      1.0f * 2.828427125f,         1.387039845f * 2.828427125f,
-      1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f,
-      1.0f * 2.828427125f,         0.785694958f * 2.828427125f,
-      0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f};
-
-  int row, col, i, k;
-  float fdtbl_Y[64], fdtbl_UV[64];
-  unsigned char YTable[64], UVTable[64];
-
-  if (!data || !width || !height || comp > 4 || comp < 1) {
-    return 0;
-  }
-
-  quality = quality ? quality : 90;
-  quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
-  quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
-
-  for (i = 0; i < 64; ++i) {
-    int uvti, yti = (YQT[i] * quality + 50) / 100;
-    YTable[stbiw__jpg_ZigZag[i]] =
-        (unsigned char)(yti < 1 ? 1 : yti > 255 ? 255 : yti);
-    uvti = (UVQT[i] * quality + 50) / 100;
-    UVTable[stbiw__jpg_ZigZag[i]] =
-        (unsigned char)(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
-  }
-
-  for (row = 0, k = 0; row < 8; ++row) {
-    for (col = 0; col < 8; ++col, ++k) {
-      fdtbl_Y[k] = 1 / (YTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
-      fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
-    }
-  }
-
-  // Write Headers
-  {
-    static const unsigned char head0[] = {
-        0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F',  'I',  'F', 0,    1, 1,
-        0,    0,    1,    0,    1, 0,    0,   0xFF, 0xDB, 0,   0x84, 0};
-    static const unsigned char head2[] = {0xFF, 0xDA, 0, 0xC,  3, 1,    0,
-                                          2,    0x11, 3, 0x11, 0, 0x3F, 0};
-    const unsigned char head1[] = {0xFF,
-                                   0xC0,
-                                   0,
-                                   0x11,
-                                   8,
-                                   (unsigned char)(height >> 8),
-                                   STBIW_UCHAR(height),
-                                   (unsigned char)(width >> 8),
-                                   STBIW_UCHAR(width),
-                                   3,
-                                   1,
-                                   0x11,
-                                   0,
-                                   2,
-                                   0x11,
-                                   1,
-                                   3,
-                                   0x11,
-                                   1,
-                                   0xFF,
-                                   0xC4,
-                                   0x01,
-                                   0xA2,
-                                   0};
-    s->func(s->context, (void *)head0, sizeof(head0));
-    s->func(s->context, (void *)YTable, sizeof(YTable));
-    stbiw__putc(s, 1);
-    s->func(s->context, UVTable, sizeof(UVTable));
-    s->func(s->context, (void *)head1, sizeof(head1));
-    s->func(s->context, (void *)(std_dc_luminance_nrcodes + 1),
-            sizeof(std_dc_luminance_nrcodes) - 1);
-    s->func(s->context, (void *)std_dc_luminance_values,
-            sizeof(std_dc_luminance_values));
-    stbiw__putc(s, 0x10); // HTYACinfo
-    s->func(s->context, (void *)(std_ac_luminance_nrcodes + 1),
-            sizeof(std_ac_luminance_nrcodes) - 1);
-    s->func(s->context, (void *)std_ac_luminance_values,
-            sizeof(std_ac_luminance_values));
-    stbiw__putc(s, 1); // HTUDCinfo
-    s->func(s->context, (void *)(std_dc_chrominance_nrcodes + 1),
-            sizeof(std_dc_chrominance_nrcodes) - 1);
-    s->func(s->context, (void *)std_dc_chrominance_values,
-            sizeof(std_dc_chrominance_values));
-    stbiw__putc(s, 0x11); // HTUACinfo
-    s->func(s->context, (void *)(std_ac_chrominance_nrcodes + 1),
-            sizeof(std_ac_chrominance_nrcodes) - 1);
-    s->func(s->context, (void *)std_ac_chrominance_values,
-            sizeof(std_ac_chrominance_values));
-    s->func(s->context, (void *)head2, sizeof(head2));
-  }
-
-  // Encode 8x8 macroblocks
-  {
-    static const unsigned short fillBits[] = {0x7F, 7};
-    const unsigned char *imageData = (const unsigned char *)data;
-    int DCY = 0, DCU = 0, DCV = 0;
-    int bitBuf = 0, bitCnt = 0;
-    // comp == 2 is grey+alpha (alpha is ignored)
-    int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
-    int x, y, pos;
-    for (y = 0; y < height; y += 8) {
-      for (x = 0; x < width; x += 8) {
-        float YDU[64], UDU[64], VDU[64];
-        for (row = y, pos = 0; row < y + 8; ++row) {
-          // row >= height => use last input row
-          int clamped_row = (row < height) ? row : height - 1;
-          int base_p =
-              (stbi__flip_vertically_on_write ? (height - 1 - clamped_row)
-                                              : clamped_row) *
-              width * comp;
-          for (col = x; col < x + 8; ++col, ++pos) {
-            float r, g, b;
-            // if col >= width => use pixel from last input column
-            int p = base_p + ((col < width) ? col : (width - 1)) * comp;
-
-            r = imageData[p + 0];
-            g = imageData[p + ofsG];
-            b = imageData[p + ofsB];
-            YDU[pos] = +0.29900f * r + 0.58700f * g + 0.11400f * b - 128;
-            UDU[pos] = -0.16874f * r - 0.33126f * g + 0.50000f * b;
-            VDU[pos] = +0.50000f * r - 0.41869f * g - 0.08131f * b;
-          }
-        }
-
-        DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY,
-                                   YDC_HT, YAC_HT);
-        DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU,
-                                   UVDC_HT, UVAC_HT);
-        DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV,
-                                   UVDC_HT, UVAC_HT);
-      }
-    }
-
-    // Do the bit alignment of the EOI marker
-    stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
-  }
-
-  // EOI
-  stbiw__putc(s, 0xFF);
-  stbiw__putc(s, 0xD9);
-
-  return 1;
-}
-
-STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x,
-                                    int y, int comp, const void *data,
-                                    int quality) {
-  stbi__write_context s;
-  stbi__start_write_callbacks(&s, func, context);
-  return stbi_write_jpg_core(&s, x, y, comp, (void *)data, quality);
-}
-
-#ifndef STBI_WRITE_NO_STDIO
-STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp,
-                            const void *data, int quality) {
-  stbi__write_context s;
-  if (stbi__start_write_file(&s, filename)) {
-    int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
-    stbi__end_write_file(&s);
-    return r;
-  } else
-    return 0;
-}
-#endif
-
-#endif // STB_IMAGE_WRITE_IMPLEMENTATION
-
-/* Revision history
-      1.11  (2019-08-11)
-
-      1.10  (2019-02-07)
-             support utf8 filenames in Windows; fix warnings and platform ifdefs
-      1.09  (2018-02-11)
-             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
-      1.08  (2018-01-29)
-             add stbi__flip_vertically_on_write, external zlib, zlib quality,
-   choose PNG filter 1.07  (2017-07-24) doc fix 1.06 (2017-07-23) writing JPEG
-   (using Jon Olick's code) 1.05   ??? 1.04 (2017-03-03) monochrome BMP
-   expansion 1.03   ??? 1.02 (2016-04-02) avoid allocating large structures on
-   the stack 1.01 (2016-01-16) STBIW_REALLOC_SIZED: support allocators with no
-   realloc support avoid race-condition in crc initialization minor compile
-   issues 1.00 (2015-09-14) installable file IO function 0.99 (2015-09-13)
-             warning fixes; TGA rle support
-      0.98 (2015-04-08)
-             added STBIW_MALLOC, STBIW_ASSERT etc
-      0.97 (2015-01-18)
-             fixed HDR asserts, rewrote HDR rle logic
-      0.96 (2015-01-17)
-             add HDR output
-             fix monochrome BMP
-      0.95 (2014-08-17)
-                       add monochrome TGA output
-      0.94 (2014-05-31)
-             rename private functions to avoid conflicts with stb_image.h
-      0.93 (2014-05-27)
-             warning fixes
-      0.92 (2010-08-01)
-             casts to unsigned char to fix warnings
-      0.91 (2010-07-17)
-             first public release
-      0.90   first internal release
-*/
-
-/*
-------------------------------------------------------------------------------
-This software is available under 2 licenses -- choose whichever you prefer.
-------------------------------------------------------------------------------
-ALTERNATIVE A - MIT License
-Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-------------------------------------------------------------------------------
-ALTERNATIVE B - Public Domain (www.unlicense.org)
-This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-------------------------------------------------------------------------------
-*/
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
deleted file mode 100644
index 52f08730620945d3559c58d26051e81437996eac..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef IMG_TENSOR_RUNTIME_H
-#define IMG_TENSOR_RUNTIME_H
-
-#include "device_math.h"
-#include "img_tensor_utils.h"
-#include <cstddef>
-
-// ***                        Runtime declaration                        *** //
-void *tensorFft(void *input, bool inverse);
-void *tensorFftHalf(void *input, bool inverse);
-void *tensorReduce(void *input, size_t axis, MathOp func,
-                   float skip_ratio = 0.0f);
-void *tensorReduceHalf(void *input, size_t axis, MathOp func,
-                       float skip_ratio = 0.0f);
-void *tensorProjectiveT(void *input, void *transformation);
-void *tensorMap1(MathOp f, void *i);
-void *tensorMap2(MathOp f2, void *i1, void *i2);
-void *tensorMap3(MathOp f3, void *i1, void *i2, void *i3);
-void *tensorMap1Half(MathOp f, void *i);
-void *tensorMap2Half(MathOp f2, void *i1, void *i2);
-void *tensorMap3Half(MathOp f3, void *i1, void *i2, void *i3);
-
-// ***                      Wrapper API declaration                      *** //
-extern "C" {
-void *wrapper_tensorFft(const char *hpvm_node_id, void *input, bool inverse);
-void *wrapper_tensorReduce(const char *hpvm_node_id, void *input, int axis,
-                           int func);
-void *wrapper_tensorProjectiveT(const char *hpvm_node_id, void *input,
-                                void *transformation);
-void *wrapper_tensorMap1(const char *hpvm_node_id, int func, void *input);
-void *wrapper_tensorMap2(const char *hpvm_node_id, int func, void *input1,
-                         void *input2);
-void *wrapper_tensorMap3(const char *hpvm_node_id, int func, void *input1,
-                         void *input2, void *input3);
-
-// Tentative
-void *wrapper_tensorStencil(const char *hpvm_node_id, void *input);
-void *wrapper_tensorCosineT(const char *hpvm_node_id, void *input);
-}
-
-#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
deleted file mode 100644
index 5dc3fe3dbc3cec9ea81fa33bc56471e2d6daaae5..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-img_tensor_utils.h
-Util functions for image load/save, image quality calculation (PSNR), etc.
-*/
-#ifndef IMG_TENSOR_UTILS
-#define IMG_TENSOR_UTILS
-
-#include <string>
-#include <vector>
-
-#include "tensor.h"
-
-const size_t N_RGB_CHAN = 3;
-
-// Loader constructor
-void *loadAsImage(const char *filename, size_t n_color = N_RGB_CHAN);
-
-void saveToImage(const char *filename, Tensor *tensor);
-
-Tensor *readDataSet(const char *path, size_t start = 0,
-                    size_t count = std::string::npos,
-                    size_t n_color = N_RGB_CHAN);
-
-void saveDataSet(const char *path, Tensor *batch, size_t start_idx = 0,
-                 size_t write_n = 0);
-
-// Kernel constructor
-void *createFilterFromData(int data_type, void *data, size_t w, size_t h,
-                           size_t n_chan);
-
-std::vector<float> PSNR(void *gold_ptr, void *approx_ptr);
-
-float violationRate(const std::vector<float> &values, float threshold,
-                    bool higher_better = true);
-
-float mean(const std::vector<float> &values);
-
-std::vector<float> SSIM(void *lhs_ptr, void *rhs_ptr);
-
-void *sliceTensorInBatch(void *whole, size_t start, size_t end);
-
-void reshape(void *t, const std::vector<size_t> &shape);
-
-#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h
index f2c732cb2743daebadec4fddc2ad88d799959dbb..a766c02d6cc724fe91e4ef581871497cfddee788 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h
@@ -4,5 +4,4 @@ void llvm_hpvm_initializeRuntimeController(const char *);
 void llvm_hpvm_clearRuntimeController();
 void llvm_hpvm_invokeRtControl(void *result, const char *str, int start,
                                int end);
-void llvm_hpvm_imgInvokeRtControl(void *result, void *gold, int start, int end);
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
index 4bb703bbd2596980fb4d930b36aaa749c7144044..d070d7755c1f5982c2c9fabf1acdca83bd446870 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
@@ -1,122 +1,79 @@
-//===--------------------------- tensor_cpu_runtime.h -----------------------===//
+//===--------------------------- tensor_cpu_runtime.h
+//-----------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 // This header file comprises of the API to the tensor routines for CPU.
 // This also contains the interfaces to the approximated versions of tensor
 // operations that are supported on CPU.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include <stdio.h>
 #include <cstdlib>
 #include <cmath>
 #include <memory>
 #include <string>
 
-
 #ifndef TENSOR_CPU_HEADER
 #define TENSOR_CPU_HEADER
 
+extern "C" {
+/****  Initialization Routine - Must be inserted at program start (in the
+ * backend)  ****/
+void llvm_hpvm_initTensorRtCPU();
+void llvm_hpvm_cleanupTensorRtCPU();
 
-extern "C"{
-  /****  Initialization Routine - Must be inserted at program start (in the backend)  ****/
-  void llvm_hpvm_initTensorRtCPU();
-  void llvm_hpvm_cleanupTensorRtCPU();
+// Routine to moving tensor data (from and to GPU,CPU)
+void hpvm_request_tensorCPU(void *tensor, int destination);
 
-  // Routine to moving tensor data (from and to GPU,CPU)
-  void hpvm_request_tensorCPU(void* tensor, int destination);
+// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for
+// cuDNN operations NOTE: The only data format supported as of now is: NCHW
+// (batch_dimension, channels, Height, Width)
+// void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t
+// dim2_size,
+///	       size_t dim3_size, size_t dim4_size, bool freeMemory = true);
 
+void initTensorData(void *tensor, void *data_ptr, size_t size_in_bytes);
 
-  // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations
-  // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width)
-  //void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
-	///	       size_t dim3_size, size_t dim4_size, bool freeMemory = true);
-  
-  void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes);
+/********** Tensor Operation API ******/
 
-  /********** Tensor Operation API ******/
+// NOTE: For conv_mode, only value '1' is supported
+void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int compute_precision, int row, int col,
+                           int skip_every, int start);
 
-  // NOTE: For conv_mode, only value '1' is supported
-void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr,
-                          int vertical_pad, int horizontal_pad,
-                          int vertical_stride, int horizontal_stride,
-                          int conv_mode, int compute_precision,
-                          int row, int col, int skip_every, int start);
+void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                          int horizontal_pad, int vertical_stride,
+                          int horizontal_stride, int conv_mode,
+                          int compute_precision, int row, int col,
+                          int skip_every, int start);
 
-void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr,
-                          int vertical_pad, int horizontal_pad,
-                          int vertical_stride, int horizontal_stride,
-                          int conv_mode, int compute_precision,
-                          int row, int col, int skip_every, int start);
+void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int conv_groups);
 
-void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups);
-			
- void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-                         void* mean_ptr, void* variance_ptr, double epsilon);
+void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                         void *mean_ptr, void *variance_ptr, double epsilon);
 
+void *tensorPoolingCPU(void *input, int poolFunction, int window_height,
+                       int window_width, int vertical_pad, int horizontal_pad,
+                       int vertical_stride, int horizontal_stride);
 
-  void* tensorPoolingCPU(void* input,
-			 int poolFunction,
-			 int window_height, int window_width,
-			 int vertical_pad, int horizontal_pad,
-			 int vertical_stride, int horizontal_stride);
+void *tensorGemmCPU(void *lhs, void *rhs);
 
-  void* tensorGemmCPU(void* lhs, void* rhs);
+void *tensorAddCPU(void *x, void *bias);
 
-  void* tensorAddCPU(void* x, void* bias);
+void *tensorReluCPU(void *input);
 
-  void* tensorReluCPU(void* input);
-
-  void* tensorRelu2CPU(void* input, float min, float max);
-  
-  void* tensorTanhCPU(void* input);
-  
-  void* tensorSoftmaxCPU(void* input);
-    
-}
+void *tensorRelu2CPU(void *input, float min, float max);
 
+void *tensorTanhCPU(void *input);
 
-/*
-void dummyFunction(){
-
-  void* initRT = (void*) &llvm_hpvm_initTensorRt;
-  void* cleanRT = (void*) &llvm_hpvm_cleanupTensorRt;
-  void* request_tensorPtr = (void*) &hpvm_request_tensor;
-  void* startProf = (void*) &startProfiling;
-  void* stopProf = (void*) &stopProfiling;
-  void* create2Dptr = (void*) &create2DTensor;
-  void* create3Dptr = (void*) &create3DTensor;
-  void* create4Dptr = (void*) &create4DTensor;
-  void* initTensorPtr = (void*) &initTensorData;
-  void* tensorSplitPtr = (void*) &tensorSplit;
-  void* tensorConcatPtr = (void*) &tensorConcat;
-  void* tensorConvPtr = (void*) &tensorConvolution;
-  void* tensorHConvPtr = (void*) &tensorHalfConvolution;
-  void* tensorPoolPtr = (void*) &tensorPooling;
-  void* tensorHalfPoolPtr = (void*) &tensorHalfPooling;
-  void* tensorLRNPtr = (void*) &tensorLRN;
-  void* tensorGemmPr = (void*) &tensorGemm;
-  void* tensorGemmCPUPtr = (void*) &tensorGemmCPU;
-  void* tensorGemmGPUPtr = (void*) &tensorGemmGPU;
-  void* tensorHgemmPtr = (void*) &tensorHalfGemm;
-  void* tensorGemmBiasPtr = (void*) &tensorGemmBias;
-  void* tensorAddPtr = (void*) &tensorAdd;
-  void* tensorHalfAddPtr = (void*) &tensorHalfAdd;
-  void* tensorReluPtr = (void*) &tensorRelu;
-  //FIXME: --void* tensorHalfReluPtr = (void*) &tensorHalfRelu;
-  void* tensorRelu2Ptr = (void*) &tensorRelu2;
-  void* tensorHalfRelu2Ptr = (void*) &tensorHalfRelu2;
-  void* tensorTanhPtr = (void*) &tensorTanh;
-  void* tensorHalfTanhPtr = (void*) &tensorHalfTanh;
-  void* tensorSoftmaxPtr = (void*) &tensorSoftmax;
-  void* tensorAddErrorPtr = (void*) &tensorAddError;    
+void *tensorSoftmaxCPU(void *input);
 }
-*/
-
 
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc
index 0de2808221adfb122860a031eea4ed8c89d6e2ba..083d733b14d4f335f4365503e5ecdb5c3de0a2fb 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc
@@ -69,37 +69,6 @@ void *tensorAdd(void *x, void *bias);
 void *tensorRelu(void *input);
 // NOTE: In-place operation
 void *tensorSoftmax(void *input);
-
-/* Error injection API - used for accuracy tuning */
-void *tensorAddError(void *x_ptr);
-}
-
-void emptyFunction() {
-
-  void *initRT = (void *)&llvm_hpvm_initTensorRt;
-  void *cleanRT = (void *)&llvm_hpvm_cleanupTensorRt;
-  void *request_tensorPtr = (void *)&hpvm_request_tensor;
-  void *startProf = (void *)&startProfiling;
-  void *stopProf = (void *)&stopProfiling;
-  void *create2Dptr = (void *)&create2DTensor;
-  void *create3Dptr = (void *)&create3DTensor;
-  void *create4Dptr = (void *)&create4DTensor;
-  void *initTensorPtr = (void *)&initTensorData;
-  void *tensorSplitPtr = (void *)&tensorSplit;
-  void *tensorConcatPtr = (void *)&tensorConcat;
-  void *tensorConvPtr = (void *)&tensorConvolution;
-  void *tensorHConvPtr = (void *)&tensorHConvolution;
-  void *tensorPoolPtr = (void *)&tensorPooling;
-  void *tensorLRNPtr = (void *)&tensorLRN;
-  void *tensorGemmPr = (void *)&tensorGemm;
-  void *tensorGemmCPUPtr = (void *)&tensorGemmCPU;
-  void *tensorGemmGPUPtr = (void *)&tensorGemmGPU;
-  void *tensorHgemmPtr = (void *)&tensorHgemm;
-  void *tensorGemmBiasPtr = (void *)&tensorGemmBias;
-  void *tensorAddPtr = (void *)&tensorAdd;
-  void *tensorReluPtr = (void *)&tensorRelu;
-  void *tensorSoftmaxPtr = (void *)&tensorSoftmax;
-  void *tensorAddErrorPtr = (void *)&tensorAddError;
 }
 
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
index b6d7f862fa60973e650c3b4306df61e89d28eb30..1b6e986a47324ab0ab663fc8e1e5171b07c135cf 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
@@ -12,8 +12,6 @@
 #include <stdio.h>
 #include <string>
 
-#include "img_tensor_runtime.h"
-
 extern "C" {
 /****  Initialization Routine - Must be inserted at program start (in the
  * backend)  ****/
@@ -111,13 +109,6 @@ void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
 void *tensorHalfBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
                           void *mean_ptr, void *variance_ptr, double epsilon);
 
-/* Error injection API - used for accuracy tuning */
-void *tensorAddError(void *x_ptr, int error_scale);
-
-void *tensorGemmModel(void *lhs, void *rhs);
-
-/*** Error Injection API End **/
-
 /****  PROMISE API *****/
 
 /*************
@@ -168,22 +159,14 @@ void *wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
                         int activation_id, // Relu, Tanh, ClipRelu
                         float out_min, float out_max);
 
+void *wrapper_ConvLayer2(
+    const char *hpvm_node_id, void *input, void *filter, void *bias,
+    int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w,
+    int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v,
+    int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id,
+    // NOTE: out_min, out_max are only relevant for ClippedRelu
+    float out_min, float out_max);
 
-void* wrapper_ConvLayer2(const char* hpvm_node_id,
-			  void* input, 
-			  void* filter, 
-			  void* bias, 
-			  int conv_pad_h, int conv_pad_w,
-			  int conv_stride_h, int conv_stride_w,
-			  int pool_id,
-			  int pool_size_v, int pool_size_h,			 
-			  int pool_pad_v, int pool_pad_h,
-			  int pool_stride_v, int pool_stride_h,
-			  int activation_id,
-			  // NOTE: out_min, out_max are only relevant for ClippedRelu
-			  float out_min, float out_max);
-  
-  
 void *wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights,
                       void *bias, int activation_id, float out_min,
                       float out_max);
@@ -213,11 +196,8 @@ void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr,
 
 void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr);
 
-
 void *tensor_set_node_id(unsigned int node_id);
-  
-  
-  
+
 // Utilities
 // TODO: separate utils in separate header
 void dumpAccuracyNorms();
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
index 323adbac8940ed83c51d3729565c1bda3dbf35cc..a3853fda533aa4668963826eb646f009aae02695 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
@@ -1,14 +1,14 @@
-//===--------------------------- tensor_signatures.cc -----------------------===//
+//===--------------------------- tensor_signatures.cc
+//-----------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 // This file contains the declarations of the API to the HPVM tensor runtime.
 // This is compiled to LLVM bitcode file that is loaded by HPVM passes when
 // tensor-based application are compiled through HPVM.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "tensor_runtime.h"
 
 void dummyFunction() {
@@ -51,7 +51,6 @@ void dummyFunction() {
   void *tensorHalfTanhPtr = (void *)&tensorHalfTanh;
   void *tensorSoftmaxPtr = (void *)&tensorSoftmax;
   void *tensorBatchNormPtr = (void *)&tensorBatchNorm;
-  void *tensorAddErrorPtr = (void *)&tensorAddError;
   void *ConvLayer = (void *)&ConvLayer_PROMISE;
   void *FCLayer = (void *)&FCLayer_PROMISE;
 
@@ -67,14 +66,5 @@ void dummyFunction() {
   void *PoolingWrapper = (void *)&wrapper_tensorPooling;
   void *softmaxWrapper = (void *)&wrapper_tensorSoftmax;
 
-  void *tensorFft = (void *)&wrapper_tensorFft;
-  void *tensorReduce = (void *)&wrapper_tensorReduce;
-  void *tensorProjectiveT = (void *)&wrapper_tensorProjectiveT;
-  void *tensorMap1 = (void *)&wrapper_tensorMap1;
-  void *tensorMap2 = (void *)&wrapper_tensorMap2;
-  void *tensorMap3 = (void *)&wrapper_tensorMap3;
-  void *tensorStencil = (void *)&wrapper_tensorStencil;
-  void *tensorCosineT = (void *)&wrapper_tensorCosineT;
-
   void *tensorNodeID = (void *)&tensor_set_node_id;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc
index b272bbcab45573f03ac17305f86a99e630db2950..a0ca6f5bb0632b592b6cc6b09c9cd6068319b954 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc
@@ -27,17 +27,17 @@ PerfParamSet::PerfParamSet() {
   printf("- knobs_file_path = %s \n", GLOBAL_KNOBS_FILE);
   std::ifstream file(GLOBAL_KNOBS_FILE);
 
-  if (!file){
+  if (!file) {
     ERROR(" Could NOT find global_knobs.txt \n");
   }
-  
+
   std::string line;
   std::string partial;
   std::vector<std::string> tokens;
 
   while (std::getline(file, line)) { // Read each line
 
-    //printf ("***** line === %s ", line);
+    // printf ("***** line === %s ", line);
     std::istringstream iss(line);
     std::string token;
     while (std::getline(iss, token, '\t')) { // Read each token in the line
@@ -64,7 +64,7 @@ PerfParamSet::PerfParamSet() {
         std::getline(token_stream, tok, ',');
         int offset = atoi(tok.c_str());
 
-        //printf("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob,
+        // printf("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob,
         //       row, col, offset);
         PerfParams params(row, col, offset);
         perf_knob_map[knob] = params;
@@ -101,10 +101,10 @@ SampParamSet::SampParamSet() {
   printf("- knobs_file_path = %s \n", GLOBAL_KNOBS_FILE);
   std::ifstream file(GLOBAL_KNOBS_FILE);
 
-  if (!file){
+  if (!file) {
     ERROR("Could NOT find global_knobs.txt \n");
   }
-  
+
   std::string line;
   std::string partial;
   std::vector<std::string> tokens;
@@ -124,7 +124,7 @@ SampParamSet::SampParamSet() {
         int index2 = token.find(",");
         std::string knob_str = token.substr(index2 + 1);
         int knob = atoi(knob_str.c_str());
-        //printf("knob = %d \n", knob);
+        // printf("knob = %d \n", knob);
 
         std::getline(iss, token, '\t');
         std::istringstream token_stream(token);
@@ -140,7 +140,7 @@ SampParamSet::SampParamSet() {
         std::getline(token_stream, tok, ',');
         float interpolation_id = atof(tok.c_str());
 
-        //printf("skip_every = %d, offset = %d \n", skip_every, offset);
+        // printf("skip_every = %d, offset = %d \n", skip_every, offset);
         SampParams params(skip_every, offset, interpolation_id);
         samp_knob_map[knob] = params;
       }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
index e9a4e50b000918c328a8b693f39c04505b6e4b79..8a8ff8435db96607917fc627036e72318409ef9b 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
@@ -1,14 +1,13 @@
 //===--------------------------- approxs_simulator.cu ---------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  consists of the emulations of implementation of software 
-// approximations for tensor convolutions. The approximations implemented are 
-// feature sampling and perforation for FP32 and FP16 compute precisions.  
+//
+//  This file  consists of the emulations of implementation of software
+// approximations for tensor convolutions. The approximations implemented are
+// feature sampling and perforation for FP32 and FP16 compute precisions.
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef SIM_HEADER
 #define SIM_HEADER
 
@@ -27,7 +26,6 @@
 #include "global_data.h"
 #include "approx_knob_utils.h"
 
-
 #include <unordered_map>
 #include <sstream>
 #include <fstream>
@@ -36,77 +34,67 @@
 #include <map>
 #include <cassert>
 
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void postInterpolateRow(int N, int n, int c, int h, int w,
-			float* data, int int_row){
+// N is new_data's size
+// n, c, h, w are the dimensions of new_data
+__global__ void postInterpolateRow(int N, int n, int c, int h, int w,
+                                   float *data, int int_row) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n = i / (c * h * w);
 
-    if((row % int_row == 1) && (row != 0) && (row != h-1))
+    if ((row % int_row == 1) && (row != 0) && (row != h - 1))
       data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(data[n * (c * h * w) + ch * (h * w) + (row - 1) * (w) + col] +
-	 data[n * (c * h * w) + ch * (h * w) + (row + 1)  * (w) + col]) / 2;
-
+          (data[n * (c * h * w) + ch * (h * w) + (row - 1) * (w) + col] +
+           data[n * (c * h * w) + ch * (h * w) + (row + 1) * (w) + col]) /
+          2;
   }
 }
 
-
-
-__global__
-void postInterpolateCol(int N, int n, int c, int h, int w,
-			float* data, int int_col){
+__global__ void postInterpolateCol(int N, int n, int c, int h, int w,
+                                   float *data, int int_col) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n = i / (c * h * w);
 
-    if((col % int_col == 1) && (col != 0) && (col != w-1))
+    if ((col % int_col == 1) && (col != 0) && (col != w - 1))
       data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(data[n * (c * h * w) + ch * (h * w) + row * (w) + (col-1) ] +
-	 data[n * (c * h * w) + ch * (h * w) + row * (w) + (col+1) ])/2;
-
+          (data[n * (c * h * w) + ch * (h * w) + row * (w) + (col - 1)] +
+           data[n * (c * h * w) + ch * (h * w) + row * (w) + (col + 1)]) /
+          2;
   }
 }
 
-
-
-
 // A 'Simulation' of perforated tensor convolution
-void* tensorConvPerfSim(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups,
-			int row, int col){
-  
+void *tensorConvPerfSim(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups,
+                        int row, int col) {
 
   INFO("*** TensorConvolution \n");
   profileEvent("tensorConv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
   cudnnConvolutionDescriptor_t convDesc;
   cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  
-  if(conv_mode == 0)
+
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   float alpha = 1.0f, beta = 0.0f;
@@ -114,13 +102,13 @@ void* tensorConvPerfSim(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-  INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
-       vertical_stride, horizontal_stride);
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+       horizontal_stride);
 
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
   }
 
@@ -130,134 +118,111 @@ void* tensorConvPerfSim(void* input_ptr, void* filter_ptr,
   int new_v = vertical_stride + 0;
   int new_h = horizontal_stride + 0;
   cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-  
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     new_v, new_h, // conv strides
-					     1, 1, // upscaling values
-					     mode , // mode is configurable
-					     computeType)); // defines compute precision
+
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      new_v, new_h,                           // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   int n, c, h, w; // output dimensions
   // Find dimension of convolution output
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
-
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
   DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output;
-  if(input->data_format == CUDNN_TENSOR_NCHW)
-    output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
-  else if(input->data_format == CUDNN_TENSOR_NHWC){
+  Tensor *output;
+  if (input->data_format == CUDNN_TENSOR_NCHW)
+    output = (Tensor *)create4DTensor((cudnnDataType_t)input->data_type,
+                                      CUDNN_TENSOR_NCHW, n, c, h, w);
+  else if (input->data_format == CUDNN_TENSOR_NHWC) {
     DEBUG("* NHWC Format \n");
-    output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
-				      CUDNN_TENSOR_NHWC, n, h, w, c);
-  }
-  else
+    output = (Tensor *)create4DTensor((cudnnDataType_t)input->data_type,
+                                      CUDNN_TENSOR_NHWC, n, h, w, c);
+  } else
     ERROR("Unsupported Tensor Type");
 
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
   // NOTE: Necessary to insert the above call for every output tensor
 
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0],
-	output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = "
+        "%d, W = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
 
-  if(convDesc == NULL || input->tensor_desc == NULL ||
-     filter->filter_desc == NULL || output->tensor_desc == NULL)
+  if (convDesc == NULL || input->tensor_desc == NULL ||
+      filter->filter_desc == NULL || output->tensor_desc == NULL)
     ERROR("NULL descriptor! \n");
 
-
-
-  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						 input->tensor_desc,
-						 filter->filter_desc,
-						 convDesc,
-						 output->tensor_desc,
-						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						 0,
-						 &convAlgo));
-
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support
+  // is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+      // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+      0, &convAlgo));
 
   DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
   // FIXIT: Algo shouldn't be hardcoded
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
 
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     convDesc,
-						     output->tensor_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
-  void* workspace;
+  void *workspace;
   checkCudaErrors(cudaMalloc(&workspace, workspace_size));
   DEBUG("workspace size = %d \n", workspace_size);
 
-
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				     input->gpu_data, filter->filter_desc, filter->gpu_data,
-				     convDesc, convAlgo, workspace, workspace_size,
-				     &beta, output->tensor_desc, output->gpu_data));
-
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+      filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace,
+      workspace_size, &beta, output->tensor_desc, output->gpu_data));
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] -
-       filter->dims.dim_sizes[2]) / vertical_stride + 1;
-  
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] -
-       filter->dims.dim_sizes[3]) / horizontal_stride + 1;
+       filter->dims.dim_sizes[2]) /
+          vertical_stride +
+      1;
 
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] -
+       filter->dims.dim_sizes[3]) /
+          horizontal_stride +
+      1;
 
-  int numBlocks = (n * c * h * w  + 127) / 128;
+  int numBlocks = (n * c * h * w + 127) / 128;
 
   if (row > 0)
-    postInterpolateRow<<<numBlocks,128>>>(n * c * h * w, n, c, h, w,
-				         (float *) output->gpu_data, row);
+    postInterpolateRow<<<numBlocks, 128>>>(n * c * h * w, n, c, h, w,
+                                           (float *)output->gpu_data, row);
 
   if (col > 0)
-    postInterpolateCol<<<numBlocks,128>>>(n * c * h * w, n, c, h, w,
-				         (float *) output->gpu_data, col);
-
+    postInterpolateCol<<<numBlocks, 128>>>(n * c * h * w, n, c, h, w,
+                                           (float *)output->gpu_data, col);
 
   profileEvent("tensorConv_end", true);
 
   return output;
 }
 
-
-
-
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void sampleFilterElems(int N,
-		       int n, int c, int h, int w,
-		       float* data,
-		       int skip_elem, int skip_offset,
-		       float mul_factor,
-		       float* newData){
+// N is new_data's size
+// n, c, h, w are the dimensions of new_data
+__global__ void sampleFilterElems(int N, int n, int c, int h, int w,
+                                  float *data, int skip_elem, int skip_offset,
+                                  float mul_factor, float *newData) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
@@ -265,75 +230,60 @@ void sampleFilterElems(int N,
 
     int local_index = (ch * (h * w)) + (row * w) + col;
 
-    if(skip_elem == 3 && h == 3 && w == 3){
+    if (skip_elem == 3 && h == 3 && w == 3) {
       skip_offset = (skip_offset + ch) % w; // wrap around skip offsets
     }
 
-    if(local_index % skip_elem  == skip_offset)
-       newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0;
+    if (local_index % skip_elem == skip_offset)
+      newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0;
     else
       newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-      data[n * (c * h * w) + ch * (h * w) + row * (w) + col] * mul_factor;
-      
+          data[n * (c * h * w) + ch * (h * w) + row * (w) + col] * mul_factor;
   }
 }
 
-
-
-
-
-void sampleFilter(Tensor* newFilter, Tensor* filter,
-		  int skip_rate, int skip_offset){
+void sampleFilter(Tensor *newFilter, Tensor *filter, int skip_rate,
+                  int skip_offset) {
 
   int n = filter->dims.dim_sizes[0];
   int c = filter->dims.dim_sizes[1];
   int h = filter->dims.dim_sizes[2];
   int w = filter->dims.dim_sizes[3];
-    
-  int numBlocks = (n * c * h * w  + 127) / 128;
-  int N = n * c * h * w;
 
-  float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); 
+  int numBlocks = (n * c * h * w + 127) / 128;
+  int N = n * c * h * w;
 
-  //float mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
-  //mul_factor = (mul_factor + 1.0) / 2;
+  float mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
 
-  
-  DEBUG ("mul_factor = %f \n", mul_factor);
+  // float mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
+  // mul_factor = (mul_factor + 1.0) / 2;
 
-  
-  sampleFilterElems<<<numBlocks,128>>>(N,
-				       n, c, h, w,
-				       (float *) filter->gpu_data,
-				       skip_rate, skip_offset, mul_factor,
-				       (float *) newFilter->gpu_data);
+  DEBUG("mul_factor = %f \n", mul_factor);
 
+  sampleFilterElems<<<numBlocks, 128>>>(
+      N, n, c, h, w, (float *)filter->gpu_data, skip_rate, skip_offset,
+      mul_factor, (float *)newFilter->gpu_data);
 }
 
-
-
 // A 'Simulation' of perforated tensor convolution
-void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups,
-			int skip_rate, int skip_offset){
-  
+void *tensorConvSampSim(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups,
+                        int skip_rate, int skip_offset) {
 
   INFO("*** TensorConvolution \n");
   profileEvent("tensorConv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
-  
   cudnnConvolutionDescriptor_t convDesc;
-  cudnnConvolutionFwdAlgo_t convAlgo;  
+  cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  
-  if(conv_mode == 0)
+
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   float alpha = 1.0f, beta = 0.0f;
@@ -344,24 +294,22 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
   convertToFP32(input);
   convertToFP32(filter);
 
-  Tensor* newFilter;
-  newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type,
-					CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
-					filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
-					filter->dims.dim_sizes[3]);
-
+  Tensor *newFilter;
+  newFilter = (Tensor *)create4DTensor(
+      (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
+      filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
+      filter->dims.dim_sizes[3]);
 
   // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
   sampleFilter(newFilter, filter, skip_rate, skip_offset);
-  
 
-  INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
-       vertical_stride, horizontal_stride);
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+       horizontal_stride);
 
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
   }
 
@@ -371,147 +319,116 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
   int new_v = vertical_stride + 0;
   int new_h = horizontal_stride + 0;
   cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-  
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     new_v, new_h, // conv strides
-					     1, 1, // upscaling values
-					     mode , // mode is configurable
-					     computeType)); // defines compute precision
+
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      new_v, new_h,                           // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   int n, c, h, w; // output dimensions
   // Find dimension of convolution output
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
-
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
   DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output;
-  output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
-  
+  Tensor *output;
+  output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                    CUDNN_TENSOR_NCHW, n, c, h, w);
 
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
   // NOTE: Necessary to insert the above call for every output tensor
 
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0],
-	output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = "
+        "%d, W = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
 
-  if(convDesc == NULL || input->tensor_desc == NULL ||
-     filter->filter_desc == NULL || output->tensor_desc == NULL)
+  if (convDesc == NULL || input->tensor_desc == NULL ||
+      filter->filter_desc == NULL || output->tensor_desc == NULL)
     ERROR("NULL descriptor! \n");
 
-
-  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						 input->tensor_desc,
-						 filter->filter_desc,
-						 convDesc,
-						 output->tensor_desc,
-						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						 0,
-						 &convAlgo));
-
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support
+  // is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+      // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+      0, &convAlgo));
 
   DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
   // NOTE: Using GEMM-based Algo
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
 
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     convDesc,
-						     output->tensor_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
-  void* workspace;
+  void *workspace;
   checkCudaErrors(cudaMalloc(&workspace, workspace_size));
   DEBUG("workspace size = %d \n", workspace_size);
 
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+      filter->filter_desc, newFilter->gpu_data, convDesc, convAlgo, workspace,
+      workspace_size, &beta, output->tensor_desc, output->gpu_data));
 
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				     input->gpu_data, filter->filter_desc, newFilter->gpu_data,
-				     convDesc, convAlgo, workspace, workspace_size,
-				     &beta, output->tensor_desc, output->gpu_data));
-
-
- 
   freeTensor(newFilter);
   profileEvent("tensorConv_end", true);
 
   return output;
 }
 
-
-
-
-
-
-
-
-
-
-void sampleFilter2(Tensor* newFilter, Tensor* filter,
-		   int skip_rate, int skip_offset, float interpolation_rate){
+void sampleFilter2(Tensor *newFilter, Tensor *filter, int skip_rate,
+                   int skip_offset, float interpolation_rate) {
 
   int n = filter->dims.dim_sizes[0];
   int c = filter->dims.dim_sizes[1];
   int h = filter->dims.dim_sizes[2];
   int w = filter->dims.dim_sizes[3];
-    
-  int numBlocks = (n * c * h * w  + 127) / 128;
+
+  int numBlocks = (n * c * h * w + 127) / 128;
   int N = n * c * h * w;
 
   float mul_factor;
   mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
   mul_factor = 1 + (interpolation_rate * (mul_factor - 1.0));
-  DEBUG ("mul_factor = %f \n", mul_factor);
-  
-  sampleFilterElems<<<numBlocks,128>>>(N,
-				       n, c, h, w,
-				       (float *) filter->gpu_data,
-				       skip_rate, skip_offset, mul_factor,
-				       (float *) newFilter->gpu_data);
-}
-
+  DEBUG("mul_factor = %f \n", mul_factor);
 
+  sampleFilterElems<<<numBlocks, 128>>>(
+      N, n, c, h, w, (float *)filter->gpu_data, skip_rate, skip_offset,
+      mul_factor, (float *)newFilter->gpu_data);
+}
 
 // A 'Simulation' of perforated tensor convolution
-void* tensorConvSampSim2(void* input_ptr, void* filter_ptr,
-			 int vertical_pad, int horizontal_pad,
-			 int vertical_stride, int horizontal_stride,
-			 int conv_mode, int conv_groups,
-			 int skip_rate, int skip_offset, float interpolation_rate){
-  
+void *tensorConvSampSim2(void *input_ptr, void *filter_ptr, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode, int conv_groups,
+                         int skip_rate, int skip_offset,
+                         float interpolation_rate) {
 
   INFO("*** TensorConvolution \n");
   profileEvent("tensorConv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
-  
   cudnnConvolutionDescriptor_t convDesc;
-  cudnnConvolutionFwdAlgo_t convAlgo;  
+  cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  
-  if(conv_mode == 0)
+
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   float alpha = 1.0f, beta = 0.0f;
@@ -522,24 +439,22 @@ void* tensorConvSampSim2(void* input_ptr, void* filter_ptr,
   convertToFP32(input);
   convertToFP32(filter);
 
-  Tensor* newFilter;
-  newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type,
-					CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
-					filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
-					filter->dims.dim_sizes[3]);
-
+  Tensor *newFilter;
+  newFilter = (Tensor *)create4DTensor(
+      (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
+      filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
+      filter->dims.dim_sizes[3]);
 
   // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
   sampleFilter2(newFilter, filter, skip_rate, skip_offset, interpolation_rate);
-  
 
-  INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
-       vertical_stride, horizontal_stride);
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+       horizontal_stride);
 
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
   }
 
@@ -549,166 +464,135 @@ void* tensorConvSampSim2(void* input_ptr, void* filter_ptr,
   int new_v = vertical_stride + 0;
   int new_h = horizontal_stride + 0;
   cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-  
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     new_v, new_h, // conv strides
-					     1, 1, // upscaling values
-					     mode , // mode is configurable
-					     computeType)); // defines compute precision
+
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      new_v, new_h,                           // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   int n, c, h, w; // output dimensions
   // Find dimension of convolution output
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
-
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
   DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output;
-  output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
-  
+  Tensor *output;
+  output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                    CUDNN_TENSOR_NCHW, n, c, h, w);
 
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
   // NOTE: Necessary to insert the above call for every output tensor
 
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0],
-	output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = "
+        "%d, W = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
 
-  if(convDesc == NULL || input->tensor_desc == NULL ||
-     filter->filter_desc == NULL || output->tensor_desc == NULL)
+  if (convDesc == NULL || input->tensor_desc == NULL ||
+      filter->filter_desc == NULL || output->tensor_desc == NULL)
     ERROR("NULL descriptor! \n");
 
-
-  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						 input->tensor_desc,
-						 filter->filter_desc,
-						 convDesc,
-						 output->tensor_desc,
-						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						 0,
-						 &convAlgo));
-
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support
+  // is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+      // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+      0, &convAlgo));
 
   DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
   // NOTE: Using GEMM-based Algo
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
 
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     convDesc,
-						     output->tensor_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
-  void* workspace;
+  void *workspace;
   checkCudaErrors(cudaMalloc(&workspace, workspace_size));
   DEBUG("workspace size = %d \n", workspace_size);
 
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+      filter->filter_desc, newFilter->gpu_data, convDesc, convAlgo, workspace,
+      workspace_size, &beta, output->tensor_desc, output->gpu_data));
 
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				     input->gpu_data, filter->filter_desc, newFilter->gpu_data,
-				     convDesc, convAlgo, workspace, workspace_size,
-				     &beta, output->tensor_desc, output->gpu_data));
-
-
- 
   freeTensor(newFilter);
   profileEvent("tensorConv_end", true);
 
   return output;
 }
 
+/************ NOTE: API for ApproxHPVM Wrapper runtime *******/
 
+void *PROMISE_Conv(void *input, float i_min, float i_max, void *filter,
+                   float w_min, float w_max, void *bias, float b_min,
+                   float b_max, int conv_pad_h, int conv_pad_w,
+                   int conv_stride_h, int conv_stride_w, int pool_id,
+                   int pool_size, int pool_stride,
+                   int activation_id, // Relu, Tanh, ClipRelu
+                   float out_min, float out_max, int swing) {
 
+  Tensor *input_t = (Tensor *)input;
+  Tensor *filter_t = (Tensor *)filter;
+  Tensor *bias_t = (Tensor *)bias;
 
-
-
-
-
-
-/************ NOTE: API for ApproxHPVM Wrapper runtime *******/ 
-
-
-void* PROMISE_Conv(void* input, float i_min, float i_max,
-		   void* filter, float w_min, float w_max,
-		   void* bias, float b_min, float b_max,
-		   int conv_pad_h, int conv_pad_w,
-		   int conv_stride_h, int conv_stride_w,
-		   int pool_id, int pool_size, int pool_stride,
-		   int activation_id, // Relu, Tanh, ClipRelu
-		   float out_min, float out_max, int swing){ 
-
-
-  Tensor* input_t = (Tensor*) input;
-  Tensor* filter_t = (Tensor*) filter;
-  Tensor* bias_t = (Tensor*) bias;
-  
   int orig_type = input_t->cur_type;
 
   DEBUG("FP32 conversions \n");
-  
+
   convertToFP32(input_t);
 
   convertToFP32(filter_t);
   convertToFP32(bias_t);
 
   DEBUG("DONE FP32 conversions \n");
-  
 
-  if(swing < 8){
+  if (swing < 8) {
     input = quantizeTensorPromise(input, i_min, i_max);
     filter = quantizeTensorPromise(filter, w_min, w_max);
-    if(bias != NULL)
+    if (bias != NULL)
       bias = quantizeTensorPromise(bias, b_min, b_max);
     // aRead error
-    
+
     input = addPromiseError(input, swing);
   }
 
-  
-  void* conv_out;
-  conv_out = tensorConvolution(input, filter,
-			       conv_pad_h, conv_pad_w,
-			       conv_stride_h, conv_stride_w,
-			       1, 0);
-  
-  void* conv_add;
-  if(bias != NULL){
+  void *conv_out;
+  conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w,
+                               conv_stride_h, conv_stride_w, 1, 0);
+
+  void *conv_add;
+  if (bias != NULL) {
     conv_add = tensorAdd(conv_out, bias);
-  }
-  else{
+  } else {
     conv_add = conv_out;
   }
 
-  void* pool_out;
+  void *pool_out;
   // NOTE: Skip pooling on negative pool sizes
-  if(pool_size > 0){
-    //FIXME: Currently only using MaxPooling
-    //-- pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_size, pool_size);
-    pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_stride, pool_stride);
-  }
-  else{
+  if (pool_size > 0) {
+    // FIXME: Currently only using MaxPooling
+    //-- pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0,
+    // pool_size, pool_size);
+    pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0,
+                             pool_stride, pool_stride);
+  } else {
     pool_out = conv_add;
   }
-  
-  void* activation_out;  
-  switch(activation_id){
+
+  void *activation_out;
+  switch (activation_id) {
   case -1:
     activation_out = pool_out;
     INFO("NO Activation Function \n");
@@ -727,68 +611,54 @@ void* PROMISE_Conv(void* input, float i_min, float i_max,
     break;
   }
 
-
-  if(swing < 8 && activation_id != -1){
+  if (swing < 8 && activation_id != -1) {
     activation_out = quantizeTensorPromise(activation_out, out_min, out_max);
   }
 
-
-
-  //NOTE: Convert back to FP16 if original type
-  if (orig_type == half_type){
-    convertToFP16((Tensor*) activation_out);
+  // NOTE: Convert back to FP16 if original type
+  if (orig_type == half_type) {
+    convertToFP16((Tensor *)activation_out);
   }
 
-  
   return activation_out;
 }
 
+void *PROMISE_FC(void *input, float i_min, float i_max, void *weights,
+                 float w_min, float w_max, void *bias, float b_min, float b_max,
+                 int activation_id, float out_min, float out_max, int swing) {
 
+  Tensor *input_t = (Tensor *)input;
+  Tensor *weights_t = (Tensor *)weights;
+  Tensor *bias_t = (Tensor *)bias;
 
-void* PROMISE_FC(void* input, float i_min, float i_max,
-		 void* weights, float w_min, float w_max,
-		 void* bias, float b_min, float b_max,
-		 int activation_id,
-		 float out_min, float out_max, int swing){
-
-
-  Tensor* input_t = (Tensor*) input;
-  Tensor* weights_t = (Tensor*) weights;
-  Tensor* bias_t = (Tensor*) bias;
-  
   int orig_type = input_t->cur_type;
-  
+
   convertToFP32(input_t);
   convertToFP32(weights_t);
   convertToFP32(bias_t);
-  
-  
-  if(swing < 8){
+
+  if (swing < 8) {
     input = quantizeTensorPromise(input, i_min, i_max);
     weights = quantizeTensorPromise(weights, w_min, w_max);
-    if(bias != NULL)
+    if (bias != NULL)
       bias = quantizeTensorPromise(bias, b_min, b_max);
 
     // NOTE: Modelling aRead error in PROMISE
     input = addPromiseError(input, swing);
   }
 
-
-  
-  void* gemm_out;
+  void *gemm_out;
   gemm_out = tensorGemmGPU(input, weights);
 
-  
-  void* gemmbias_out;
-  if(bias != NULL){
+  void *gemmbias_out;
+  if (bias != NULL) {
     gemmbias_out = tensorAdd(gemm_out, bias);
-  }
-  else{
+  } else {
     gemmbias_out = gemm_out;
   }
- 
-  void* activation_out;
-  switch(activation_id){
+
+  void *activation_out;
+  switch (activation_id) {
 
   case -1:
     activation_out = gemmbias_out;
@@ -807,86 +677,71 @@ void* PROMISE_FC(void* input, float i_min, float i_max,
     ERROR("Activation id %d NOT supported \n", activation_out);
     break;
   }
-  
-  
-  if(swing < 8 && activation_id != -1){
+
+  if (swing < 8 && activation_id != -1) {
     activation_out = quantizeTensorPromise(activation_out, out_min, out_max);
   }
 
-
-  //NOTE: Convert back to FP16 if original type
-  if (orig_type == half_type){
-    convertToFP16((Tensor*) activation_out);
+  // NOTE: Convert back to FP16 if original type
+  if (orig_type == half_type) {
+    convertToFP16((Tensor *)activation_out);
   }
 
-
-  
   return activation_out;
 }
 
-
-
-
-
-// NOTE: Enabling the macro below is used for testing against the old PROMISE wrapper
+// NOTE: Enabling the macro below is used for testing against the old PROMISE
+// wrapper
 //#define OLD_MODEL
 
 #ifndef OLD_MODEL
 
+bool isPromiseLayer(int swing) {
 
-
-bool isPromiseLayer(int swing){
-
-  if(swing < 8)
+  if (swing < 8)
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isGPULayer(int swing) {
 
-bool isGPULayer(int swing){
-
-  if(swing > 10 ) // PROMISE layers are 1-7
+  if (swing > 10) // PROMISE layers are 1-7
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isFullPrecision(int swing) {
 
-bool isFullPrecision(int swing){
-
-  if(swing == 11)
+  if (swing == 11)
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isHalfPrecision(int swing) {
 
-
-bool isHalfPrecision(int swing){
-
-  if(swing == 12)
+  if (swing == 12)
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isPerforation(int swing) {
 
-bool isPerforation(int swing){
-
-  if(swing >= 100 && swing <= 200)
+  if (swing >= 100 && swing <= 200)
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isSampling(int swing) {
 
-bool isSampling(int swing){
-
-  if(swing >= 200 && swing <= 300)
+  if (swing >= 200 && swing <= 300)
     return true;
   else
-    return false;      
+    return false;
 }
 
 bool isReductionSampling(int swing) {
@@ -894,300 +749,227 @@ bool isReductionSampling(int swing) {
   if (swing >= 41 && swing <= 49)
     return true;
   else
-    return false;      
+    return false;
 }
 
-int getSwing(int swing){
+int getSwing(int swing) {
 
-  #ifdef PROMISE_TUNER_ENABLED
+#ifdef PROMISE_TUNER_ENABLED
 
   // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime
-  if(!approxhpvm_runtime_mode){
-  
-    if(op_counter >= total_ops){
+  if (!approxhpvm_runtime_mode) {
+
+    if (op_counter >= total_ops) {
       ERROR("No accuracy flag found \n");
     }
-  
+
     swing = op_accuracies[op_counter];
     op_counter++;
   }
 
-  #endif  
+#endif
 
-   DEBUG("---- swing_value = %d \n", swing);  
+  DEBUG("---- swing_value = %d \n", swing);
 
-   return swing;
+  return swing;
 }
 
-
-
-
-//bool FP16_tuning = false;
-
+// bool FP16_tuning = false;
 
 /***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */
 
-
-
-void initializeAutotuner(){
+void initializeAutotuner() {
 
   DEBUG("initializing tuner .... \n");
-  
+
   sampParamSet = new SampParamSet;
-  perfParamSet = new PerfParamSet;  
+  perfParamSet = new PerfParamSet;
 }
 
+void *Autotuner_SampConv(void *input, float i_min, float i_max, void *filter,
+                         float w_min, float w_max, void *bias, float b_min,
+                         float b_max, int conv_pad_h, int conv_pad_w,
+                         int conv_stride_h, int conv_stride_w, int pool_id,
+                         int pool_size,
+                         int activation_id, // Relu, Tanh, ClipRelu
+                         float out_min, float out_max, int swing) {
 
-void* Autotuner_SampConv(void* input, float i_min, float i_max,
-			 void* filter, float w_min, float w_max,
-			 void* bias, float b_min, float b_max,
-			 int conv_pad_h, int conv_pad_w,
-			 int conv_stride_h, int conv_stride_w,
-			 int pool_id, int pool_size,
-			 int activation_id, // Relu, Tanh, ClipRelu
-			 float out_min, float out_max, int swing){
+  SampParams params = sampParamSet->getSampParams(swing);
 
+  DEBUG("params.skip_rate = %d, params.skip_offset = %d \n", params.skip_rate,
+        params.skip_offset);
+
+  void *conv_out;
+
+  if (!FP16_tuning) {
 
-  SampParams params = sampParamSet->getSampParams(swing);
-  
-  DEBUG("params.skip_rate = %d, params.skip_offset = %d \n",
-	params.skip_rate, params.skip_offset);
-  
-  void* conv_out;
-  
-  if (!FP16_tuning){
- 
     /* conv_out = tensorConvSampSim(input, filter,
-				 conv_pad_h, conv_pad_w,
-				 conv_stride_h, conv_stride_w, 1, 1,
-				 params.skip_rate, params.skip_offset);
+                                 conv_pad_h, conv_pad_w,
+                                 conv_stride_h, conv_stride_w, 1, 1,
+                                 params.skip_rate, params.skip_offset);
     */
 
-
-    if (SIMULATION_MODE){
-      conv_out = tensorConvSampSim2(input, filter,
-				    conv_pad_h, conv_pad_w,
-				    conv_stride_h, conv_stride_w, 1, 1,
-				    params.skip_rate, params.skip_offset, params.interpolation_id);
+    if (SIMULATION_MODE) {
+      conv_out = tensorConvSampSim2(
+          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
+          1, 1, params.skip_rate, params.skip_offset, params.interpolation_id);
     }
-        
 
     else {
-      conv_out = tensorConvApprox(input, filter,
-				  conv_pad_h, conv_pad_w,
-				  conv_stride_h, conv_stride_w, 1, 1,
-				  1, 1, params.skip_rate, params.skip_offset);
+      conv_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w,
+                                  conv_stride_h, conv_stride_w, 1, 1, 1, 1,
+                                  params.skip_rate, params.skip_offset);
     }
-    
-    
-  }
-  else{
-        
-    conv_out = tensorConvApproxHalf2(input, filter,
-				     conv_pad_h, conv_pad_w,
-				     conv_stride_h, conv_stride_w,
-				     1, 1,
-				     1, 1,
-				     params.skip_rate, params.skip_offset);    
-   
+
+  } else {
+
+    conv_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
+                                     conv_stride_h, conv_stride_w, 1, 1, 1, 1,
+                                     params.skip_rate, params.skip_offset);
   }
 
   return conv_out;
 }
 
-
-
-
-void* Autotuner_PerforatedConv(void* input, float i_min, float i_max,
-			       void* filter, float w_min, float w_max,
-			       void* bias, float b_min, float b_max,
-			       int conv_pad_h, int conv_pad_w,
-			       int conv_stride_h, int conv_stride_w,
-			       int pool_id, int pool_size,
-			       int activation_id, // Relu, Tanh, ClipRelu
-			       float out_min, float out_max, int swing){ 
-
+void *Autotuner_PerforatedConv(void *input, float i_min, float i_max,
+                               void *filter, float w_min, float w_max,
+                               void *bias, float b_min, float b_max,
+                               int conv_pad_h, int conv_pad_w,
+                               int conv_stride_h, int conv_stride_w,
+                               int pool_id, int pool_size,
+                               int activation_id, // Relu, Tanh, ClipRelu
+                               float out_min, float out_max, int swing) {
 
   PerfParams params = perfParamSet->getPerfParams(swing);
-  
+
   DEBUG("params.row = %d, params.col = %d, params.skip_offset = %d \n",
-	params.row, params.col, params.skip_offset);
-    
+        params.row, params.col, params.skip_offset);
 
-  void* conv_out;
-  
-  if (!FP16_tuning){
+  void *conv_out;
 
+  if (!FP16_tuning) {
 
-    if (SIMULATION_MODE){
+    if (SIMULATION_MODE) {
 
-      conv_out = tensorConvPerfCuda(input, filter,
-				    conv_pad_h, conv_pad_w,
-				    conv_stride_h, conv_stride_w, 1, 1,
-				    params.row, params.col, params.skip_offset);
+      conv_out = tensorConvPerfCuda(input, filter, conv_pad_h, conv_pad_w,
+                                    conv_stride_h, conv_stride_w, 1, 1,
+                                    params.row, params.col, params.skip_offset);
 
+    } else {
+
+      conv_out = tensorConvApprox(
+          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
+          1, 1, params.row, params.col, 1, params.skip_offset);
     }
-    else{
-
-      conv_out = tensorConvApprox(input, filter,
-				  conv_pad_h, conv_pad_w,
-				  conv_stride_h, conv_stride_w,
-				  1, 1,
-				  params.row, params.col,
-				  1, params.skip_offset);   
-    }
-    
-    
-  }
-  else{
 
-    conv_out = tensorConvApproxHalf2(input, filter,
-				     conv_pad_h, conv_pad_w,
-				     conv_stride_h, conv_stride_w,
-				     1, 1,
-				     params.row, params.col,
-				     1, params.skip_offset);   
+  } else {
 
+    conv_out = tensorConvApproxHalf2(
+        input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1,
+        1, params.row, params.col, 1, params.skip_offset);
   }
-    
-  return conv_out;  
-}
-
-
-
 
+  return conv_out;
+}
 
+void *Autotuner_ConvOp(void *input, float i_min, float i_max, void *filter,
+                       float w_min, float w_max, void *bias, float b_min,
+                       float b_max, int conv_pad_h, int conv_pad_w,
+                       int conv_stride_h, int conv_stride_w, int pool_id,
+                       int pool_size,
+                       int activation_id, // Relu, Tanh, ClipRelu
+                       float out_min, float out_max, int swing) {
 
-void* Autotuner_ConvOp(void* input, float i_min, float i_max,
-		       void* filter, float w_min, float w_max,
-		       void* bias, float b_min, float b_max,
-		       int conv_pad_h, int conv_pad_w,
-		       int conv_stride_h, int conv_stride_w,
-		       int pool_id, int pool_size,
-		       int activation_id, // Relu, Tanh, ClipRelu
-		       float out_min, float out_max, int swing){ 
+  void *conv_out;
+  if (isPerforation(swing)) {
 
-  
-  void* conv_out;
-  if(isPerforation(swing)){
+    conv_out = Autotuner_PerforatedConv(
+        input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max,
+        conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, pool_id,
+        pool_size, activation_id, out_min, out_max, swing);
 
-    conv_out = Autotuner_PerforatedConv(input, i_min, i_max,
-					filter, w_min, w_max,
-					bias, b_min, b_max,
-					conv_pad_h, conv_pad_w,
-					conv_stride_h, conv_stride_w,
-					pool_id, pool_size,
-					activation_id, 
-					out_min, out_max, swing);
-    
   }
 
-  else if(isSampling(swing)){
+  else if (isSampling(swing)) {
 
-    conv_out = Autotuner_SampConv(input, i_min, i_max,
-				  filter, w_min, w_max,
-				  bias, b_min, b_max,
-				  conv_pad_h, conv_pad_w,
-				  conv_stride_h, conv_stride_w,
-				  pool_id, pool_size,
-				  activation_id, 
-				  out_min, out_max, swing);
+    conv_out = Autotuner_SampConv(
+        input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max,
+        conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, pool_id,
+        pool_size, activation_id, out_min, out_max, swing);
   }
-  
 
-  else if (isHalfPrecision(swing)){
+  else if (isHalfPrecision(swing)) {
 
-    if (FP16_tuning){
- 
-      conv_out = tensorHalfConvolution(input, filter,
-				     conv_pad_h, conv_pad_w,
-				     conv_stride_h, conv_stride_w,
-				     1, 0);
-    }
-    else{
-      conv_out = tensorConvolution(input, filter,
-				 conv_pad_h, conv_pad_w,
-				 conv_stride_h, conv_stride_w,
-				 1, 0);
+    if (FP16_tuning) {
+
+      conv_out = tensorHalfConvolution(input, filter, conv_pad_h, conv_pad_w,
+                                       conv_stride_h, conv_stride_w, 1, 0);
+    } else {
+      conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w,
+                                   conv_stride_h, conv_stride_w, 1, 0);
     }
-    
-  }
 
-  else if (isFullPrecision(swing)){
-    conv_out = tensorConvolution(input, filter,
-				 conv_pad_h, conv_pad_w,
-				 conv_stride_h, conv_stride_w,
-				 1, 0);
   }
 
+  else if (isFullPrecision(swing)) {
+    conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w,
+                                 conv_stride_h, conv_stride_w, 1, 0);
+  }
 
-  return conv_out;  
+  return conv_out;
 }
 
+void *Autotuner_Add(void *input, void *bias, int swing) {
 
+  void *conv_add;
+  if (bias != NULL) {
 
-void* Autotuner_Add(void* input, void* bias, int swing){
-
-  void* conv_add;
-  if(bias != NULL){
-    
-    if( isFullPrecision(swing) || !(FP16_tuning) ){  
+    if (isFullPrecision(swing) || !(FP16_tuning)) {
       conv_add = tensorAdd(input, bias);
-    }
-    else {
+    } else {
       conv_add = tensorHalfAdd(input, bias);
     }
-  }
-  else{
+  } else {
     conv_add = input;
   }
 
   return conv_add;
 }
 
+void *Autotuner_Pooling(void *input, int pool_size, int pool_stride,
+                        int swing) {
 
+  void *pool_out;
 
-void* Autotuner_Pooling(void* input,
-			int pool_size, int pool_stride,
-			int swing){
+  if (pool_size > 0) {
 
-  void* pool_out;
-  
-  if(pool_size > 0){
-    
-    //FIXME: Currently only using MaxPooling
-    if( isFullPrecision(swing) || !(FP16_tuning) ){  
-      pool_out = tensorPooling(input, 0, pool_size, pool_size,
-			       0, 0, pool_stride, pool_stride);
+    // FIXME: Currently only using MaxPooling
+    if (isFullPrecision(swing) || !(FP16_tuning)) {
+      pool_out = tensorPooling(input, 0, pool_size, pool_size, 0, 0,
+                               pool_stride, pool_stride);
 
     }
-	
+
     else {
-      pool_out = tensorHalfPooling(input, 0, pool_size, pool_size,
-				   0, 0, pool_stride, pool_stride);  
+      pool_out = tensorHalfPooling(input, 0, pool_size, pool_size, 0, 0,
+                                   pool_stride, pool_stride);
     }
-    
-   
-  }
-  else{
+
+  } else {
     pool_out = input;
   }
-  
-  
+
   return pool_out;
 }
 
+void *Autotuner_Activation(void *input, int activation_id, int out_min,
+                           int out_max, int swing) {
 
+  void *activation_out;
 
+  if (isFullPrecision(swing) || (!FP16_tuning)) {
 
-void* Autotuner_Activation(void* input, int activation_id,
-			   int out_min, int out_max, int swing){
-
-  void* activation_out;
-
-  if ( isFullPrecision(swing) || (!FP16_tuning) ){
-    
-    switch(activation_id){
+    switch (activation_id) {
     case -1:
       activation_out = input;
       INFO("NO Activation Function \n");
@@ -1206,10 +988,10 @@ void* Autotuner_Activation(void* input, int activation_id,
       break;
     }
   }
-   
-  else{
 
-    switch(activation_id){
+  else {
+
+    switch (activation_id) {
     case -1:
       activation_out = input;
       INFO("NO Activation Function \n");
@@ -1227,310 +1009,116 @@ void* Autotuner_Activation(void* input, int activation_id,
       ERROR("Activation id %d NOT supported \n", activation_out);
       break;
     }
-
   }
 
-
   return activation_out;
 }
 
+void *Autotuner_GPU_ConvLayer(void *input, float i_min, float i_max,
+                              void *filter, float w_min, float w_max,
+                              void *bias, float b_min, float b_max,
+                              int conv_pad_h, int conv_pad_w, int conv_stride_h,
+                              int conv_stride_w, int pool_id, int pool_size,
+                              int pool_stride,
+                              int activation_id, // Relu, Tanh, ClipRelu
+                              float out_min, float out_max, int swing) {
 
-void* autotuner_tensorFft(void *input, bool inverse) {
-  if(ONLINE_PROFILING){
-    ERROR("Online Profiling cannot be enabled\n");
-    abort();
-  }
-
-  int swing = 0;
-  swing = getSwing(swing);
-
-  if (isFullPrecision(swing)) {
-    return tensorFft(input, inverse);
-  }
-  if (isHalfPrecision(swing)) {
-    return tensorFftHalf(input, inverse);
-  }
+  void *conv_out = Autotuner_ConvOp(
+      input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h,
+      conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size,
+      activation_id, out_min, out_max, swing);
 
-  ERROR("Unsupported autotuner flag for operation fft\n");
-  abort();
-  return NULL;
-}
+  void *conv_add = Autotuner_Add(conv_out, bias, swing);
 
+  void *pool_out = Autotuner_Pooling(conv_add, pool_size, pool_stride, swing);
 
-void* autotuner_tensorReduce(void *input, size_t axis, MathOp func) {
-  if(ONLINE_PROFILING){
-    ERROR("Online Profiling cannot be enabled\n");
-    abort();
-  }
-
-  int swing = 0;
-  swing = getSwing(swing);
-
-  if (isFullPrecision(swing)) {
-    return tensorReduce(input, axis, func, 0.0f);
-  }
+  void *activation_out =
+      Autotuner_Activation(pool_out, activation_id, out_min, out_max, swing);
 
-  if (isHalfPrecision(swing)) {
-    return tensorReduceHalf(input, axis, func, 0.0f);
-  }
-
-  if (isReductionSampling(swing)) {
-    RedSampParams params = getRedSampParams(swing);
-    DEBUG("params.skip_ratio = %f, params.is_half = %d\n", params.skip_ratio, (int)params.is_half);
-    if (params.is_half)
-      return tensorReduceHalf(input, axis, func, params.skip_ratio);
-    else
-      return tensorReduce(input, axis, func, params.skip_ratio);
-  }
-
-  ERROR("Unsupported autotuner flag for operation reduce\n");
-  abort();
-  return NULL;
-}
-
-void* autotuner_tensorProjectiveT(void *input, void *transformation) {
-  if(ONLINE_PROFILING){
-    ERROR("Online Profiling cannot be enabled\n");
-    abort();
-  }
-
-  int swing = 0;
-  swing = getSwing(swing);
-
-  if (isFullPrecision(swing)) {
-    return tensorProjectiveT(input, transformation);
-  }
-
-  ERROR("Unsupported autotuner flag for operation projectiveT\n");
-  abort();
-  return NULL;
-}
-
-
-void* autotuner_tensorMap1(MathOp func, void *input) {
-  if(ONLINE_PROFILING){
-    ERROR("Online Profiling cannot be enabled\n");
-    abort();
-  }
-
-  int swing = 0;
-  swing = getSwing(swing);
-
-  if (isFullPrecision(swing)) {
-    return tensorMap1(func, input);
-  }
-
-  if (isHalfPrecision(swing)) {
-    return tensorMap1Half(func, input);
-  }
-
-  ERROR("Unsupported autotuner flag for operation map1\n");
-  abort();
-  return NULL;
-}
-
-void* autotuner_tensorMap2(MathOp func, void *input1, void *input2) {
-  if(ONLINE_PROFILING){
-    ERROR("Online Profiling cannot be enabled\n");
-    abort();
-  }
-
-  int swing = 0;
-  swing = getSwing(swing);
-
-  if (isFullPrecision(swing)) {
-    return tensorMap2(func, input1, input2);
-  }
-
-  if (isHalfPrecision(swing)) {
-    return tensorMap2Half(func, input1, input2);
-  }
-
-  ERROR("Unsupported autotuner flag for operation map2\n");
-  abort();
-  return NULL;
-}
-
-void* autotuner_tensorMap3(MathOp func, void *input1, void *input2,
-                           void *input3) {
-  if(ONLINE_PROFILING){
-    ERROR("Online Profiling cannot be enabled\n");
-    abort();
-  }
-
-  int swing = 0;
-  swing = getSwing(swing);
-
-  if (isFullPrecision(swing)) {
-    return tensorMap3(func, input1, input2, input3);
-  }
-
-  if (isHalfPrecision(swing)) {
-    return tensorMap3Half(func, input1, input2, input3);
-  }
-
-  ERROR("Unsupported autotuner flag for operation map3\n");
-  abort();
-  return NULL;
-}
-
-
-
-
-void* Autotuner_GPU_ConvLayer(void* input, float i_min, float i_max,
-			      void* filter, float w_min, float w_max,
-			      void* bias, float b_min, float b_max,
-			      int conv_pad_h, int conv_pad_w,
-			      int conv_stride_h, int conv_stride_w,
-			      int pool_id, int pool_size, int pool_stride, 
-			      int activation_id, // Relu, Tanh, ClipRelu
-			      float out_min, float out_max, int swing){ 
-  
-
-  void* conv_out = Autotuner_ConvOp(input, i_min, i_max,
-				    filter, w_min, w_max,
-				    bias, b_min, b_max,
-				    conv_pad_h, conv_pad_w,
-				    conv_stride_h, conv_stride_w,
-				    pool_id, pool_size,
-				    activation_id, 
-				    out_min, out_max, swing);
- 
-  
-  void* conv_add = Autotuner_Add(conv_out, bias, swing);
-
-  void* pool_out = Autotuner_Pooling(conv_add, pool_size, pool_stride, swing);
-
-  void* activation_out = Autotuner_Activation(pool_out, activation_id, out_min, out_max, swing);
-  
-
-  return activation_out;  
+  return activation_out;
 }
 
-
 /**** Top-level API for Handling Convolution Layers
 
       The granularity of handling is at a layer-level - not tensor-op level
-        
+
 ***/
 
-void* Autotuner_ConvLayer(void* input, float i_min, float i_max,
-			  void* filter, float w_min, float w_max,
-			  void* bias, float b_min, float b_max,
-			  int conv_pad_h, int conv_pad_w,
-			  int conv_stride_h, int conv_stride_w,
-			  int pool_id, int pool_size, int pool_stride, 
-			  int activation_id, // Relu, Tanh, ClipRelu
-			  float out_min, float out_max, int swing){ 
-
-  if(FP16_tuning){
-    if(ONLINE_PROFILING){
+void *Autotuner_ConvLayer(void *input, float i_min, float i_max, void *filter,
+                          float w_min, float w_max, void *bias, float b_min,
+                          float b_max, int conv_pad_h, int conv_pad_w,
+                          int conv_stride_h, int conv_stride_w, int pool_id,
+                          int pool_size, int pool_stride,
+                          int activation_id, // Relu, Tanh, ClipRelu
+                          float out_min, float out_max, int swing) {
+
+  if (FP16_tuning) {
+    if (ONLINE_PROFILING) {
       ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n");
     }
   }
 
-  swing = getSwing(swing);  
- 
-  if(isPromiseLayer(swing)){
-    
-    return PROMISE_Conv(input, i_min, i_max,
-			filter, w_min, w_max,
-			bias, b_min, b_max,
-			conv_pad_h, conv_pad_w,
-			conv_stride_h, conv_stride_w,
-			pool_id, pool_size, pool_stride,
-			activation_id, 
-			out_min, out_max, swing);
+  swing = getSwing(swing);
+
+  if (isPromiseLayer(swing)) {
+
+    return PROMISE_Conv(input, i_min, i_max, filter, w_min, w_max, bias, b_min,
+                        b_max, conv_pad_h, conv_pad_w, conv_stride_h,
+                        conv_stride_w, pool_id, pool_size, pool_stride,
+                        activation_id, out_min, out_max, swing);
   }
 
   assert(isGPULayer(swing));
 
-  return Autotuner_GPU_ConvLayer(input, i_min, i_max,
-			   filter, w_min, w_max,
-			   bias, b_min, b_max,
-			   conv_pad_h, conv_pad_w,
-			   conv_stride_h, conv_stride_w,
-			   pool_id, pool_size, pool_stride, 
-			   activation_id,
-			   out_min, out_max, swing);
-
+  return Autotuner_GPU_ConvLayer(
+      input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h,
+      conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, pool_stride,
+      activation_id, out_min, out_max, swing);
 }
 
-
-
-
-
 /**** Top-level API Unchanged for backwards compatibility  ***/
 
-void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
-			void* filter, float w_min, float w_max,
-			void* bias, float b_min, float b_max,
-			int conv_pad_h, int conv_pad_w,
-			int conv_stride_h, int conv_stride_w,
-			int pool_id, int pool_size,
-			int activation_id, // Relu, Tanh, ClipRelu
-			float out_min, float out_max, int swing){ 
-
-
-  return Autotuner_ConvLayer(input, i_min, i_max,
-			     filter, w_min, w_max,
-			     bias, b_min, b_max,
-			     conv_pad_h, conv_pad_w,
-			     conv_stride_h, conv_stride_w,
-			     pool_id, pool_size, pool_size, // FIXIT: Assumption pool_size == pool_strides
-			     activation_id,
-			     out_min, out_max, swing);
-  
-
+void *ConvLayer_PROMISE(void *input, float i_min, float i_max, void *filter,
+                        float w_min, float w_max, void *bias, float b_min,
+                        float b_max, int conv_pad_h, int conv_pad_w,
+                        int conv_stride_h, int conv_stride_w, int pool_id,
+                        int pool_size,
+                        int activation_id, // Relu, Tanh, ClipRelu
+                        float out_min, float out_max, int swing) {
+
+  return Autotuner_ConvLayer(
+      input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h,
+      conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size,
+      pool_size, // FIXIT: Assumption pool_size == pool_strides
+      activation_id, out_min, out_max, swing);
 }
 
-
-
-
-void* ConvLayer_PROMISE2(void* input, float i_min, float i_max,
-			 void* filter, float w_min, float w_max,
-			 void* bias, float b_min, float b_max,
-			 int conv_pad_h, int conv_pad_w,
-			 int conv_stride_h, int conv_stride_w,
-			 int pool_id, int pool_size, int pool_stride, 
-			 int activation_id, // Relu, Tanh, ClipRelu
-			 float out_min, float out_max, int swing){ 
-
-
-  return Autotuner_ConvLayer(input, i_min, i_max,
-			     filter, w_min, w_max,
-			     bias, b_min, b_max,
-			     conv_pad_h, conv_pad_w,
-			     conv_stride_h, conv_stride_w,
-			     pool_id, pool_size, pool_stride, 
-			     activation_id,
-			     out_min, out_max, swing);
-  
-
+void *ConvLayer_PROMISE2(void *input, float i_min, float i_max, void *filter,
+                         float w_min, float w_max, void *bias, float b_min,
+                         float b_max, int conv_pad_h, int conv_pad_w,
+                         int conv_stride_h, int conv_stride_w, int pool_id,
+                         int pool_size, int pool_stride,
+                         int activation_id, // Relu, Tanh, ClipRelu
+                         float out_min, float out_max, int swing) {
+
+  return Autotuner_ConvLayer(
+      input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h,
+      conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, pool_stride,
+      activation_id, out_min, out_max, swing);
 }
 
+void *
+FCLayer_PROMISE(void *input, float i_min, float i_max, void *weights,
+                float w_min, float w_max, void *bias, float b_min, float b_max,
+                int activation_id, float out_min, float out_max,
+                int swing) { // NOTE: min_val, max_val apply to 'ClippedRelu'
 
+  swing = getSwing(swing);
 
+  if (isPromiseLayer(swing)) {
 
-
-
-
-void* FCLayer_PROMISE(void* input, float i_min, float i_max,
-		      void* weights, float w_min, float w_max,
-		      void* bias, float b_min, float b_max,
-		      int activation_id,
-		      float out_min, float out_max, int swing){ //NOTE: min_val, max_val apply to 'ClippedRelu'
-
-
-  swing = getSwing(swing);
-  
-  if(isPromiseLayer(swing)){
-
-    return PROMISE_FC(input, i_min, i_max,
-		      weights, w_min, w_max,
-		      bias, b_min, b_max,
-		      activation_id,
-		      out_min, out_max, swing);
+    return PROMISE_FC(input, i_min, i_max, weights, w_min, w_max, bias, b_min,
+                      b_max, activation_id, out_min, out_max, swing);
   }
 
   assert(isGPULayer(swing));
@@ -1576,18 +1164,12 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max,
   }
 
   return activation_out;
-
 }
 
 #endif
 
-
-
 #ifdef OLD_MODEL
 
 #endif
 
-#endif 
-
-
-
+#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
index c1848f126750808a9438a4d2cf7729d1bf420fd1..b97e5beadb7822cce12bdf2ee4d16407cd0483c4 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
@@ -1,13 +1,12 @@
 //===--------------------------- approxtechniques.cu ---------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 //  This file  consists of the custom implementation of software approximations
 // for tensor convolutions. The approximations implemented are feature sampling
-// and perforation for FP32 and FP16 compute precisions.  
+// and perforation for FP32 and FP16 compute precisions.
 //
 //===----------------------------------------------------------------------===//
- 
 
 #include "tensor_utils.h"
 #include "approx_utils.h"
@@ -17,406 +16,465 @@
 #include "fp16_conversion.h"
 #include "profiling.h"
 
-extern "C"{
-
-__global__ void convToGemm(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+extern "C" {
+
+__global__ void convToGemm(float *const __restrict__ output,
+                           const float *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     const int inH = h * V_stride - V_pad;
     const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-            output[out_index] = 0;
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmFullInput(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride,
-                    const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen
-                  if(filter_elem_num % skip_every != skip_every-1-skip_offset) {
-                      int output_col = filter_elem_num -
-                               ((filter_elem_num + skip_every)/skip_every);
-                        if(skip_every == 1) output_col = filter_elem_num;
-                        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                            output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                        input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                        else         
-                            output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                   }                
-                }              
-            }                
+__global__ void convToGemmFullInput(
+    float *const __restrict__ output, const float *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h =
+      tx % (H_out * W_out) / W_out;     // output height index (row number)_
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter elemen
+        if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) {
+          int output_col =
+              filter_elem_num - ((filter_elem_num + skip_every) / skip_every);
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
         }
+      }
+    }
+  }
 }
 
-__global__ void convToGemmHalfInputNew(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-      
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if(filter_elem_num % skip_every != skip_offset) {
-                      int output_col = filter_elem_num -
-                                        (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                     if(skip_every == 1) output_col = filter_elem_num;
-                      if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                          output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                      else
-                          output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                  }
-              }
-           }
+__global__ void
+convToGemmHalfInputNew(__half *const __restrict__ output,
+                       const __half *const __restrict input, const int N,
+                       const int C, const int H, const int W, const int KH,
+                       const int KW, const int V_pad, const int H_pad,
+                       const int H_out, const int W_out, const int V_stride,
+                       const int H_stride, const int reduced_filter_elem,
+                       const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (filter_elem_num % skip_every != skip_offset) {
+          int output_col =
+              filter_elem_num - (filter_elem_num / skip_every +
+                                 (filter_elem_num % skip_every > skip_offset));
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
+        }
       }
+    }
+  }
 }
 
-
-__global__
-void convToGemmHalf(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW,
-                    const int V_pad, const int H_pad,
-                    const int H_out, const int W_out,
-                    const int V_stride, const int H_stride){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i
-    const int n = tx / (C * H_out * W_out); //output image numbe
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number
-    const int w = tx % W_out; //output width index (col number
-    const int inH = h * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
-                                            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                } else {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
-                }
-            }
+__global__ void convToGemmHalf(__half *const __restrict__ output,
+                               const __half *const __restrict input,
+                               const int N, const int C, const int H,
+                               const int W, const int KH, const int KW,
+                               const int V_pad, const int H_pad,
+                               const int H_out, const int W_out,
+                               const int V_stride, const int H_stride) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread i
+  const int n = tx / (C * H_out * W_out);               // output image numbe
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan numbe
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row number
+  const int w = tx % W_out;                   // output width index (col number
+  const int inH = h * V_stride - V_pad;
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+          output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        } else {
+          output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
         }
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output,
-                                        const __half * const __restrict input,
-                                        const int N, const int C,
-                                        const int H, const int W,
-                                        const int KH, const int KW, const int V_pad,
-                                        const int H_pad, const int H_out,
-                                        const int W_out, const int V_stride,
-                                        const int H_stride, const int reduced_filter_elem,
-                                        const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                     const int output_col = condition * filter_elem_num 
-                                    + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) 
-                                                         - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));                   		     
-                    const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
-                    //((output_col*N + n) * H_out + h) * W_out + w;
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                       output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                     else
-                       output[out_index] = 0;
-              }
-            }
+__global__ void convToGemmHalfInputNewIrregular(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if ((filter_elem_num - skip_offset) % skip_every) {
+          const int condition = (filter_elem_num < skip_offset);
+          const int output_col =
+              condition * filter_elem_num +
+              (!condition) *
+                  (filter_elem_num -
+                   ((filter_elem_num + 1 - skip_offset) / skip_every) -
+                   ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
+          const int out_index =
+              ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
+          //((output_col*N + n) * H_out + h) * W_out + w;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[out_index] = 0;
         }
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output,
-                                                const __half * const __restrict input,
-                                                const int N, const int C,
-                                                const int H, const int W,
-                                                const int KH, const int KW, const int V_pad,
-                                                const int H_pad, const int H_out,
-                                                const int W_out, const int V_stride,
-                                                const int H_stride, const int reduced_filter_elem,
-                                                const int skip_every, const int skip_offset) {
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_out * W_out); //output image number
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-
-	        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                    const int output_col = condition * filter_elem_num
-                                        + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every)
-                                        - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
-
-                    const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
-                    
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                    else
-                        output[out_index] = 0;
-                }
-            }
+__global__ void convToGemmHalfInputNewIrregular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if ((filter_elem_num - skip_offset) % skip_every) {
+          const int condition = (filter_elem_num < skip_offset);
+          const int output_col =
+              condition * filter_elem_num +
+              (!condition) *
+                  (filter_elem_num -
+                   ((filter_elem_num + 1 - skip_offset) / skip_every) -
+                   ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
+
+          const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
+
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[out_index] = 0;
         }
+      }
     }
+  }
 }
 
-
-
-__global__ void convToGemmHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void convToGemmHalf2(__half *const __restrict__ output,
+                                const __half *const __restrict input,
+                                const int N, const int C, const int H,
+                                const int W, const int KH, const int KW,
+                                const int V_pad, const int H_pad,
+                                const int H_out, const int W_out,
+                                const int V_stride, const int H_stride,
+                                const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     const int inH = h * V_stride - V_pad;
     const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) { 
-      for(int j = 0; j < KW; j++) { 
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element 
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-            output[out_index] = 0;
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmPerfRow(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void
+convToGemmPerfRow(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     int h_index;
-    if(h < start) {
-        h_index = h;
+    if (h < start) {
+      h_index = h;
     } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-
-    for(int i = 0; i < KH; i++) {
-        for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element
-	const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[out_index] = 0;
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w,
-			  float *old_data, float *new_data, int x, int start){
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-    
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) { 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) { 
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                (old_data[output_index] + old_data[output_index - w]) / 2;
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); 
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // filter number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      int row_index = row - ((row + 1 - start) / x);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          (old_data[output_index] + old_data[output_index - w]) / 2;
+    } else {
+      int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void convToGemmPerfCol(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
+__global__ void
+convToGemmPerfCol(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
     int w_index;
-    if(w < start) {
+    if (w < start) {
       w_index = w;
     } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
-    const int inW = w_index * H_stride - H_pad; 
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW  + i * KW + j; //index of this filter element
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
+    const int inW = w_index * H_stride - H_pad;
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = 0;
       }
     }
   }
 }
 
-__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w,
-			                        float *old_data, float *new_data, int x, int start) { 
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    	const int ch = tx % (c * h * w) / (h * w); //output chan number
-   	 const int row = tx % (h * w) / w; //output height index (row number)
-    	const int col = tx % w; //output width index (col number)
-
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] 
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-       		int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                	    (old_data[output_index] + old_data[output_index - 1]) / 2;
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);  
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-       	 	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
+__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // output chan number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      int col_index = col - ((col + 1 - start) / x);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          (old_data[output_index] + old_data[output_index - 1]) / 2;
+    } else {
+      int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void convToGemmPerfRowHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     int h_index;
-    if(h < start) {
-        h_index = h;
+    if (h < start) {
+      h_index = h;
     } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-
-    
-   for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-    	const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-    	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
           output[out_index] = 0;
       }
@@ -424,844 +482,903 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
   }
 }
 
-__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_eff * W_out); //output image numbe
-    if(n < N) { 
-        const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-        const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        int h_index;                   
-        if(h < start) {                
-            h_index = h;               
-        } else {                       
-            h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;                                                            
-        }                              
-        const int inH = h_index * V_stride - V_pad;
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-
-
-	for(int i = 0; i < KH; i++) {
-	  for(int j = 0; j < KW; j++) {
-	    const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-	    const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
-
-	    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	      output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	    else
-	      output[out_index] = 0;
-
-	  }
-	}
-	
+__global__ void convToGemmPerfRowHalf2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image numbe
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
+    int h_index;
+    if (h < start) {
+      h_index = h;
+    } else {
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+    }
+    const int inH = h_index * V_stride - V_pad;
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-				__hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+
+    const int ch = tx % (c * h * w) / (h * w); // filter number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      int row_index = row - ((row + 1 - start) / x);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+    } else {
+      int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col];
-       } else if(row == h-1) {
-           new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            const int row_index = row - ((row + 1 - start) / x);
-            const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h,
+                                          int w, __half *old_data,
+                                          __half *new_data, int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+
+    const int ch = tx % (c * h * w) / (h * w); // filter number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      const int row_index = row - ((row + 1 - start) / x);
+      const int output_index =
+          ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+    } else {
+      const int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      const int output_index =
+          ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-
-__global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
+__global__ void convToGemmPerfColHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
     int w_index;
-    if(w < start) {
+    if (w < start) {
       w_index = w;
     } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inW = w_index * H_stride - H_pad;
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
           output[out_index] = 0;
-
       }
     }
   }
 }
 
-__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                        const int H, const int W, const int KH, const int KW, const int V_pad,
-                        const int H_pad, const int H_out, const int W_out, const int V_stride,
-                        const int H_stride, const int x, const int start, const int W_eff){
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_eff); //output image number
-      if(n < N) {
-          const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-          const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-          const int w = tx % W_eff; //output width index (col number)
-          int w_index;
-          if(w < start) {
-              w_index = w;
-          } else {
-              w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
-          }
-          const int inW = w_index * H_stride - H_pad;
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-
-
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen
-                  const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
-                  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                  else
-                      output[out_index] = 0;
-              }
-        }
+__global__ void convToGemmPerfColHalf2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
+    int w_index;
+    if (w < start) {
+      w_index = w;
+    } else {
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+    }
+    const int inW = w_index * H_stride - H_pad;
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter elemen
+        const int out_index =
+            ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+      }
     }
+  }
 }
 
-
-__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    	const int ch = tx % (c * h * w) / (h * w); //output chan number
-    	const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-        	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-			__hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
-   }
+__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // output chan number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      int col_index = col - ((col + 1 - start) / x);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    } else {
+      int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
+    }
+  }
 }
 
-__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        const int ch = tx % (c * h * w) / (h * w); //output chan number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-        if(col < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                        = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col];
-   
-        } else if(col == w - 1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
-   
-        } else if (col == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                        old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
-   
-        } else if((col - start) % x == 0) {
-            const int col_index = col - ((col + 1 - start) / x);
-            const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-        } else {
-            const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-            const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h,
+                                          int w, __half *old_data,
+                                          __half *new_data, int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // output chan number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col];
+
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
+
+    } else if ((col - start) % x == 0) {
+      const int col_index = col - ((col + 1 - start) / x);
+      const int output_index =
+          ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    } else {
+      const int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      const int output_index =
+          ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
+__global__ void
+convToGemmFullInputRegular(float *const __restrict__ output,
+                           const float *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int reduced_filter_elem,
+                           const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int ch = (fi * C) / reduced_filter_elem;
+      const int offset = (skip_offset + ch) % skip_every;
+      int in_index;
+      if (fi < offset) {
+        in_index = fi;
+      } else {
+        in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                   (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                   offset - 1;
+      }
 
-__global__ void convToGemmFullInputRegular(float * const __restrict__ output,
-				    const float * const __restrict input,
-				    const int N, const int C,
-				    const int H, const int W,
-				    const int KH, const int KW, const int V_pad,
-				    const int H_pad, const int H_out,
-				    const int W_out, const int V_stride,
-				    const int H_stride, const int reduced_filter_elem,
-				    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (H_out * W_out); //output image number
-  if(n < N) {
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-    #pragma unroll
-    for(int fi = 0; fi < reduced_filter_elem; fi++) {
-         const int ch = (fi * C) / reduced_filter_elem;
-         const int offset = (skip_offset + ch) % skip_every;
-         int in_index;
-         if(fi < offset) {
-             in_index = fi;
-         } else {
-             in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
-                        + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-        }
-	 
-        const int i = (in_index % (KW * KH)) / KW;
-        const int j = in_index % KW;
-        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; 
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-        } else {
-            output[out_index] = 0;
-        }
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
       }
     }
+  }
 }
 
-__global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            int in_index;
-            if(fi < skip_offset) {
-                in_index = fi;
-            } else {
-                in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                            + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-            }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmFullInputIrregular(
+    float *const __restrict__ output, const float *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      int in_index;
+      if (fi < skip_offset) {
+        in_index = fi;
+      } else {
+        in_index =
+            ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+            (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+            skip_offset - 1;
+      }
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
-
-    
+  }
 }
 
-__global__ void createReducedFiltersFullRegular(float * output,
-						const float * const __restrict input, const int NF,
-						const int num_filter_elem, const int reduced_filter_elem, 
-						const int channels,
-						const int skip_every, const int skip_offset, const float fac) {
-  
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / reduced_filter_elem; //filter index
-  if(fIdx < NF) { 
-    const int offset = tx % reduced_filter_elem; //offset within filter
+__global__ void createReducedFiltersFullRegular(
+    float *output, const float *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int channels, const int skip_every, const int skip_offset,
+    const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
     const int ch = (offset * channels) / reduced_filter_elem;
     const int channel_offset = (skip_offset + ch) % skip_every;
-      int in_index;
-      if(offset < channel_offset) {
-        in_index = offset;
-      }
-      else {
-         in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                  + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
-      }
-      
-      output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+    int in_index;
+    if (offset < channel_offset) {
+      in_index = offset;
+    } else {
+      in_index =
+          ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) +
+          (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) >
+           0) +
+          channel_offset - 1;
+    }
+
+    output[fIdx * reduced_filter_elem + offset] =
+        fac * input[num_filter_elem * fIdx + in_index];
   }
 }
 
-__global__ void createReducedFiltersFullIrregular(float * output,
-                     const float * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int fIdx = tx / reduced_filter_elem; //filter index
-      if(fIdx < NF) {
-        const int offset = tx % reduced_filter_elem; //offset within filter
-        int in_index;
-        if(offset < skip_offset) {
-            in_index = offset;
-        } else {
-            in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; 
-        }
-        output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+__global__ void createReducedFiltersFullIrregular(
+    float *output, const float *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int skip_every, const int skip_offset, const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
+    int in_index;
+    if (offset < skip_offset) {
+      in_index = offset;
+    } else {
+      in_index =
+          ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) +
+          (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+          skip_offset - 1;
     }
+    output[fIdx * reduced_filter_elem + offset] =
+        fac * input[num_filter_elem * fIdx + in_index];
+  }
 }
 
-__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-      #pragma unroll
-      for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-         const int fi = ch * (reduced_filter_elem / C) + ki;
-         const int offset = (skip_offset + ch) % skip_every;
-   
-         const bool condition = (fi < offset);
-         const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-  
-         const int i = (in_index % (KW * KH)) / KW;
-         const int j = in_index % KW;
-         const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-         if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { 
-             output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-         } else {
-            output[out_index] = 0;
-         }
+__global__ void
+convToGemmHalfInputRegular(__half *const __restrict__ output,
+                           const __half *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int reduced_filter_elem,
+                           const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int ch =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int ki = 0; ki < reduced_filter_elem / C; ki++) {
+      const int fi = ch * (reduced_filter_elem / C) + ki;
+      const int offset = (skip_offset + ch) % skip_every;
+
+      const bool condition = (fi < offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               offset - 1);
+
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
       }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C, 
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      if(n < N) {
-           const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-          const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-          const int w = tx % W_out; //output width index (col number)
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-          const int inW = w * H_stride - H_pad; //input width index (col number)
-          
-          #pragma unroll
-           for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-
-	      const int fi = ch * (reduced_filter_elem / C) + ki;	          
-              const int offset = (skip_offset + ch) % skip_every;
-              const int condition = (fi < offset);
-              const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                          + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-         
-              const int i = (in_index % (KW * KH)) / KW;
-              const int j = in_index % KW;
-              const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-              if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                  output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-              }
-	      else {
-                  output[out_index] = 0;
-             }
-        }
+__global__ void convToGemmHalfInputRegular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int ch =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int ki = 0; ki < reduced_filter_elem / C; ki++) {
+
+      const int fi = ch * (reduced_filter_elem / C) + ki;
+      const int offset = (skip_offset + ch) % skip_every;
+      const int condition = (fi < offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               offset - 1);
+
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-
-	    const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW; 
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            }
-	    else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmHalfInputIrregular(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int condition = (fi < skip_offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               skip_offset - 1);
+
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-       #pragma unroll 
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                 + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-      
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmHalfInputIrregular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int condition = (fi < skip_offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               skip_offset - 1);
+
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
+__global__ void createReducedFiltersHalfRegular(
+    __half *output, const __half *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int channels, const int skip_every, const int skip_offset,
+    const float fac) {
 
-__global__ void createReducedFiltersHalfRegular(__half * output,
-                                         const __half * const __restrict input, const int NF,
-                                         const int num_filter_elem, const int reduced_filter_elem,
-                     			 const int channels,
-                                         const int skip_every, const int skip_offset, const float fac) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
 
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-
-  const int fIdx = tx / reduced_filter_elem; //filter index
-  if(fIdx < NF) {
-    const int offset = tx % reduced_filter_elem; //offset within filter
+  const int fIdx = tx / reduced_filter_elem; // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
     const int ch = (offset * channels) / reduced_filter_elem;
     const int channel_offset = (skip_offset + ch) % skip_every;
     const int condition = (offset < channel_offset);
-    const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                          + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1);
-      
-    output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
- }
-  
+    const int in_index =
+        condition * offset +
+        (!condition) *
+            (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) +
+             (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) >
+              0) +
+             channel_offset - 1);
+
+    output[fIdx * reduced_filter_elem + offset] =
+        __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
+  }
 }
 
-__global__ void createReducedFiltersHalfIrregular(__half * output,
-                     const __half * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
+__global__ void createReducedFiltersHalfIrregular(
+    __half *output, const __half *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int skip_every, const int skip_offset, const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
 
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / reduced_filter_elem; //filter index
-  
-  if(fIdx < NF) {
+  if (fIdx < NF) {
 
-    const int offset = tx % reduced_filter_elem; //offset within filter
+    const int offset = tx % reduced_filter_elem; // offset within filter
     const int condition = (offset < skip_offset);
-    
-    int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-        
-    output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
-  }
-      
-}
 
+    int in_index =
+        condition * offset +
+        (!condition) *
+            (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) +
+             (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) >
+              0) +
+             skip_offset - 1);
 
+    output[fIdx * reduced_filter_elem + offset] =
+        __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
+  }
+}
 
-//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
-__global__ void convToGemmApprox(float * const __restrict__ output,
-				 const float * const __restrict input, const int N, const int C,
-				 const int H, const int W,
-				 const int KH, const int KW, const int V_pad,
-				 const int H_pad, const int H_out,
-				 const int W_out, const int V_stride,
-				 const int H_stride, const int reduced_filter_elem,
-				 const int skip_every) {
-  
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
-	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //cal output column, taking skipping into account
-	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	  else
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
-	}
+// produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem
+// cols
+__global__ void
+convToGemmApprox(float *const __restrict__ output,
+                 const float *const __restrict input, const int N, const int C,
+                 const int H, const int W, const int KH, const int KW,
+                 const int V_pad, const int H_pad, const int H_out,
+                 const int W_out, const int V_stride, const int H_stride,
+                 const int reduced_filter_elem, const int skip_every) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (filter_elem_num % skip_every !=
+            skip_every - 1) { // are we including this filter element?
+          const int output_col =
+              filter_elem_num -
+              (filter_elem_num /
+               skip_every); // cal output column, taking skipping into account
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((n * reduced_filter_elem + output_col) * H_out + h) *
+                       W_out +
+                   w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((n * reduced_filter_elem + output_col) * H_out + h) *
+                       W_out +
+                   w] = 0;
+        }
       }
     }
   }
 }
 
-
 /// This function serves as an API with the custom implementation of convolution
-/// with the perforation and filter sampling support. The compute precison is FP32.
-/// This routine is invoked by the tuner for tuning approximations for convolutions.
+/// with the perforation and filter sampling support. The compute precison is
+/// FP32. This routine is invoked by the tuner for tuning approximations for
+/// convolutions.
 ///
-void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
-			 int vertical_pad, int horizontal_pad, int vertical_stride,
-			 int horizontal_stride, int conv_mode, int conv_groups,
-			 int row, int col, int start){
-
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode, int conv_groups,
+                         int row, int col, int start) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
-  
-  Tensor* output;
+
+  Tensor *output;
   // TODO: Support other cases;
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
   convertToFP32(input);
   convertToFP32(filter);
-  
+
   long int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
   int rem_row = (h - start) % row > 0;
   int h_eff = h - ((h - start) / row) - rem_row;
-  
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   int rem_col = (w - start) % col > 0;
   int w_eff = w - ((w - start) / col) - rem_col;
 
-  Tensor* new_output;
-  if(row > 1){
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+  Tensor *new_output;
+  if (row > 1) {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float* convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, start, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, start, h_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h_eff * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h_eff * w,
-					      num_filter_elem * h_eff * w,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h_eff * w, c * h_eff * w,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateRow<<<numBlocks, 128>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  }
-  else if(col > 1){
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+  } else if (col > 1) {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, start, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, start, w_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData,
-					      h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h * w_eff, c * h * w_eff,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateCol<<<numBlocks, 128>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else { 
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
+  } else {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    convToGemmApprox<<<gridSize, blockSize>>>(convData,
-					      (float *)input->gpu_data, n,
-					      input->dims.dim_sizes[1],
-					      input->dims.dim_sizes[2],
-					      input->dims.dim_sizes[3],
-					      KH, KW,
-					      vertical_pad, horizontal_pad, h, w,
-					      vertical_stride, horizontal_stride,
-					      num_filter_elem, c * h * w);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApprox<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, c * h * w);
     checkCudaErrors(cudaDeviceSynchronize());
-    //Do the matrix multiplication
-    //Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
-    
+    // Do the matrix multiplication
+    // Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w, num_filter_elem * h * w,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w, c * h * w,
-					      n));
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem,
+        &alpha, convData, h * w, num_filter_elem * h * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w, c * h * w, n));
 
     new_output = output;
     cudaFree(convData);
   }
 
-  //Event("Conv_end"); //, true);
+  // Event("Conv_end"); //, true);
   return new_output;
 }
 
-__global__
-void switchMatrixFull(int N, int n, int c, int h, int w,
-              float *old_data, float *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-          int col = ((i % (c * h * w)) % (h * w)) % w;
-          int row = ((i % (c * h * w)) % (h * w)) / w;
-          int ch = (i % (c * h * w)) / (h * w);
-          int n_new = i / (c * h * w);
-          
-          new_data[((n_new * c + ch) * h + row ) * w + col] =
-                        old_data[((ch * n + n_new) * h + row ) * w + col];
-        }
-}
+__global__ void switchMatrixFull(int N, int n, int c, int h, int w,
+                                 float *old_data, float *new_data) {
 
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
+
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
+  }
+}
 
 /// This function serves as an API with the custom implementation of convolution
-/// with the perforation and filter sampling support. The compute precison is FP32.
+/// with the perforation and filter sampling support. The compute precison is
+/// FP32.
 ///
-void* tensorConvApprox(void* input_ptr, void* filter_ptr,
-		       int vertical_pad, int horizontal_pad, int vertical_stride,
-		       int horizontal_stride, int conv_mode, int conv_groups,
-		       int row, int col, int skip_every, int offset){
+void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
+                       int horizontal_pad, int vertical_stride,
+                       int horizontal_stride, int conv_mode, int conv_groups,
+                       int row, int col, int skip_every, int offset) {
 
   //////INFO("*** TensorConvolution approximation \n");
-  //Event("Conv");
+  // Event("Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
@@ -1275,15 +1392,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   ////Event("H2F_end");
 
   const int n = input->dims.dim_sizes[0];
-  const int c = filter->dims.dim_sizes[0]; //number of filters
+  const int c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-  const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const int h =
+      (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const int w =
+      (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
   ////INFO("batch: %d\n", n);
@@ -1296,619 +1416,572 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   ////INFO("horizontal_stride: %d\n", horizontal_stride);
   ////INFO("output height: %d\n", h);
   ////INFO("output width: %d\n", w);
-  if(row > 1) {
+  if (row > 1) {
     const int rem_row = (h - offset) % row > 0;
     const int h_eff = h - ((h - offset) / row) - rem_row;
 
-    Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, offset, h_eff);
+    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h_eff * w));
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, offset, h_eff);
     checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                &alpha,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (float *)filter->gpu_data, num_filter_elem, 0,
-                                                &beta,
-                                                (float *)output->gpu_data, h_eff * w, c * h_eff * w,
-                                                n));
-    //interpolate
+
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+    // interpolate
     int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    approxInterpolateRow<<<numBlocks, blocksize>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(col > 1) {
+  } else if (col > 1) {
     const int rem_col = (w - offset) % col > 0;
     const int w_eff = w - ((w - offset) / col) - rem_col;
 
-    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3], KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, offset, w_eff);
+    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w_eff));
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, offset, w_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
-					      n));
-
-    //interpolate
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    // interpolate
     int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    approxInterpolateCol<<<numBlocks, blocksize>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(skip_every > 1) {
-    //reduced number after skipping
+  } else if (skip_every > 1) {
+    // reduced number after skipping
     const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+    const int reduced_filter_elem =
+        num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
 
-    float* convData;
+    float *convData;
     size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    float* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
-    
+    float *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
+
     const int filtBlockSize = 128;
     ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac =  ((float) skip_every) / ((float) skip_every - 1);
+    const int filtGridSize =
+        (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac = ((float)skip_every) / ((float)skip_every - 1);
     //////INFO("fac: %f\n", fac);
     const int blockSize = 128;
-    //////INFO("n * h * w : %d\n", (n * h * w ));    
-    const int gridSize = (n * h * w + blockSize - 1) / blockSize;  
-    if(!(KH * KW % skip_every)) {
-       // ////INFO("REGULAR FILTERING\n");
-        createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                (float *)filter->gpu_data,
-								c, num_filter_elem,
-								reduced_filter_elem,
-								input->dims.dim_sizes[1], skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
+    //////INFO("n * h * w : %d\n", (n * h * w ));
+    const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+    if (!(KH * KW % skip_every)) {
+      // ////INFO("REGULAR FILTERING\n");
+      createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
+          reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+          fac);
+      checkCudaErrors(cudaDeviceSynchronize());
+      convToGemmFullInputRegular<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     } else {
-       // ////INFO("IRREGULAR FILTERING\n");
-        createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (float *)filter->gpu_data,
-                                    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,     
-                                                                input->dims.dim_sizes[1],                                                     
-                                                                input->dims.dim_sizes[2],                                                 
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
+      // ////INFO("IRREGULAR FILTERING\n");
+      createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
+          reduced_filter_elem, skip_every, offset, fac);
+      checkCudaErrors(cudaDeviceSynchronize());
+      convToGemmFullInputIrregular<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     }
     checkCudaErrors(cudaDeviceSynchronize());
-    
+
     const float alpha = 1.0;
     const float beta = 0.0;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            &alpha,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem,
+        &alpha, convData, h * w, reduced_filter_elem * h * w, reducedFilter,
+        reduced_filter_elem, 0, &beta, (float *)new_output->gpu_data, h * w,
+        c * h * w, n));
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
 
-      //INFO("FP32 BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-                               CUDNN_TENSOR_NCHW, n, c, h, w);
+    // INFO("FP32 BASELINE\n");
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w));
-    convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w, vertical_stride, horizontal_stride, 
-                           skip_every, offset);//num_filter_elem);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w));
+    convToGemmFullInput<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        skip_every, offset); // num_filter_elem);
     checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     /*
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                          CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, num_filter_elem,
-                                            &alpha,
-                                            convData, h * w, num_filter_elem * h * w,
-                                            (float *)filter->gpu_data, num_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
-    */
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                       n * h * w, c, num_filter_elem,
-                        &alpha,
-                        convData,
-                        CUDA_R_32F, n * h * w,
-                        (float *) filter->gpu_data, CUDA_R_32F,
-                        num_filter_elem,
-                        &beta,
-                        (float *) output->gpu_data,
-                        CUDA_R_32F, n * h * w,
-                        CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-    
-    const int numBlocks = (n * c * h * w  + 255) / 256;
-    switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (float *)output->gpu_data,
-                                    (float *)new_output->gpu_data);
-    
+
+    float alpha = 1.0f, beta = 0.0f;
+    /*
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+                                         CUBLAS_OP_N, CUBLAS_OP_N,
+                                           h * w, c, num_filter_elem,
+                                           &alpha,
+                                           convData, h * w, num_filter_elem * h
+    * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float
+    *)new_output->gpu_data, h * w, c * h * w, n));
+   */
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
+        &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data,
+        CUDA_R_32F, num_filter_elem, &beta, (float *)output->gpu_data,
+        CUDA_R_32F, n * h * w, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    const int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                         (float *)output->gpu_data,
+                                         (float *)new_output->gpu_data);
+
     checkCudaErrors(cudaDeviceSynchronize());
     cudaFree(convData);
   }
 
-  //Event("Conv_end");
+  // Event("Conv_end");
   return new_output;
 }
 
-__global__
-void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-            int col = ((i % (c * h * w)) % (h * w)) % w;
-            int row = ((i % (c * h * w)) % (h * w)) / w;
-            int ch = (i % (c * h * w)) / (h * w);
-            int n_new = i / (c * h * w);
-            
-            new_data[((n_new * c + ch) * h + row ) * w + col] =
-                            old_data[((ch * n + n_new) * h + row ) * w + col];
-      }
-}
+__global__ void switchMatrixHalf(int N, int n, int c, int h, int w,
+                                 __half *old_data, __half *new_data) {
 
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
+
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
+  }
+}
 
-/// This function serves as an API to custom implementation of the 
+/// This function serves as an API to custom implementation of the
 /// half-precision convolution with the perforation and filter sampling
-/// support. 
+/// support.
 ///
-void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
-			   int vertical_pad, int horizontal_pad,
-			   int vertical_stride, int horizontal_stride,
-			   int conv_mode, int conv_groups,
-			   int row, int col, int skip_every, int offset) {
-
- //INFO("*** TensorConvolution half approximation \n");
- // profileEvent("#Conv");
-
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups, int row, int col, int skip_every,
+                            int offset) {
+
+  // INFO("*** TensorConvolution half approximation \n");
+  // profileEvent("#Conv");
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
 
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
-  
+
   profileEvent("F2H_start");
-   convertToFP16(input);
-   convertToFP16(filter);
+  convertToFP16(input);
+  convertToFP16(filter);
   profileEvent("F2H_end");
-  
+
   const long int n = input->dims.dim_sizes[0];
-  const long int c = filter->dims.dim_sizes[0]; //number of filters
+  const long int c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-  const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const long int h =
+      (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const long int w =
+      (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   changeTensorPlacement(new_output, DEVICE);
-  //INFO("batch: %d\n", n);
+  // INFO("batch: %d\n", n);
   // INFO("channels: %d\n", input->dims.dim_sizes[1]);
   // INFO("num_filters: %d\n", c);
   // INFO("kernel height: %d\n", KH);
-  // INFO("kernel width: %d\n", KW);   
+  // INFO("kernel width: %d\n", KW);
   // INFO("num_filter_elem: %d\n", num_filter_elem);
-   //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
-   //INFO("vertical_stride: %d\n", vertical_stride);
-   //INFO("horizontal_stride: %d\n", horizontal_stride);
+  // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
+  // INFO("vertical_stride: %d\n", vertical_stride);
+  // INFO("horizontal_stride: %d\n", horizontal_stride);
   // INFO("output height: %d\n", h);
   // INFO("output width: %d\n", w);
-   //INFO("skip_every: %d\n", skip_every);
+  // INFO("skip_every: %d\n", skip_every);
   const __half alf = approx_float_to_half(1.0);
   const __half bet = approx_float_to_half(0.0);
   const __half *alpha_half = &alf;
   const __half *beta_half = &bet;
 
-  if(row > 1){
+  if (row > 1) {
     const int rem_row = (h - offset) % row > 0;
     const int h_eff = h - ((h - offset) / row) - rem_row;
-    
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW,
-						  n, c, h_eff, w);
+
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w);
     changeTensorPlacement(output_half, DEVICE);
 
-    __half * convData;
+    __half *convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    
+
     const int patchBlockSize = 256;
-    const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) / patchBlockSize;
+    const int numPatchBlocks =
+        (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) /
+        patchBlockSize;
     const int interpolationBlocksize = 256;
-    const int numInterpolationBlocks = (n * c * h * w  + interpolationBlocksize - 1) / interpolationBlocksize;
-    if(h * w <= 64) {
-        //INFO("H *W <= 64\n");
-        convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>(convData,
-                                   (__half *)input->gpu_half_data, n,
-                                   input->dims.dim_sizes[1],
-                                   input->dims.dim_sizes[2],
-                                   input->dims.dim_sizes[3],
-                                   KH, KW, vertical_pad,
-                                   horizontal_pad, h, w, vertical_stride,
-                                   horizontal_stride, row, offset, h_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                         n * h_eff * w, c, num_filter_elem,
-                         alpha_half,
-                         convData, CUDA_R_16F, n * h_eff * w,
-                         (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                         beta_half,
-                         (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
-                         CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-        approxInterpolateRowHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-                                        (__half *)output_half->gpu_half_data,
-                                        (__half *)new_output->gpu_half_data,
-                                        row, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
-    
-    } else {
-        //INFO("H *W > 64\n");
-        convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>(convData,
-						   (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1],
-						   input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3],
-						   KH, KW, vertical_pad,
-						   horizontal_pad, h, w, vertical_stride,
-						   horizontal_stride, row, offset, h_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                alpha_half,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                                beta_half,
-                                                (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w,
-                                                n));    
-        
-        approxInterpolateRowHalf<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						row, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
+    const int numInterpolationBlocks =
+        (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize;
+    if (h * w <= 64) {
+      // INFO("H *W <= 64\n");
+      convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, row, offset, h_eff);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c,
+          num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w,
+          (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+          beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F,
+          n * h_eff * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+      approxInterpolateRowHalf2<<<numInterpolationBlocks,
+                                  interpolationBlocksize>>>(
+          n * c * h * w, h_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, row, offset);
+      checkCudaErrors(cudaDeviceSynchronize());
 
+    } else {
+      // INFO("H *W > 64\n");
+      convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, row, offset, h_eff);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+          alpha_half, convData, h_eff * w, num_filter_elem * h_eff * w,
+          (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half,
+          (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, n));
+
+      approxInterpolateRowHalf<<<numInterpolationBlocks,
+                                 interpolationBlocksize>>>(
+          n * c * h * w, h_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, row, offset);
+      checkCudaErrors(cudaDeviceSynchronize());
     }
     freeTensor(output_half);
     cudaFree(convData);
-} else if(col > 1) {
+  } else if (col > 1) {
     const int rem_col = (w - offset) % col > 0;
     const int w_eff = w - ((w - offset) / col) - rem_col;
 
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff);
     changeTensorPlacement(output_half, DEVICE);
-   
-    __half * convData;
+
+    __half *convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    
+
     const int patchBlockSize = 256;
-    const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) / patchBlockSize;
+    const int numPatchBlocks =
+        (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) /
+        patchBlockSize;
     const int interpolationBlocksize = 256;
-    const int numInterpolationBlocks = (n * c * h * w  + interpolationBlocksize - 1) / interpolationBlocksize;
-   if(h * w <= 64) {
-         //INFO("H *W <= 64\n");
-        convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3], KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, col, offset, w_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                        n * h * w_eff, c, num_filter_elem,
-                                        alpha_half,
-                                        convData, CUDA_R_16F, n * h * w_eff,
-                                        (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                                        beta_half,
-                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
-                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-         approxInterpolateColHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-                                                        (__half *)output_half->gpu_half_data,
-                                                        (__half *)new_output->gpu_half_data,
-                                                        col, offset);
-          checkCudaErrors(cudaDeviceSynchronize());
+    const int numInterpolationBlocks =
+        (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize;
+    if (h * w <= 64) {
+      // INFO("H *W <= 64\n");
+      convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, col, offset, w_eff);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c,
+          num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff,
+          (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+          beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F,
+          n * h * w_eff, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+      approxInterpolateColHalf2<<<numInterpolationBlocks,
+                                  interpolationBlocksize>>>(
+          n * c * h * w, w_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, col, offset);
+      checkCudaErrors(cudaDeviceSynchronize());
     } else {
-        //INFO("H *W > 64\n");
-        convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n,
-		                            				   input->dims.dim_sizes[1],
-                                                       input->dims.dim_sizes[2],
-                                                       input->dims.dim_sizes[3], KH, KW, vertical_pad,
-                                                       horizontal_pad, h, w, vertical_stride,
-                                                       horizontal_stride, col, offset, w_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-    
-        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                              CUBLAS_OP_N, CUBLAS_OP_N,
-                                              h * w_eff, c, num_filter_elem,
-                                              alpha_half,
-                                              convData, h * w_eff, num_filter_elem * h * w_eff,
-                                              (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                              beta_half,
-                                              (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff,
-                                              n));
-
-         approxInterpolateColHalf<<<numInterpolationBlocks,interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-                                 (__half *)output_half->gpu_half_data,
-                                 (__half *)new_output->gpu_half_data,
-                                 col, offset);
-         checkCudaErrors(cudaDeviceSynchronize());
+      // INFO("H *W > 64\n");
+      convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, col, offset, w_eff);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+          alpha_half, convData, h * w_eff, num_filter_elem * h * w_eff,
+          (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half,
+          (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, n));
+
+      approxInterpolateColHalf<<<numInterpolationBlocks,
+                                 interpolationBlocksize>>>(
+          n * c * h * w, w_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, col, offset);
+      checkCudaErrors(cudaDeviceSynchronize());
     }
 
     freeTensor(output_half);
     cudaFree(convData);
-  } else if(skip_every > 1) {
+  } else if (skip_every > 1) {
     const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+    const int reduced_filter_elem =
+        num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
 
-    __half* convData;
+    __half *convData;
     size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    __half* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
+    __half *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
 
     const int filtBlockSize = 256;
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac =  ((float) skip_every) / ((float) skip_every - 1);
+    const int filtGridSize =
+        (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac = ((float)skip_every) / ((float)skip_every - 1);
     const int blockSize = 256;
-    //const int gridSize = (n * h * w + blockSize - 1) / blockSize;
-   // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
-   // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
+    // const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+    // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
+    // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
     const __half alf = approx_float_to_half(1.0);
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-    if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { 
-      if(!(KH * KW % skip_every)) {
-        //INFO("---REGULAR FILTERING\n");
-        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                (__half *)filter->gpu_half_data,
-								c, num_filter_elem,
-                                                                reduced_filter_elem,
-                                                                input->dims.dim_sizes[1], skip_every, offset, fac);
+    if (c * num_filter_elem <
+        500000) { // 250) {//c * reduced_filter_elem < 150000) {
+      if (!(KH * KW % skip_every)) {
+        // INFO("---REGULAR FILTERING\n");
+        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+            fac);
         checkCudaErrors(cudaDeviceSynchronize());
-	
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
       } else {
-        //INFO("---IRREGULAR FILTERING\n");
-        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (__half *)filter->gpu_half_data,
-				    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
+        // INFO("---IRREGULAR FILTERING\n");
+        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, skip_every, offset, fac);
         checkCudaErrors(cudaDeviceSynchronize());
-        
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1]  + blockSize - 1) / blockSize;
-	    //convToGemmHalfInputIrregular
-        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,  
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-     }   
-     checkCudaErrors(cudaDeviceSynchronize());
-
-     checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            alpha_half,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            beta_half,
-                                            (__half *)new_output->gpu_half_data, h * w, c * h * w,
-                                            n));
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        // convToGemmHalfInputIrregular
+        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      }
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem,
+          alpha_half, convData, h * w, reduced_filter_elem * h * w,
+          reducedFilter, reduced_filter_elem, 0, beta_half,
+          (__half *)new_output->gpu_half_data, h * w, c * h * w, n));
     } else {
-        Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                 CUDNN_TENSOR_NCHW, n, c, h, w);
-        changeTensorPlacement(output_half, DEVICE);
-
-        if(!(KH * KW % skip_every)) {
-           //INFO("REGULAR FILTERING\n");
-            createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                        (__half *)filter->gpu_half_data,
-                                                        c, num_filter_elem,
-                                                        reduced_filter_elem,
-                                                        input->dims.dim_sizes[1], skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-        } else {
-           // INFO("IRREGULAR FILTERING\n");
-            createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                            (__half *)filter->gpu_half_data,
-                                                                            c, num_filter_elem,
-                                                                            reduced_filter_elem,
-                                                                            skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-            }
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                        n * h * w, c, reduced_filter_elem,
-                                        alpha_half,
-                                        convData, CUDA_R_16F, n * h * w,
-                                         reducedFilter, CUDA_R_16F, reduced_filter_elem,
-                                        beta_half,
-                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
-                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-            
-            int numBlocks = (n * c * h * w  + 255) / 256;
-            switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (__half *)output_half->gpu_half_data,
-                                    (__half *)new_output->gpu_half_data);
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            freeTensor(output_half);
+      Tensor *output_half = (Tensor *)create4DTensor(
+          (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w);
+      changeTensorPlacement(output_half, DEVICE);
+
+      if (!(KH * KW % skip_every)) {
+        // INFO("REGULAR FILTERING\n");
+        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+            fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      } else {
+        // INFO("IRREGULAR FILTERING\n");
+        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, skip_every, offset, fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      }
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+          reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w,
+          reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half,
+          (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w,
+          CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+      int numBlocks = (n * c * h * w + 255) / 256;
+      switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                           (__half *)output_half->gpu_half_data,
+                                           (__half *)new_output->gpu_half_data);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      freeTensor(output_half);
     }
-    
+
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
-       //INFO("FP16 BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                   CUDNN_TENSOR_NCHW, n, c, h, w);
-      
-      changeTensorPlacement(output, DEVICE);
-      __half * convData;
-      long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
-      checkCudaErrors(cudaMalloc(&convData, convDataSize));
-      
-      const int blockSize = 256;
-      const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-      //convToGemmHalf
-      convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData,
-                                                (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3],
-                                                KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, num_filter_elem,
-                                                skip_every, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        const __half alf = approx_float_to_half(1.0);
-        const __half bet = approx_float_to_half(0.0);
-        const __half *alpha_half = &alf;
-        const __half *beta_half = &bet;
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                    n * h * w, c, num_filter_elem,
-                                    alpha_half,
-                                    convData, CUDA_R_16F, n * h * w,
-                                    (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                                    beta_half,
-                                    (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w,
-                                    CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-        
-        const int numBlocks = (n * c * h * w  + 255) / 256;
-        switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data,
-                                            (__half *)new_output->gpu_half_data);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        freeTensor(output);
-        cudaFree(convData);
+    // INFO("FP16 BASELINE\n");
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    changeTensorPlacement(output, DEVICE);
+    __half *convData;
+    long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    // convToGemmHalf
+    convToGemmHalfInputNew<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, skip_every, offset);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
+        alpha_half, convData, CUDA_R_16F, n * h * w,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half,
+        (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F,
+        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    const int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                         (__half *)output->gpu_half_data,
+                                         (__half *)new_output->gpu_half_data);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    freeTensor(output);
+    cudaFree(convData);
   }
 
   profileEvent("H2F_start");
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu
index 6e9f88bb54e5655b18d72fc88e5a08a2478ea9fc..bdcfb2c5684d1584e1a520194066fc20e3724632 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu
@@ -7,429 +7,489 @@
 #include "fp16_conversion.h"
 #include "profiling.h"
 
-extern "C"{
-
-__global__ void convToGemm(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+extern "C" {
+
+__global__ void convToGemm(float *const __restrict__ output,
+                           const float *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     const int inH = h * V_stride - V_pad;
     const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-            output[out_index] = 0;
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmFullInput(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride,
-                    const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen
-                  if(filter_elem_num % skip_every != skip_every-1-skip_offset) {
-                      int output_col = filter_elem_num -
-                                ((filter_elem_num + skip_every)/skip_every);
-                       if(skip_every == 1)
-                           output_col = filter_elem_num;
-                        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                            output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                        input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                        else         
-                            output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                    }                
-                }              
-            }                
+__global__ void convToGemmFullInput(
+    float *const __restrict__ output, const float *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h =
+      tx % (H_out * W_out) / W_out;     // output height index (row number)_
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter elemen
+        if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) {
+          int output_col =
+              filter_elem_num - ((filter_elem_num + skip_every) / skip_every);
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
         }
+      }
+    }
+  }
 }
 
-__global__ void convToGemmHalfInputNew(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-      
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if(filter_elem_num % skip_every != skip_offset) {
-                      int output_col = filter_elem_num -
-                                        (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                      if(skip_every == 1)
-                          output_col = filter_elem_num;
-                      if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                          output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                      else
-                          output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                  }
-                }
-           }
+__global__ void
+convToGemmHalfInputNew(__half *const __restrict__ output,
+                       const __half *const __restrict input, const int N,
+                       const int C, const int H, const int W, const int KH,
+                       const int KW, const int V_pad, const int H_pad,
+                       const int H_out, const int W_out, const int V_stride,
+                       const int H_stride, const int reduced_filter_elem,
+                       const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (filter_elem_num % skip_every != skip_offset) {
+          int output_col =
+              filter_elem_num - (filter_elem_num / skip_every +
+                                 (filter_elem_num % skip_every > skip_offset));
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
+        }
       }
+    }
+  }
 }
 
-
-__global__
-void convToGemmHalf(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW,
-                    const int V_pad, const int H_pad,
-                    const int H_out, const int W_out,
-                    const int V_stride, const int H_stride){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i
-    const int n = tx / (C * H_out * W_out); //output image numbe
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number
-    const int w = tx % W_out; //output width index (col number
-    const int inH = h * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
-                                            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                } else {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
-                }
-            }
+__global__ void convToGemmHalf(__half *const __restrict__ output,
+                               const __half *const __restrict input,
+                               const int N, const int C, const int H,
+                               const int W, const int KH, const int KW,
+                               const int V_pad, const int H_pad,
+                               const int H_out, const int W_out,
+                               const int V_stride, const int H_stride) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread i
+  const int n = tx / (C * H_out * W_out);               // output image numbe
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan numbe
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row number
+  const int w = tx % W_out;                   // output width index (col number
+  const int inH = h * V_stride - V_pad;
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+          output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        } else {
+          output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
         }
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output,
-                                        const __half * const __restrict input,
-                                        const int N, const int C,
-                                        const int H, const int W,
-                                        const int KH, const int KW, const int V_pad,
-                                        const int H_pad, const int H_out,
-                                        const int W_out, const int V_stride,
-                                        const int H_stride, const int reduced_filter_elem,
-                                        const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  //const int ki = c * KH * KW + i;
-                  //const int kj = c * KH * KW + j;
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                     const int output_col = condition * filter_elem_num 
-                                    + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) 
-                                                         - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));                   
-                  //if(filter_elem_num % skip_every != skip_offset) {
-                  // int output_col = filter_elem_num -
-                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                   //if(skip_every == 1)
-                   //    output_col = filter_elem_num;
-                    const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
-                    //((output_col*N + n) * H_out + h) * W_out + w;
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                       output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                     else
-                       output[out_index] = 0;
-              }
-            }
+__global__ void convToGemmHalfInputNewIrregular(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        // const int ki = c * KH * KW + i;
+        // const int kj = c * KH * KW + j;
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if ((filter_elem_num - skip_offset) % skip_every) {
+          const int condition = (filter_elem_num < skip_offset);
+          const int output_col =
+              condition * filter_elem_num +
+              (!condition) *
+                  (filter_elem_num -
+                   ((filter_elem_num + 1 - skip_offset) / skip_every) -
+                   ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
+          // if(filter_elem_num % skip_every != skip_offset) {
+          // int output_col = filter_elem_num -
+          //  (filter_elem_num/skip_every + (filter_elem_num % skip_every >
+          //  skip_offset));
+          // if(skip_every == 1)
+          //    output_col = filter_elem_num;
+          const int out_index =
+              ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
+          //((output_col*N + n) * H_out + h) * W_out + w;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[out_index] = 0;
         }
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output,
-                                                const __half * const __restrict input,
-                                                const int N, const int C,
-                                                const int H, const int W,
-                                                const int KH, const int KW, const int V_pad,
-                                                const int H_pad, const int H_out,
-                                                const int W_out, const int V_stride,
-                                                const int H_stride, const int reduced_filter_elem,
-                                                const int skip_every, const int skip_offset) {
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_out * W_out); //output image number
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-                //const int ki = c * KH * KW + i;
-                //const int kj = c * KH * KW + j;
-                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                    const int output_col = condition * filter_elem_num
-                                        + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every)
-                                        - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
-                    //if(filter_elem_num % skip_every != skip_offset) {
-                    // int output_col = filter_elem_num -
-                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                    //if(skip_every == 1)
-                    //    output_col = filter_elem_num;
-                    const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
-                    //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
-                    //((output_col*N + n) * H_out + h) * W_out + w
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                    else
-                        output[out_index] = 0;
-                }
-            }
+__global__ void convToGemmHalfInputNewIrregular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        // const int ki = c * KH * KW + i;
+        // const int kj = c * KH * KW + j;
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if ((filter_elem_num - skip_offset) % skip_every) {
+          const int condition = (filter_elem_num < skip_offset);
+          const int output_col =
+              condition * filter_elem_num +
+              (!condition) *
+                  (filter_elem_num -
+                   ((filter_elem_num + 1 - skip_offset) / skip_every) -
+                   ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
+          // if(filter_elem_num % skip_every != skip_offset) {
+          // int output_col = filter_elem_num -
+          //  (filter_elem_num/skip_every + (filter_elem_num % skip_every >
+          //  skip_offset));
+          // if(skip_every == 1)
+          //    output_col = filter_elem_num;
+          const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
+          //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
+          //((output_col*N + n) * H_out + h) * W_out + w
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[out_index] = 0;
         }
+      }
     }
+  }
 }
 
-
-
-__global__ void convToGemmHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void convToGemmHalf2(__half *const __restrict__ output,
+                                const __half *const __restrict input,
+                                const int N, const int C, const int H,
+                                const int W, const int KH, const int KW,
+                                const int V_pad, const int H_pad,
+                                const int H_out, const int W_out,
+                                const int V_stride, const int H_stride,
+                                const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     const int inH = h * V_stride - V_pad;
     const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) { 
-      for(int j = 0; j < KW; j++) { 
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element 
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-            output[out_index] = 0;
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmPerfRow(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void
+convToGemmPerfRow(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     int h_index;
-    if(h < start) {
-        h_index = h;
+    if (h < start) {
+      h_index = h;
     } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-   //#pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-      //  int i = ki / KW;
-      //  int j = ki % KW;
-    for(int i = 0; i < KH; i++) {
-        for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element
-    const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[out_index] = 0;
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+    //#pragma unroll
+    // for (int ki = 0; ki < KH * KW; ki++) {
+    //  int i = ki / KW;
+    //  int j = ki % KW;
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w,
-			  float *old_data, float *new_data, int x, int start){
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-    
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) { 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) { 
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                (old_data[output_index] + old_data[output_index - w]) / 2;
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); 
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // filter number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      int row_index = row - ((row + 1 - start) / x);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          (old_data[output_index] + old_data[output_index - w]) / 2;
+    } else {
+      int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void convToGemmPerfCol(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
+__global__ void
+convToGemmPerfCol(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
     int w_index;
-    if(w < start) {
+    if (w < start) {
       w_index = w;
     } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
-    const int inW = w_index * H_stride - H_pad; 
-    const int inH = h * V_stride - V_pad; //input height index (row number)
+    const int inW = w_index * H_stride - H_pad;
+    const int inH = h * V_stride - V_pad; // input height index (row number)
     //#pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-      //  int i = ki / KW;
-       // int j = ki % KW;
-    
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW  + i * KW + j; //index of this filter element
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
+    // for (int ki = 0; ki < KH * KW; ki++) {
+    //  int i = ki / KW;
+    // int j = ki % KW;
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = 0;
       }
     }
   }
 }
 
-__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w,
-			                        float *old_data, float *new_data, int x, int start) { 
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    	const int ch = tx % (c * h * w) / (h * w); //output chan number
-   	 const int row = tx % (h * w) / w; //output height index (row number)
-    	const int col = tx % w; //output width index (col number)
-
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] 
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-       		int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                	    (old_data[output_index] + old_data[output_index - 1]) / 2;
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);  
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-       	 	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
+__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // output chan number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      int col_index = col - ((col + 1 - start) / x);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          (old_data[output_index] + old_data[output_index - 1]) / 2;
+    } else {
+      int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void convToGemmPerfRowHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     int h_index;
-    if(h < start) {
-        h_index = h;
+    if (h < start) {
+      h_index = h;
     } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-  // #pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-     //   int i = ki / KW; 
-     //   int j = ki % KW;
-   
-   for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-    	const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-    	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+    // #pragma unroll
+    // for (int ki = 0; ki < KH * KW; ki++) {
+    //   int i = ki / KW;
+    //   int j = ki % KW;
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
           output[out_index] = 0;
       }
@@ -437,872 +497,941 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
   }
 }
 
-__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_eff * W_out); //output image numbe
-    if(n < N) { 
-        const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-        const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        int h_index;                   
-        if(h < start) {                
-            h_index = h;               
-        } else {                       
-            h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;                                                            
-        }                              
-        const int inH = h_index * V_stride - V_pad;
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        // #pragma unroll
-        //for (int ki = 0; ki < KH * KW; ki++) {
-            //   int i = ki / KW; 
-            //   int j = ki % KW; 
-            for(int i = 0; i < KH; i++) {
-                for(int j = 0; j < KW; j++) {
-                    const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-                    const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
-                    //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                    else
-                        output[out_index] = 0;
-                }
-            }
+__global__ void convToGemmPerfRowHalf2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image numbe
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
+    int h_index;
+    if (h < start) {
+      h_index = h;
+    } else {
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
-}
-
-__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-
-    const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int n = tx / (c * h * w); //output image number
-    const int stride = blockDim.x * gridDim.x;
-    //if(n < N) {
-    for(int i = index; i < N; i += stride){
-        const int col = ((i % (c * h * w)) % (h * w)) % w;
-        const int row = ((i % (c * h * w)) % (h * w)) / w;
-        const int ch = (i % (c * h * w)) / (h * w);
-        const int n = i / (c * h * w);
-
-        //const int ch = tx % (c * h * w) / (h * w); //filter number
-        //const int row = tx % (h * w) / w; //output height index (row number)
-        //const int col = tx % w; //output width index (col number)
-
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-				__hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+    const int inH = h_index * V_stride - V_pad;
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+    // #pragma unroll
+    // for (int ki = 0; ki < KH * KW; ki++) {
+    //   int i = ki / KW;
+    //   int j = ki % KW;
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
+        //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void approxInterpolateRowHalf2(int N, int old_h, int j, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-    
-    const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int n = tx / (c * h * w); //output image numbe
-    const int stride = blockDim.x * gridDim.x;
-    //if(n < N) {
-    for(int i = index; i < N; i += stride){
-        const int col = ((i % (c * h * w)) % (h * w)) % w;
-        const int row = ((i % (c * h * w)) % (h * w)) / w;
-        const int ch = (i % (c * h * w)) / (h * w);
-        const int n = i / (c * h * w);
-        
-        //const int ch = tx % (c * h * w) / (h * w); //filter number
-        //const int row = tx % (h * w) / w; //output height index (row number)
-        //const int col = tx % w; //output width index (col number
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    old_data[ch * (n * old_h * w) + n * (old_h * w) + row * (w) + col];
-       } else if(row == h-1) {
-           new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (n * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (n * old_h * w) + n * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            const int row_index = row - ((row + 1 - start) / x);
-            const int output_index = ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            const int output_index = ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
+
+  const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  // const int n = tx / (c * h * w); //output image number
+  const int stride = blockDim.x * gridDim.x;
+  // if(n < N) {
+  for (int i = index; i < N; i += stride) {
+    const int col = ((i % (c * h * w)) % (h * w)) % w;
+    const int row = ((i % (c * h * w)) % (h * w)) / w;
+    const int ch = (i % (c * h * w)) / (h * w);
+    const int n = i / (c * h * w);
+
+    // const int ch = tx % (c * h * w) / (h * w); //filter number
+    // const int row = tx % (h * w) / w; //output height index (row number)
+    // const int col = tx % w; //output width index (col number)
+
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      int row_index = row - ((row + 1 - start) / x);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+    } else {
+      int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
+__global__ void approxInterpolateRowHalf2(int N, int old_h, int j, int c, int h,
+                                          int w, __half *old_data,
+                                          __half *new_data, int x, int start) {
 
-__global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int W_eff){
+  const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  // const int n = tx / (c * h * w); //output image numbe
+  const int stride = blockDim.x * gridDim.x;
+  // if(n < N) {
+  for (int i = index; i < N; i += stride) {
+    const int col = ((i % (c * h * w)) % (h * w)) % w;
+    const int row = ((i % (c * h * w)) % (h * w)) / w;
+    const int ch = (i % (c * h * w)) / (h * w);
+    const int n = i / (c * h * w);
+
+    // const int ch = tx % (c * h * w) / (h * w); //filter number
+    // const int row = tx % (h * w) / w; //output height index (row number)
+    // const int col = tx % w; //output width index (col number
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * old_h * w) + n * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      const int row_index = row - ((row + 1 - start) / x);
+      const int output_index =
+          ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+      // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+    } else {
+      const int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      const int output_index =
+          ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+      // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
+    }
+  }
+}
 
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
+__global__ void convToGemmPerfColHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
     int w_index;
-    if(w < start) {
+    if (w < start) {
       w_index = w;
     } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inW = w_index * H_stride - H_pad;
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-     //#pragma unroll
-    //  for (int ki = 0; ki < KH * KW; ki++) {               
-      //    int i = ki / KW;
-       //   int j = ki % KW; 
-    
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
-            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+                                          //#pragma unroll
+    //  for (int ki = 0; ki < KH * KW; ki++) {
+    //    int i = ki / KW;
+    //   int j = ki % KW;
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
-
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                        const int H, const int W, const int KH, const int KW, const int V_pad,
-                        const int H_pad, const int H_out, const int W_out, const int V_stride,
-                        const int H_stride, const int x, const int start, const int W_eff){
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_eff); //output image number
-      if(n < N) {
-          const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-          const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-          const int w = tx % W_eff; //output width index (col number)
-          int w_index;
-          if(w < start) {
-              w_index = w;
-          } else {
-              w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
-          }
-          const int inW = w_index * H_stride - H_pad;
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-          //#pragma unroll
-          //  for (int ki = 0; ki < KH * KW; ki++) {               
-              //    int i = ki / KW;
-              //   int j = ki % KW; 
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen
-                  const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
-                  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                  else
-                      output[out_index] = 0;
-              }
-        }
+__global__ void convToGemmPerfColHalf2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
+    int w_index;
+    if (w < start) {
+      w_index = w;
+    } else {
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
+    const int inW = w_index * H_stride - H_pad;
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    //#pragma unroll
+    //  for (int ki = 0; ki < KH * KW; ki++) {
+    //    int i = ki / KW;
+    //   int j = ki % KW;
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter elemen
+        const int out_index =
+            ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+      }
+    }
+  }
 }
 
+__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
 
-__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
+  const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int stride = blockDim.x * gridDim.x;
 
-    const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int stride = blockDim.x * gridDim.x;
-    
-    for(int i = index; i < N; i += stride){
-        const int col = ((i % (c * h * w)) % (h * w)) % w;
-        const int row = ((i % (c * h * w)) % (h * w)) / w;
-        const int ch = (i % (c * h * w)) / (h * w);
-        const int n = i / (c * h * w);
+  for (int i = index; i < N; i += stride) {
+    const int col = ((i % (c * h * w)) % (h * w)) % w;
+    const int row = ((i % (c * h * w)) % (h * w)) / w;
+    const int ch = (i % (c * h * w)) / (h * w);
+    const int n = i / (c * h * w);
 
-    //const int n = tx / (c * h * w); //output image number
-    //if(n < N) {
-    	//const int ch = tx % (c * h * w) / (h * w); //output chan number
-    	//const int row = tx % (h * w) / w; //output height index (row number)
+    // const int n = tx / (c * h * w); //output image number
+    // if(n < N) {
+    // const int ch = tx % (c * h * w) / (h * w); //output chan number
+    // const int row = tx % (h * w) / w; //output height index (row number)
     //	const int col = tx % w; //output width index (col number)
 
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-        	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-			__hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
-   }
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      int col_index = col - ((col + 1 - start) / x);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    } else {
+      int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
+    }
+  }
 }
 
-__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
-    
-    const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int stride = blockDim.x * gridDim.x;
-    
-    for(int i = index; i < N; i += stride){
-        const int col = ((i % (c * h * w)) % (h * w)) % w;
-        const int row = ((i % (c * h * w)) % (h * w)) / w;
-        const int ch = (i % (c * h * w)) / (h * w);
-        const int n = i / (c * h * w);
-        //const int n = tx / (c * h * w); //output image number
-        //if(n < N) {
-            //const int ch = tx % (c * h * w) / (h * w); //output chan number
-            //const int row = tx % (h * w) / w; //output height index (row number)
-            //  const int col = tx % w; //output width index (col number)
-        if(col < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                        = old_data[ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col];
-                        //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-        } else if(col == w - 1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
-                            //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-        } else if (col == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                        old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w)];
-                        //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-        } else if((col - start) % x == 0) {
-            const int col_index = col - ((col + 1 - start) / x);
-            const int output_index = ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-        } else {
-            const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-            const int output_index = ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            //const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h,
+                                          int w, __half *old_data,
+                                          __half *new_data, int x, int start) {
+
+  const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < N; i += stride) {
+    const int col = ((i % (c * h * w)) % (h * w)) % w;
+    const int row = ((i % (c * h * w)) % (h * w)) / w;
+    const int ch = (i % (c * h * w)) / (h * w);
+    const int n = i / (c * h * w);
+    // const int n = tx / (c * h * w); //output image number
+    // if(n < N) {
+    // const int ch = tx % (c * h * w) / (h * w); //output chan number
+    // const int row = tx % (h * w) / w; //output height index (row number)
+    //  const int col = tx % w; //output width index (col number)
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col];
+      // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+      // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w)];
+      // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      const int col_index = col - ((col + 1 - start) / x);
+      const int output_index =
+          ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+      // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    } else {
+      const int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      const int output_index =
+          ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+      // const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row *
+      // old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-
-__global__ void convToGemmFullInputRegular(float * const __restrict__ output,
-				    const float * const __restrict input,
-				    const int N, const int C,
-				    const int H, const int W,
-				    const int KH, const int KW, const int V_pad,
-				    const int H_pad, const int H_out,
-				    const int W_out, const int V_stride,
-				    const int H_stride, const int reduced_filter_elem,
-				    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (H_out * W_out); //output image number
-  if(n < N) {
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-    #pragma unroll
-    for(int fi = 0; fi < reduced_filter_elem; fi++) {
-         const int ch = (fi * C) / reduced_filter_elem;
-         const int offset = (skip_offset + ch) % skip_every;
-         int in_index;
-         if(fi < offset) {
-             in_index = fi;
-         } else {
-             in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
-                        + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-        }
-        const int i = (in_index % (KW * KH)) / KW;
-        const int j = in_index % KW;
-        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; 
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-        } else {
-            output[out_index] = 0;
-        }
+__global__ void
+convToGemmFullInputRegular(float *const __restrict__ output,
+                           const float *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int reduced_filter_elem,
+                           const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int ch = (fi * C) / reduced_filter_elem;
+      const int offset = (skip_offset + ch) % skip_every;
+      int in_index;
+      if (fi < offset) {
+        in_index = fi;
+      } else {
+        in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                   (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                   offset - 1;
+      }
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
       }
     }
+  }
 }
 
-__global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            int in_index;
-            if(fi < skip_offset) {
-                in_index = fi;
-            } else {
-                in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                            + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-            }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmFullInputIrregular(
+    float *const __restrict__ output, const float *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      int in_index;
+      if (fi < skip_offset) {
+        in_index = fi;
+      } else {
+        in_index =
+            ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+            (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+            skip_offset - 1;
+      }
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void createReducedFiltersFullRegular(float * output,
-					 const float * const __restrict input, const int NF,
-					 const int num_filter_elem, const int reduced_filter_elem, 
-                     const int channels,
-					 const int skip_every, const int skip_offset, const float fac) {
-  
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / reduced_filter_elem; //filter index
-  if(fIdx < NF) { 
-    const int offset = tx % reduced_filter_elem; //offset within filter
+__global__ void createReducedFiltersFullRegular(
+    float *output, const float *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int channels, const int skip_every, const int skip_offset,
+    const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
     const int ch = (offset * channels) / reduced_filter_elem;
     const int channel_offset = (skip_offset + ch) % skip_every;
-      int in_index;
-      if(offset < channel_offset) {
-        in_index = offset;
-     } else {
-         in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                  + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
-     }
-    output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+    int in_index;
+    if (offset < channel_offset) {
+      in_index = offset;
+    } else {
+      in_index =
+          ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) +
+          (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) >
+           0) +
+          channel_offset - 1;
+    }
+    output[fIdx * reduced_filter_elem + offset] =
+        fac * input[num_filter_elem * fIdx + in_index];
   }
 }
 
-__global__ void createReducedFiltersFullIrregular(float * output,
-                     const float * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int fIdx = tx / reduced_filter_elem; //filter index
-      if(fIdx < NF) {
-        const int offset = tx % reduced_filter_elem; //offset within filter
-        int in_index;
-        if(offset < skip_offset) {
-            in_index = offset;
-        } else {
-            in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; 
-        }
-        output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+__global__ void createReducedFiltersFullIrregular(
+    float *output, const float *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int skip_every, const int skip_offset, const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
+    int in_index;
+    if (offset < skip_offset) {
+      in_index = offset;
+    } else {
+      in_index =
+          ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) +
+          (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+          skip_offset - 1;
     }
+    output[fIdx * reduced_filter_elem + offset] =
+        fac * input[num_filter_elem * fIdx + in_index];
+  }
 }
 
-__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-    #pragma unroll
-    //for(int fi = 0; fi < reduced_filter_elem; fi++) {
-         //const int ch = (fi * C) / reduced_filter_elem;
-      for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-        const int fi = ch * (reduced_filter_elem / C) + ki;
-        const int offset = (skip_offset + ch) % skip_every;
-         //int in_index;
-         const bool condition = (fi < offset);
-         const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-         //if(fi < offset) {
-         //    in_index = fi;
-         //} else {
-         //    in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-           //             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-       // }
-        const int i = (in_index % (KW * KH)) / KW;
-        const int j = in_index % KW;
-        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { 
-            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-        } else {
-            output[out_index] = 0;
-        }
+__global__ void
+convToGemmHalfInputRegular(__half *const __restrict__ output,
+                           const __half *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int reduced_filter_elem,
+                           const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int ch =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    // for(int fi = 0; fi < reduced_filter_elem; fi++) {
+    // const int ch = (fi * C) / reduced_filter_elem;
+    for (int ki = 0; ki < reduced_filter_elem / C; ki++) {
+      const int fi = ch * (reduced_filter_elem / C) + ki;
+      const int offset = (skip_offset + ch) % skip_every;
+      // int in_index;
+      const bool condition = (fi < offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               offset - 1);
+      // if(fi < offset) {
+      //    in_index = fi;
+      //} else {
+      //    in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
+      //             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0)
+      //             + offset - 1;
+      // }
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
       }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C, 
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      if(n < N) {
-           const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-          const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-          const int w = tx % W_out; //output width index (col number)
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-          const int inW = w * H_stride - H_pad; //input width index (col number)
-          
-          #pragma unroll
-           for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-               const int fi = ch * (reduced_filter_elem / C) + ki;
-          //for(int fi = 0; fi < reduced_filter_elem; fi++) {
-           //   const int ch = (fi * C) / reduced_filter_elem;
-              const int offset = (skip_offset + ch) % skip_every;
-              const int condition = (fi < offset);
-             const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                          + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-             // int in_index;
-              //if(fi < offset) {
-               //   in_index = fi;
-              //} else {
-               //   in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
-                 //               + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-             // }
-              const int i = (in_index % (KW * KH)) / KW;
-              const int j = in_index % KW;
-              const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-              if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                  output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-              } else {
-                  output[out_index] = 0;
-             }
-        }
+__global__ void convToGemmHalfInputRegular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int ch =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int ki = 0; ki < reduced_filter_elem / C; ki++) {
+      const int fi = ch * (reduced_filter_elem / C) + ki;
+      // for(int fi = 0; fi < reduced_filter_elem; fi++) {
+      //   const int ch = (fi * C) / reduced_filter_elem;
+      const int offset = (skip_offset + ch) % skip_every;
+      const int condition = (fi < offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               offset - 1);
+      // int in_index;
+      // if(fi < offset) {
+      //   in_index = fi;
+      //} else {
+      //   in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
+      //               + (((fi - offset + 1) * skip_every) % (skip_every - 1) >
+      //               0) + offset - 1;
+      // }
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-            //int in_index;
-            //if(fi < skip_offset) {
-             //   in_index = fi;
-            //} else {        
-              //  in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-              //              + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-           // }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW; 
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmHalfInputIrregular(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int condition = (fi < skip_offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               skip_offset - 1);
+      // int in_index;
+      // if(fi < skip_offset) {
+      //   in_index = fi;
+      //} else {
+      //  in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+      //              + (((fi - skip_offset + 1) * skip_every) % (skip_every -
+      //              1) > 0) + skip_offset - 1;
+      // }
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-       #pragma unroll 
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                 + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-           // int in_index;
-           // if(fi < skip_offset) {
-           //     in_index = fi;
-           // } else {
-            //    in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                   //             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-           // }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-            //const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmHalfInputIrregular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int condition = (fi < skip_offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               skip_offset - 1);
+      // int in_index;
+      // if(fi < skip_offset) {
+      //     in_index = fi;
+      // } else {
+      //    in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+      //             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1)
+      //             > 0) + skip_offset - 1;
+      // }
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+      // const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) *
+      // W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
+__global__ void createReducedFiltersHalfRegular(
+    __half *output, const __half *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int channels, const int skip_every, const int skip_offset,
+    const float fac) {
 
-__global__ void createReducedFiltersHalfRegular(__half * output,
-                                         const __half * const __restrict input, const int NF,
-                                         const int num_filter_elem, const int reduced_filter_elem,
-                     			 const int channels,
-                                         const int skip_every, const int skip_offset, const float fac) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
   const int stride = blockDim.x * gridDim.x;
-  
+
   //#pragma unroll
   for (int i = tx; i < NF; i += stride) {
-    const int fIdx = i / reduced_filter_elem; //filter index
-  //if(fIdx < NF) {
-    const int offset = i % reduced_filter_elem; //offset within filter
+    const int fIdx = i / reduced_filter_elem; // filter index
+    // if(fIdx < NF) {
+    const int offset = i % reduced_filter_elem; // offset within filter
     const int ch = (offset * channels) / reduced_filter_elem;
     const int channel_offset = (skip_offset + ch) % skip_every;
     const int condition = (offset < channel_offset);
-    const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                          + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1);
-      
-     // int in_index;
-     // if(offset < channel_offset) {
-      //  in_index = offset;
-     //} else {
-       //  in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-         //         + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
+    const int in_index =
+        condition * offset +
+        (!condition) *
+            (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) +
+             (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) >
+              0) +
+             channel_offset - 1);
+
+    // int in_index;
+    // if(offset < channel_offset) {
+    //  in_index = offset;
+    //} else {
+    //  in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every -
+    //  1)
+    //         + (((offset - channel_offset + 1) * skip_every) % (skip_every -
+    //         1) > 0) + channel_offset -1;
     // }
-    output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
+    output[fIdx * reduced_filter_elem + offset] =
+        __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
   }
 }
 
-__global__ void createReducedFiltersHalfIrregular(__half * output,
-                     const __half * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-     const int stride = blockDim.x * gridDim.x;
-      //#pragma unroll
-      for (int i = tx; i < NF; i += stride) { 
-  
-      const int fIdx = i / reduced_filter_elem; //filter index
+__global__ void createReducedFiltersHalfIrregular(
+    __half *output, const __half *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int skip_every, const int skip_offset, const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int stride = blockDim.x * gridDim.x;
+  //#pragma unroll
+  for (int i = tx; i < NF; i += stride) {
+
+    const int fIdx = i / reduced_filter_elem; // filter index
     // if(fIdx < NF) {
-        const int offset = i % reduced_filter_elem; //offset within filter
-        const int condition = (offset < skip_offset);
-        int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-        //}
-        output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
+    const int offset = i % reduced_filter_elem; // offset within filter
+    const int condition = (offset < skip_offset);
+    int in_index =
+        condition * offset +
+        (!condition) *
+            (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) +
+             (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) >
+              0) +
+             skip_offset - 1);
+    //}
+    output[fIdx * reduced_filter_elem + offset] =
+        __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
     //}
   }
 }
 
-void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
-			 int vertical_pad, int horizontal_pad, int vertical_stride,
-			 int horizontal_stride, int conv_mode, int conv_groups,
-			 int row, int col, int start){
+void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode, int conv_groups,
+                         int row, int col, int start) {
 
   //////INFO("*** TensorConvolution (output perforation) \n");
-  //Event("Conv");
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  // Event("Conv");
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
-  
-  Tensor* output;
+
+  Tensor *output;
   // TODO: Support other cases;
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-  //Event("H2F_start");
+  // Event("H2F_start");
   convertToFP32(input);
   convertToFP32(filter);
-  //Event("H2F_end");
-  
+  // Event("H2F_end");
+
   long int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
   int rem_row = (h - start) % row > 0;
   int h_eff = h - ((h - start) / row) - rem_row;
-  
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   int rem_col = (w - start) % col > 0;
   int w_eff = w - ((w - start) / col) - rem_col;
 
-  Tensor* new_output;
-  if(row > 1){
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+  Tensor *new_output;
+  if (row > 1) {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float* convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, start, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, start, h_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h_eff * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h_eff * w,
-					      num_filter_elem * h_eff * w,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h_eff * w, c * h_eff * w,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateRow<<<numBlocks, 128>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(col > 1){
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+  } else if (col > 1) {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, start, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, start, w_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData,
-					      h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h * w_eff, c * h * w_eff,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateCol<<<numBlocks, 128>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else { 
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
+  } else {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    convToGemmApprox<<<gridSize, blockSize>>>(convData,
-					      (float *)input->gpu_data, n,
-					      input->dims.dim_sizes[1],
-					      input->dims.dim_sizes[2],
-					      input->dims.dim_sizes[3],
-					      KH, KW,
-					      vertical_pad, horizontal_pad, h, w,
-					      vertical_stride, horizontal_stride,
-					      num_filter_elem, c * h * w);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApprox<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, c * h * w);
     checkCudaErrors(cudaDeviceSynchronize());
-    //Do the matrix multiplication
-    //Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
-    
+    // Do the matrix multiplication
+    // Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w, num_filter_elem * h * w,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w, c * h * w,
-					      n));
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem,
+        &alpha, convData, h * w, num_filter_elem * h * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w, c * h * w, n));
 
     new_output = output;
     cudaFree(convData);
   }
 
-  //Event("Conv_end"); //, true);
+  // Event("Conv_end"); //, true);
   return new_output;
 }
 
-__global__
-void switchMatrixFull(int N, int n, int c, int h, int w,
-              float *old_data, float *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-          int col = ((i % (c * h * w)) % (h * w)) % w;
-          int row = ((i % (c * h * w)) % (h * w)) / w;
-          int ch = (i % (c * h * w)) / (h * w);
-          int n_new = i / (c * h * w);
-          
-          new_data[((n_new * c + ch) * h + row ) * w + col] =
-                        old_data[((ch * n + n_new) * h + row ) * w + col];
-        }
-}
+__global__ void switchMatrixFull(int N, int n, int c, int h, int w,
+                                 float *old_data, float *new_data) {
 
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
 
-void* tensorConvApprox(void* input_ptr, void* filter_ptr,
-		       int vertical_pad, int horizontal_pad, int vertical_stride,
-		       int horizontal_stride, int conv_mode, int conv_groups,
-		       int row, int col, int skip_every, int offset){
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
+  }
+}
+
+void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
+                       int horizontal_pad, int vertical_stride,
+                       int horizontal_stride, int conv_mode, int conv_groups,
+                       int row, int col, int skip_every, int offset) {
 
   //////INFO("*** TensorConvolution approximation \n");
-  //Event("Conv");
+  // Event("Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
@@ -1316,15 +1445,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   ////Event("H2F_end");
 
   const int n = input->dims.dim_sizes[0];
-  const int c = filter->dims.dim_sizes[0]; //number of filters
+  const int c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-  const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const int h =
+      (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const int w =
+      (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
   ////INFO("batch: %d\n", n);
@@ -1337,327 +1469,299 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   ////INFO("horizontal_stride: %d\n", horizontal_stride);
   ////INFO("output height: %d\n", h);
   ////INFO("output width: %d\n", w);
-  if(row > 1) {
+  if (row > 1) {
     const int rem_row = (h - offset) % row > 0;
     const int h_eff = h - ((h - offset) / row) - rem_row;
 
-    Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, offset, h_eff);
+    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h_eff * w));
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, offset, h_eff);
     checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                &alpha,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (float *)filter->gpu_data, num_filter_elem, 0,
-                                                &beta,
-                                                (float *)output->gpu_data, h_eff * w, c * h_eff * w,
-                                                n));
-    //interpolate
+
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+    // interpolate
     int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    approxInterpolateRow<<<numBlocks, blocksize>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(col > 1) {
+  } else if (col > 1) {
     const int rem_col = (w - offset) % col > 0;
     const int w_eff = w - ((w - offset) / col) - rem_col;
 
-    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3], KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, offset, w_eff);
+    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w_eff));
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, offset, w_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
-					      n));
-
-    //interpolate
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    // interpolate
     int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    approxInterpolateCol<<<numBlocks, blocksize>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(skip_every > 1) {
-    //reduced number after skipping
+  } else if (skip_every > 1) {
+    // reduced number after skipping
     const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+    const int reduced_filter_elem =
+        num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
 
-    float* convData;
+    float *convData;
     size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    float* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
-    
+    float *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
+
     const int filtBlockSize = 128;
     ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac = ((float) skip_every) / ((float) skip_every - 1);
+    const int filtGridSize =
+        (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac = ((float)skip_every) / ((float)skip_every - 1);
     //////INFO("fac: %f\n", fac);
     const int blockSize = 128;
-    //////INFO("n * h * w : %d\n", (n * h * w ));    
-    const int gridSize = (n * h * w + blockSize - 1) / blockSize;  
-    if(!(KH * KW % skip_every)) {
-       // ////INFO("REGULAR FILTERING\n");
-        createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                (float *)filter->gpu_data,
-								c, num_filter_elem,
-								reduced_filter_elem,
-								input->dims.dim_sizes[1], skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
+    //////INFO("n * h * w : %d\n", (n * h * w ));
+    const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+    if (!(KH * KW % skip_every)) {
+      // ////INFO("REGULAR FILTERING\n");
+      createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
+          reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+          fac);
+      checkCudaErrors(cudaDeviceSynchronize());
+      convToGemmFullInputRegular<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     } else {
-       // ////INFO("IRREGULAR FILTERING\n");
-        createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (float *)filter->gpu_data,
-                                    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,     
-                                                                input->dims.dim_sizes[1],                                                     
-                                                                input->dims.dim_sizes[2],                                                 
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
+      // ////INFO("IRREGULAR FILTERING\n");
+      createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
+          reduced_filter_elem, skip_every, offset, fac);
+      checkCudaErrors(cudaDeviceSynchronize());
+      convToGemmFullInputIrregular<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     }
     checkCudaErrors(cudaDeviceSynchronize());
-    
+
     const float alpha = 1.0;
     const float beta = 0.0;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            &alpha,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem,
+        &alpha, convData, h * w, reduced_filter_elem * h * w, reducedFilter,
+        reduced_filter_elem, 0, &beta, (float *)new_output->gpu_data, h * w,
+        c * h * w, n));
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
-      //INFO("FP32 BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-                               CUDNN_TENSOR_NCHW, n, c, h, w);
+    // INFO("FP32 BASELINE\n");
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
     changeTensorPlacement(new_output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w));
-    convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w, vertical_stride, horizontal_stride, 
-                           skip_every, offset);//num_filter_elem);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w));
+    convToGemmFullInput<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        skip_every, offset); // num_filter_elem);
     checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     /*
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                          CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, num_filter_elem,
-                                            &alpha,
-                                            convData, h * w, num_filter_elem * h * w,
-                                            (float *)filter->gpu_data, num_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
-    */
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                       n * h * w, c, num_filter_elem,
-                        &alpha,
-                        convData,
-                        CUDA_R_32F, n * h * w,
-                        (float *) filter->gpu_data, CUDA_R_32F,
-                        num_filter_elem,
-                        &beta,
-                        (float *) output->gpu_data,
-                        CUDA_R_32F, n * h * w,
-                        CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-    
-    const int numBlocks = (n * c * h * w  + 255) / 256;
-    switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (float *)output->gpu_data,
-                                    (float *)new_output->gpu_data);
-    
+
+    float alpha = 1.0f, beta = 0.0f;
+    /*
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+                                         CUBLAS_OP_N, CUBLAS_OP_N,
+                                           h * w, c, num_filter_elem,
+                                           &alpha,
+                                           convData, h * w, num_filter_elem * h
+    * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float
+    *)new_output->gpu_data, h * w, c * h * w, n));
+   */
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
+        &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data,
+        CUDA_R_32F, num_filter_elem, &beta, (float *)output->gpu_data,
+        CUDA_R_32F, n * h * w, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    const int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                         (float *)output->gpu_data,
+                                         (float *)new_output->gpu_data);
+
     checkCudaErrors(cudaDeviceSynchronize());
     cudaFree(convData);
   }
 
-  //Event("Conv_end");
+  // Event("Conv_end");
   return new_output;
 }
 
-__global__
-void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-            int col = ((i % (c * h * w)) % (h * w)) % w;
-            int row = ((i % (c * h * w)) % (h * w)) / w;
-            int ch = (i % (c * h * w)) / (h * w);
-            int n_new = i / (c * h * w);
-            
-            new_data[((n_new * c + ch) * h + row ) * w + col] =
-                            old_data[((ch * n + n_new) * h + row ) * w + col];
-      }
-}
+__global__ void switchMatrixHalf(int N, int n, int c, int h, int w,
+                                 __half *old_data, __half *new_data) {
 
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
 
-void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
-			   int vertical_pad, int horizontal_pad,
-			   int vertical_stride, int horizontal_stride,
-			   int conv_mode, int conv_groups,
-			   int row, int col, int skip_every, int offset) {
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
+  }
+}
+
+void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups, int row, int col, int skip_every,
+                            int offset) {
 
- //INFO("*** TensorConvolution half approximation \n");
- // profileEvent("#Conv");
+  // INFO("*** TensorConvolution half approximation \n");
+  // profileEvent("#Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
 
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
- // INFO("CONVERT\n");
+  // INFO("CONVERT\n");
   profileEvent("F2H_start");
-   convertToFP16(input);
-   convertToFP16(filter);
+  convertToFP16(input);
+  convertToFP16(filter);
   profileEvent("F2H_end");
-//INFO("CONVERTED\n");
+  // INFO("CONVERTED\n");
   const long int n = input->dims.dim_sizes[0];
-  const long int c = filter->dims.dim_sizes[0]; //number of filters
+  const long int c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-  const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const long int h =
+      (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const long int w =
+      (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
-  //INFO("batch: %d\n", n);
+  // INFO("batch: %d\n", n);
   // INFO("channels: %d\n", input->dims.dim_sizes[1]);
   // INFO("num_filters: %d\n", c);
   // INFO("kernel height: %d\n", KH);
-  // INFO("kernel width: %d\n", KW);   
+  // INFO("kernel width: %d\n", KW);
   // INFO("num_filter_elem: %d\n", num_filter_elem);
-   //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
-   //INFO("vertical_stride: %d\n", vertical_stride);
-   //INFO("horizontal_stride: %d\n", horizontal_stride);
+  // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
+  // INFO("vertical_stride: %d\n", vertical_stride);
+  // INFO("horizontal_stride: %d\n", horizontal_stride);
   // INFO("output height: %d\n", h);
   // INFO("output width: %d\n", w);
-   //INFO("skip_every: %d\n", skip_every);
-  if(row > 1){
+  // INFO("skip_every: %d\n", skip_every);
+  if (row > 1) {
     const int rem_row = (h - offset) % row > 0;
     const int h_eff = h - ((h - offset) / row) - rem_row;
-    
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW,
-						  n, c, h_eff, w);
+
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
 
-    __half * convData;
+    __half *convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w));
+    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h_eff * w));
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-    
-    if(h * w <= 64) {
-        convToGemmPerfRowHalf2<<<gridSize, blockSize>>>(convData,
-                                   (__half *)input->gpu_half_data, n,
-                                   input->dims.dim_sizes[1],
-                                   input->dims.dim_sizes[2],
-                                   input->dims.dim_sizes[3],
-                                   KH, KW, vertical_pad,
-                                   horizontal_pad, h, w, vertical_stride,
-                                   horizontal_stride, row, offset, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    if (h * w <= 64) {
+      convToGemmPerfRowHalf2<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, row, offset, h_eff);
     } else {
-        convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData,
-						   (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1],
-						   input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3],
-						   KH, KW, vertical_pad,
-						   horizontal_pad, h, w, vertical_stride,
-						   horizontal_stride, row, offset, h_eff);
+      convToGemmPerfRowHalf<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, row, offset, h_eff);
     }
     checkCudaErrors(cudaDeviceSynchronize());
 
@@ -1665,74 +1769,68 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-    if(h * w <= 64) {
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                     n * h_eff * w, c, num_filter_elem,
-                     alpha_half,
-                     convData, CUDA_R_16F, n * h_eff * w,
-                     (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                     beta_half,
-                     (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
-                     CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+    if (h * w <= 64) {
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c,
+          num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w,
+          (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+          beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F,
+          n * h_eff * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     } else {
-     checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                alpha_half,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                                beta_half,
-                                                (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w,
-                                                n));    
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+          alpha_half, convData, h_eff * w, num_filter_elem * h_eff * w,
+          (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half,
+          (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, n));
     }
-    //interpolate
+    // interpolate
     int blocksize = 256;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    if(h * w <= 64) {
-        approxInterpolateRowHalf2<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-                                (__half *)output_half->gpu_half_data,
-                                (__half *)new_output->gpu_half_data,
-                                row, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    if (h * w <= 64) {
+      approxInterpolateRowHalf2<<<numBlocks, blocksize>>>(
+          n * c * h * w, h_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, row, offset);
     } else {
-        approxInterpolateRowHalf<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						row, offset);
+      approxInterpolateRowHalf<<<numBlocks, blocksize>>>(
+          n * c * h * w, h_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, row, offset);
     }
     checkCudaErrors(cudaDeviceSynchronize());
 
     freeTensor(output_half);
     cudaFree(convData);
-} else if(col > 1) {
+  } else if (col > 1) {
     const int rem_col = (w - offset) % col > 0;
     const int w_eff = w - ((w - offset) / col) - rem_col;
 
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
-   
-    __half * convData;
+
+    __half *convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff));
+    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w_eff));
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-    if(h * w <= 64) {
-        convToGemmPerfColHalf2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3], KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, col, offset, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+    if (h * w <= 64) {
+      convToGemmPerfColHalf2<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, col, offset, w_eff);
     } else {
-        convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1],
-						   input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
-						   horizontal_pad, h, w, vertical_stride,
-						   horizontal_stride, col, offset, w_eff);
+      convToGemmPerfColHalf<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, col, offset, w_eff);
     }
     checkCudaErrors(cudaDeviceSynchronize());
 
@@ -1740,229 +1838,211 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-    if(h * w <= 64) {
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                         n * h * w_eff, c, num_filter_elem,
-                         alpha_half,
-                         convData, CUDA_R_16F, n * h * w_eff,
-                         (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                         beta_half,
-                         (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
-                         CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+    if (h * w <= 64) {
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c,
+          num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff,
+          (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+          beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F,
+          n * h * w_eff, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     } else {
-        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                              CUBLAS_OP_N, CUBLAS_OP_N,
-                                              h * w_eff, c, num_filter_elem,
-                                              alpha_half,
-                                              convData, h * w_eff, num_filter_elem * h * w_eff,
-                                              (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                              beta_half,
-                                              (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff,
-                                              n));
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+          alpha_half, convData, h * w_eff, num_filter_elem * h * w_eff,
+          (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half,
+          (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, n));
     }
-    //interpolate
+    // interpolate
     int blocksize = 256;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    if(h * w <= 64) {
-        approxInterpolateColHalf2<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-                                (__half *)output_half->gpu_half_data,
-                                (__half *)new_output->gpu_half_data,
-                                col, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    if (h * w <= 64) {
+      approxInterpolateColHalf2<<<numBlocks, blocksize>>>(
+          n * c * h * w, w_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, col, offset);
 
     } else {
-        approxInterpolateColHalf<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						col, offset);
-   }
-   checkCudaErrors(cudaDeviceSynchronize());
+      approxInterpolateColHalf<<<numBlocks, blocksize>>>(
+          n * c * h * w, w_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, col, offset);
+    }
+    checkCudaErrors(cudaDeviceSynchronize());
 
     freeTensor(output_half);
     cudaFree(convData);
-  } else if(skip_every > 1) {
+  } else if (skip_every > 1) {
     const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+    const int reduced_filter_elem =
+        num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
 
-    __half* convData;
+    __half *convData;
     size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    __half* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
+    __half *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
 
     const int filtBlockSize = 256;
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac = ((float) skip_every) / ((float) skip_every - 1);
+    const int filtGridSize =
+        (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac = ((float)skip_every) / ((float)skip_every - 1);
     const int blockSize = 256;
-    //const int gridSize = (n * h * w + blockSize - 1) / blockSize;
-   // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
-   // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
+    // const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+    // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
+    // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
     const __half alf = approx_float_to_half(1.0);
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-   if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { 
-      if(!(KH * KW % skip_every)) {
-        //INFO("REGULAR FILTERING\n");
-        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                (__half *)filter->gpu_half_data,
-								c, num_filter_elem,
-                                                                reduced_filter_elem,
-                                                                input->dims.dim_sizes[1], skip_every, offset, fac);
+    if (c * num_filter_elem <
+        500000) { // 250) {//c * reduced_filter_elem < 150000) {
+      if (!(KH * KW % skip_every)) {
+        // INFO("REGULAR FILTERING\n");
+        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+            fac);
         checkCudaErrors(cudaDeviceSynchronize());
-	
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
       } else {
-        //INFO("IRREGULAR FILTERING\n");
-        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (__half *)filter->gpu_half_data,
-				    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
+        // INFO("IRREGULAR FILTERING\n");
+        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, skip_every, offset, fac);
         checkCudaErrors(cudaDeviceSynchronize());
-        
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1]  + blockSize - 1) / blockSize;
-	    //convToGemmHalfInputIrregular
-        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,  
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-     }   
-     checkCudaErrors(cudaDeviceSynchronize());
-
-     checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            alpha_half,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            beta_half,
-                                            (__half *)new_output->gpu_half_data, h * w, c * h * w,
-                                            n));
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        // convToGemmHalfInputIrregular
+        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      }
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem,
+          alpha_half, convData, h * w, reduced_filter_elem * h * w,
+          reducedFilter, reduced_filter_elem, 0, beta_half,
+          (__half *)new_output->gpu_half_data, h * w, c * h * w, n));
     } else {
-        Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                 CUDNN_TENSOR_NCHW, n, c, h, w);
-        changeTensorPlacement(output_half, DEVICE);
-
-        if(!(KH * KW % skip_every)) {
-           // INFO("REGULAR FILTERING\n");
-            createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                        (__half *)filter->gpu_half_data,
-                                                        c, num_filter_elem,
-                                                        reduced_filter_elem,
-                                                        input->dims.dim_sizes[1], skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-        } else {
-            //INFO("IRREGULAR FILTERING\n");
-            createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                            (__half *)filter->gpu_half_data,
-                                                                            c, num_filter_elem,
-                                                                            reduced_filter_elem,
-                                                                            skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-            }
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                        n * h * w, c, reduced_filter_elem,
-                                        alpha_half,
-                                        convData, CUDA_R_16F, n * h * w,
-                                         reducedFilter, CUDA_R_16F, reduced_filter_elem,
-                                        beta_half,
-                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
-                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-            
-            int numBlocks = (n * c * h * w  + 255) / 256;
-            switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (__half *)output_half->gpu_half_data,
-                                    (__half *)new_output->gpu_half_data);
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            freeTensor(output_half);
+      Tensor *output_half = (Tensor *)create4DTensor(
+          (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w);
+      changeTensorPlacement(output_half, DEVICE);
+
+      if (!(KH * KW % skip_every)) {
+        // INFO("REGULAR FILTERING\n");
+        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+            fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      } else {
+        // INFO("IRREGULAR FILTERING\n");
+        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, skip_every, offset, fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      }
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+          reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w,
+          reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half,
+          (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w,
+          CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+      int numBlocks = (n * c * h * w + 255) / 256;
+      switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                           (__half *)output_half->gpu_half_data,
+                                           (__half *)new_output->gpu_half_data);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      freeTensor(output_half);
     }
-    
+
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
     //    INFO("BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                   CUDNN_TENSOR_NCHW, n, c, h, w);
-      
-      changeTensorPlacement(output, DEVICE);
-      __half * convData;
-      long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
-      checkCudaErrors(cudaMalloc(&convData, convDataSize));
-      
-      const int blockSize = 256;
-      const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-      //convToGemmHalf
-      convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData,
-                                                (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3],
-                                                KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, num_filter_elem,
-                                                skip_every, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        const __half alf = approx_float_to_half(1.0);
-        const __half bet = approx_float_to_half(0.0);
-        const __half *alpha_half = &alf;
-        const __half *beta_half = &bet;
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                    n * h * w, c, num_filter_elem,
-                                    alpha_half,
-                                    convData, CUDA_R_16F, n * h * w,
-                                    (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                                    beta_half,
-                                    (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w,
-                                    CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-        
-        const int numBlocks = (n * c * h * w  + 255) / 256;
-        switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data,
-                                            (__half *)new_output->gpu_half_data);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        freeTensor(output);
-        cudaFree(convData);
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    changeTensorPlacement(output, DEVICE);
+    __half *convData;
+    long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    // convToGemmHalf
+    convToGemmHalfInputNew<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, skip_every, offset);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
+        alpha_half, convData, CUDA_R_16F, n * h * w,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half,
+        (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F,
+        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    const int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                         (__half *)output->gpu_half_data,
+                                         (__half *)new_output->gpu_half_data);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    freeTensor(output);
+    cudaFree(convData);
   }
-//    INFO("CONV DONE\n");
+  //    INFO("CONV DONE\n");
   profileEvent("H2F_start");
   convertToFP32_offline(new_output);
-  //convertToFP32(input);
-  //convertToFP32(filter);
+  // convertToFP32(input);
+  // convertToFP32(filter);
   profileEvent("H2F_end");
-  //profileEvent("#Conv_end");
-  //INFO("CONVOLUTION END\n");
+  // profileEvent("#Conv_end");
+  // INFO("CONVOLUTION END\n");
   return new_output;
 }
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
deleted file mode 100644
index 0fe6c20ca848c1caf8180735db9d5cce2f3b2f82..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-#include "functional/common.h"
-#include "tensor_utils.h"
-
-#include <algorithm>
-#include <functional>
-#include <numeric>
-#include <stdexcept>
-
-// TODO: this approach does not scale well.
-// The right way is probably implementing some type_traits for the Tensor_type_t
-// enum
-template <> float *convertAndGetGPUData<float>(Tensor *t) {
-  if (t->cur_type == float_type)
-    return static_cast<float *>(t->gpu_data);
-  if (t->cur_type == half_type) {
-    convertToFP32(t);
-    t->data_type = float_type;
-    return static_cast<float *>(t->gpu_data);
-  }
-  ERROR("Type %s is incompatible with target type float\n",
-        std::to_string(t->cur_type));
-}
-
-template <> half *convertAndGetGPUData<half>(Tensor *t) {
-  if (t->cur_type == half_type)
-    return static_cast<half *>(t->gpu_half_data);
-  if (t->cur_type == float_type) {
-    convertToFP16(t);
-    t->data_type = half_type;
-    return static_cast<half *>(t->gpu_half_data);
-  }
-  ERROR("Type %s is incompatible with target type half\n",
-        std::to_string(t->cur_type));
-}
-
-template <> float2 *convertAndGetGPUData<float2>(Tensor *t) {
-  if (t->cur_type == float2_type)
-    return static_cast<float2 *>(t->gpu_data);
-  if (t->cur_type == half2_type) {
-    // FIXME: hacking to make convertToFP16 realize these are "2 floats"
-    t->num_elems *= 2;
-    convertToFP32(t);
-    t->num_elems /= 2;
-    t->cur_type = t->data_type = float2_type;
-    return static_cast<float2 *>(t->gpu_data);
-  }
-  ERROR("Type %s is incompatible with target type float2\n",
-        std::to_string(t->cur_type));
-}
-
-template <> half2 *convertAndGetGPUData<half2>(Tensor *t) {
-  if (t->cur_type == half2_type)
-    return static_cast<half2 *>(t->gpu_half_data);
-  if (t->cur_type == float2_type) {
-    // FIXME: hacking to make convertToFP16 realize these are "2 floats"
-    t->num_elems *= 2;
-    convertToFP16(t);
-    t->num_elems /= 2;
-    t->cur_type = t->data_type = half2_type;
-    return static_cast<half2 *>(t->gpu_half_data);
-  }
-  ERROR("Type %s is incompatible with target type half2\n",
-        std::to_string(t->cur_type));
-}
-
-void convertToFloat2Offline(Tensor *t) {
-  if (t->cur_type == float2_type)
-    return;
-  else if (t->cur_type == half2_type) {
-    t->cur_type = t->data_type = half_type;
-    t->num_elems *= 2;
-    convertToFP32_offline(t);
-    t->num_elems /= 2;
-    t->cur_type = t->data_type = float2_type;
-  } else {
-    ERROR("Type %s is incompatible with target type half2\n",
-          std::to_string(t->cur_type));
-  }
-}
-
-std::vector<size_t> sizes(const Dimension &dim) {
-  return std::vector<size_t>(dim.dim_sizes, dim.dim_sizes + dim.num_dims);
-}
-
-std::vector<size_t> sizes(Tensor *t) { return sizes(t->dims); }
-
-size_t num_elems(const std::vector<size_t> &dim_sizes) {
-  return std::accumulate(dim_sizes.begin(), dim_sizes.end(), 1,
-                         std::multiplies<>());
-}
-
-size_t num_elems(const Dimension &dim) { return num_elems(sizes(dim)); }
-
-size_t num_elems(Tensor *t) { return num_elems(sizes(t)); }
-
-static Tensor_type_t toHalfType(Tensor_type_t float_ty) {
-  switch (float_ty) {
-  case float_type:
-    return half_type;
-  case float2_type:
-    return half2_type;
-  case half_type:
-  case half2_type:
-    return float_ty;
-  default:
-    ERROR("Types not acceptable\n");
-  }
-}
-
-static Tensor_type_t toFloatType(Tensor_type_t half_ty) {
-  switch (half_ty) {
-  case half_type:
-    return float_type;
-  case half2_type:
-    return float2_type;
-  case float_type:
-  case float2_type:
-    return half_ty;
-  default:
-    ERROR("Types not acceptable\n");
-  }
-}
-
-Tensor_type_t getCompatibleType(int t1, int t2, bool get_half) {
-  auto type1 = (Tensor_type_t)t1, type2 = (Tensor_type_t)t2;
-  if (getTypeSize(type1) > getTypeSize(type2))
-    std::swap(type1, type2);
-  if (type1 == type2)
-    return get_half ? toHalfType(type1)
-                    : toFloatType(type1); // Or type2, whatever
-  if (type1 == half_type && type2 == float_type)
-    return get_half ? half_type : float_type;
-  if (type1 == half2_type && type2 == float2_type)
-    return get_half ? half2_type : float2_type;
-  ERROR("Types not acceptable\n");
-}
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
index fd1492fe68e8833ea4cdca4d5df6518b6ec3b37c..c18ffcea26f93fe752500983f4d4a3fcfe59ded2 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
@@ -1,13 +1,13 @@
-//===--------------------------- configuration.cpp -------------------------===//
+//===--------------------------- configuration.cpp
+//-------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  consists of the definitions of API to get information about 
+//
+//  This file  consists of the definitions of API to get information about
 // configurations for rest of the tensor runtime to use.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "configuration.h"
 
 using G_APPROX = GPUNodeConfiguration::APPROX;
@@ -31,9 +31,8 @@ void GPUNodeConfiguration::pushNewTensorOperation(G_TENSOR_OP top) {
 void GPUNodeConfiguration::pushNewApproximationChoiceForOperation(
     G_APPROX approx, int u) {
   unsigned size = ApproxChoices.size();
-  CUSTOM_ASSERT(
-      size >= 1 &&
-      "Cannot apply approximation choice to non existent operation.");
+  CUSTOM_ASSERT(size >= 1 &&
+                "Cannot apply approximation choice to non existent operation.");
   ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u));
 }
 
@@ -55,9 +54,8 @@ void CPUNodeConfiguration::pushNewTensorOperation(C_TENSOR_OP top) {
 void CPUNodeConfiguration::pushNewApproximationChoiceForOperation(
     C_APPROX approx, int u) {
   unsigned size = ApproxChoices.size();
-  CUSTOM_ASSERT(
-      size >= 1 &&
-      "Cannot apply approximation choice to non existent operation.");
+  CUSTOM_ASSERT(size >= 1 &&
+                "Cannot apply approximation choice to non existent operation.");
   ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u));
 }
 
@@ -71,8 +69,8 @@ CPUNodeConfiguration::CPUNodeConfiguration() {
 }
 CPUNodeConfiguration::~CPUNodeConfiguration() {}
 
-Configuration::Configuration(
-    std::string &n, float f, float e, float a, float al)
+Configuration::Configuration(std::string &n, float f, float e, float a,
+                             float al)
     : name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {}
 
 float Configuration::getSpeedup() { return speedup; }
@@ -82,20 +80,20 @@ float Configuration::getEnergy() { return energy; }
 float Configuration::getAccuracy() { return accuracy; }
 
 float Configuration::getAccuracyLoss() { return accuracyLoss; }
-bool ConfigurationLessThan::
-operator()(const struct Configuration &a, const struct Configuration &b) const {
+bool ConfigurationLessThan::operator()(const struct Configuration &a,
+                                       const struct Configuration &b) const {
   return (a.accuracyLoss < b.accuracyLoss);
 }
-bool ConfigurationLessThan_AL::
-operator()(const struct Configuration *a, const float &b) const {
+bool ConfigurationLessThan_AL::operator()(const struct Configuration *a,
+                                          const float &b) const {
   return (a->accuracyLoss < b);
 }
-bool ConfigurationLessThan_SP::
-operator()(const struct Configuration *a, const float &b) const {
+bool ConfigurationLessThan_SP::operator()(const struct Configuration *a,
+                                          const float &b) const {
   return (a->speedup < b);
 }
-bool ConfigurationLessThan_E::
-operator()(const struct Configuration *a, const float &b) const {
+bool ConfigurationLessThan_E::operator()(const struct Configuration *a,
+                                         const float &b) const {
   return (a->energy < b);
 }
 
@@ -286,9 +284,8 @@ void CPUNodeConfiguration::print() {
 void Configuration::print() {
 
   printf("+++++\n");
-  printf(
-      "%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy,
-      accuracyLoss);
+  printf("%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy,
+         accuracyLoss);
   for (std::map<std::string, NodeConfiguration *>::const_iterator it =
            setup.begin();
        it != setup.end(); ++it) {
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc
deleted file mode 100644
index 3e4aecb824a93b932ef2146380b86496f71b0f28..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-
-
-#ifndef RUNTIME_DEBUG
-#define RUNTIME_DEBUG
-
-#define LOG_DEBUG 0 // Sets the debug logging to true
-#define LOG_INFO 1  // Sets the info logging to true
-#define LOG_ERROR 1  // Print Errors 
-#define ASSERT_FLAG // Sets assertions to true (opposite of NDEBUG macro)
-
-#include "debug.h"
-#include "tensor.h"
-#include <sstream>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-void INFO(const char *format, ...) {
-  if (!LOG_INFO) // Don't print if logging info is disabled
-    return;
-  va_list args;
-  va_start(args, format);
-  printf("INFO: ");
-  vprintf(format, args);
-  va_end(args);
-}
-
-void DEBUG(const char *format, ...) {
-  if (!LOG_DEBUG) // Don't print if logging info is disabled
-    return;
-  va_list args;
-  va_start(args, format);
-  printf("DEBUG: ");
-  vprintf(format, args);
-  va_end(args);
-}
-
-void ERROR(const char *format, ...) {
-  if (!LOG_ERROR) // Don't print if logging info is disabled
-    return;
-  va_list args;
-  va_start(args, format);
-  printf("ERROR!: ");
-  vprintf(format, args);
-  va_end(args);
-
-  abort();
-}
-
-void fillOnes(struct Tensor *tensor) {
-  // initialization is specific to the floating point type
-  if (tensor->data_type == CUDNN_DATA_FLOAT) {
-    float *data_arr = (float *)tensor->host_data;
-    for (unsigned int i = 0; i < tensor->num_elems; i++) {
-      data_arr[i] = 1.0;
-    }
-  }
-}
-
-void printTensorDescInfo(struct Tensor *tensor) {
-
-  cudnnDataType_t dType;
-  int nStride, cStride, hStride, wStride;
-  int size1, size2, size3, size4;
-  cudnnGetTensor4dDescriptor(tensor->tensor_desc, &dType, &size1, &size2,
-                             &size3, &size4, &nStride, &cStride, &hStride,
-                             &wStride);
-
-  DEBUG("dType = %d, size1 = %d, size2 = %d, size3 = %d, size4 = %d \n", dType,
-        size1, size2, size3, size4);
-
-  DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", nStride,
-        cStride, hStride, wStride);
-}
-
-#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp
index 9bec84de77fc279547eaaba8410c0e25ba3f3cd0..8e5e1fe9689853ee3ff547b62c5d44660db27b04 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp
@@ -1,8 +1,69 @@
 #include "debug.h"
+#include "tensor.h"
 #include <cstdarg>
 #include <cstdio>
 #include <cuda_runtime_api.h>
 #include <stdexcept>
+#include <sstream>
+#include <cstdlib>
+
+void INFO(const char *format, ...) {
+  if (!LOG_INFO) // Don't print if logging info is disabled
+    return;
+  va_list args;
+  va_start(args, format);
+  printf("INFO: ");
+  vprintf(format, args);
+  va_end(args);
+}
+
+void DEBUG(const char *format, ...) {
+  if (!LOG_DEBUG) // Don't print if logging info is disabled
+    return;
+  va_list args;
+  va_start(args, format);
+  printf("DEBUG: ");
+  vprintf(format, args);
+  va_end(args);
+}
+
+void ERROR(const char *format, ...) {
+  if (!LOG_ERROR) // Don't print if logging info is disabled
+    return;
+  va_list args;
+  va_start(args, format);
+  printf("ERROR!: ");
+  vprintf(format, args);
+  va_end(args);
+
+  abort();
+}
+
+void fillOnes(struct Tensor *tensor) {
+  // initialization is specific to the floating point type
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = 1.0;
+    }
+  }
+}
+
+void printTensorDescInfo(struct Tensor *tensor) {
+
+  cudnnDataType_t dType;
+  int nStride, cStride, hStride, wStride;
+  int size1, size2, size3, size4;
+  cudnnGetTensor4dDescriptor(tensor->tensor_desc, &dType, &size1, &size2,
+                             &size3, &size4, &nStride, &cStride, &hStride,
+                             &wStride);
+
+  DEBUG("dType = %d, size1 = %d, size2 = %d, size3 = %d, size4 = %d \n", dType,
+        size1, size2, size3, size4);
+
+  DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", nStride,
+        cStride, hStride, wStride);
+}
 
 void throwError(const char *file, int line, const char *fmt, ...) {
   char msg[2048];
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
index 0e05813bb6eb5de86057bf3b2066c8fd98642e8d..032443bd7a63a1640e463c0457dd362e09733be3 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
@@ -12,8 +12,8 @@
 #define CASE_FUNC(ename, fname)                                                \
   case MathOp::ename: {                                                        \
     void *v_func_ptr = nullptr;                                                \
-    checkCudaErrors(cudaMemcpyFromSymbol(                                      \
-        &v_func_ptr, _internal::fname##_ptr, sizeof(void *)));                 \
+    checkCudaErrors(cudaMemcpyFromSymbol(&v_func_ptr, _internal::fname##_ptr,  \
+                                         sizeof(void *)));                     \
     return v_func_ptr;                                                         \
   }
 
@@ -120,7 +120,7 @@ template <> void *mathOpToFunc<float2>(MathOp op) {
     CASE_FUNC(Mul, f2mul)
   default:
     ERROR("Float2 function not found\n");
-    return nullptr;  // For some compilers
+    return nullptr; // For some compilers
   }
 }
 
@@ -129,7 +129,7 @@ template <> void *mathOpToFunc<half2>(MathOp op) {
     CASE_FUNC(Mul, h2mul)
   default:
     ERROR("Half2 function not found\n");
-    return nullptr;  // For some compilers
+    return nullptr; // For some compilers
   }
 }
 
@@ -151,7 +151,7 @@ template <> void *mathOpToFunc<float>(MathOp op) {
   default:
     ERROR("Float function not found\n");
   }
-  return nullptr;  // For some compilers
+  return nullptr; // For some compilers
 }
 
 template <> void *mathOpToFunc<half>(MathOp op) {
@@ -169,7 +169,7 @@ template <> void *mathOpToFunc<half>(MathOp op) {
   default:
     ERROR("Half function not found\n");
   }
-  return nullptr;  // For some compilers
+  return nullptr; // For some compilers
 }
 
 template <> half reduceOpToIdentity<half>(MathOp op) {
@@ -185,5 +185,5 @@ template <> half reduceOpToIdentity<half>(MathOp op) {
   default:
     ERROR("Operator does not have id value\n");
   }
-  return 0.0f;  // For some compilers
+  return 0.0f; // For some compilers
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu
index 7a700b435efe464153fbba7997662c7dfa970385..638e06e786a8d8e4c587d4bda5d0223fa386f39a 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu
@@ -2,7 +2,6 @@
 #ifndef ERROR_HEADER
 #define ERROR_HEADER
 
-
 #include <stdio.h>
 #include <stdarg.h>
 #include <cstdio>
@@ -23,7 +22,6 @@
 #include <math.h>
 #include <assert.h>
 
-
 #include "debug.h"
 #include "tensor.h"
 #include "profiling.h"
@@ -31,39 +29,33 @@
 #include "global_data.h"
 #include "error.h"
 
+extern "C" {
 
+void readSkipTensors(int *skip_tensor_ids, int op_count) {
 
-extern "C"{
-  
-  
-void readSkipTensors(int* skip_tensor_ids, int op_count){
-
-  for(int i = 0; i < op_count; i++){
+  for (int i = 0; i < op_count; i++) {
     int tensor_id = skip_tensor_ids[i];
     skip_tensors[tensor_id] = 1;
   }
-
 }
 
-
-
-void readOpenTunerFlags(const char* file_name){
+void readOpenTunerFlags(const char *file_name) {
 
   total_ops = 0;
   op_counter = 0;
   op_accuracies.clear();
-  
-  FILE* fp = fopen(file_name, "r");
-  if(fp == NULL){
+
+  FILE *fp = fopen(file_name, "r");
+  if (fp == NULL) {
     DEBUG("\n WARNING: File 'opentuner_flags' not found \n\n\n");
     return;
   }
-    
+
   int retVal = 200;
-  while(retVal != EOF){
+  while (retVal != EOF) {
 
     int op_acc;
-    if(fp != NULL)
+    if (fp != NULL)
       retVal = fscanf(fp, "%d", &op_acc);
     else
       op_acc = 0;
@@ -75,24 +67,23 @@ void readOpenTunerFlags(const char* file_name){
   fclose(fp);
 }
 
-
-void readQuantRanges(char* file_name){
+void readQuantRanges(char *file_name) {
 
   total_ops = 0;
   op_counter = 0;
   quant_ranges.clear();
-  
-  FILE* fp = fopen(file_name, "r");
-  if(fp == NULL){
+
+  FILE *fp = fopen(file_name, "r");
+  if (fp == NULL) {
     ERROR("File %s not found \n", file_name);
   }
-    
+
   int retVal = 200;
-  while(retVal != EOF && retVal != -1){
+  while (retVal != EOF && retVal != -1) {
 
     int min;
     int max;
-    if(fp != NULL){
+    if (fp != NULL) {
       retVal = fscanf(fp, "%d", &min);
       printf("min =% d \n", min);
 
@@ -100,22 +91,18 @@ void readQuantRanges(char* file_name){
       printf("max =% d \n", max);
     }
 
-    if(retVal != -1){
-      struct Range* range = (struct Range*) malloc(sizeof(struct Range));
+    if (retVal != -1) {
+      struct Range *range = (struct Range *)malloc(sizeof(struct Range));
       range->min = min;
       range->max = max;
       quant_ranges.push_back(range);
       total_ops++;
     }
   }
-  
+
   fclose(fp);
 }
 
-
-
-
-
 /*__device__ inline void atomicAdd(float* address, float value)
 
 {
@@ -133,11 +120,7 @@ void readQuantRanges(char* file_name){
 };
 */
 
-
-
-
-
-Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
+Norm_t *calculateNorms(Tensor *x, Tensor *x_orig) {
 
   deviceToHostCopy(x);
   deviceToHostCopy(x_orig);
@@ -148,18 +131,18 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
   float inf_norm = -1.0;
   double total = 0.0;
 
-  float* arr1 = (float*) x->host_data;
-  float* arr2 = (float*) x_orig->host_data;
-  
-  for(unsigned int i = 0; i < x->num_elems; i++){
+  float *arr1 = (float *)x->host_data;
+  float *arr2 = (float *)x_orig->host_data;
+
+  for (unsigned int i = 0; i < x->num_elems; i++) {
 
     total = total + arr2[i];
-    
+
     float diff = abs(arr1[i] - arr2[i]);
     l1_norm += diff;
-    l2_norm += (arr1[i] - arr2[i]) *  (arr1[i] - arr2[i]);
+    l2_norm += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
 
-    if(inf_norm < diff)
+    if (inf_norm < diff)
       inf_norm = diff;
   }
 
@@ -170,12 +153,11 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
   l1_norm = l1_norm / distribution_mean;
   l2_norm = l2_norm / distribution_mean;
 
-    
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
+  Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t));
   norms->l1_norm = l1_norm;
   norms->l2_norm = l2_norm;
-  norms->inf_norm = inf_norm;  
-  
+  norms->inf_norm = inf_norm;
+
   INFO("l1_norm = %f \n", l1_norm);
   INFO("l2_norm = %f \n", l2_norm);
   INFO("inf_norm = %f \n", inf_norm);
@@ -183,9 +165,7 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
   return norms;
 }
 
-
-
-Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
+Norm_t *calculateNorms2(Tensor *x, Tensor *x_orig) {
 
   deviceToHostCopy(x);
   deviceToHostCopy(x_orig);
@@ -196,50 +176,49 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
 
   double l1_norm_A = 0.0;
   double l1_norm_B = 0.0;
-  
+
   double l2_norm_A = 0.0;
   double l2_norm_B = 0.0;
   float inf_norm = -1.0;
   float orig_inf_norm = -1.0;
   double total_diff = 0.0;
   double total_diff_squared = 0.0;
- 
-  float* arr1 = (float*) x->host_data;
-  float* arr2 = (float*) x_orig->host_data;
-  
-  for(unsigned int i = 0; i < x->num_elems; i++){
 
-    if(arr2[i] != 0.0)
+  float *arr1 = (float *)x->host_data;
+  float *arr2 = (float *)x_orig->host_data;
+
+  for (unsigned int i = 0; i < x->num_elems; i++) {
+
+    if (arr2[i] != 0.0)
       l0_norm_A = l0_norm_A + 1.0;
-    if(arr1[i] != 0.0)
+    if (arr1[i] != 0.0)
       l0_norm_B = l0_norm_B + 1.0;
-        
+
     l1_norm_A = l1_norm_A + abs(arr2[i]);
     l1_norm_B = l1_norm_B + abs(arr1[i]);
 
     l2_norm_A = l2_norm_A + (arr2[i] * arr2[i]);
     l2_norm_B = l2_norm_B + (arr1[i] * arr1[i]);
-      
+
     float diff = abs(arr1[i] - arr2[i]);
     total_diff = total_diff + diff;
     float diff_squared = diff * diff;
-    total_diff_squared = total_diff_squared + diff_squared; 
-
+    total_diff_squared = total_diff_squared + diff_squared;
 
-    if(orig_inf_norm < diff){
+    if (orig_inf_norm < diff) {
       orig_inf_norm = diff;
     }
-    
+
     // Relative difference value
-    float normalized_diff = diff / arr2[i];   
-    if(inf_norm < normalized_diff){
+    float normalized_diff = diff / arr2[i];
+    if (inf_norm < normalized_diff) {
       inf_norm = normalized_diff;
-    }    
+    }
   }
 
   // Relative L1 and Mean L1 norms of the difference Matrix
-  float mean_l1 = ( total_diff ) / x->num_elems;
-  float relative_l1 = ( total_diff ) / l1_norm_A;
+  float mean_l1 = (total_diff) / x->num_elems;
+  float relative_l1 = (total_diff) / l1_norm_A;
 
   // Computing Relative L2 norm - i.e., Euclidean distance
   double norm_root_A = sqrt(l2_norm_A);
@@ -248,8 +227,9 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
   float relative_l2 = diff_root / norm_root_A;
 
   // Packing computed norms in Norm_t struct
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
-  // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
+  Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t));
+  // Mean metrics - not normalized for the distribution - suitable for precision
+  // tuning hardware
   norms->mean_l1 = mean_l1;
   norms->mean_l2 = mean_l2;
   norms->orig_inf_norm = orig_inf_norm;
@@ -257,8 +237,8 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
   // Relative metrics (relative to distribution) - suitable for PROMISE
   norms->l1_norm = relative_l1;
   norms->l2_norm = relative_l2;
-  norms->inf_norm = inf_norm;  
-  
+  norms->inf_norm = inf_norm;
+
   INFO("l1_norm = %f \n", relative_l1);
   INFO("l2_norm = %f \n", relative_l2);
   INFO("inf_norm = %f \n", inf_norm);
@@ -266,33 +246,28 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
   return norms;
 }
 
-
-
-
-
-__global__ void normComputeKernel(float* A, float * B, double* l1_A, double* l2_A,
-				  double* l1_diff, double* l2_diff, unsigned int n){
+__global__ void normComputeKernel(float *A, float *B, double *l1_A,
+                                  double *l2_A, double *l1_diff,
+                                  double *l2_diff, unsigned int n) {
 
   int i = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(i < n){
-    
+  if (i < n) {
+
     double diff = fabsf(A[i] - B[i]);
-    double diff_squared = diff * diff;   
+    double diff_squared = diff * diff;
 
-    atomicAdd( l1_A,  fabsf(A[i]) );
-    atomicAdd( l2_A, (A[i] * A[i]) );
+    atomicAdd(l1_A, fabsf(A[i]));
+    atomicAdd(l2_A, (A[i] * A[i]));
 
-    atomicAdd( l1_diff, diff);
-    atomicAdd( l2_diff, diff_squared);
+    atomicAdd(l1_diff, diff);
+    atomicAdd(l2_diff, diff_squared);
   }
 }
 
-
-
 __inline__ __device__ double warpReduceSum(double val) {
 
-  for (int offset = warpSize/2; offset > 0; offset /= 2)
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
     val += __shfl_down_sync(0xFFFFFFFF, val, offset);
 
   return val;
@@ -304,36 +279,34 @@ __inline__ __device__ double blockReduceSum(double val) {
   int lane = threadIdx.x % warpSize;
   int wid = threadIdx.x / warpSize;
 
-  val = warpReduceSum(val);     // Each warp performs partial reduction
+  val = warpReduceSum(val); // Each warp performs partial reduction
 
   if (lane == 0)
-    shared[wid]=val; // Write reduced value to shared memory
+    shared[wid] = val; // Write reduced value to shared memory
 
-  
-  __syncthreads();              // Wait for all partial reductions
+  __syncthreads(); // Wait for all partial reductions
 
-  
-  //read from shared memory only if that warp existed
+  // read from shared memory only if that warp existed
   val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
 
-  if (wid == 0) val = warpReduceSum(val); //Final reduce within first warp
+  if (wid == 0)
+    val = warpReduceSum(val); // Final reduce within first warp
 
   return val;
-
 }
 
-
-
-__global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N,
-					      double* A_l1, double* A_l2,
-					      double* diff_l1, double* diff_l2) {
+__global__ void deviceReduceBlockAtomicKernel(float *A, float *B, int N,
+                                              double *A_l1, double *A_l2,
+                                              double *diff_l1,
+                                              double *diff_l2) {
 
   double sum_A_l1 = double(0);
   double sum_A_l2 = double(0);
   double sum_diff_l1 = double(0);
   double sum_diff_l2 = double(0);
 
-  for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
 
     sum_A_l1 += fabsf(A[i]);
     sum_A_l2 += (A[i] * A[i]);
@@ -347,31 +320,28 @@ __global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N,
   sum_A_l2 = blockReduceSum(sum_A_l2);
   sum_diff_l1 = blockReduceSum(sum_diff_l1);
   sum_diff_l2 = blockReduceSum(sum_diff_l2);
-  
-  if (threadIdx.x == 0){
+
+  if (threadIdx.x == 0) {
     atomicAdd(A_l1, sum_A_l1);
     atomicAdd(A_l2, sum_A_l2);
     atomicAdd(diff_l1, sum_diff_l1);
     atomicAdd(diff_l2, sum_diff_l2);
-  }   
+  }
 }
 
-
-void deviceReduce(float* A, float* B, int N,
-		  double* A_l1, double* A_l2,
-		  double* diff_l1, double* diff_l2) {
+void deviceReduce(float *A, float *B, int N, double *A_l1, double *A_l2,
+                  double *diff_l1, double *diff_l2) {
 
   int threads = 512;
   int blocks = min((N + threads - 1) / threads, 1024);
 
-  deviceReduceBlockAtomicKernel<<<blocks, threads>>>(A, B, N, A_l1, A_l2, diff_l1, diff_l2);
+  deviceReduceBlockAtomicKernel<<<blocks, threads>>>(A, B, N, A_l1, A_l2,
+                                                     diff_l1, diff_l2);
   //-- deviceReduceKernel<<<1, 1024>>>(out, out, blocks);
 }
 
-
-
 // Compute Norms on the GPU
-Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){
+Norm_t *calculateNormsTreeReduction(Tensor *x, Tensor *x_orig) {
 
   hostToDeviceCopy(x);
   hostToDeviceCopy(x_orig);
@@ -388,26 +358,27 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){
   double *l2_norm_A_d;
   double *l1_diff_d;
   double *l2_diff_d;
-  
-  cudaMalloc( (void**) &l1_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l2_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l1_diff_d, sizeof(double));
-  cudaMalloc( (void**) &l2_diff_d, sizeof(double));
- 
-    
-  float* arr1 = (float*) x->gpu_data;
-  float* arr2 = (float*) x_orig->gpu_data;
-
-  //normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
-  deviceReduce(arr1, arr2, x->num_elems, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d);
-  
+
+  cudaMalloc((void **)&l1_norm_A_d, sizeof(double));
+  cudaMalloc((void **)&l2_norm_A_d, sizeof(double));
+  cudaMalloc((void **)&l1_diff_d, sizeof(double));
+  cudaMalloc((void **)&l2_diff_d, sizeof(double));
+
+  float *arr1 = (float *)x->gpu_data;
+  float *arr2 = (float *)x_orig->gpu_data;
+
+  // normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d,
+  // l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
+  deviceReduce(arr1, arr2, x->num_elems, l1_norm_A_d, l2_norm_A_d, l1_diff_d,
+               l2_diff_d);
+
   cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
 
   INFO("l1_norm_A = %f, l2_norm_A = %f, l1_diff = %f, l2_diff = %f \n",
-       l1_norm_A, l2_norm_A,l1_diff, l2_diff);
+       l1_norm_A, l2_norm_A, l1_diff, l2_diff);
 
   // Relative L1 and Mean L1 norms of the difference Matrix
   float mean_l1 = l1_diff / x->num_elems;
@@ -420,34 +391,32 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){
   float relative_l2 = diff_root / norm_root_A;
 
   // Packing computed norms in Norm_t struct
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
-  // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
+  Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t));
+  // Mean metrics - not normalized for the distribution - suitable for precision
+  // tuning hardware
   norms->mean_l1 = mean_l1;
   norms->mean_l2 = mean_l2;
   norms->orig_inf_norm = 0.0;
 
-  // Relative metrics (relative to distribution) 
+  // Relative metrics (relative to distribution)
   norms->l1_norm = relative_l1;
   norms->l2_norm = relative_l2;
-  norms->inf_norm = 0.0;  
-  
+  norms->inf_norm = 0.0;
+
   INFO("l1_norm = %f \n", relative_l1);
   INFO("l2_norm = %f \n", relative_l2);
 
   return norms;
 }
 
-
-
-
 // Compute Norms on the GPU
-Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
+Norm_t *calculateNormsGPU(Tensor *x, Tensor *x_orig) {
 
   hostToDeviceCopy(x);
   hostToDeviceCopy(x_orig);
 
   // FIXIT: Move all floats to doubles - overflow is possible
-  
+
   double l1_norm_A;
   double l2_norm_A;
 
@@ -459,27 +428,26 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
   double *l2_norm_A_d;
   double *l1_diff_d;
   double *l2_diff_d;
-  
-  cudaMalloc( (void**) &l1_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l2_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l1_diff_d, sizeof(double));
-  cudaMalloc( (void**) &l2_diff_d, sizeof(double));
- 
-    
-  float* arr1 = (float*) x->gpu_data;
-  float* arr2 = (float*) x_orig->gpu_data;
+
+  cudaMalloc((void **)&l1_norm_A_d, sizeof(double));
+  cudaMalloc((void **)&l2_norm_A_d, sizeof(double));
+  cudaMalloc((void **)&l1_diff_d, sizeof(double));
+  cudaMalloc((void **)&l2_diff_d, sizeof(double));
+
+  float *arr1 = (float *)x->gpu_data;
+  float *arr2 = (float *)x_orig->gpu_data;
 
   int blockSize = 1024;
-  int gridSize = (int) ceil ((float) x->num_elems / blockSize);
+  int gridSize = (int)ceil((float)x->num_elems / blockSize);
   INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
 
-  normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
+  normComputeKernel<<<gridSize, blockSize>>>(
+      arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
 
   cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
-  
 
   // Relative L1 and Mean L1 norms of the difference Matrix
   float mean_l1 = l1_diff / x->num_elems;
@@ -492,8 +460,9 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
   float relative_l2 = diff_root / norm_root_A;
 
   // Packing computed norms in Norm_t struct
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
-  // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
+  Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t));
+  // Mean metrics - not normalized for the distribution - suitable for precision
+  // tuning hardware
   norms->mean_l1 = mean_l1;
   norms->mean_l2 = mean_l2;
   norms->orig_inf_norm = 0.0;
@@ -501,355 +470,47 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
   // Relative metrics (relative to distribution) - suitable for PROMISE
   norms->l1_norm = relative_l1;
   norms->l2_norm = relative_l2;
-  norms->inf_norm = 0.0;  
-  
+  norms->inf_norm = 0.0;
+
   INFO("l1_norm = %f \n", relative_l1);
   INFO("l2_norm = %f \n", relative_l2);
 
   return norms;
 }
 
-
-
-
-__global__ void vecConstMul(float* A, float mul_factor, int n){
+__global__ void vecConstMul(float *A, float mul_factor, int n) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(id < n)
-    A[id] = A[id] * mul_factor; 
+  if (id < n)
+    A[id] = A[id] * mul_factor;
 }
 
-
-__global__ void vecRound(float* A, int n){
+__global__ void vecRound(float *A, int n) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(id < n)
-    A[id] = roundf(A[id]); 
+  if (id < n)
+    A[id] = roundf(A[id]);
 }
 
-
-__global__ void vecConstDiv(float* A, float div_factor, int n){
+__global__ void vecConstDiv(float *A, float div_factor, int n) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(id < n)
-    A[id] = A[id] / div_factor; 
+  if (id < n)
+    A[id] = A[id] / div_factor;
 }
 
-
-
-__global__ void vecMul(float* A, float* B, int n){
+__global__ void vecMul(float *A, float *B, int n) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(id < n)
-    B[id] = A[id] * B[id]; 
-}
-
-
-/****  ERROR injecion routines  ******/
-
-void initRandValues(Tensor* bias, int error_scale){
-
-  float scaling_values[20];
-  
-  // FIXIT: Error knob 0 should be 0 zero
-  scaling_values[0] = 0.000;
-  scaling_values[1] = 0.0005;
-  scaling_values[2] = 0.03;
-  scaling_values[3] = 0.06;
-  scaling_values[4] = 0.08;
-  scaling_values[5] = 0.105;  
-  scaling_values[6] = 0.134;
-  scaling_values[7] = 0.16;
-  scaling_values[8] = 0.2;
-  scaling_values[9] = 0.23;
-  scaling_values[10] = 0.26;
-  scaling_values[11] = 0.3;
-  scaling_values[12] = 0.35;
-  scaling_values[13] = 0.4;
-  scaling_values[14] = 0.45;
-  scaling_values[15] = 0.55;
-  scaling_values[16] = 0.65;
-  scaling_values[17] = 0.7;
-  scaling_values[18] = 0.8;
-  scaling_values[19] = 0.9;
- 
-
-  curandGenerator_t gen;
-
-  struct timespec ts;
-
-  if(timespec_get(&ts, TIME_UTC) == 0){
-    printf("crashed \n");
-    abort();
-  }
-
-  curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
-
-  curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec);
-    
-  curandGenerateNormal(gen, (float*) bias->gpu_data, bias->num_elems, 0.0, 1.0 * scaling_values[error_scale]);
-
+  if (id < n)
+    B[id] = A[id] * B[id];
 }
 
-
-
-void initRandValues2(Tensor* bias, int error_scale){
-
-  float scaling_values[20];
-  
-  // FIXIT: Error knob 0 should be 0 zero
-  scaling_values[0] = 0.000;
-  scaling_values[1] = 0.0005;
-  scaling_values[2] = 0.0008;
-  scaling_values[3] = 0.001;
-  scaling_values[4] = 0.005;
-  scaling_values[5] = 0.01;  
-  scaling_values[6] = 0.02;
-  scaling_values[7] = 0.03;
-  scaling_values[8] = 0.04;
-  scaling_values[9] = 0.05;
-  scaling_values[10] = 0.06;
-  scaling_values[11] = 0.08;
-  scaling_values[12] = 0.1;
-  scaling_values[13] = 0.12;
-  scaling_values[14] = 0.15;
-  scaling_values[15] = 0.2;
-  scaling_values[16] = 0.55;
-  scaling_values[17] = 0.6;
-  scaling_values[18] = 0.65;
-  scaling_values[19] = 0.7;
-
-
-  curandGenerator_t gen;
-
-  struct timespec ts;
-
-  if(timespec_get(&ts, TIME_UTC) == 0){
-    printf("crashed \n");
-    abort();
-  }
-
-  curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
-
-  curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec);
-    
-  curandGenerateNormal(gen, (float*) bias->gpu_data, bias->num_elems, 0.0, 1.0 * scaling_values[error_scale]);
-
-}
-
-
-void* addBitError(void* x_ptr, int error_scale){
-
-  if(error_scale > 6 || error_scale < 0){
-    ERROR("Error Scale out of bounds \n");
-  }
-      
-  INFO("*** TensorBitError \n");  
-  profileEvent("tensorBitError");
-
-  Tensor* x = (Tensor*) x_ptr;
-  
-  size_t* dim_sizes = x->dims.dim_sizes; 
-  Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
-					        dim_sizes[0], dim_sizes[1],
-						dim_sizes[2], dim_sizes[3]);
-
-  // Copying x data into x_original - for computing Norms 
-  tensorCopy(x, x_original);
-
-  // Quadratic Error
-  float freq_factors[6];
-  freq_factors[0] = 0.1;
-  freq_factors[1] = 0.2;
-  freq_factors[2] = 0.4;
-  freq_factors[3] = 0.6;
-  freq_factors[4] = 0.8;
-  freq_factors[5] = 1.0;
-
-  float error_freq = freq_factors[error_scale];
-  
-  deviceToHostCopy(x);
-
-  unsigned char* data_arr = reinterpret_cast<unsigned char*>(x->host_data);
-  // FIXIT: Need to be careful about floating point datatype assumptions
-  long int total_bytes = x->size_in_bytes;
-  long int error_iterations = total_bytes * 0.01 * error_freq;
-  INFO("total_bytes = %lu, error_iterations = %lu \n", total_bytes, error_iterations);
-
-  srand(time(NULL));
-  
-  for(int i = 0; i < error_iterations; i++){
-    // FIXIT: The rand() is only specific to int - need long 
-    long int index = rand() % total_bytes;
-    int N = 5; // The operation below flips the Nth bit 
-    unsigned char fil = 1UL << N;
-    unsigned char val = data_arr[index];
-    char flipped = val^fil;
-    data_arr[i] = flipped;
-  }
-  
-
-  Norm_t* norms = calculateNorms2(x, x_original);
-
-  
-  profileEvent("tensorBitError_end", true);
-  
-  return (void*) norms;
-
-}
-
-
-void randomCeilAndFloor(float* x, size_t num_elems){
-
-  INFO("randomCeilAndFloor\n");
-  
-  std::random_device rd;
-  std::mt19937 mt(rd());
-  std::normal_distribution<float> distribution(0.0, 1.0);
-
-  for(size_t i = 0; i < num_elems; i++){
-    float rand_num = distribution(mt);
-    int val = abs(((int) rand_num) % 2);
-    if(val == 0)
-      x[i] = floor(x[i]);
-    else if(val == 1)
-      x[i] = ceil(x[i]);
-  }
-
-}
-
-// Routine for Adding RoundOff Errors
-void* addRoundError(void* x_ptr, int error_scale){
-
-  if(error_scale > 11 || error_scale < 0){
-    ERROR("Error Scale out of bounds \n");
-  }
-      
-  INFO("*** TensorRoundError \n");  
-  profileEvent("tensorRoundError");
-
-  Tensor* x = (Tensor*) x_ptr;
-  
-  size_t* dim_sizes = x->dims.dim_sizes; 
-  Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
-					        dim_sizes[0], dim_sizes[1],
-						dim_sizes[2], dim_sizes[3]);
-
-  // Copying x data into x_original - for computing Norms 
-  tensorCopy(x, x_original);
-
-  float round_factors[12];
-  round_factors[0] = 1000000; // FIXIT: This should be zero error
-  round_factors[1] = 100;
-  round_factors[2] = 10;
-  round_factors[3] = 7; // Beyond this point, the error function is linear
-  round_factors[4] = 3;
-  round_factors[5] = 1;
-  round_factors[6] = 0.7;
-  round_factors[7] = 0.3;
-  round_factors[8] = 0.1;
-  round_factors[9] = 0.07;
-  round_factors[10] = 0.03;
-  round_factors[11] = 0.01;
-  
-  // THINK: Considering using error magnitudes in this scenario
-  
-
-  float round_factor = round_factors[error_scale];
-  INFO("round_factor = %f \n", round_factor);
-  
-  hostToDeviceCopy(x);
-
-  int blockSize = 128;
-  int gridSize = (int) ceil ((float) x->num_elems / blockSize);
-  INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
-
-  // NOTE: Check if a large gridSize will work with really large tensors
-  vecConstMul<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems);
-  //vecRound<<<gridSize, blockSize>>>((float*) x->gpu_data, x->num_elems);
-  
-  deviceToHostCopy(x);
-  randomCeilAndFloor((float*) x->host_data, x->num_elems);
-  hostToDeviceCopy(x);
-  
-  vecConstDiv<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems);
-  
-  Norm_t* norms = calculateNorms2(x, x_original);
-  
-  profileEvent("tensorRoundError_end", true);
-  
-  return (void*) norms;
-}
-
-
-
-
-// Routine for Adding Gaussian Error
-void* addGaussianError(void* x_ptr, int error_scale){
-
-  if(error_scale > 20 || error_scale < 0){
-    ERROR("Error Scale out of bounds \n");
-  }
-      
-  INFO("*** TensorAddError \n");  
-  profileEvent("tensorAddError");
-
-  Tensor* x = (Tensor*) x_ptr;
-  
-  size_t* dim_sizes = x->dims.dim_sizes;
-  Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format,
-					  dim_sizes[0], dim_sizes[1],
-					  dim_sizes[2], dim_sizes[3]);
-  
-  Tensor* x_original = (Tensor*) create4DTensor(x->cur_type, x->data_format,
-					        dim_sizes[0], dim_sizes[1],
-						dim_sizes[2], dim_sizes[3]);
-
-  // Copying x data into x_original - for computing Norms 
-  tensorCopy(x, x_original);
-
-  // NOTE: Error scale is used to generate the bias matrix
-  initRandValues(bias, error_scale);  
-
-  hostToDeviceCopy(x);
-  //hostToDeviceCopy(bias);
-
- 
-  int blockSize = 1024;
-  int gridSize = (int) ceil ((float) x->num_elems / blockSize);
-  INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
-
-  // NOTE: Check if a large gridSize will work with really large tensors
-  vecMul<<<gridSize, blockSize>>>((float*) x->gpu_data, (float*) bias->gpu_data, x->num_elems);
-  
-  float alpha = 1.0f;
-    
-  // FIXIT: routine fails for 3D tensors
-  checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc,
-			    bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data));
-
-
-  //Norm_t* norms = calculateNorms2(x, x_original);
-  //Norm_t* norms = calculateNormsGPU(x, x_original);
-
-  Norm_t* norms = calculateNormsTreeReduction(x, x_original);
-  
-  freeTensor(x_original);
-  freeTensor(bias);
-  
-  
-  profileEvent("tensorAddError_end", true);
-  
-  return (void*) norms;
-}
-
-
-
-void initPromiseRandValues(Tensor* bias, int error_scale){
+void initPromiseRandValues(Tensor *bias, int error_scale) {
 
   float scaling_values[10];
 
@@ -859,98 +520,91 @@ void initPromiseRandValues(Tensor* bias, int error_scale){
   scaling_values[2] = 0.336;
   scaling_values[3] = 0.21;
   scaling_values[4] = 0.168;
-  scaling_values[5] = 0.14;  
+  scaling_values[5] = 0.14;
   scaling_values[6] = 0.11;
   scaling_values[7] = 0.0784;
   scaling_values[8] = 0.005;
   scaling_values[9] = 0.000;
 
-  
   curandGenerator_t gen;
   struct timespec ts;
-  if(timespec_get(&ts, TIME_UTC) == 0){
+  if (timespec_get(&ts, TIME_UTC) == 0) {
     printf("crashed \n");
     abort();
   }
 
   curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
-  curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec);
-  curandGenerateNormal(gen,
-		       (float*) bias->gpu_data,
-		       bias->num_elems, 0.0,
-		       1.0 * scaling_values[error_scale]);
-  
+  curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec ^ ts.tv_sec);
+  curandGenerateNormal(gen, (float *)bias->gpu_data, bias->num_elems, 0.0,
+                       1.0 * scaling_values[error_scale]);
 }
 
-
 // NOTE: Assumption is that x_ptr is FP32 tensor - doesn't work with FP16
 // Routine for Adding PROMISE bitline swing error
-void* addPromiseError(void* x_ptr, int error_scale){
+void *addPromiseError(void *x_ptr, int error_scale) {
 
-  if(error_scale > 10 || error_scale < 0){
+  if (error_scale > 10 || error_scale < 0) {
     ERROR("Error Scale out of bounds for PROMISE - 8 Swing values \n");
   }
-      
-  INFO("*** addPromiseError \n");  
+
+  INFO("*** addPromiseError \n");
   profileEvent("addPromiseError");
 
-  Tensor* x = (Tensor*) x_ptr;
-  
-  size_t* dim_sizes = x->dims.dim_sizes;
-  Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format,
-					  dim_sizes[0], dim_sizes[1],
-					  dim_sizes[2], dim_sizes[3]);
- 
+  Tensor *x = (Tensor *)x_ptr;
+
+  size_t *dim_sizes = x->dims.dim_sizes;
+  Tensor *bias =
+      (Tensor *)create4DTensor(x->cur_type, x->data_format, dim_sizes[0],
+                               dim_sizes[1], dim_sizes[2], dim_sizes[3]);
+
   // NOTE: Error scale is used to generate the bias matrix
-  initPromiseRandValues(bias, error_scale);  
+  initPromiseRandValues(bias, error_scale);
 
   hostToDeviceCopy(x);
-  //hostToDeviceCopy(bias);
- 
+  // hostToDeviceCopy(bias);
+
   int blockSize = 1024;
-  int gridSize = (int) ceil ((float) x->num_elems / blockSize);
+  int gridSize = (int)ceil((float)x->num_elems / blockSize);
   INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
 
   // NOTE: Check if a large gridSize will work with really large tensors
-  vecMul<<<gridSize, blockSize>>>((float*) x->gpu_data, (float*) bias->gpu_data, x->num_elems);
-  
+  vecMul<<<gridSize, blockSize>>>((float *)x->gpu_data, (float *)bias->gpu_data,
+                                  x->num_elems);
+
   float alpha = 1.0f;
-  //float beta = 0.0f;    
+  // float beta = 0.0f;
   checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc,
-			    bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data));
+                            bias->gpu_data, &alpha, x->tensor_desc,
+                            x->gpu_data));
 
   profileEvent("addPromiseError_end", true);
-  
-  return (void*) x;
-}
-
-
 
+  return (void *)x;
+}
 
-__global__ void quantizeAndClip(float* A, int n, float mul_factor, float min, float max){
+__global__ void quantizeAndClip(float *A, int n, float mul_factor, float min,
+                                float max) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
-  if(id < n){
+  if (id < n) {
     int temp = (A[id] - min) / mul_factor;
     float result = temp * 1.0 * mul_factor;
     result = result + min;
     A[id] = result;
 
-    if(A[id] > max){
+    if (A[id] > max) {
       A[id] = max;
     }
-    if(A[id] < min){
+    if (A[id] < min) {
       A[id] = min;
     }
-    
   }
 }
 
-
-__global__ void quantizeElem(float* A, int n, float mul_factor, float min){
+__global__ void quantizeElem(float *A, int n, float mul_factor, float min) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
-  if(id < n){
+  if (id < n) {
     int temp = (A[id] - min) / mul_factor;
     float result = temp * 1.0 * mul_factor;
     result = result + min;
@@ -958,44 +612,27 @@ __global__ void quantizeElem(float* A, int n, float mul_factor, float min){
   }
 }
 
-
-void* quantizeTensorPromise(void* input_ptr, float min, float max){
+void *quantizeTensorPromise(void *input_ptr, float min, float max) {
 
   INFO("QuantizeTensorPROMISE \n");
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
-  
   int quantize_range = 256;
   float input_range = max - min;
   float mul_factor = input_range / quantize_range;
   INFO("mul_factor = %f \n", mul_factor);
 
   int blockSize = 1024;
-  int gridSize = (int) ceil ((float) input->num_elems / blockSize);
+  int gridSize = (int)ceil((float)input->num_elems / blockSize);
   INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
 
   hostToDeviceCopy(input);
 
-  quantizeAndClip<<<gridSize, blockSize>>>((float*) input->gpu_data,
-					   input->num_elems, mul_factor, min, max);
+  quantizeAndClip<<<gridSize, blockSize>>>(
+      (float *)input->gpu_data, input->num_elems, mul_factor, min, max);
 
-  
   return input;
 }
-
-
-void* tensorAddError(void* x_ptr, int error_scale){
-
-  void * new_x = addGaussianError(x_ptr, error_scale);
-  //void * new_x = addRoundError(x_ptr, error_scale);
-  //void * new_x = addBitError(x_ptr, error_scale);
-  return new_x;
-}
-
-
-
-
 }
-  
 
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu
index 4392839f7f6dbca8df4352a19fdd689d6f8e3d5e..00334f8ecc821fdb3209e48aa94785aad0a54f37 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu
@@ -1,7 +1,7 @@
 //===--------------------------- fp16_gemm.cu -----------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 //  This file  consists of the custom implementation of quantization kernels.
 // This helps HPVM to switch compute precision for tensor operations between
 // FP32 and FP16.
@@ -17,236 +17,199 @@
 #include <cuda_fp16.h>
 #include "fp16_emu.h"
 
-
-
 inline cudaError_t checkCuda(cudaError_t result) {
-    if (result != cudaSuccess)
-        std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n";
-    return result;
+  if (result != cudaSuccess)
+    std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n";
+  return result;
 }
 
 inline cublasStatus_t checkCublas(cublasStatus_t result) {
-    if (result != CUBLAS_STATUS_SUCCESS)
-        std::cerr << "cuBLAS Error: " << result << "\n";
-    return result;
+  if (result != CUBLAS_STATUS_SUCCESS)
+    std::cerr << "cuBLAS Error: " << result << "\n";
+  return result;
 }
 
 template <typename T>
-inline void printArray(const T * const __restrict__ array,
+inline void printArray(const T *const __restrict__ array,
                        const unsigned elements) {
-    for (unsigned i = 0; i < elements; i++)
-        std::cout << std::to_string(array[i]) << "\n";
+  for (unsigned i = 0; i < elements; i++)
+    std::cout << std::to_string(array[i]) << "\n";
 }
 
 // initialization
 template <typename T>
-__global__ void initKernel(T * const __restrict__ array,
+__global__ void initKernel(T *const __restrict__ array,
                            const unsigned elements) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        array[idx] = 1.2;
+  const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < elements)
+    array[idx] = 1.2;
 }
 
 template <typename T>
-void init(T * const __restrict__ array,
-          const unsigned elements) {
-    const unsigned block_size = 512;
-    const unsigned num_blocks = (elements + block_size - 1) / block_size;
-    initKernel<<<num_blocks, block_size>>>(array, elements);
-    checkCuda(cudaDeviceSynchronize());
+void init(T *const __restrict__ array, const unsigned elements) {
+  const unsigned block_size = 512;
+  const unsigned num_blocks = (elements + block_size - 1) / block_size;
+  initKernel<<<num_blocks, block_size>>>(array, elements);
+  checkCuda(cudaDeviceSynchronize());
 }
 
 // float to half
-__global__ void f2hKernel(const float * const __restrict__ input,
+__global__ void f2hKernel(const float *const __restrict__ input,
                           const unsigned elements,
-                          half * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = __float2half_rn(input[idx]);
+                          half *const __restrict__ output) {
+  const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < elements)
+    output[idx] = __float2half_rn(input[idx]);
 }
 
-void f2h(const float * const __restrict__ input,
-         const unsigned elements,
-         half * const __restrict__ output) {
-    const unsigned block_size = 512;
-    const unsigned num_blocks = (elements + block_size - 1) / block_size;
-    f2hKernel<<<num_blocks, block_size>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
+void f2h(const float *const __restrict__ input, const unsigned elements,
+         half *const __restrict__ output) {
+  const unsigned block_size = 512;
+  const unsigned num_blocks = (elements + block_size - 1) / block_size;
+  f2hKernel<<<num_blocks, block_size>>>(input, elements, output);
+  checkCuda(cudaDeviceSynchronize());
 }
 
 // half to float
-__global__ void h2fKernel(const half * const __restrict__ input,
+__global__ void h2fKernel(const half *const __restrict__ input,
                           const unsigned elements,
-                          float * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = __half2float(input[idx]);
+                          float *const __restrict__ output) {
+  const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < elements)
+    output[idx] = __half2float(input[idx]);
 }
 
-void h2f(const half * const __restrict__ input,
-         const unsigned elements,
-         float * const __restrict__ output) {
-    const unsigned block_size = 512;
-    const unsigned num_blocks = (elements + block_size - 1) / block_size;
-    h2fKernel<<<num_blocks, block_size>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
+void h2f(const half *const __restrict__ input, const unsigned elements,
+         float *const __restrict__ output) {
+  const unsigned block_size = 512;
+  const unsigned num_blocks = (elements + block_size - 1) / block_size;
+  h2fKernel<<<num_blocks, block_size>>>(input, elements, output);
+  checkCuda(cudaDeviceSynchronize());
 }
 
-void sgemm(const float * const __restrict__ a,
-           const unsigned num_rows_a,
-           const unsigned num_cols_a,
-           const float * const __restrict__ b,
-           const unsigned num_rows_b,
-           const unsigned num_cols_b,
-           float * const __restrict__ c) {
-    const unsigned iterations = 10;
-    float kernel_time;
-    cudaEvent_t start;
-    cudaEvent_t stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cublasHandle_t handle;
-    checkCublas(cublasCreate(&handle));
-
-    // Enable Tensor Cores
-    checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
-
-    const float alpha_ = 1.0;
-    const float beta_  = 0.0;
-    const float *alpha = &alpha_;
-    const float *beta  = &beta_;
-
-    cudaEventRecord(start, 0);
-    for (unsigned i = 0; i < iterations; i++) {
-        checkCublas(cublasGemmEx(handle,
-                                 CUBLAS_OP_N,
-                                 CUBLAS_OP_N,
-                                 // Dimensions
-                                 num_rows_a,
-                                 num_cols_b,
-                                 num_cols_a,
-                                 alpha,
-                                 // A
-                                 a,
-                                 CUDA_R_32F,
-                                 num_rows_a,
-                                 // B
-                                 b,
-                                 CUDA_R_32F,
-                                 num_rows_b,
-                                 beta,
-                                 // C
-                                 c,
-                                 CUDA_R_32F,
-                                 num_rows_a,
-                                 // Compute precision and algorithm
-                                 CUDA_R_32F,
-                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&kernel_time, start, stop);
-
-    std::cout << "FP32 GEMM: " << std::to_string(kernel_time / iterations) << " ms\n";
+void sgemm(const float *const __restrict__ a, const unsigned num_rows_a,
+           const unsigned num_cols_a, const float *const __restrict__ b,
+           const unsigned num_rows_b, const unsigned num_cols_b,
+           float *const __restrict__ c) {
+  const unsigned iterations = 10;
+  float kernel_time;
+  cudaEvent_t start;
+  cudaEvent_t stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cublasHandle_t handle;
+  checkCublas(cublasCreate(&handle));
+
+  // Enable Tensor Cores
+  checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+
+  const float alpha_ = 1.0;
+  const float beta_ = 0.0;
+  const float *alpha = &alpha_;
+  const float *beta = &beta_;
+
+  cudaEventRecord(start, 0);
+  for (unsigned i = 0; i < iterations; i++) {
+    checkCublas(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             // Dimensions
+                             num_rows_a, num_cols_b, num_cols_a, alpha,
+                             // A
+                             a, CUDA_R_32F, num_rows_a,
+                             // B
+                             b, CUDA_R_32F, num_rows_b, beta,
+                             // C
+                             c, CUDA_R_32F, num_rows_a,
+                             // Compute precision and algorithm
+                             CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&kernel_time, start, stop);
+
+  std::cout << "FP32 GEMM: " << std::to_string(kernel_time / iterations)
+            << " ms\n";
 }
 
-void hgemm(const float * const __restrict__ af,
-           const unsigned num_rows_a,
-           const unsigned num_cols_a,
-           const float * const __restrict__ bf,
-           const unsigned num_rows_b,
-           const unsigned num_cols_b,
-           float * const __restrict__ cf) {
-    const unsigned iterations = 10;
-
-    const unsigned num_elements_a = num_rows_a * num_cols_a;
-    const unsigned num_elements_b = num_rows_b * num_cols_b;
-    const unsigned num_elements_c = num_rows_a * num_cols_b;
-
-    float to_fp16_time;
-    float to_fp32_time;
-    float kernel_time;
-    float total_time;
-
-    cudaEvent_t start;
-    cudaEvent_t stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    half *a;
-    half *b;
-    half *c;
-
-    checkCuda(cudaMallocManaged(&a, sizeof(half) * num_elements_a));
-    checkCuda(cudaMallocManaged(&b, sizeof(half) * num_elements_b));
-    checkCuda(cudaMallocManaged(&c, sizeof(half) * num_elements_c));
-
-    init(a, num_elements_a);
-    init(b, num_elements_b);
-    init(c, num_elements_c);
-
-    // Convert floats to halfs
-    cudaEventRecord(start, 0);
-    f2h(af, num_elements_a, a);
-    f2h(bf, num_elements_b, b);
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&to_fp16_time, start, stop);
-
-    cublasHandle_t handle;
-    checkCublas(cublasCreate(&handle));
-    checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
-
-    const half alpha_ = cpu_float2half_rn(1.0);
-    const half beta_  = cpu_float2half_rn(0.0);
-    const half *alpha = &alpha_;
-    const half *beta  = &beta_;
-
-    cudaEventRecord(start, 0);
-    for (unsigned i = 0; i < iterations; i++) {
-        checkCublas(cublasGemmEx(handle,
-                                 CUBLAS_OP_N,
-                                 CUBLAS_OP_N,
-                                 // Dimensions
-                                 num_rows_a,
-                                 num_cols_b,
-                                 num_cols_a,
-                                 alpha,
-                                 // A
-                                 a,
-                                 CUDA_R_16F,
-                                 num_rows_a,
-                                 // B
-                                 b,
-                                 CUDA_R_16F,
-                                 num_rows_b,
-                                 beta,
-                                 // C
-                                 c,
-                                 CUDA_R_16F,
-                                 num_rows_a,
-                                 // Compute precision and algorithm
-                                 CUDA_R_16F,
-                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&kernel_time, start, stop);
-
-    cudaEventRecord(start, 0);
-    h2f(c, num_elements_c, cf);
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&to_fp32_time, start, stop);
-
-    total_time = to_fp16_time + (kernel_time / iterations) + to_fp32_time;
-    std::cout << "FP16 GEMM: " << std::to_string(total_time) << " ms\n";
-    std::cout << "\tTo FP16: " << std::to_string(to_fp16_time) << " ms\n";
-    std::cout << "\tKernel : " << std::to_string(kernel_time / iterations) << " ms\n";
-    std::cout << "\tTo FP32: " << std::to_string(to_fp32_time) << " ms\n";
+void hgemm(const float *const __restrict__ af, const unsigned num_rows_a,
+           const unsigned num_cols_a, const float *const __restrict__ bf,
+           const unsigned num_rows_b, const unsigned num_cols_b,
+           float *const __restrict__ cf) {
+  const unsigned iterations = 10;
+
+  const unsigned num_elements_a = num_rows_a * num_cols_a;
+  const unsigned num_elements_b = num_rows_b * num_cols_b;
+  const unsigned num_elements_c = num_rows_a * num_cols_b;
+
+  float to_fp16_time;
+  float to_fp32_time;
+  float kernel_time;
+  float total_time;
+
+  cudaEvent_t start;
+  cudaEvent_t stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  half *a;
+  half *b;
+  half *c;
+
+  checkCuda(cudaMallocManaged(&a, sizeof(half) * num_elements_a));
+  checkCuda(cudaMallocManaged(&b, sizeof(half) * num_elements_b));
+  checkCuda(cudaMallocManaged(&c, sizeof(half) * num_elements_c));
+
+  init(a, num_elements_a);
+  init(b, num_elements_b);
+  init(c, num_elements_c);
+
+  // Convert floats to halfs
+  cudaEventRecord(start, 0);
+  f2h(af, num_elements_a, a);
+  f2h(bf, num_elements_b, b);
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&to_fp16_time, start, stop);
+
+  cublasHandle_t handle;
+  checkCublas(cublasCreate(&handle));
+  checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+
+  const half alpha_ = cpu_float2half_rn(1.0);
+  const half beta_ = cpu_float2half_rn(0.0);
+  const half *alpha = &alpha_;
+  const half *beta = &beta_;
+
+  cudaEventRecord(start, 0);
+  for (unsigned i = 0; i < iterations; i++) {
+    checkCublas(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             // Dimensions
+                             num_rows_a, num_cols_b, num_cols_a, alpha,
+                             // A
+                             a, CUDA_R_16F, num_rows_a,
+                             // B
+                             b, CUDA_R_16F, num_rows_b, beta,
+                             // C
+                             c, CUDA_R_16F, num_rows_a,
+                             // Compute precision and algorithm
+                             CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&kernel_time, start, stop);
+
+  cudaEventRecord(start, 0);
+  h2f(c, num_elements_c, cf);
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&to_fp32_time, start, stop);
+
+  total_time = to_fp16_time + (kernel_time / iterations) + to_fp32_time;
+  std::cout << "FP16 GEMM: " << std::to_string(total_time) << " ms\n";
+  std::cout << "\tTo FP16: " << std::to_string(to_fp16_time) << " ms\n";
+  std::cout << "\tKernel : " << std::to_string(kernel_time / iterations)
+            << " ms\n";
+  std::cout << "\tTo FP32: " << std::to_string(to_fp32_time) << " ms\n";
 }
 
-
- 
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
index 4902043b7ce6a1240981224d98dc7dac70361500..aeb12e9f6e3fb56bfeaef3bd71bd2c3594fdcc08 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
@@ -47,5 +47,4 @@ std::string profile_data = "";
 PerfParamSet *perfParamSet;
 SampParamSet *sampParamSet;
 
-unsigned int currentTensorID = -1;
-
+unsigned int currentTensorID = ~0U;
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
index 4b49a3702b1938ceed9829cc3572474c7cb82420..6a3fcc12e014205aaf81e2cae0906ed6cfbff33e 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
@@ -1,14 +1,14 @@
-//===--------------------------- group_conv.cu -----------------------------===//
+//===--------------------------- group_conv.cu
+//-----------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  group convolutions with FP16 and FP32 compute precisions. 
+//
+//  This file  group convolutions with FP16 and FP32 compute precisions.
 // Note that group convolutions, unlike regular convolutions, are not
 // approximable in any other way in HPVM.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "tensor_utils.h"
 #include "fp16_gemm.h"
 #include "debug.h"
@@ -17,31 +17,26 @@
 #include "op_overheads.h"
 #include "error.h"
 
+extern "C" {
 
-extern "C"{
-
-
+__global__ void depthwise_convNew8(
+    float *const __restrict__ y, const float *const __restrict__ x,
+    const float *const __restrict__ w, const int B, const int M, const int H,
+    const int W, const int KH, const int KW, const int H_out, const int W_out,
+    const int H_pad, const int W_pad, const int H_stride, const int W_stride) {
 
-__global__ void depthwise_convNew8(float* const __restrict__ y,
-				   const float* const __restrict__ x,
-				   const float* const __restrict__ w,
-				   const int B, const int M,
-				   const int H, const int W, const int KH,
-				   const int KW, const int H_out, const int W_out,
-				   const int H_pad, const int W_pad,
-				   const int H_stride, const int W_stride)
-{
-
-  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+#define y4d(i3, i2, i1, i0)                                                    \
+  y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+#define x4d(i3, i2, i1, i0)                                                    \
+  x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
   const int num = 8;
 
   const int b = num * blockIdx.x;
-  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
-	
-  if(m < M){
-    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+  const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out);
+
+  if (m < M) {
+    const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out);
 
     const int start_h = (tx / W_out) * H_stride - H_pad;
     const int start_w = (tx % W_out) * W_stride - W_pad;
@@ -54,80 +49,73 @@ __global__ void depthwise_convNew8(float* const __restrict__ y,
     float c5 = 0;
     float c6 = 0;
     float c7 = 0;
-	
-    const float* weights = &w[m * KH * KW];
+
+    const float *weights = &w[m * KH * KW];
 
     for (int k = 0; k < KH * KW; k++) {
       int p = k / KW;
       int q = k % KW;
 
-      if (start_h + p > -1 && start_h + p < H &&
-	  start_w + q > -1 && start_w + q < W) {
-
-	c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
-	if(b + 1 < B)
-	  c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
-	if(b + 2 < B)
-	  c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
-	if(b + 3 < B)
-	  c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
-	if(b + 4 < B)
-	  c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
-	if(b + 5 < B)
-	  c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
-	if(b + 6 < B)
-	  c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
-	if(b + 7 < B)
-	  c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
-    
-
+      if (start_h + p > -1 && start_h + p < H && start_w + q > -1 &&
+          start_w + q < W) {
+
+        c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
+        if (b + 1 < B)
+          c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
+        if (b + 2 < B)
+          c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
+        if (b + 3 < B)
+          c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
+        if (b + 4 < B)
+          c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
+        if (b + 5 < B)
+          c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
+        if (b + 6 < B)
+          c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
+        if (b + 7 < B)
+          c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
       }
     }
 
-    y4d(b, m, 0, tx) = c0;	
-    if(b + 1 < B)
+    y4d(b, m, 0, tx) = c0;
+    if (b + 1 < B)
       y4d(b + 1, m, 0, tx) = c1;
-    if(b + 2 < B)
+    if (b + 2 < B)
       y4d(b + 2, m, 0, tx) = c2;
-    if(b + 3 < B)
+    if (b + 3 < B)
       y4d(b + 3, m, 0, tx) = c3;
-    if(b + 4 < B)
+    if (b + 4 < B)
       y4d(b + 4, m, 0, tx) = c4;
-    if(b + 5 < B)
+    if (b + 5 < B)
       y4d(b + 5, m, 0, tx) = c5;
-    if(b + 6 < B)
+    if (b + 6 < B)
       y4d(b + 6, m, 0, tx) = c6;
-    if(b + 7 < B)
+    if (b + 7 < B)
       y4d(b + 7, m, 0, tx) = c7;
   }
-	
-  #undef y4d 
-  #undef x4d
-}
-
-
 
+#undef y4d
+#undef x4d
+}
 
-__global__ void depthwise_convNew8_half2(__half* const __restrict__ y,
-					const __half* const __restrict__ x,
-					const __half* const __restrict__ w,
-					const int B, const int M,
-					const int H, const int W, const int KH,
-					const int KW, const int H_out, const int W_out,
-					const int H_pad, const int W_pad,
-					const int H_stride, const int W_stride)
-{
+__global__ void depthwise_convNew8_half2(
+    __half *const __restrict__ y, const __half *const __restrict__ x,
+    const __half *const __restrict__ w, const int B, const int M, const int H,
+    const int W, const int KH, const int KW, const int H_out, const int W_out,
+    const int H_pad, const int W_pad, const int H_stride, const int W_stride) {
 
-  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+#define y4d(i3, i2, i1, i0)                                                    \
+  y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+#define x4d(i3, i2, i1, i0)                                                    \
+  x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
   const int num = 8;
 
   const int b = num * blockIdx.x;
-  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
-	
-  if(m < M){
-    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+  const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out);
+
+  if (m < M) {
+    const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out);
 
     const int start_h = (tx / W_out) * H_stride - H_pad;
     const int start_w = (tx % W_out) * W_stride - W_pad;
@@ -136,111 +124,112 @@ __global__ void depthwise_convNew8_half2(__half* const __restrict__ y,
     __half2 c1 = __half2half2(0);
     __half2 c2 = __half2half2(0);
     __half2 c3 = __half2half2(0);
-    	
-    const __half* weights = &w[m * KH * KW];
+
+    const __half *weights = &w[m * KH * KW];
 
     for (int k = 0; k < KH * KW; k++) {
       int p = k / KW;
       int q = k % KW;
-      if (start_h + p > -1 && start_h + p < H &&
-	  start_w + q > -1 && start_w + q < W) {
-
-      
-	__half2 t1;
-	__half2 t2;
-	__half2 t3;
-	__half2 t4;
-	if(b + 7 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
-	  t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q));
-	}
-	else if(b + 6 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
-	  t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q));
-
-	}
-	else if(b + 5 < B){
-	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	    t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
-	}
-	else if(b + 4 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	  t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q));
-
-	}
-	else if(b + 3 < B){
-	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	 }
-	else if(b + 2 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q));
-
-	}
-	else if(b + 1 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	}
-	else{
-	  t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q));
-
-	 }
-
-	
-	c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0);
-	c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1);
-	c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2);
-	c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3);
-
+      if (start_h + p > -1 && start_h + p < H && start_w + q > -1 &&
+          start_w + q < W) {
+
+        __half2 t1;
+        __half2 t2;
+        __half2 t3;
+        __half2 t4;
+        if (b + 7 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+          t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q),
+                              x4d(b + 4, m, start_h + p, start_w + q));
+          t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q),
+                              x4d(b + 6, m, start_h + p, start_w + q));
+        } else if (b + 6 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+          t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q),
+                              x4d(b + 4, m, start_h + p, start_w + q));
+          t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q));
+
+        } else if (b + 5 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+          t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q),
+                              x4d(b + 4, m, start_h + p, start_w + q));
+        } else if (b + 4 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+          t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q));
+
+        } else if (b + 3 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+        } else if (b + 2 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q));
+
+        } else if (b + 1 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+        } else {
+          t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q));
+        }
+
+        c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0);
+        c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1);
+        c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2);
+        c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3);
       }
     }
 
-    y4d(b, m, 0, tx) = __high2half(c0);	
-    if(b + 1 < B)
+    y4d(b, m, 0, tx) = __high2half(c0);
+    if (b + 1 < B)
       y4d(b + 1, m, 0, tx) = __low2half(c0);
-    if(b + 2 < B)
+    if (b + 2 < B)
       y4d(b + 2, m, 0, tx) = __high2half(c1);
-    if(b + 3 < B)
+    if (b + 3 < B)
       y4d(b + 3, m, 0, tx) = __low2half(c1);
-    if(b + 4 < B)
+    if (b + 4 < B)
       y4d(b + 4, m, 0, tx) = __high2half(c2);
-    if(b + 5 < B)
+    if (b + 5 < B)
       y4d(b + 5, m, 0, tx) = __low2half(c2);
-    if(b + 6 < B)
+    if (b + 6 < B)
       y4d(b + 6, m, 0, tx) = __high2half(c3);
-    if(b + 7 < B)
+    if (b + 7 < B)
       y4d(b + 7, m, 0, tx) = __low2half(c3);
   }
-	
-  #undef y4d 
-  #undef x4d
-}
 
+#undef y4d
+#undef x4d
+}
 
-
-void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups){
-
+void *tensorConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups) {
 
   INFO("*** TensorConvolution \n");
   profileEvent("Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
-  //FIXME: Current hack to preserve backward compatibilty
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
 
-  Tensor* output;
+  Tensor *output;
 
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
@@ -248,43 +237,43 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
   convertToFP32(input);
   convertToFP32(filter);
 
-  
   if (conv_groups > 32) {
-    // TODO: Support other cases;  
+    // TODO: Support other cases;
     hostToDeviceCopy(input);
     hostToDeviceCopy(filter);
 
-    int n, c, h, w; // output dimensions  
+    int n, c, h, w; // output dimensions
     n = input->dims.dim_sizes[0];
     c = input->dims.dim_sizes[1];
     const int KH = filter->dims.dim_sizes[2];
     const int KW = filter->dims.dim_sizes[3];
-    h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-    w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+    h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride +
+        1;
+    w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) /
+            horizontal_stride +
+        1;
 
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
 
-
-		
     int blockSize;
     blockSize = 64;
-		
-    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
+
+    dim3 grid(((n + 7) / 8), (c * h * w + blockSize - 1) / blockSize);
     dim3 block(blockSize);
-    depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data,
-					 (float*)input->gpu_data, (float*)filter->gpu_data,
-					 input->dims.dim_sizes[0], input->dims.dim_sizes[1],
-					 input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-					 KH, KW, h, w, vertical_pad, horizontal_pad,
-					 vertical_stride, horizontal_stride);
+    depthwise_convNew8<<<grid, block>>>(
+        (float *)output->gpu_data, (float *)input->gpu_data,
+        (float *)filter->gpu_data, input->dims.dim_sizes[0],
+        input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+        input->dims.dim_sizes[3], KH, KW, h, w, vertical_pad, horizontal_pad,
+        vertical_stride, horizontal_stride);
 
-  }
-  else {
+  } else {
 
     cudnnConvolutionDescriptor_t convDesc;
     cudnnConvolutionFwdAlgo_t convAlgo;
@@ -297,152 +286,119 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
     // FIXIT: Need to be more aware of the implications of alpha and beta
     float alpha = 1.0f, beta = 0.0f;
 
-    // TODO: Support other cases;  
+    // TODO: Support other cases;
     hostToDeviceCopy(input);
     hostToDeviceCopy(filter);
 
-    INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
+    INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+         horizontal_stride);
 
     checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
     // NOTE: Adding support for grouped convolution
     checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
-
     cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
     // FIXIT: Think if upscaling values need to be configurable?
-    // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
-    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					       vertical_pad, horizontal_pad, // conv padding
-					       vertical_stride, horizontal_stride, // conv strides
-					       1, 1, // upscaling values
-					       mode, // mode is configurable
-					       computeType)); // defines compute precision
-
-    int n, c, h, w; // output dimensions  
-    // Find dimension of convolution output
-    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     &n, &c, &h, &w));
+    // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE
+    // should be used?
+    checkCUDNN(cudnnSetConvolution2dDescriptor(
+        convDesc, vertical_pad, horizontal_pad, // conv padding
+        vertical_stride, horizontal_stride,     // conv strides
+        1, 1,                                   // upscaling values
+        mode,                                   // mode is configurable
+        computeType));                          // defines compute precision
 
+    int n, c, h, w; // output dimensions
+    // Find dimension of convolution output
+    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+        convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h,
+          w);
 
     if (input->data_format == CUDNN_TENSOR_NCHW)
-      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
+      output = (Tensor *)create4DTensor(
+          (cudnnDataType_t)float_type, // input->data_type,
+          CUDNN_TENSOR_NCHW, n, c, h, w);
     else if (input->data_format == CUDNN_TENSOR_NHWC) {
       DEBUG("* NHWC Format \n");
-      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				       CUDNN_TENSOR_NHWC, n, h, w, c);
-    }
-    else
+      output = (Tensor *)create4DTensor(
+          (cudnnDataType_t)float_type, // input->data_type,
+          CUDNN_TENSOR_NHWC, n, h, w, c);
+    } else
       ERROR("Unsupported Tensor Type");
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
 
-    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	  output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H "
+          "= %d, W = %d \n",
+          output->data_type, output->data_format, output->dims.dim_sizes[0],
+          output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+          output->dims.dim_sizes[3]);
 
     if (convDesc == NULL || input->tensor_desc == NULL ||
-	filter->filter_desc == NULL || output->tensor_desc == NULL)
+        filter->filter_desc == NULL || output->tensor_desc == NULL)
       ERROR("NULL descriptor! \n");
 
-
-    // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-    checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   convDesc,
-						   output->tensor_desc,
-						   CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-						   //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						   0,
-						   &convAlgo));
-
+    // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN
+    // support is lacking
+    checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+        cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+        output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+        // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+        0, &convAlgo));
 
     DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	  CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	  CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-
+          CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
     // FIXIT: Algo shouldn't be hardcoded
     convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
     size_t workspace_size;
-    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						       input->tensor_desc,
-						       filter->filter_desc,
-						       convDesc,
-						       output->tensor_desc,
-						       convAlgo,
-						       &workspace_size));
+    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+        cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+        output->tensor_desc, convAlgo, &workspace_size));
 
     // Allocating memory for the convolution workspace
-    void* workspace;
+    void *workspace;
     checkCudaErrors(cudaMalloc(&workspace, workspace_size));
     DEBUG("workspace size = %d \n", workspace_size);
 
-
-    checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				       input->gpu_data, filter->filter_desc, filter->gpu_data,
-				       convDesc, convAlgo, workspace, workspace_size,
-				       &beta, output->tensor_desc, output->gpu_data));
+    checkCUDNN(cudnnConvolutionForward(
+        cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+        filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace,
+        workspace_size, &beta, output->tensor_desc, output->gpu_data));
   }
 
   cudaDeviceSynchronize();
   profileEvent("Conv_end", true);
 
-
-  #ifdef ERROR_INJECTION_ENABLED
-
-  if (op_counter >= total_ops) {
-    ERROR("No accuracy flag found \n");
-  }
-
-  int op_acc = op_accuracies[op_counter];
-
-  // Skip errorInjection if explicitly requested
-  if (skip_tensors.find(op_counter) != skip_tensors.end()) {
-    op_acc = 0;
-  }
-
-  void* error_norms = tensorAddError(output, op_acc);
-  add_norms(error_norms, "tensorConv", op_acc);
-  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
-
-  op_counter++;
-
-  #endif   
-
   return output;
-
-
 }
 
 // FIXME: Need to properly fix the new HALF type conversion
-void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
-			    int vertical_pad, int horizontal_pad,
-			    int vertical_stride, int horizontal_stride,
-			    int conv_mode, int conv_groups){
+void *tensorHalfConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups) {
 
   INFO("*** TensorHConvolution \n");
   profileEvent("#Conv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
   cudnnConvolutionDescriptor_t convDesc;
   cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  
-  if(conv_mode == 0)
+
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   // FIXIT: Need to be more aware of the implications of alpha and beta
@@ -454,33 +410,34 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-
   // Float-Half Conversions
   profileEvent("F2H_start");
 
   convertToFP16(input);
-  convertToFP16(filter);  
+  convertToFP16(filter);
 
   profileEvent("F2H_end");
   /******* END OF INPUT DATA CONVERSIONS*/
 
-  
   Tensor *output;
-  if(conv_groups > 1){
+  if (conv_groups > 1) {
     int n = input->dims.dim_sizes[0];
     int c = input->dims.dim_sizes[1];
     const int KH = filter->dims.dim_sizes[2];
     const int KW = filter->dims.dim_sizes[3];
-    int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-    int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-    
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
-    
+    int h =
+        (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride +
+        1;
+    int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) /
+                horizontal_stride +
+            1;
+
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h,
+          w);
 
-    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, 
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
+    output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                      CUDNN_TENSOR_NCHW, n, c, h, w);
 
-  
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
@@ -488,117 +445,90 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
     int blockSize;
     blockSize = 128;
 
-    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
+    dim3 grid(((n + 7) / 8), (c * h * w + blockSize - 1) / blockSize);
     dim3 block(blockSize);
-    depthwise_convNew8_half2<<<grid, block>>> ((__half*) output->gpu_half_data,
-					      (__half*) input->gpu_half_data,
-					      (__half*) filter->gpu_half_data,
-					      input->dims.dim_sizes[0], input->dims.dim_sizes[1],
-					      input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-					      KH, KW, h, w,
-					      vertical_pad, horizontal_pad,
-					      vertical_stride, horizontal_stride);
+    depthwise_convNew8_half2<<<grid, block>>>(
+        (__half *)output->gpu_half_data, (__half *)input->gpu_half_data,
+        (__half *)filter->gpu_half_data, input->dims.dim_sizes[0],
+        input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+        input->dims.dim_sizes[3], KH, KW, h, w, vertical_pad, horizontal_pad,
+        vertical_stride, horizontal_stride);
     cudaDeviceSynchronize();
 
-    
-  }
-  else{    
+  } else {
     checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-    //FIXME: Current hack to preserve backward compatibilty
-    if(conv_groups == 0){
+    // FIXME: Current hack to preserve backward compatibilty
+    if (conv_groups == 0) {
       conv_groups = 1;
     }
-  
+
     // NOTE: Adding support for grouped convolution
     checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
-  
-    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					       vertical_pad, horizontal_pad, // conv padding
-					       vertical_stride, horizontal_stride, // conv strides
-					       1, 1, // upscaling values
-					       mode, // mode is configurable
-					       computeType)); // defines compute precision
+    checkCUDNN(cudnnSetConvolution2dDescriptor(
+        convDesc, vertical_pad, horizontal_pad, // conv padding
+        vertical_stride, horizontal_stride,     // conv strides
+        1, 1,                                   // upscaling values
+        mode,                                   // mode is configurable
+        computeType));                          // defines compute precision
 
     int n, c, h, w; // output dimensions
     // Find dimension of convolution output
-    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						     input->tensor_half_desc,
-						     filter->filter_half_desc,
-						     &n, &c, &h, &w));
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+        convDesc, input->tensor_half_desc, filter->filter_half_desc, &n, &c, &h,
+        &w));
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h,
+          w);
 
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
 
-    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
-
-  
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
 
-    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
-	  output->data_type, output->data_format,
-	  output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W "
+          "= %d, C = %d \n",
+          output->data_type, output->data_format, output->dims.dim_sizes[0],
+          output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+          output->dims.dim_sizes[3]);
 
-    if(convDesc == NULL || input->tensor_desc == NULL ||
-       filter->filter_desc == NULL || output->tensor_desc == NULL)
+    if (convDesc == NULL || input->tensor_desc == NULL ||
+        filter->filter_desc == NULL || output->tensor_desc == NULL)
       ERROR("NULL descriptor! \n");
 
-
     // NOTE: The following algo works with TRUE half precision
     convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+    // convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
-  
     size_t workspace_size;
-    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						       input->tensor_half_desc,
-						       filter->filter_half_desc,
-						       convDesc,
-						       output->tensor_half_desc,
-						       convAlgo,
-						       &workspace_size));
+    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+        cudnnHandle, input->tensor_half_desc, filter->filter_half_desc,
+        convDesc, output->tensor_half_desc, convAlgo, &workspace_size));
 
     // Allocating memory for the convolution workspace
     DEBUG("workspace size = %d \n", workspace_size);
-    void* workspace;
+    void *workspace;
     checkCudaErrors(cudaMalloc(&workspace, workspace_size));
 
-
-
-
-    checkCUDNN(cudnnConvolutionForward(cudnnHandle,
-				       &alpha,
-				       input->tensor_half_desc,
-				       input->gpu_half_data,
-				       filter->filter_half_desc,
-				       filter->gpu_half_data,
-				       convDesc, convAlgo, workspace, workspace_size,
-				       &beta,
-				       output->tensor_half_desc,
-				       output->gpu_half_data));
-
+    checkCUDNN(cudnnConvolutionForward(
+        cudnnHandle, &alpha, input->tensor_half_desc, input->gpu_half_data,
+        filter->filter_half_desc, filter->gpu_half_data, convDesc, convAlgo,
+        workspace, workspace_size, &beta, output->tensor_half_desc,
+        output->gpu_half_data));
   }
-  
+
   profileEvent("H2F_start");
 
   convertToFP32_offline(output);
-  
-  profileEvent("H2F_end");
 
+  profileEvent("H2F_end");
 
   profileEvent("#Conv_end");
 
-  
   return output;
-
 }
 
-
-  
-
-}// End of Extern C
-
+} // End of Extern C
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu
index e706080051a41dac1f7486027fcb9225793921bf..8324b18e044b37ee697a624e60ec77eb4bc7a8d5 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu
@@ -1,9 +1,11 @@
-//===--------------------------- half_precision_api.cu --------------------------===//
+//===--------------------------- half_precision_api.cu
+//--------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  consists of the custom implementation of tensor precision changing
-// kernels useful for approximated and non-approximated versions of tensor 
+//
+//  This file  consists of the custom implementation of tensor precision
+//  changing
+// kernels useful for approximated and non-approximated versions of tensor
 // operations. This file also contains API for tensor operations operating on
 // tensors with half-precision.
 //
@@ -12,7 +14,6 @@
 #ifndef HALF_API_HEADER
 #define HALF_API_HEADER
 
-
 #include <stdio.h>
 #include <stdarg.h>
 #include <cstdio>
@@ -37,7 +38,6 @@
 #include <cuda_fp16.h>
 #include <driver_types.h>
 
-
 // Tensor runtime header files
 #include "../include/tensor_runtime.h"
 #include "../include/tensor_utils.h"
@@ -48,15 +48,13 @@
 #include "../include/fp16_gemm.h"
 #include "../include/fp16_conversion.h"
 
-
-
-void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){
+void *tensorHalfGemm(void *lhs_ptr, void *rhs_ptr) {
 
   INFO("*** TensorHalfGemm \n");
   profileEvent("#Mul");
 
-  Tensor* lhs = (Tensor*) lhs_ptr;
-  Tensor* rhs = (Tensor*) rhs_ptr;
+  Tensor *lhs = (Tensor *)lhs_ptr;
+  Tensor *rhs = (Tensor *)rhs_ptr;
 
   DEBUG("rhs->dims.num_dims = %d \n", rhs->dims.num_dims);
   DEBUG("lhs->dims.num_dims = %d \n", lhs->dims.num_dims);
@@ -64,65 +62,60 @@ void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){
   hostToDeviceCopy(lhs);
   hostToDeviceCopy(rhs);
 
-  
   profileEvent("F2H_start");
 
   convertToFP16(lhs);
   convertToFP16(rhs);
-  
-  profileEvent("F2H_end");
 
+  profileEvent("F2H_end");
 
   // 'm' holds the batch dimension - assuming NCHW format Tensors
   int m = lhs->dims.dim_sizes[0];
   // The rhs last dimension must contain the neurons
-  int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
   int k = 1;
 
-  for (int j = 1 ; j < lhs->dims.num_dims; j++){
+  for (int j = 1; j < lhs->dims.num_dims; j++) {
     k = k * lhs->dims.dim_sizes[j]; // input neurons
   }
 
-  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
+  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
   // Dimension-note: Check if k is same across the two tensors
   DEBUG("m = %d, n = %d, k = %d \n", m, n, k);
-  if(rhs_k != k){
+  if (rhs_k != k) {
     ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k);
   }
 
-  // NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines
-  Tensor* output = (Tensor*) create4DTensor(half_type, CUDNN_TENSOR_NCHW,
-					    m, n, 1, 1);
+  // NOTE: Creating a 4D tensor to be compatible with later called cuDNN
+  // routines
+  Tensor *output =
+      (Tensor *)create4DTensor(half_type, CUDNN_TENSOR_NCHW, m, n, 1, 1);
 
   changeTensorPlacement(output, DEVICE);
 
-  //convertToFP16(output);
-
+  // convertToFP16(output);
 
   // INFO: cuBlas uses column-major format
   // INFO: The leading dimension is just the FIRST Dimension
-  // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN expects
+  // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN
+  // expects
   const __half alf = approx_float_to_half(1.0);
   const __half bet = approx_float_to_half(0.0);
   const __half *alpha_half = &alf;
   const __half *beta_half = &bet;
 
-
-  checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-			       n, m, k,
-			       alpha_half,
-			       (__half*) rhs->gpu_half_data, CUDA_R_16F, n,
-			       (__half*) lhs->gpu_half_data, CUDA_R_16F, k,
-			       beta_half,
-			       (__half*) output->gpu_half_data, CUDA_R_16F, n,
-			       CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
+  checkCudaErrors(cublasGemmEx(
+      cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, alpha_half,
+      (__half *)rhs->gpu_half_data, CUDA_R_16F, n, (__half *)lhs->gpu_half_data,
+      CUDA_R_16F, k, beta_half, (__half *)output->gpu_half_data, CUDA_R_16F, n,
+      CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(output);
 
-  //h2f((half*) output_half->gpu_data, output->num_elems, (float*) output->gpu_data);
+  // h2f((half*) output_half->gpu_data, output->num_elems, (float*)
+  // output->gpu_data);
 
   profileEvent("H2F_end");
 
@@ -131,32 +124,28 @@ void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){
   return output;
 }
 
-
-
-void* tensorHalfGemmGPU(void* lhs_ptr, void* rhs_ptr){
+void *tensorHalfGemmGPU(void *lhs_ptr, void *rhs_ptr) {
   return tensorHalfGemm(lhs_ptr, rhs_ptr);
 }
 
-
-
 // FIXIT: Generalize all of the routines for types {half, float, double}
-void* tensorHalfConvolution(void* input_ptr, void* filter_ptr,
-			    int vertical_pad, int horizontal_pad,
-			    int vertical_stride, int horizontal_stride,
-			    int conv_mode, int conv_groups){
+void *tensorHalfConvolution(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups) {
 
   INFO("*** TensorHConvolution \n");
   profileEvent("#Conv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
   cudnnConvolutionDescriptor_t convDesc;
   cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  if(conv_mode == 0)
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   // FIXIT: Need to be more aware of the implications of alpha and beta
@@ -168,7 +157,6 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-
   /***** CONVERSIONS from FP32 to FP16 - on the GPU */
   profileEvent("F2H_start");
 
@@ -178,95 +166,76 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr,
   profileEvent("F2H_end");
   /******* END OF INPUT DATA CONVERSIONS*/
 
-  
-
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
   }
-  
+
   // NOTE: Adding support for grouped convolution
   checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
-  
   // FIXIT: Think if upscaling values need to be configurable?
   // IMP-FIXIT:  CUDNN Cross correlation is only used in the Lenet context
-  // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     vertical_stride, horizontal_stride, // conv strides
-					     1, 1, // upscaling values
-					     mode, // mode is configurable
-					     computeType)); // defines compute precision
+  // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE
+  // should be used?
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      vertical_stride, horizontal_stride,     // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   int n, c, h, w; // output dimensions
   // Find dimension of convolution output
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
-  
-  DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
+  DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, // input->data_type,
-					    CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *output =
+      (Tensor *)create4DTensor((cudnnDataType_t)half_type, // input->data_type,
+                               CUDNN_TENSOR_NCHW, n, c, h, w);
 
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
 
-  //convertToFP16(output);
+  // convertToFP16(output);
 
-  
   // NOTE: Necessary to insert the above call for every output tensor
 
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
-	output->data_type, output->data_format,
-	output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = "
+        "%d, C = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
 
-  if(convDesc == NULL || input->tensor_half_desc == NULL ||
-     filter->filter_half_desc == NULL || output->tensor_half_desc == NULL)
+  if (convDesc == NULL || input->tensor_half_desc == NULL ||
+      filter->filter_half_desc == NULL || output->tensor_half_desc == NULL)
     ERROR("NULL descriptor! \n");
 
-
   // NOTE: The following algo works with TRUE half precision
 
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
 
-  //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  // convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
-  
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_half_desc,
-						     filter->filter_half_desc,
-						     convDesc,
-						     output->tensor_half_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_half_desc, filter->filter_half_desc, convDesc,
+      output->tensor_half_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
   DEBUG("workspace size = %d \n", workspace_size);
-  void* workspace;
+  void *workspace;
   checkCudaErrors(cudaMalloc(&workspace, workspace_size));
 
-
-
-
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle,
-				     &alpha,
-				     input->tensor_half_desc,
-				     input->gpu_half_data,
-				     filter->filter_half_desc,
-				     filter->gpu_half_data,
-				     convDesc, convAlgo,
-				     workspace, workspace_size,
-				     &beta,
-				     output->tensor_half_desc,
-				     output->gpu_half_data));
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_half_desc, input->gpu_half_data,
+      filter->filter_half_desc, filter->gpu_half_data, convDesc, convAlgo,
+      workspace, workspace_size, &beta, output->tensor_half_desc,
+      output->gpu_half_data));
 
   profileEvent("H2F_start");
 
@@ -279,21 +248,18 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr,
   return output;
 }
 
-
-
-
-void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-           		  void* mean_ptr, void* variance_ptr, double epsilon){
+void *tensorHalfBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                          void *mean_ptr, void *variance_ptr, double epsilon) {
 
   INFO("*** TensorHalfBatchNorm \n");
   profileEvent("#BatchNorm");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* gamma = (Tensor*) gamma_ptr;
-  Tensor* beta = (Tensor*) beta_ptr;
-  Tensor* mean = (Tensor*) mean_ptr;
-  Tensor* variance = (Tensor*) variance_ptr;
-  
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *gamma = (Tensor *)gamma_ptr;
+  Tensor *beta = (Tensor *)beta_ptr;
+  Tensor *mean = (Tensor *)mean_ptr;
+  Tensor *variance = (Tensor *)variance_ptr;
+
   float alpha_val = 1.0f, beta_val = 0.0f;
   hostToDeviceCopy(input);
   hostToDeviceCopy(gamma);
@@ -301,56 +267,37 @@ void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
   hostToDeviceCopy(mean);
   hostToDeviceCopy(variance);
 
-  
   profileEvent("F2H_start");
 
   convertToFP16(input);
 
   profileEvent("F2H_end");
-  
-
-
-  checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle, CUDNN_BATCHNORM_SPATIAL,
-						     &alpha_val, &beta_val,
-						     input->tensor_half_desc,
-						     input->gpu_half_data,
-						     input->tensor_half_desc,
-						     input->gpu_half_data,
-						     gamma->tensor_desc, gamma->gpu_data,
-						     beta->gpu_data, mean->gpu_data,
-						     variance->gpu_data, epsilon));
-
 
+  checkCUDNN(cudnnBatchNormalizationForwardInference(
+      cudnnHandle, CUDNN_BATCHNORM_SPATIAL, &alpha_val, &beta_val,
+      input->tensor_half_desc, input->gpu_half_data, input->tensor_half_desc,
+      input->gpu_half_data, gamma->tensor_desc, gamma->gpu_data, beta->gpu_data,
+      mean->gpu_data, variance->gpu_data, epsilon));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(input);
-  
-  profileEvent("H2F_end");
 
+  profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfBatchNorm_end", true);
 
-
   return input;
 }
 
-
-
-
-void* tensorHalfPooling(void* input_ptr,
-			int poolFunction,
-			int window_height, int window_width,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride){
-
-  
+void *tensorHalfPooling(void *input_ptr, int poolFunction, int window_height,
+                        int window_width, int vertical_pad, int horizontal_pad,
+                        int vertical_stride, int horizontal_stride) {
 
   INFO("*** TensorHalfPooling \n");
   profileEvent("#Pool");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   hostToDeviceCopy(input);
 
@@ -366,218 +313,185 @@ void* tensorHalfPooling(void* input_ptr,
   // FIXIT: Need to be more aware of the implications of alpha and beta
   float alpha = 1.0f, beta = 0.0f;
 
-
   checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc));
 
   int n = input->dims.dim_sizes[0];
   int c = input->dims.dim_sizes[1];
-  int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / vertical_stride;
+  int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) /
+          vertical_stride;
   h = h + 1;
-  int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / horizontal_stride;
+  int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) /
+          horizontal_stride;
   w = w + 1;
 
   DEBUG("n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
   // FIXIT: Don't be specific to floats
-  Tensor* output = (Tensor*) create4DTensor(half_type, CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *output =
+      (Tensor *)create4DTensor(half_type, CUDNN_TENSOR_NCHW, n, c, h, w);
   // Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
 
-  //convertToFP16(output);
+  // convertToFP16(output);
 
   // FIXIT: Fix being specific to CUDNN_DATA_FLOAT and NCHW format
   // FIXIT: Is this setTensor even needed?
   checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_half_desc,
-					CUDNN_TENSOR_NCHW,
-					CUDNN_DATA_HALF,
-					n, c,
-					h, w));
+                                        CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, n,
+                                        c, h, w));
 
   cudnnPoolingMode_t pool_mode;
-  if(poolFunction == 0)
+  if (poolFunction == 0)
     pool_mode = CUDNN_POOLING_MAX;
-  else if(poolFunction == 1)
+  else if (poolFunction == 1)
     pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
 
-
   // FIXIT: Make the pool function (max, min, avg) configurable
-  checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc,
-					 pool_mode,
-					 CUDNN_PROPAGATE_NAN,
-					 window_height, window_width,
-					 vertical_pad, horizontal_pad,
-					 vertical_stride, horizontal_stride));
-
-  
-  checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha,
-				 input->tensor_half_desc,
-				 input->gpu_half_data, &beta,
-				 output->tensor_half_desc, output->gpu_half_data));
-
+  checkCUDNN(cudnnSetPooling2dDescriptor(
+      poolDesc, pool_mode, CUDNN_PROPAGATE_NAN, window_height, window_width,
+      vertical_pad, horizontal_pad, vertical_stride, horizontal_stride));
 
+  checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha,
+                                 input->tensor_half_desc, input->gpu_half_data,
+                                 &beta, output->tensor_half_desc,
+                                 output->gpu_half_data));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(output);
-  
+
   profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfPooling_end", true);
 
   return output;
 }
 
-
-
-
-
-void* tensorHalfRelu2(void* input_ptr, float min, float max){
+void *tensorHalfRelu2(void *input_ptr, float min, float max) {
 
   INFO("*** TensorClippedRelu \n");
   profileEvent("#Relu");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   cudnnActivationDescriptor_t reluDesc;
   float alpha = 1.0f, beta = 0.0f;
   hostToDeviceCopy(input);
 
-
   //**** Floating point to half conversions
   profileEvent("F2H_start");
 
   convertToFP16(input);
-  
+
   profileEvent("F2H_end");
   /*** End of data type conversion **/
 
-
   checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc));
 
-  checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU,
-					  CUDNN_PROPAGATE_NAN, 2.0));
-
-  checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha,
-				    input->tensor_half_desc, input->gpu_half_data, &beta,
-				    input->tensor_half_desc, input->gpu_half_data));
+  checkCUDNN(cudnnSetActivationDescriptor(
+      reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_PROPAGATE_NAN, 2.0));
 
+  checkCUDNN(cudnnActivationForward(
+      cudnnHandle, reluDesc, &alpha, input->tensor_half_desc,
+      input->gpu_half_data, &beta, input->tensor_half_desc,
+      input->gpu_half_data));
 
   profileEvent("H2F_start");
   // NOTE: Transforming half precision output to single precision
 
   convertToFP32_offline(input);
-  
+
   profileEvent("H2F_end");
 
   profileEvent("#tensorHalfClippedRelu_end");
 
-
   return input;
 }
 
-
-
-
-void* tensorHalfRelu(void* input_ptr){
+void *tensorHalfRelu(void *input_ptr) {
 
   INFO("*** TensorHalfRelu \n");
   profileEvent("#Relu");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   cudnnActivationDescriptor_t reluDesc;
   float alpha = 1.0f, beta = 0.0f;
   hostToDeviceCopy(input);
 
-
   //**** Floating point to half conversions
   profileEvent("F2H_start");
 
   convertToFP16(input);
-	    
+
   profileEvent("F2H_end");
   /*** End of data type conversion **/
 
-
   checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc));
 
   checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU,
-					  CUDNN_PROPAGATE_NAN, 0.0));
+                                          CUDNN_PROPAGATE_NAN, 0.0));
 
-  checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha,
-				    input->tensor_half_desc, input->gpu_half_data, &beta,
-				    input->tensor_half_desc, input->gpu_half_data));
+  checkCUDNN(cudnnActivationForward(
+      cudnnHandle, reluDesc, &alpha, input->tensor_half_desc,
+      input->gpu_half_data, &beta, input->tensor_half_desc,
+      input->gpu_half_data));
 
- 
   profileEvent("H2F_start");
 
   convertToFP32_offline(input);
-  
+
   profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfRelu_end");
 
-  
   return input;
 }
 
-
-
-
-
-
-void* tensorHalfTanh(void* input_ptr){
+void *tensorHalfTanh(void *input_ptr) {
 
   INFO("*** TensorHalfTanh \n");
   profileEvent("#Tanh");
 
-
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   cudnnActivationDescriptor_t tanhDesc;
   float alpha = 1.0f, beta = 0.0f;
   hostToDeviceCopy(input);
 
-
   //**** Data conversion from float to half
   profileEvent("F2H_start");
 
   convertToFP16(input);
-  
+
   profileEvent("F2H_end");
   /**** End of data type conversion ****/
 
-
   checkCUDNN(cudnnCreateActivationDescriptor(&tanhDesc));
 
   checkCUDNN(cudnnSetActivationDescriptor(tanhDesc, CUDNN_ACTIVATION_TANH,
-					  CUDNN_PROPAGATE_NAN, 0.0));
+                                          CUDNN_PROPAGATE_NAN, 0.0));
 
-  checkCUDNN(cudnnActivationForward(cudnnHandle, tanhDesc, &alpha,
-				    input->tensor_half_desc, input->gpu_half_data, &beta,
-				    input->tensor_half_desc, input->gpu_half_data));
+  checkCUDNN(cudnnActivationForward(
+      cudnnHandle, tanhDesc, &alpha, input->tensor_half_desc,
+      input->gpu_half_data, &beta, input->tensor_half_desc,
+      input->gpu_half_data));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(input);
-  
+
   profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfTanh_end");
 
-
   return input;
 }
 
+void *tensorHalfAdd(void *x_ptr, void *bias_ptr) {
 
-
-void* tensorHalfAdd(void* x_ptr, void* bias_ptr){
-
-  Tensor* x = (Tensor*) x_ptr;
-  Tensor* bias = (Tensor*) bias_ptr;
+  Tensor *x = (Tensor *)x_ptr;
+  Tensor *bias = (Tensor *)bias_ptr;
 
   INFO("*** TensorHalfAdd \n");
   profileEvent("#Add");
@@ -587,36 +501,29 @@ void* tensorHalfAdd(void* x_ptr, void* bias_ptr){
   hostToDeviceCopy(x);
   hostToDeviceCopy(bias);
 
-
   //**** Data conversion from float to half
   profileEvent("F2H_start");
 
   convertToFP16(x);
   convertToFP16(bias);
-  
+
   profileEvent("F2H_end");
   /*** End of data type conversions ****/
 
-
   // FIXIT: routine fails for 3D tensors
   checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_half_desc,
-			    bias->gpu_half_data, &alpha,
-			    x->tensor_half_desc, x->gpu_half_data));
-
+                            bias->gpu_half_data, &alpha, x->tensor_half_desc,
+                            x->gpu_half_data));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(x);
-  
+
   profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfAdd_end");
 
-
   return x;
 }
 
-
-
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
index 02d1747328b65f30a20b2db2eecdb0f06f711bdf..5e1fbc99197af7797620f80ffbbc5aa41ee63517 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -1,48 +1,45 @@
-//===--------------------------- hpvm-rt-controller.cpp ---------------------===//
+//===--------------------------- hpvm-rt-controller.cpp
+//---------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file contains code for that allows the tensor runtime to adapt 
+//
+//  This file contains code for that allows the tensor runtime to adapt
 // in response to external changes in conditions (such as frequency changes)
 // by helping to choose correct approximation configurations. It also provides
 // routines for the rest of the runtime to get performance and energy profiling.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "hpvm-rt-controller.h"
-#include "img_tensor_utils.h"
 #include "global_data.h"
 #include <fstream>
 
 //-------- Functionality to read and update frequency on Jetson board -------//
 /*const char* available_freqs[] = {"140250000", "229500000", "318750000",
-                                 "408000000", "497250000", "586500000", 
+                                 "408000000", "497250000", "586500000",
                                  "675750000", "765000000", "854250000",
                                  "943500000", "1032750000", "1122000000",
                                  "1211250000", "1300500000"};
 
 */
 
-
 const int available_freqs[] = {
-140250000, // 0
-229500000, // 1
-318750000, // 2
-408000000, // 3
-497250000, // 4
-586500000, // 5
-675750000, // 6
-765000000, // 7
-854250000, // 8
-943500000, // 9
-1032750000,// 10
-1122000000,// 11
-1211250000,// 12
-1300500000 // 13
+    140250000,  // 0
+    229500000,  // 1
+    318750000,  // 2
+    408000000,  // 3
+    497250000,  // 4
+    586500000,  // 5
+    675750000,  // 6
+    765000000,  // 7
+    854250000,  // 8
+    943500000,  // 9
+    1032750000, // 10
+    1122000000, // 11
+    1211250000, // 12
+    1300500000  // 13
 };
 
-
 /*void updateJetsonGPUFreq(int freq_level) {
 
   if (freq_level < 0 || freq_level > 13) {
@@ -50,7 +47,7 @@ const int available_freqs[] = {
     abort();
   }
 
-  const char* freq_val = available_freqs[freq_level]; 
+  const char* freq_val = available_freqs[freq_level];
   printf("freq-val[0] = %s \n", freq_val);
 
   FILE* max_file =
@@ -60,7 +57,7 @@ const int available_freqs[] = {
   }
   fwrite(freq_val, strlen(freq_val), 1, max_file);
   fclose(max_file);
-  
+
   FILE* min_file =
     fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+");
   if (min_file == NULL){
@@ -81,7 +78,7 @@ unsigned long int readJetsonGPUFreq() {
 
   char buf[50];
   char* ptr;
-  
+
   fread(buf, 50, 1, cur_freq_file);
   unsigned long cur_freq = strtoul(buf, &ptr, 10);
   fclose(cur_freq_file);
@@ -90,14 +87,15 @@ unsigned long int readJetsonGPUFreq() {
 
 */
 
-
 // Sets frequency
 void setFreq(unsigned freq_index) {
 
   unsigned target_freq = available_freqs[freq_index];
-  
-  const char * const min_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
-  const char * const max_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
+
+  const char *const min_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
+  const char *const max_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
 
   std::ofstream min_stream;
   std::ofstream max_stream;
@@ -116,7 +114,8 @@ void setFreq(unsigned freq_index) {
 unsigned recordFreq() {
 
   // Current frequency file
-  const char * const cur_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
+  const char *const cur_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
   std::ifstream cur_stream;
   cur_stream.open(cur_freq_file, std::ifstream::in);
 
@@ -129,10 +128,6 @@ unsigned recordFreq() {
   return cur_freq;
 }
 
-
-
-
-
 //---------------------------------------------------------------------------//
 
 /*
@@ -146,13 +141,13 @@ bool fileExists(const std::string &file) {
 
 // There will be no frequency request for the first batch
 // Therefore, we skip the first element by initializing to 1, not 0.
-FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) :
-  idx_list(il), rep_factor(rf), count(1), idx(0) {}
+FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf)
+    : idx_list(il), rep_factor(rf), count(1), idx(0) {}
 
 unsigned FrequencyIndexList::getNextIndex() {
   if (count == rep_factor) {
     count = 0;
-    idx = (idx+1) % idx_list.size();
+    idx = (idx + 1) % idx_list.size();
   }
   count++;
   return idx_list[idx];
@@ -219,7 +214,7 @@ void ProfileInfo::readIterationFrequency() {
   frequency_current_iteration = recordFreq();
 #else
   frequency_current_iteration = 0;
-#endif //JETSON_EXECUTION
+#endif // JETSON_EXECUTION
 }
 
 unsigned long ProfileInfo::getIterationFrequency() {
@@ -288,15 +283,14 @@ void ProfileInfo::printToFile() {
   // to have equal sizes, in outer and inner vectors both,
   // and all time_info and energy_info vectors must have the same size.
   unsigned iterations = tensor_time_info.size();
-  CUSTOM_ASSERT(
-      (tensor_time_info.size() == iterations) &&
-      (tensor_energy_info.size() == iterations) &&
-      (control_time_info.size() == iterations) &&
-      (control_energy_info.size() == iterations) &&
-      (config_time_info.size() == iterations) &&
-      (config_energy_info.size() == iterations) &&
-      (frequency_info.size() == iterations) &&
-      "time_info, energy_info, frequency_info size: \
+  CUSTOM_ASSERT((tensor_time_info.size() == iterations) &&
+                (tensor_energy_info.size() == iterations) &&
+                (control_time_info.size() == iterations) &&
+                (control_energy_info.size() == iterations) &&
+                (config_time_info.size() == iterations) &&
+                (config_energy_info.size() == iterations) &&
+                (frequency_info.size() == iterations) &&
+                "time_info, energy_info, frequency_info size: \
                    iteration number does not match.");
 
   for (unsigned i = 0; i < tensor_time_info.size(); i++) {
@@ -346,8 +340,8 @@ ProfileInfo::ProfileInfo()
       time_control_current_iteration(0.0), time_config_current_iteration(0.0),
       energy_compute_current_iteration(0.0),
       energy_control_current_iteration(0.0),
-      energy_config_current_iteration(0.0),
-      frequency_current_iteration(0), in_iteration(false) {}
+      energy_config_current_iteration(0.0), frequency_current_iteration(0),
+      in_iteration(false) {}
 
 Slowdowns::Slowdowns() {
   idx = 0;
@@ -389,52 +383,50 @@ void RuntimeController::stop_profiler() {
     profiler->stop_profiler();
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &RuntimeController::
-getSpeedupConfigurations() {
+std::vector<struct Configuration *> &
+RuntimeController::getSpeedupConfigurations() {
   return SpeedupConfigurations;
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &RuntimeController::
-getEnergyConfigurations() {
+std::vector<struct Configuration *> &
+RuntimeController::getEnergyConfigurations() {
   return EnergyConfigurations;
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &RuntimeController::
-getThreeDCurveConfigurations() {
+std::vector<struct Configuration *> &
+RuntimeController::getThreeDCurveConfigurations() {
   return ThreeDCurveConfigurations;
 }
 // For testing purposes only - do not use widely
 unsigned RuntimeController::getConfigurationIdx() { return configurationIdx; }
 
 double RuntimeController::getCurrentConfigurationSpeedup() {
-  return (double) (*Configurations)[configurationIdx]->speedup;
+  return (double)(*Configurations)[configurationIdx]->speedup;
 }
 
 double RuntimeController::getCurrentConfigurationEnergy() {
-  return (double) (*Configurations)[configurationIdx]->energy;
+  return (double)(*Configurations)[configurationIdx]->energy;
 }
 
 double RuntimeController::getCurrentConfigurationAccuracy() {
-  return (double) (*Configurations)[configurationIdx]->accuracy;
+  return (double)(*Configurations)[configurationIdx]->accuracy;
 }
 
 double RuntimeController::getCurrentConfigurationAccuracyLoss() {
-  return (double) (*Configurations)[configurationIdx]->accuracyLoss;
+  return (double)(*Configurations)[configurationIdx]->accuracyLoss;
 }
 
 NodeConfiguration *RuntimeController::getNodeConfiguration(const char *data) {
 
   // if visc.node.id Not specified for this HPVM Node
-  if (currentTensorID == -1){
+  if (currentTensorID == ~0U) {
     std::string s(data);
     // All nodes are expected to have a configuration
     return (*Configurations)[configurationIdx]->setup.at(s);
-  }
-  else{
-    DEBUG("-- currentTensorID = \%u \n", currentTensorID); 
+  } else {
+    DEBUG("-- currentTensorID = %u \n", currentTensorID);
     return (*Configurations)[configurationIdx]->idConfigMap.at(currentTensorID);
   }
-  
 }
 
 void RuntimeController::init(const char *Cstr) {
@@ -443,7 +435,8 @@ void RuntimeController::init(const char *Cstr) {
   setProfileInfoFilename(Cstr);
   readConfigurationFile(Cstr);
 
-  // NOTE: Configurations is pareto-configs. InitialConfigurations is the full list (config file)
+  // NOTE: Configurations is pareto-configs. InitialConfigurations is the full
+  // list (config file)
   Configurations = NULL;
   computeParetoConfigurationPoints();
   //    compute3DParetoConfigurationPoints(); Not using 3D curve
@@ -464,8 +457,10 @@ void RuntimeController::init(const char *Cstr) {
   // Pseudo random variable (when we did few experiments)
   // or true random numbers for probabilistic control
   pseudo_rd = 0.0;
-  std::random_device rd;  //Will be used to obtain a seed for the random number engine
-  generator = std::mt19937 (rd()); //Standard mersenne_twister_engine seeded with rd()
+  std::random_device
+      rd; // Will be used to obtain a seed for the random number engine
+  generator =
+      std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd()
   distr = std::uniform_real_distribution<>(0.0, 1.0);
 
   g_freq = available_freqs[13];
@@ -487,8 +482,8 @@ void RuntimeController::end_iteration() {
     PI->end_iteration();
 }
 
-void RuntimeController::addToCurrentIterationComputeTime(
-    const char *s, double t) {
+void RuntimeController::addToCurrentIterationComputeTime(const char *s,
+                                                         double t) {
   if (PI)
     PI->addToCurrentIterationComputeTime(s, t);
 }
@@ -503,8 +498,8 @@ void RuntimeController::addToCurrentIterationConfigTime(double t) {
     PI->addToCurrentIterationConfigTime(t);
 }
 
-void RuntimeController::addToCurrentIterationComputeEnergy(
-    const char *s, double e) {
+void RuntimeController::addToCurrentIterationComputeEnergy(const char *s,
+                                                           double e) {
   if (PI)
     PI->addToCurrentIterationComputeEnergy(s, e);
 }
@@ -542,8 +537,8 @@ void RuntimeController::updateFrequency() {
   //--- updateJetsonGPUFreq(freq_idx);
 
   setFreq(freq_idx);
-  
-#endif //JETSON_EXECUTION
+
+#endif // JETSON_EXECUTION
 }
 
 void RuntimeController::writeProfileInfo() {
@@ -576,11 +571,9 @@ std::pair<double, double> RuntimeController::fc_profile(
     const unsigned num_rows_a, const unsigned num_cols_a,
     const unsigned num_rows_b, const unsigned num_cols_b,
     const unsigned voltage_swing, const unsigned patch_factor) {
-  return (
-      promise ? promise->fc_profile(
-                    num_rows_a, num_cols_a, num_rows_b, num_cols_b,
-                    voltage_swing, patch_factor)
-              : std::make_pair(0.0, 0.0));
+  return (promise ? promise->fc_profile(num_rows_a, num_cols_a, num_rows_b,
+                                        num_cols_b, voltage_swing, patch_factor)
+                  : std::make_pair(0.0, 0.0));
 }
 
 std::pair<double, double> RuntimeController::conv_profile(
@@ -588,17 +581,16 @@ std::pair<double, double> RuntimeController::conv_profile(
     const unsigned c_out, const unsigned c_in, const unsigned k_h,
     const unsigned k_w, const unsigned s_h, const unsigned s_w,
     const unsigned voltage_swing, const unsigned patch_factor) {
-  return (
-      promise ? promise->conv_profile(
-                    n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing,
-                    patch_factor)
-              : std::make_pair(0.0, 0.0));
+  return (promise ? promise->conv_profile(n, c, h, w, c_out, c_in, k_h, k_w,
+                                          s_h, s_w, voltage_swing, patch_factor)
+                  : std::make_pair(0.0, 0.0));
 }
 
 // Constructor and descructor
 RuntimeController::RuntimeController() {
   configurationIdx = 0;
-  FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10);
+  FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+                               10);
 #ifdef ACTIVE_PROFILING
   PI = new ProfileInfo();
   profiler = new Profiler();
@@ -674,7 +666,6 @@ void RuntimeController::readConfigurationFile(const char *str) {
     abort();
   }
 
-  bool readingConfiguration = false;
   bool readingFirstLine = false;
 
   // Read baseline_time from first line of configuration file
@@ -682,16 +673,14 @@ void RuntimeController::readConfigurationFile(const char *str) {
   std::getline(qin, first_line);
   DEBUG("first_line: %s\n", first_line.c_str());
 
-  try{
+  try {
     baseline_time = std::stod(first_line);
     DEBUG("Baseline time: %lf\n\n", baseline_time);
-  }
-  catch(...){
+  } catch (...) {
     ERROR("Please Add/Fix Baseline Time at Top of Config File.. ");
   }
 
-  
-  unsigned int firstTensorID = 1;  
+  unsigned int firstTensorID = 1;
   for (std::string line; std::getline(qin, line);) {
     DEBUG("line: %s\n", line.c_str());
 
@@ -709,13 +698,11 @@ void RuntimeController::readConfigurationFile(const char *str) {
 
     if (tokens[0] == "+++++") { // Found new configuration start token
       // Mark the start of a new configuration
-      readingConfiguration = true;
       readingFirstLine = true;
       continue;
     }
 
     if (tokens[0] == "-----") { // Found configuration end token
-      readingConfiguration = false;
       // Mark the end of current configuration
       continue;
     }
@@ -724,10 +711,10 @@ void RuntimeController::readConfigurationFile(const char *str) {
       // Read first line, to create the new configuration struct
       readingFirstLine = false;
       firstTensorID = 1; // reset first tensor ID for new config
-      
-      InitialConfigurations.push_back(Configuration(
-          tokens[0], std::stof(tokens[1]), std::stof(tokens[2]),
-          std::stof(tokens[3]), std::stof(tokens[4])));
+
+      InitialConfigurations.push_back(
+          Configuration(tokens[0], std::stof(tokens[1]), std::stof(tokens[2]),
+                        std::stof(tokens[3]), std::stof(tokens[4])));
       continue;
     }
 
@@ -735,9 +722,8 @@ void RuntimeController::readConfigurationFile(const char *str) {
       DEBUG("Found gpu configuration\n");
 
       // There must be at least one operation, with an approximation option
-      CUSTOM_ASSERT(
-          (tokens.size() >= 5) &&
-          "Not enough operations - approximation options.");
+      CUSTOM_ASSERT((tokens.size() >= 5) &&
+                    "Not enough operations - approximation options.");
 
       GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration();
       InitialConfigurations.back().setup.insert(
@@ -748,7 +734,7 @@ void RuntimeController::readConfigurationFile(const char *str) {
       InitialConfigurations.back().idConfigMap.insert(
           std::make_pair(firstTensorID, NodeConf));
       DEBUG("*** firstTensorID = %d \n\n", firstTensorID);
-      
+
       unsigned idx = 2;
       while (idx < tokens.size()) {
         if (tokens[idx] == "add") {
@@ -897,14 +883,13 @@ void RuntimeController::readConfigurationFile(const char *str) {
 
       // Update first TensorID using number of tensor ops in current node
       firstTensorID += NodeConf->getApproxChoices().size();
-      
+
     } else if (tokens[1] == "cpu") {
       DEBUG("Found gpu configuration\n");
 
       // There must be at least one operation, with an approximation option
-      CUSTOM_ASSERT(
-          (tokens.size() >= 5) &&
-          "Not enough operations - approximation options.");
+      CUSTOM_ASSERT((tokens.size() >= 5) &&
+                    "Not enough operations - approximation options.");
 
       CPUNodeConfiguration *NodeConf = new CPUNodeConfiguration();
       InitialConfigurations.back().setup.insert(
@@ -1020,9 +1005,8 @@ void RuntimeController::computeParetoConfigurationPoints() {
 
   // Sort the configurations according to accuracy loss
   INFO("Sorting autotuner configurations...\n");
-  std::sort(
-      InitialConfigurations.begin() + 1, InitialConfigurations.end(),
-      ConfigurationLessThan());
+  std::sort(InitialConfigurations.begin() + 1, InitialConfigurations.end(),
+            ConfigurationLessThan());
   INFO("Done sorting.\n");
 
   for (unsigned start_idx = 1; start_idx < InitialConfigurations.size();) {
@@ -1056,14 +1040,12 @@ void RuntimeController::computeParetoConfigurationPoints() {
         en_idx = i;
       }
     }
-    DEBUG(
-        "accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
-        InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
+    DEBUG("accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
+          InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
     // Found best speedup for this accuracy point (not dominated by any of
     // these).
-    DEBUG(
-        "accuracy loss = %f, energy = %f, at en_idx = %d\n",
-        InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
+    DEBUG("accuracy loss = %f, energy = %f, at en_idx = %d\n",
+          InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
     // Found best energy for this accuracy point (not dominated by any of
     // these).
 
@@ -1133,9 +1115,8 @@ void RuntimeController::compute3DParetoConfigurationPoints() {
 
   // Sort the configurations according to accuracy loss
   INFO("Sorting autotuner configurations...\n");
-  std::sort(
-      InitialConfigurations.begin(), InitialConfigurations.end(),
-      ConfigurationLessThan());
+  std::sort(InitialConfigurations.begin(), InitialConfigurations.end(),
+            ConfigurationLessThan());
   INFO("Done sorting.\n");
 
   for (unsigned start_idx = 0; start_idx < InitialConfigurations.size();) {
@@ -1169,11 +1150,10 @@ void RuntimeController::compute3DParetoConfigurationPoints() {
         }
       }
       if (!dominated) {
-        DEBUG(
-            "accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
-            InitialConfigurations[i].accuracyLoss,
-            InitialConfigurations[i].speedup, InitialConfigurations[i].energy,
-            i);
+        DEBUG("accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
+              InitialConfigurations[i].accuracyLoss,
+              InitialConfigurations[i].speedup, InitialConfigurations[i].energy,
+              i);
         Indices.push_back(i);
       }
     }
@@ -1232,31 +1212,22 @@ void RuntimeController::printConfigurations(
   }
 }
 
-unsigned long RuntimeController::getLastFrequency() {
-  return g_freq;
-}
+unsigned long RuntimeController::getLastFrequency() { return g_freq; }
 
-void RuntimeController::setLastFrequency(unsigned long f) {
-  g_freq = f;
-}
+void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; }
 
-double RuntimeController::getLastSpeedup() {
-  return g_speedup;
-}
+double RuntimeController::getLastSpeedup() { return g_speedup; }
 
-void RuntimeController::setLastSpeedup(double s) {
-  g_speedup = s;
-}
+void RuntimeController::setLastSpeedup(double s) { g_speedup = s; }
 
 void RuntimeController::findNextConfiguration() {
   configurationIdx = (configurationIdx + 1) % Configurations->size();
-  DEBUG(
-      "findNextConfiguration: Updated configurationIdx to %u.\n",
-      configurationIdx);
+  DEBUG("findNextConfiguration: Updated configurationIdx to %u.\n",
+        configurationIdx);
 }
 
-void RuntimeController::findTargetConfiguration(
-    float goal, enum SEARCH_KIND sk) {
+void RuntimeController::findTargetConfiguration(float goal,
+                                                enum SEARCH_KIND sk) {
   // We search in range begin(), end()-1 . It is OK to decrement end(), because
   // the configurations vector always points to one of the pareto curves, and
   // they are never empty - we have always pushed at least one configuration.
@@ -1267,25 +1238,25 @@ void RuntimeController::findTargetConfiguration(
   case SPEEDUP: {
     // Assigning one of Pareto configs to 'Configurations' class attribute
     Configurations = &SpeedupConfigurations;
-    low_it = std::lower_bound(
-        Configurations->begin(), Configurations->end() - 1, goal,
-        ConfigurationLessThan_SP());
+    low_it =
+        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+                         goal, ConfigurationLessThan_SP());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ENERGY: {
     Configurations = &EnergyConfigurations;
-    low_it = std::lower_bound(
-        Configurations->begin(), Configurations->end() - 1, goal,
-        ConfigurationLessThan_E());
+    low_it =
+        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+                         goal, ConfigurationLessThan_E());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ACCURACY_LOSS: {
     Configurations = &SpeedupConfigurations;
-    low_it = std::lower_bound(
-        Configurations->begin(), Configurations->end() - 1, goal,
-        ConfigurationLessThan_AL());
+    low_it =
+        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+                         goal, ConfigurationLessThan_AL());
     if ((*low_it)->accuracyLoss > goal)
       --low_it;
     configurationIdx = low_it - Configurations->begin();
@@ -1300,9 +1271,8 @@ void RuntimeController::findTargetConfiguration(
   // After search, low_it points to the Configuration to the element with the
   // goal value or the immediately lower value if it does not exist
 
-  DEBUG(
-      "findTargetConfiguration: Updated configurationIdx to %u.\n",
-      configurationIdx);
+  DEBUG("findTargetConfiguration: Updated configurationIdx to %u.\n",
+        configurationIdx);
 }
 
 void RuntimeController::adjustTargetConfiguration(float goal) {
@@ -1313,8 +1283,8 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
   // Find configuration before the selected one.
   // There is always one, unless goal is 1. Then, we would pick baseline, and
   //  both upper and lower should be the same configuration, at index 0.
-  unsigned prev_conf_idx = configurationIdx > 0 ? configurationIdx - 1
-                                                : configurationIdx;
+  unsigned prev_conf_idx =
+      configurationIdx > 0 ? configurationIdx - 1 : configurationIdx;
   // Get the two configurations' speedup, and compute the appropriate ranges
   float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup;
   float prev_conf_speedup = (*Configurations)[prev_conf_idx]->speedup;
@@ -1333,32 +1303,32 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
 
     //***--- Probability adjustment strategy 1 ---***//
     // No big adjustments at edges of probability range
-//    float adjust_val = 0.0;
-//    if (low_pb < high_pb) {
-//      adjust_val = low_pb * 0.2;
-//    } else {
-//      adjust_val = high_pb * 0.2;
-//    }
-//    low_pb -= adjust_val;
-//    high_pb += adjust_val;
+    //    float adjust_val = 0.0;
+    //    if (low_pb < high_pb) {
+    //      adjust_val = low_pb * 0.2;
+    //    } else {
+    //      adjust_val = high_pb * 0.2;
+    //    }
+    //    low_pb -= adjust_val;
+    //    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 2 ---***//
     // No big adjustment at high edge of probability range
-//    float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2;
-//    low_pb -= adjust_val;
-//    high_pb += adjust_val;
+    //    float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2;
+    //    low_pb -= adjust_val;
+    //    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 3 ---***//
-    //Similar to 2, but higher always increases, more significantly
-//    float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5;
-//    low_pb -= adjust_val;
-//    high_pb += adjust_val;
+    // Similar to 2, but higher always increases, more significantly
+    //    float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5;
+    //    low_pb -= adjust_val;
+    //    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 4 ---***//
-    //Similar to 2, but higher always increases, more significantly
+    // Similar to 2, but higher always increases, more significantly
     // Low end, high end a bit less aggressive than total range
     float adjust_val = low_pb * 0.6 > 0.2 ? 0.2 : low_pb * 0.6;
     adjust_val = adjust_val > high_pb ? high_pb : adjust_val;
@@ -1367,20 +1337,18 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
     //***---                                   ---***//
   }
 
-  DEBUG(
-      "**---- adjustTargetConfiguration: upper conf = %s with probability: "
-      "%f.\n",
-      ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
-  DEBUG(
-      "**---- adjustTargetConfiguration: lower conf = %s with probability: "
-      "%f.\n\n",
-      ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb);
+  DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: "
+        "%f.\n",
+        ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
+  DEBUG("**---- adjustTargetConfiguration: lower conf = %s with probability: "
+        "%f.\n\n",
+        ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb);
 
   // Select a random number from 0 to 1
   // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1)
   // to the upper
   // float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ;
-  //float rd = pseudo_rd;
+  // float rd = pseudo_rd;
   float rd = distr(generator);
   if (rd < low_pb) {
     // If the probability is in the low range
@@ -1414,8 +1382,8 @@ extern "C" void llvm_hpvm_clearRuntimeController() {
 //*** Methods to compute accuracy of a tensor by the runtime controller   ***//
 uint32_t *labels_from_file = NULL;
 
-uint32_t *
-hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) {
+uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start,
+                                         int end) {
 
   // Initialize buffer
   if (!labels_from_file) {
@@ -1424,14 +1392,14 @@ hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) {
       ERROR("Data file %s is not found. Aborting...\n", labels_file);
       abort();
     }
-    
+
     // Get number of labels
     fseek(file, 0, SEEK_END);
     long size = ftell(file);
     fseek(file, 0, SEEK_SET); // return file pointer to beginning
 
     // Allocate memory for labels
-    labels_from_file = (uint32_t *) malloc(size);
+    labels_from_file = (uint32_t *)malloc(size);
     if (labels_from_file == NULL) {
       ERROR("Memory allocation for labels unsucessfull. Aborting...\n");
       abort();
@@ -1488,10 +1456,10 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
-  
-  average_accuracy = accuracy + (average_accuracy * num_executations); 
+
+  average_accuracy = accuracy + (average_accuracy * num_executations);
   num_executations++;
-  average_accuracy = average_accuracy/num_executations;
+  average_accuracy = average_accuracy / num_executations;
 
   FILE *fp = fopen("final_accuracy", "w+");
   if (fp != NULL) {
@@ -1508,13 +1476,12 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
   return accuracy;
 }
 
-
 #define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl
 //#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl
 //#define llvm_hpvm_invokeRtControl_ITERATE llvm_hpvm_invokeRtControl
 
-extern "C" void llvm_hpvm_invokeRtControl_BASE(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str,
+                                               int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1531,16 +1498,15 @@ extern "C" void llvm_hpvm_invokeRtControl_BASE(
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n\n",
+       current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ITERATE(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str,
+                                                  int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1564,16 +1530,15 @@ extern "C" void llvm_hpvm_invokeRtControl_ITERATE(
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n\n",
+       current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ADJUST(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str,
+                                                 int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1616,17 +1581,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST(
   RC->addToCurrentIterationConfigEnergy(pinfo2.second);
   //*                                                                        */
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("target speedup = %lf\n\n", target_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result,
+                                                    const char *str, int start,
+                                                    int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1670,17 +1635,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(
   RC->addToCurrentIterationConfigEnergy(pinfo2.second);
   //*                                                                        */
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("target speedup = %lf\n\n", target_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result,
+                                                   const char *str, int start,
+                                                   int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1707,21 +1672,20 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(
   float next_conf_speedup =
       RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("slowdown (target speedup) = %f\n", slowdown);
   INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO(
-      "Swapping to next configuration: %s with speedup %f\n\n",
-      next_conf_name.c_str(), next_conf_speedup);
+  INFO("Swapping to next configuration: %s with speedup %f\n\n",
+       next_conf_name.c_str(), next_conf_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result,
+                                                      const char *str,
+                                                      int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1749,21 +1713,19 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(
   float next_conf_speedup =
       RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("slowdown (target speedup) = %f\n", slowdown);
   INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO(
-      "Swapping to next configuration: %s with speedup %f\n\n",
-      next_conf_name.c_str(), next_conf_speedup);
+  INFO("Swapping to next configuration: %s with speedup %f\n\n",
+       next_conf_name.c_str(), next_conf_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_RAND(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str,
+                                               int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1781,9 +1743,8 @@ extern "C" void llvm_hpvm_invokeRtControl_RAND(
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n\n",
+       current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
@@ -1794,32 +1755,7 @@ static void writeVectorToFile(const char *path, const std::vector<T> &vec) {
   std::ofstream of(path, std::ofstream::out | std::ofstream::app);
   if (!of.good())
     ERROR("Cannot write to %s file", path);
-  for (float f: vec)
+  for (float f : vec)
     of << f << ' ';
   of << '\n';
 }
-
-extern "C" void llvm_hpvm_imgInvokeRtControl(void* result, void *gold, int start, int end) {
-  RC->resume_profiler();
-
-  if (gold != nullptr) {
-    writeVectorToFile("psnr.txt", PSNR(gold, result));
-    writeVectorToFile("ssim.txt", SSIM(gold, result));
-  }
-
-  // Read stats for iteration that was just completed
-  double current_iteration_time = RC->getCurrentIterationComputeTime();
-  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
-
-  RC->pause_profiler();
-  std::pair<double, double> pinfo = RC->get_time_energy();
-  RC->reset_profiler();
-  RC->addToCurrentIterationControlTime(pinfo.first);
-  RC->addToCurrentIterationControlEnergy(pinfo.second);
-
-  INFO("current iteration time = %f, current iteration energy = %f\n\n",
-       current_iteration_time, current_iteration_energy);
-
-  // Note the end of iteration
-  RC->end_iteration();
-}
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
deleted file mode 100644
index 608950aa473948bc6c3663d88646c8080a5d56e1..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
+++ /dev/null
@@ -1,638 +0,0 @@
-#include "approxhpvm_img_runtime_utils.h"
-#include "debug.h"
-#include "img_tensor_runtime.h"
-
-#include "functional/map.cuh"
-#include "functional/reduce.cuh"
-#include "tensor_utils.h"
-
-#include <cufft.h>
-#include <cufftXt.h>
-#include <thrust/device_vector.h>
-
-template <typename T> struct DivFunctor {
-  const T dividend;
-
-  DivFunctor(float dividend) : dividend(dividend) {}
-
-  __host__ __device__ T operator()(const T x) const { return x / dividend; }
-};
-
-// ***                       Runtime implementation                      *** //
-void *tensorFft(void *input, bool inverse) {
-  // https://docs.nvidia.com/cuda/cufft/index.html#twod-complex-to-real-transforms
-  // Tensor checking
-  INFO("FFT\n");
-  profileEvent("tensorFft");
-  auto *t_input = (Tensor *)input;
-  int total_rank = t_input->dims.num_dims;
-  if (total_rank != 4)
-    ERROR("Only 4-dim tensor supported\n");
-  // Dimensions
-  size_t *all_dims = t_input->dims.dim_sizes;
-  int height = all_dims[2], width = all_dims[3];
-  int n_batch = int(all_dims[0]) * int(all_dims[1]);
-  // Prepare input data
-  hostToDeviceCopy(t_input);
-  // Create a 2D FFT plan
-  cufftHandle plan;
-  checkCUFFT(cufftCreate(&plan));
-  // Output
-  Tensor *out_tensor = nullptr;
-  if (inverse) {
-    int fft_dim[2] = {height, (width - 1) * 2};
-    auto *input_cuda = convertAndGetGPUData<cufftComplex>(t_input);
-    // Define output data
-    out_tensor = (Tensor *)create4DTensor(
-        (int)float_type, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], height,
-        (width - 1) * 2);
-    changeTensorPlacement(out_tensor, DEVICE);
-    auto *output_cuda = convertAndGetGPUData<cufftReal>(out_tensor);
-    checkCUFFT(cufftPlanMany(
-        &plan, 2, fft_dim, nullptr, 1, 0, nullptr, 1, 0, CUFFT_C2R, n_batch));
-    // Execute the plan
-    checkCUFFT(cufftExecC2R(plan, input_cuda, output_cuda));
-  } else {
-    int fft_dim[2] = {height, width};
-    auto *input_cuda = convertAndGetGPUData<cufftReal>(t_input);
-    // Define output data
-    out_tensor = (Tensor *)create4DTensor(
-        (int)float2_type, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], height,
-        (width / 2 + 1));
-    changeTensorPlacement(out_tensor, DEVICE);
-    auto *output_cuda = convertAndGetGPUData<cufftComplex>(out_tensor);
-    checkCUFFT(cufftPlanMany(
-        &plan, 2, fft_dim, nullptr, 1, 0, nullptr, 1, 0, CUFFT_R2C, n_batch));
-    // Execute the plan
-    checkCUFFT(cufftExecR2C(plan, input_cuda, output_cuda));
-  }
-  // Wait for the device to finish
-  checkCUDA(cudaDeviceSynchronize());
-
-  if (inverse) {
-    auto *output_cuda = convertAndGetGPUData<cufftReal>(out_tensor);
-    thrust::device_ptr<cufftReal> normalize_v(output_cuda);
-    size_t size = height * (width - 1) * 2;
-    DivFunctor<cufftReal> div(size);
-    thrust::transform(normalize_v, normalize_v + size, normalize_v, div);
-  }
-  // Release memory
-  cufftDestroy(plan);
-  profileEvent("tensorFft_end");
-  return out_tensor;
-}
-
-void *tensorFftHalf(void *input, bool inverse) {
-  // Tensor checking
-  INFO("FFTHalf\n");
-  profileEvent("#tensorFft");
-  auto *t_input = (Tensor *)input;
-  int total_rank = t_input->dims.num_dims;
-  if (total_rank != 4)
-    ERROR("Only 4-dim tensor supported\n");
-  // Dimensions
-  size_t *all_dims = t_input->dims.dim_sizes;
-  int height = all_dims[2], width = all_dims[3];
-  long long int n_batch = int(all_dims[0]) * int(all_dims[1]);
-  // Prepare input data
-  hostToDeviceCopy(t_input);
-  // Create a 2D FFT plan
-  cufftHandle plan;
-  checkCUFFT(cufftCreate(&plan));
-  // Output
-  Tensor *out_tensor = nullptr;
-  if (inverse) {
-    long long int fft_dim[2] = {height, (width - 1) * 2};
-    profileEvent("F2H_start");
-    auto *input_cuda = convertAndGetGPUData<half2>(t_input);
-    profileEvent("F2H_end");
-    // Define output data
-    out_tensor = (Tensor *)create4DTensor(
-        (int)half_type, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], height,
-        (width - 1) * 2);
-    changeTensorPlacement(out_tensor, DEVICE);
-    auto *output_cuda = convertAndGetGPUData<half>(out_tensor);
-    size_t worksize = 0;
-    checkCUFFT(cufftXtMakePlanMany(
-        plan, 2, fft_dim, nullptr, 1, 0, CUDA_C_16F /*inputtype*/, nullptr, 1,
-        0, CUDA_R_16F /*outputtype*/, n_batch, &worksize,
-        CUDA_C_16F /*executiontype*/
-        ));
-    // Execute the plan
-    checkCUFFT(cufftXtExec(plan, input_cuda, output_cuda, CUFFT_INVERSE));
-  } else {
-    long long int fft_dim[2] = {height, width};
-    profileEvent("F2H_start");
-    auto *input_cuda = convertAndGetGPUData<half>(t_input);
-    profileEvent("F2H_end");
-    // Define output data
-    out_tensor = (Tensor *)create4DTensor(
-        (int)half2_type, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], height,
-        (width / 2 + 1));
-    changeTensorPlacement(out_tensor, DEVICE);
-    auto *output_cuda = convertAndGetGPUData<half2>(out_tensor);
-    size_t worksize = 0;
-    checkCUFFT(cufftXtMakePlanMany(
-        plan, 2, fft_dim, nullptr, 1, 0, CUDA_R_16F /*inputtype*/, nullptr, 1,
-        0, CUDA_C_16F /*outputtype*/, n_batch, &worksize,
-        CUDA_C_16F /*executiontype*/
-        ));
-    // Execute the plan
-    checkCUFFT(cufftXtExec(plan, input_cuda, output_cuda, CUFFT_FORWARD));
-  }
-  // Wait for the device to finish
-  checkCUDA(cudaDeviceSynchronize());
-
-  if (inverse) {
-    auto *output_cuda = convertAndGetGPUData<half>(out_tensor);
-    thrust::device_ptr<half> normalize_v(output_cuda);
-    size_t size = height * (width - 1) * 2;
-    DivFunctor<half> div(size);
-    thrust::transform(normalize_v, normalize_v + size, normalize_v, div);
-
-    profileEvent("H2F_start");
-    convertToFP32_offline(out_tensor);
-    out_tensor->data_type = out_tensor->cur_type;
-    profileEvent("H2F_end");
-  } else {
-    profileEvent("H2F_start");
-    convertToFloat2Offline(out_tensor);
-    profileEvent("H2F_end");
-  }
-  // Release memory
-  cufftDestroy(plan);
-
-  profileEvent("#tensorFft_end");
-  return out_tensor;
-}
-
-/* Implements template instantiations in reduce.cuh */
-
-template <>
-__device__ void reduceAlongDim<float>(
-    float *target, float *src, float init, float identity, void *func,
-    size_t num_irows, size_t dim_size) {
-  auto *binary_op = (NTo1MapF<float, 2>)func;
-
-  float acc = init;
-  for (size_t col = 0; col < dim_size; ++col) {
-    acc = binary_op(acc, *src);
-    src += num_irows;
-  }
-  *target = acc;
-}
-
-template <>
-__device__ void parallelReduceAlongDim<float>(
-    float *target, float *src, float init, float identity, void *func,
-    size_t num_irows, size_t dim_size, size_t along_dim_tid,
-    size_t n_along_dim_threads) {
-  __shared__ float sbuf[CrossDimTh][AlongDimTh + 1]; // avoid bank conflict
-  float *this_buf_line = sbuf[threadIdx.y];
-
-  auto *binary_op = (NTo1MapF<float, 2>)func;
-
-  float acc = init;
-  // Sequential reduction within a thread.
-  for (size_t col = along_dim_tid; col < dim_size; col += n_along_dim_threads) {
-    acc = binary_op(acc, src[col * num_irows]);
-  }
-
-  this_buf_line[along_dim_tid] = acc;
-
-  __syncthreads();
-
-  // Reduce intermediate values to single value.
-  for (size_t s = AlongDimTh >> 1u; s > 0; s >>= 1u) {
-    if (along_dim_tid < s) {
-      float arg1 = this_buf_line[along_dim_tid];
-      float arg2 = this_buf_line[along_dim_tid + s];
-      float res = binary_op(arg1, arg2);
-      this_buf_line[along_dim_tid] = res;
-    }
-    __syncthreads();
-  }
-
-  if (along_dim_tid == 0) {
-    *target = this_buf_line[0];
-  }
-  __syncthreads();
-}
-
-static __device__ __forceinline__ half
-reduceHalf2ToHalf(NTo1MapF<half2, 2> half2_f, half2 value) {
-  half2 high = __high2half2(value), low = __low2half2(value);
-  return __high2half(half2_f(high, low)); // Or the lower half, whatever
-}
-
-template <>
-__device__ void reduceAlongDim<half>(
-    half *target, half *src, half init, half identity, void *func,
-    size_t num_irows, size_t dim_size) {
-  auto *binary_op = (NTo1MapF<half2, 2>)func;
-
-  half2 acc = __halves2half2(init, identity);
-  size_t twice_irows = num_irows << 1;
-  for (size_t col = 0; col < dim_size; col += 2) {
-    half higher = col + 1 < dim_size ? *(src + num_irows) : identity;
-    acc = binary_op(acc, __halves2half2(*src, higher));
-    src += twice_irows;
-  }
-  *target = reduceHalf2ToHalf(binary_op, acc);
-}
-
-template <>
-__device__ void parallelReduceAlongDim<half>(
-    half *target, half *src, half init, half identity, void *func,
-    size_t num_irows, size_t dim_size, size_t along_dim_tid,
-    size_t n_along_dim_threads) {
-  __shared__ half2 sbuf[CrossDimTh][AlongDimTh + 1]; // avoid bank conflict
-  half2 *this_buf_line = sbuf[threadIdx.y];
-
-  auto *binary_op = (NTo1MapF<half2, 2>)func;
-
-  // Sequential reduction within a thread.
-  half2 acc = __halves2half2(init, identity);
-  size_t src_stride = n_along_dim_threads * num_irows;
-  for (size_t col = along_dim_tid; col < dim_size;
-       col += (n_along_dim_threads << 1), src += src_stride << 1) {
-    half higher =
-        col + n_along_dim_threads < dim_size ? *(src + src_stride) : identity;
-    acc = binary_op(acc, __halves2half2(*src, higher));
-  }
-
-  this_buf_line[along_dim_tid] = acc;
-
-  __syncthreads();
-
-  // Reduce intermediate values to single value.
-  for (size_t s = AlongDimTh >> 1u; s > 0; s >>= 1u) {
-    if (along_dim_tid < s) {
-      half2 arg1 = this_buf_line[along_dim_tid];
-      half2 arg2 = this_buf_line[along_dim_tid + s];
-      half2 res = binary_op(arg1, arg2);
-      this_buf_line[along_dim_tid] = res;
-    }
-    __syncthreads();
-  }
-
-  if (along_dim_tid == 0) {
-    *target = reduceHalf2ToHalf(binary_op, this_buf_line[0]);
-  }
-  __syncthreads();
-}
-
-template <>
-__global__ void kernelMapBroadcast<float, 1>(
-    float *target, unsigned num_rows, void *func, float **srcs,
-    size_t *tail_strides) {
-  auto *n_ary_op = (NTo1MapF<float, 1>)func;
-
-  unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x,
-           stride = gridDim.x * blockDim.x;
-  for (unsigned row = threadId; row < num_rows; row += stride) {
-    target[row] = n_ary_op(srcs[0][row]);
-  }
-}
-
-template <>
-__global__ void kernelMapBroadcast<float, 2>(
-    float *target, unsigned num_rows, void *func, float **srcs,
-    size_t *tail_strides) {
-  auto *n_ary_op = (NTo1MapF<float, 2>)func;
-
-  unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x,
-           stride = gridDim.x * blockDim.x;
-  for (unsigned row = threadId; row < num_rows; row += stride) {
-    unsigned j0 = row / tail_strides[0], j1 = row / tail_strides[1];
-    target[row] = n_ary_op(srcs[0][j0], srcs[1][j1]);
-  }
-}
-
-template <>
-__global__ void kernelMapBroadcast<float, 3>(
-    float *target, unsigned num_rows, void *func, float **srcs,
-    size_t *tail_strides) {
-  auto *n_ary_op = (NTo1MapF<float, 3>)func;
-
-  unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x,
-           stride = gridDim.x * blockDim.x;
-  for (unsigned row = threadId; row < num_rows; row += stride) {
-    unsigned j0 = row / tail_strides[0], j1 = row / tail_strides[1],
-             j2 = row / tail_strides[2];
-    target[row] = n_ary_op(srcs[0][j0], srcs[1][j1], srcs[1][j2]);
-  }
-}
-
-template <>
-__global__ void kernelMapBroadcast<half, 1>(
-    half *target, unsigned num_rows, void *func, half **srcs,
-    size_t *tail_strides) {
-  auto *op = (NTo1MapF<half2, 1>)func;
-
-  unsigned threadId = (blockIdx.x * blockDim.x + threadIdx.x) << 1,
-           stride = (gridDim.x * blockDim.x) << 1;
-  unsigned row = threadId;
-  for (; row < num_rows - 1; row += stride) {
-    auto *m0 = (half2 *)&srcs[0][row], *m_out = (half2 *)&target[row];
-    *m_out = op(*m0);
-  }
-  if (row == num_rows - 1) {
-    half2 result = op(__half2half2(srcs[0][row]));
-    target[row] = __high2half(result);
-  }
-}
-
-template <>
-__global__ void kernelMapBroadcast<half, 2>(
-    half *target, unsigned num_rows, void *func, half **srcs,
-    size_t *tail_strides) {
-  auto *op = (NTo1MapF<half2, 2>)func;
-
-  unsigned o_row = (blockIdx.x * blockDim.x + threadIdx.x) << 1,
-           o_stride = gridDim.x * blockDim.x, o_stride2 = o_stride << 1;
-  for (; o_row < num_rows - 1; o_row += o_stride2) {
-    half2 *o_ptr = (half2 *)&target[o_row];
-    half2 in[2];
-    for (size_t i = 0; i < 2; i++) {
-      if (tail_strides[i] == 1) {
-        in[i] = __halves2half2(srcs[i][o_row], srcs[i][o_row + 1]);
-      } else {
-        unsigned i_row_l = o_row / tail_strides[i],
-                 i_row_r = (o_row + 1) / tail_strides[i];
-        if (i_row_l == i_row_r)
-          in[i] = __half2half2(srcs[i][i_row_l]);
-        else
-          in[i] = __halves2half2(srcs[i][i_row_l], srcs[i][i_row_r]);
-      }
-    }
-    *o_ptr = op(in[0], in[1]);
-  }
-  if (o_row == num_rows - 1) {
-    unsigned row0 = o_row / tail_strides[0], row1 = o_row / tail_strides[1];
-    half2 v0 = __half2half2(srcs[0][row0]), v1 = __half2half2(srcs[1][row1]);
-    half2 result = op(v0, v1);
-    target[o_row] = __high2half(result);
-  }
-}
-
-template <>
-__global__ void kernelMapBroadcast<half, 3>(
-    half *target, unsigned num_rows, void *func, half **srcs,
-    size_t *tail_strides) {
-  auto *op = (NTo1MapF<half2, 3>)func;
-
-  unsigned threadId = (blockIdx.x * blockDim.x + threadIdx.x) << 1,
-           stride = (gridDim.x * blockDim.x) << 1;
-  unsigned row = threadId;
-  for (; row < num_rows - 1; row += stride) {
-    unsigned row0 = row / tail_strides[0], row1 = row / tail_strides[1],
-             row2 = row / tail_strides[2];
-    auto *m0 = (half2 *)&srcs[0][row0], *m1 = (half2 *)&srcs[1][row1],
-         *m2 = (half2 *)&srcs[2][row2], *m_out = (half2 *)&target[row];
-    *m_out = op(*m0, *m1, *m2);
-  }
-  if (row == num_rows - 1) {
-    unsigned row0 = row / tail_strides[0], row1 = row / tail_strides[1],
-             row2 = row / tail_strides[2];
-    half2 v0 = __half2half2(srcs[0][row0]), v1 = __half2half2(srcs[1][row1]),
-          v2 = __half2half2(srcs[2][row2]);
-    half2 result = op(v0, v1, v2);
-    target[row] = __high2half(result);
-  }
-}
-
-void *tensorReduce(void *input, size_t axis, MathOp func, float skip_ratio) {
-  INFO("Reduce\n");
-  profileEvent("tensorReduce");
-  auto *src = (Tensor *)input;
-  if (axis >= src->dims.num_dims)
-    ERROR("Dimension out of range\n");
-  if (src->dims.num_dims != 4 || src->data_format != CUDNN_TENSOR_NCHW)
-    ERROR("Not supported\n");
-  Tensor *ret = reduceDim<float>(src, 0.0f, func, axis, skip_ratio);
-  profileEvent("tensorReduce_end");
-  return ret;
-}
-
-void *
-tensorReduceHalf(void *input, size_t axis, MathOp func, float skip_ratio) {
-  INFO("Reduce\n");
-  profileEvent("#tensorReduce");
-  auto *src = (Tensor *)input;
-  if (axis >= src->dims.num_dims)
-    ERROR("Dimension out of range\n");
-  if (src->dims.num_dims != 4 || src->data_format != CUDNN_TENSOR_NCHW)
-    ERROR("Not supported\n");
-  Tensor *ret = reduceDim<half>(src, 0.0f, func, axis, skip_ratio);
-  profileEvent("H2F_start");
-  convertToFP32_offline(ret);
-  profileEvent("H2F_end");
-  profileEvent("#tensorReduce_end");
-  return ret;
-}
-
-void *tensorProjectiveT(void *input, void *transformation) {
-  ERROR("ProjectiveT operation currently unsupported.\n");
-  abort();
-}
-
-void *tensorMap1(MathOp f, void *i) {
-  INFO("Map1\n");
-  profileEvent("tensorMap1");
-  auto *src = (Tensor *)i;
-  Tensor *ret = mapGeneral<float, 1>(f, {src});
-  profileEvent("tensorMap1_end");
-  return ret;
-}
-
-void *tensorMap1Half(MathOp f, void *i) {
-  INFO("Map1Half\n");
-  profileEvent("#tensorMap1");
-  auto *src = (Tensor *)i;
-  Tensor *ret = mapGeneral<half, 1>(f, {src});
-  profileEvent("H2F_start");
-  convertToFP32_offline(ret);
-  profileEvent("H2F_end");
-  profileEvent("#tensorMap1_end");
-  return ret;
-}
-
-void *tensorMap2(MathOp f2, void *i1, void *i2) {
-  INFO("Map2\n");
-  profileEvent("tensorMap2");
-  auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2;
-  Tensor_type_t common_ty =
-      getCompatibleType(src1->cur_type, src2->cur_type, false);
-  Tensor *ret = nullptr;
-  if (common_ty == float_type)
-    ret = mapGeneral<float, 2>(f2, {src1, src2});
-  else if (common_ty == float2_type)
-    ret = mapGeneral<float2, 2>(f2, {src1, src2});
-  else
-    ERROR("Type not recognized\n");
-  profileEvent("tensorMap2_end");
-  return ret;
-}
-
-void *tensorMap2Half(MathOp f2, void *i1, void *i2) {
-  INFO("Map2Half\n");
-  profileEvent("#tensorMap2");
-  auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2;
-  Tensor_type_t common_ty =
-      getCompatibleType(src1->cur_type, src2->cur_type, false);
-  if (common_ty == float_type) {
-    Tensor *ret = mapGeneral<half, 2>(f2, {src1, src2});
-    profileEvent("H2F_start");
-    convertToFP32_offline(ret);
-    profileEvent("H2F_end");
-    profileEvent("#tensorMap2_end");
-    return ret;
-  } else if (common_ty == float2_type) {
-    Tensor *ret = mapGeneral<half2, 2>(f2, {src1, src2});
-    profileEvent("H2F_start");
-    convertToFloat2Offline(ret);
-    profileEvent("H2F_end");
-    profileEvent("#tensorMap2_end");
-    return ret;
-  } else {
-    ERROR("Type not recognized\n");
-    return nullptr; // For some compilers
-  }
-}
-
-void *tensorMap3(MathOp f3, void *i1, void *i2, void *i3) {
-  INFO("Map3\n");
-  profileEvent("tensorMap3");
-  auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2, *src3 = (Tensor *)i3;
-  Tensor *ret = mapGeneral<float, 3>(f3, {src1, src2, src3});
-  profileEvent("tensorMap3_end");
-  return ret;
-}
-
-void *tensorMap3Half(MathOp f3, void *i1, void *i2, void *i3) {
-  INFO("Map3Half\n");
-  profileEvent("#tensorMap3");
-  auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2, *src3 = (Tensor *)i3;
-  Tensor *ret = mapGeneral<half, 3>(f3, {src1, src2, src3});
-  profileEvent("H2F_start");
-  convertToFP32_offline(ret);
-  profileEvent("H2F_end");
-  profileEvent("#tensorMap3_end");
-  return ret;
-}
-
-// ***                     Wrapper API implementation                    *** //
-
-void *wrapper_tensorFft(const char *hpvm_node_id, void *input, bool inverse) {
-  GPUNodeConfiguration *GPUConf =
-      (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
-  std::vector<std::pair<
-      GPUNodeConfiguration::TENSOR_OP,
-      std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
-      &ApproxChoices = GPUConf->getApproxChoices();
-  // Approximation choices must be for a fft operation
-  CUSTOM_ASSERT(
-      ApproxChoices.size() == 1 &&
-      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::FFT &&
-      "Invalid configuration generated for tensor fft wrapper operation");
-  return handleTensorFftApproximationTuples(
-      ApproxChoices[0].second, input, inverse);
-}
-
-void *wrapper_tensorReduce(
-    const char *hpvm_node_id, void *input, int axis, int func) {
-  GPUNodeConfiguration *GPUConf =
-      (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
-  std::vector<std::pair<
-      GPUNodeConfiguration::TENSOR_OP,
-      std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
-      &ApproxChoices = GPUConf->getApproxChoices();
-  // Approximation choices must be for a reduce operation
-  CUSTOM_ASSERT(
-      ApproxChoices.size() == 1 &&
-      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::REDUCE &&
-      "Invalid configuration generated for tensor reduce wrapper operation");
-  return handleTensorReduceApproximationTuples(
-      ApproxChoices[0].second, input, axis, (MathOp)func);
-}
-
-void *wrapper_tensorProjectiveT(
-    const char *hpvm_node_id, void *input, void *transformation) {
-  GPUNodeConfiguration *GPUConf =
-      (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
-  std::vector<std::pair<
-      GPUNodeConfiguration::TENSOR_OP,
-      std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
-      &ApproxChoices = GPUConf->getApproxChoices();
-  // Approximation choices must be for a projectiveT operation
-  CUSTOM_ASSERT(
-      ApproxChoices.size() == 1 &&
-      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T &&
-      "Invalid configuration generated for tensor projectiveT "
-      "wrapper operation");
-  return handleTensorProjectiveTApproximationTuples(
-      ApproxChoices[0].second, input, transformation);
-}
-
-void *wrapper_tensorMap1(const char *hpvm_node_id, int func, void *input) {
-  GPUNodeConfiguration *GPUConf =
-      (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
-  std::vector<std::pair<
-      GPUNodeConfiguration::TENSOR_OP,
-      std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
-      &ApproxChoices = GPUConf->getApproxChoices();
-  // Approximation choices must be for a map1 operation
-  CUSTOM_ASSERT(
-      ApproxChoices.size() == 1 &&
-      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP1 &&
-      "Invalid configuration generated for tensor map1 wrapper operation");
-  return handleTensorMap1ApproximationTuples(
-      ApproxChoices[0].second, (MathOp)func, input);
-}
-
-void *wrapper_tensorMap2(
-    const char *hpvm_node_id, int func, void *input1, void *input2) {
-  GPUNodeConfiguration *GPUConf =
-      (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
-  std::vector<std::pair<
-      GPUNodeConfiguration::TENSOR_OP,
-      std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
-      &ApproxChoices = GPUConf->getApproxChoices();
-  // Approximation choices must be for a map2 operation
-  CUSTOM_ASSERT(
-      ApproxChoices.size() == 1 &&
-      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP2 &&
-      "Invalid configuration generated for tensor map2 wrapper operation");
-  return handleTensorMap2ApproximationTuples(
-      ApproxChoices[0].second, (MathOp)func, input1, input2);
-}
-
-void *wrapper_tensorMap3(
-    const char *hpvm_node_id, int func, void *input1, void *input2,
-    void *input3) {
-  GPUNodeConfiguration *GPUConf =
-      (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
-  std::vector<std::pair<
-      GPUNodeConfiguration::TENSOR_OP,
-      std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
-      &ApproxChoices = GPUConf->getApproxChoices();
-  // Approximation choices must be for a map3 operation
-  CUSTOM_ASSERT(
-      ApproxChoices.size() == 1 &&
-      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP3 &&
-      "Invalid configuration generated for tensor map3 wrapper operation");
-  return handleTensorMap3ApproximationTuples(
-      ApproxChoices[0].second, (MathOp)func, input1, input2, input3);
-}
-
-// Tentative
-void *wrapper_tensorStencil(const char *hpvm_node_id, void *input) {
-  ERROR("Stencil operation currently unsupported.\n");
-  abort();
-}
-
-void *wrapper_tensorCosineT(const char *hpvm_node_id, void *input) {
-  ERROR("CosineT operation currently unsupported.\n");
-  abort();
-}
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
deleted file mode 100644
index b4e9e3fea8a2f0638267f6386698d5434a6b91fc..0000000000000000000000000000000000000000
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
+++ /dev/null
@@ -1,445 +0,0 @@
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <experimental/filesystem>
-#include <numeric>
-#include <sstream>
-#include <string>
-
-#include "debug.h"
-#include "device_math.h"
-#include "functional/common.h"
-#include "img_tensor_runtime.h"
-#include "img_tensor_utils.h"
-#include "tensor_utils.h"
-
-// Image I/O utilities
-#define STB_IMAGE_IMPLEMENTATION
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-
-#include "image/stb_image.h"
-#include "image/stb_image_write.h"
-
-static inline uint8_t *float_to_uint8(const float *fl, size_t count) {
-  auto *ret = new uint8_t[count];
-  float max_v = *std::max_element(fl, fl + count),
-        min_v = *std::min_element(fl, fl + count);
-  if (max_v - min_v < 1e-3) {
-    for (size_t i = 0; i < count; i++)
-      ret[i] = 0;
-  } else {
-    float frac = 255 / (max_v - min_v);
-    for (size_t i = 0; i < count; i++)
-      ret[i] = uint8_t(frac * (fl[i] - min_v));
-  }
-  return ret;
-}
-
-static inline float *uint8_to_float(const uint8_t *ui, size_t len) {
-  auto *ret = new float[len];
-  for (size_t i = 0; i < len; i++)
-    ret[i] = float(ui[i]) / 255;
-  return ret;
-}
-
-static Tensor *to_nhwc(Tensor *t) {
-  if (t->data_format == CUDNN_TENSOR_NHWC) {
-    DEBUG("Tensor already in NHWC format, no conversion needed\n");
-    return t;
-  } else if (t->data_format != CUDNN_TENSOR_NCHW) {
-    ERROR("Unknown tensor format: %s\n", std::to_string(t->data_format));
-  } else {
-    DEBUG("Converting to NHWC format\n");
-  }
-
-  size_t *dim_arr = t->dims.dim_sizes;
-  size_t n = dim_arr[0], c = dim_arr[1], h = dim_arr[2], w = dim_arr[3];
-  auto *out_tensor =
-      (Tensor *)create4DTensor(t->data_type, CUDNN_TENSOR_NHWC, n, h, w, c);
-  size_t nhwc_offset = 0;
-  size_t element_size = getTypeSize(t->data_type);
-  char *out_data = (char *)(out_tensor->host_data),
-       *in_data = (char *)(t->host_data);
-  for (int n0 = 0; n0 < n; n0++)
-    for (int h0 = 0; h0 < h; h0++)
-      for (int w0 = 0; w0 < w; w0++)
-        for (int c0 = 0; c0 < c; c0++) {
-          size_t nc = n0 * c + c0, nch = nc * h + h0, nchw_idx = nch * w + w0,
-                 nchw_offset = nchw_idx * element_size;
-          std::memcpy(out_data + nhwc_offset, in_data + nchw_offset,
-                      element_size);
-          nhwc_offset += element_size;
-        }
-  return out_tensor;
-}
-
-static Tensor *to_nchw(Tensor *t) {
-  if (t->data_format == CUDNN_TENSOR_NCHW) {
-    DEBUG("Tensor already in NCHW format, no conversion needed\n");
-    return t;
-  } else if (t->data_format != CUDNN_TENSOR_NHWC) {
-    ERROR("Unknown tensor format: %s\n", std::to_string(t->data_format));
-  } else {
-    DEBUG("Converting to NCHW format\n");
-  }
-  size_t *dim_arr = t->dims.dim_sizes;
-  size_t n = dim_arr[0], h = dim_arr[1], w = dim_arr[2], c = dim_arr[3];
-  Tensor *out_tensor =
-      (Tensor *)create4DTensor(t->data_type, CUDNN_TENSOR_NCHW, n, c, h, w);
-  size_t nchw_offset = 0;
-  size_t element_size = getTypeSize(t->data_type);
-  char *out_data = (char *)(out_tensor->host_data),
-       *in_data = (char *)(t->host_data);
-  for (int n0 = 0; n0 < n; n0++)
-    for (int c0 = 0; c0 < c; c0++)
-      for (int h0 = 0; h0 < h; h0++)
-        for (int w0 = 0; w0 < w; w0++) {
-          size_t nh = n0 * h + h0, nhw = nh * w + w0, nhwc_idx = nhw * c + c0,
-                 nhwc_offset = nhwc_idx * element_size;
-          std::memcpy(out_data + nchw_offset, in_data + nhwc_offset,
-                      element_size);
-          nchw_offset += element_size;
-        }
-  return out_tensor;
-}
-
-namespace fs = std::experimental::filesystem;
-
-// List all files in a folder.
-static inline std::vector<std::string> listFiles(const std::string &folder) {
-  std::vector<std::string> ret;
-  for (const auto &entry : fs::directory_iterator(folder))
-    ret.push_back(entry.path().string());
-  std::sort(ret.begin(), ret.end());
-  return ret;
-}
-
-// return in[start:start+count]
-template <typename T>
-std::vector<T> sliceVector(const std::vector<T> &in, size_t start,
-                           size_t count) {
-  auto slice_begin = in.begin() + start;
-  if (slice_begin > in.end())
-    slice_begin = in.end();
-  auto slice_end = count == std::string::npos ? in.end() : slice_begin + count;
-  if (slice_end > in.end())
-    slice_end = in.end();
-  return std::vector<T>(slice_begin, slice_end);
-}
-
-// Read an image dataset from a folder with each image as a file.
-Tensor *readDataSet(const char *path, size_t start, size_t count,
-                    size_t n_color) {
-  INFO("Loading image dataset from path %s\n", path);
-  std::vector<std::string> filenames =
-      sliceVector(listFiles(path), start, count);
-  if (filenames.empty()) {
-    INFO("Folder is empty or slice is empty\n");
-    return nullptr;
-  }
-
-  auto *first_image = (Tensor *)loadAsImage(filenames[0].c_str(), n_color);
-  std::vector<size_t> sizes = ::sizes(first_image);
-  size_t h = sizes[2], w = sizes[3];
-  DEBUG("Loading shape: (%lu, %lu, %lu, %lu)\n", filenames.size(), n_color, h,
-        w);
-  auto *batch = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC,
-                                         filenames.size(), h, w, n_color);
-  size_t n_floats = n_color * h * w;
-  auto *base_data = (float *)batch->host_data;
-  for (const auto &path : filenames) {
-    int x, y, n; // x = width, y = height, n = # 8-bit components per pixel
-    uint8_t *data = stbi_load(path.c_str(), &x, &y, &n, n_color);
-    if (data == nullptr)
-      ERROR("Image load failed\n");
-    if (x != h || y != w) {
-      std::ostringstream os;
-      os << "Image file " << path << " have different shape (" << x << ", " << y
-         << ")";
-      ERROR("%s\n", os.str());
-    }
-    float *converted = uint8_to_float(data, n_floats);
-    stbi_image_free(data);
-    std::memcpy(base_data, converted, n_floats * sizeof(float));
-    delete[] converted;
-    base_data += n_floats;
-  }
-  auto *nchw_batch = to_nchw(batch);
-  DEBUG("Loaded all images.\n");
-  return nchw_batch;
-}
-
-// Convert complex-domain image to float valued image.
-static Tensor *complexToFloat(Tensor *batch) {
-  convertAndGetGPUData<float2>(batch); // Convert to float2
-  deviceToHostCopy(batch);
-  auto *in_data = (float2 *)batch->host_data;
-  size_t n_elem = batch->num_elems;
-  std::vector<float> magnitudes(n_elem, 0.0f);
-  for (size_t i = 0; i < batch->num_elems; i++) {
-    magnitudes[i] = hypot(in_data[i].x, in_data[i].y);
-  }
-
-  size_t *dims = batch->dims.dim_sizes;
-  auto *ret = (Tensor *)create4DTensor(float_type, batch->data_format, dims[0],
-                                       dims[1], dims[2], dims[3]);
-  auto *out_data = (float *)ret->host_data;
-  for (size_t i = 0; i < magnitudes.size(); i++) {
-    float f = magnitudes[i];
-    out_data[i] = f > 1.0f ? log(f) : 0;
-  }
-  return ret;
-}
-
-// Save an image tensor image-by-image to a folder.
-void saveDataSet(const char *path, Tensor *batch, size_t start_idx,
-                 size_t write_n) {
-  INFO("Saving image dataset to path %s\n", path);
-  Tensor *float_batch = batch;
-  if (batch->data_type == float2_type || batch->data_type == half2_type)
-    float_batch = complexToFloat(float_batch); // Already copied
-  else {
-    DEBUG("Copying to CPU before printing\n");
-    convertAndGetGPUData<float>(float_batch);
-    deviceToHostCopy(float_batch);
-  }
-  Tensor *converted_batch = float_batch;
-  if (converted_batch->data_format == CUDNN_TENSOR_NCHW) {
-    DEBUG("Copy-converting to NHWC format\n");
-    converted_batch = to_nhwc(converted_batch);
-  }
-  std::vector<size_t> sizes = ::sizes(converted_batch);
-  size_t h = sizes[1], w = sizes[2], c = sizes[3];
-  auto *base_data = (float *)converted_batch->host_data;
-  if (write_n == 0)
-    write_n = sizes[0];
-  else
-    write_n = std::min(write_n, sizes[0]);
-  for (size_t i = start_idx; i < start_idx + write_n; i++) {
-    std::string name = path;
-    name += "/";
-    std::string number = std::to_string(i);
-    // FIXME: pad to 6 digits. Ordering will break when we have more than 1M
-    // files.
-    number = std::string(6 - number.length(), '0') + number;
-    name += number + ".png";
-
-    uint8_t *ldr_data = float_to_uint8(base_data, h * w * c);
-    if (!stbi_write_png(name.c_str(), w, h, c, ldr_data, 0))
-      ERROR("Write file failed\n");
-    delete[] ldr_data;
-
-    base_data += h * w * c;
-  }
-}
-
-// Load 1 file as an image into a tensor.
-void *loadAsImage(const char *filename, size_t n_color) {
-  INFO("Loading image from path=%s\n", filename);
-  int x, y, n; // x = width, y = height, n = # 8-bit components per pixel
-  uint8_t *data = stbi_load(filename, &x, &y, &n, n_color);
-  if (data == nullptr)
-    ERROR("Image load failed\n");
-  float *converted = uint8_to_float(data, x * y * n);
-  DEBUG("Loading shape: (1, %lu, %lu, %lu)(NHWC)\n", y, x, n_color);
-  auto *image =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC, 1, y, x, n);
-  std::memcpy(image->host_data, converted, x * y * n * sizeof(float));
-  auto *nchw_image = to_nchw(image);
-  stbi_image_free(data);
-  return nchw_image;
-}
-
-// Save 1 tensor as an image into a file.
-void saveToImage(const char *filename, Tensor *tensor) {
-  INFO("Saving image data to path=%s\n", filename);
-  deviceToHostCopy(tensor);
-  Tensor *converted_tensor = tensor;
-  if (tensor->data_format == CUDNN_TENSOR_NCHW) {
-    DEBUG("Copy-converting to NHWC format\n");
-    converted_tensor = to_nhwc(tensor);
-  }
-  auto *hdr_data = (float *)converted_tensor->host_data;
-  size_t *dims = converted_tensor->dims.dim_sizes;
-  size_t w = dims[2], h = dims[1], c = dims[3];
-  uint8_t *ldr = float_to_uint8(hdr_data, w * h * c);
-  stbi_write_png(filename, w, h, c, ldr, 0);
-  delete[] ldr;
-}
-
-// Make a conv2d filter from 2-dim data.
-void *createFilterFromData(int data_type, void *data, size_t w, size_t h,
-                           size_t n_chan) {
-  DEBUG("Creating filter from data\n");
-  auto *tensor =
-      (Tensor *)create4DTensor(data_type, CUDNN_TENSOR_NCHW, n_chan, 1, h, w);
-  char *tensor_data;
-  if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_FLOAT)
-    tensor_data = (char *)tensor->host_data;
-  else {
-    ERROR("Data type unsupported as filter\n");
-  }
-  size_t channel_sz = tensor->size_in_bytes / n_chan;
-  for (size_t i = 0; i < n_chan; i++, tensor_data += channel_sz) {
-    std::memcpy(tensor_data, data, channel_sz);
-  }
-  return tensor;
-}
-
-// Normalize an image tensor.
-static void *normalize(void *image) {
-  auto *max_1D = tensorReduce(image, 2, MathOp::Max);
-  auto *max = tensorReduce(max_1D, 3, MathOp::Max);
-  auto *img_norm = tensorMap2(MathOp::Div, image, max);
-  freeTensor(max_1D);
-  freeTensor(max);
-  return img_norm;
-}
-
-float compute_mean(float *arr, int left, int right) {
-  float sum = 0;
-  for (int i = left; i < right; i++) {
-    sum += arr[i];
-  }
-  return sum / (right - left);
-}
-
-float compute_variance(float *arr, int left, int right, float mean) {
-  float sum = 0;
-  for (int i = left; i < right; i++) {
-    sum += (arr[i] - mean) * (arr[i] - mean);
-  }
-  return sum / (right - left - 1);
-}
-
-float compute_covariance(float *x, float *y, int left, int right, float x_mean,
-                         float y_mean) {
-  float sum = 0;
-  for (int i = left; i < right; i++) {
-    sum += (x[i] - x_mean) * (y[i] - y_mean);
-  }
-  return sum / (right - left - 1);
-}
-
-std::vector<float> SSIM(void *lhs_ptr, void *rhs_ptr) {
-  auto *lhs = (Tensor *)lhs_ptr, *rhs = (Tensor *)rhs_ptr;
-
-  lhs = (Tensor *)normalize(lhs);
-  rhs = (Tensor *)normalize(rhs);
-
-  convertToFP32(lhs);
-  convertToFP32(rhs);
-  deviceToHostCopy(lhs);
-  deviceToHostCopy(rhs);
-
-  float L = 1.0;
-  float K1 = 0.01;
-  float K2 = 0.03;
-  float C1 = (K1 * L) * (K1 * L);
-  float C2 = (K2 * L) * (K2 * L);
-
-  int n = lhs->dims.dim_sizes[0];
-  int c = lhs->dims.dim_sizes[1];
-  int h = lhs->dims.dim_sizes[2];
-  int w = lhs->dims.dim_sizes[3];
-
-  float *lhs_arr = (float *)lhs->host_data;
-  float *rhs_arr = (float *)rhs->host_data;
-
-  std::vector<float> scores;
-  for (int i = 0; i < n; i++) {
-    int left = i * c * h * w;
-    int right = (i + 1) * c * h * w;
-
-    float x_mean = compute_mean(lhs_arr, left, right);
-    float y_mean = compute_mean(rhs_arr, left, right);
-    float x_var = compute_variance(lhs_arr, left, right, x_mean);
-    float y_var = compute_variance(rhs_arr, left, right, y_mean);
-    float covariance =
-        compute_covariance(lhs_arr, rhs_arr, left, right, x_mean, y_mean);
-
-    scores.push_back(
-        ((2 * x_mean * y_mean + C1) * (2 * covariance + C2)) /
-        ((x_mean * x_mean + y_mean * y_mean + C1) * (x_var + y_var + C2)));
-  }
-  return scores;
-}
-
-std::vector<float> PSNR(void *gold_ptr, void *approx_ptr) {
-  auto *gold_tensor = (Tensor *)gold_ptr, *approx_tensor = (Tensor *)approx_ptr;
-  convertToFP32(gold_tensor);
-  convertToFP32(approx_tensor);
-
-  size_t *dim_sizes = gold_tensor->dims.dim_sizes;
-  size_t batch_dim = dim_sizes[0];
-  size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
-  float image_size_f = image_size;
-  DEBUG("batch_dim = %lu, image_size = %lu\n", batch_dim, image_size);
-  auto *image_size_tensor =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 1, 1, 1);
-  std::memcpy(image_size_tensor->host_data, &image_size_f, sizeof(float));
-
-  gold_tensor = (Tensor *)normalize(gold_tensor);
-  approx_tensor = (Tensor *)normalize(approx_tensor);
-  auto *diff = tensorMap2(MathOp::Sub, gold_tensor, approx_tensor);
-  auto *diffsqr = tensorMap2(MathOp::Mul, diff, diff);
-  auto *mse_sum_2d = tensorReduce(diffsqr, 3, MathOp::Add);
-  auto *mse_sum_1d = tensorReduce(mse_sum_2d, 2, MathOp::Add);
-  auto *mse_sum = tensorReduce(mse_sum_1d, 1, MathOp::Add);
-  auto *mse_avg = tensorMap2(MathOp::Div, mse_sum, image_size_tensor);
-  auto *psnr_val = (Tensor *)tensorMap1(MathOp::PSNR, mse_avg);
-  deviceToHostCopy(psnr_val);
-
-  auto *float_data = (float *)psnr_val->host_data;
-  return std::vector<float>(float_data, float_data + batch_dim);
-}
-
-float violationRate(const std::vector<float> &values, float threshold,
-                    bool higher_better) {
-  if (values.empty())
-    return 0.0f;
-  size_t violation = 0;
-  for (float v : values) {
-    if (std::isnan(v))
-      ++violation;
-    if ((higher_better && v < threshold) || (!higher_better && v > threshold))
-      ++violation;
-  }
-  return (float)violation / values.size();
-}
-
-float mean(const std::vector<float> &values) {
-  std::vector<float> non_nan;
-  for (float f : values)
-    if (!std::isnan(f))
-      non_nan.push_back(f);
-  if (non_nan.empty())
-    return 0.0f;
-  return std::accumulate(non_nan.begin(), non_nan.end(), 0.0f, std::plus<>()) /
-         (float)non_nan.size();
-}
-
-void *sliceTensorInBatch(void *whole, size_t start, size_t end) {
-  auto *whole_tensor = (Tensor *)whole;
-  size_t *dim_sizes = whole_tensor->dims.dim_sizes;
-  auto *output =
-      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, end - start,
-                               dim_sizes[1], dim_sizes[2], dim_sizes[3]);
-  size_t single_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
-  auto *in_data = (float *)(whole_tensor->host_data) + start * single_size;
-  memcpy(output->host_data, in_data, (end - start) * single_size);
-  return output;
-}
-
-void reshape(void *t, const std::vector<size_t> &shape) {
-  auto *tensor = (Tensor *)t;
-  size_t in_n = num_elems(tensor), out_n = num_elems(shape);
-  if (in_n != out_n)
-    ERROR("Reshaping cannot change number of elements\n");
-  tensor->dims.num_dims = shape.size();
-  free(tensor->dims.dim_sizes);
-  tensor->dims.dim_sizes = (size_t *)malloc(sizeof(size_t) * shape.size());
-  std::copy(shape.begin(), shape.end(), tensor->dims.dim_sizes);
-  set4DTensorDescriptor(tensor, tensor->data_format, shape[0], shape[1],
-                        shape[2], shape[3]);
-}
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
index 8b5c4aaf93db40c038c4a9a30569318ae00d6be1..b322ee2be37b60487e15c9109d4230adf1ad84e2 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
@@ -50,10 +50,6 @@ void llvm_hpvm_initTensorRt(int gpuid) {
 
 #endif
 
-#ifdef ERROR_INJECTION_ENABLED
-    readOpenTunerFlags("opentuner_flags");
-#endif
-
     runtime_initialized = true;
   }
 
@@ -72,14 +68,7 @@ void llvm_hpvm_initApproxhpvmRt(int gpuid) {
 
 void llvm_hpvm_cleanupApproxhpvmRt() {}
 
-void dumpAccuracyNorms() {
-
-#ifdef ERROR_INJECTION_ENABLED
-
-#endif
-
-  dump_result("accuracy_summary");
-}
+void dumpAccuracyNorms() { dump_result("accuracy_summary"); }
 
 // Returns the number of GPUs active on the platform
 unsigned int getGPUCount() {
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc
index ad1d2e137d19d1c158afb031f35f278d9cdefaa0..08f13bf0f891e03f3d13e0c2f2e8bc97bacb3b64 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc
@@ -1,13 +1,12 @@
 //===----------------------------- profling.cc  ---------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 //  This file contains code provides the definition of the interface for
 // applications to start and stop profiling for energy and performance.
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef PROFILING_HEADER
 #define PROFILING_HEADER
 
@@ -52,7 +51,7 @@ void stopProfiling() {
 void profileEvent(const char *event_name, bool compare_previous = false) {
 
   checkCudaErrors(cudaDeviceSynchronize());
-  
+
   auto it = func_counters.find(event_name);
   if (it == func_counters.end()) {
     func_counters[event_name] = 1;
@@ -73,7 +72,7 @@ void profileEvent(const char *event_name, bool compare_previous = false) {
       time_reading - zero_time;
 
   DEBUG("AbsoluteTime, Event = %s, Time = %f \n", event_name,
-       current_time.count());
+        current_time.count());
   profile_data.append(event_name);
   profile_data.append(event_count);
   profile_data.append("\t");
@@ -86,14 +85,13 @@ void profileEvent(const char *event_name, bool compare_previous = false) {
     profile_data.append("\t");
     profile_data.append(std::to_string(duration_time.count()));
     DEBUG("TimeDuration, Event = %s, Time = %f \n", event_name,
-         duration_time.count());
+          duration_time.count());
   }
 
   profile_data.append("\n");
 
   previous_time = time_reading; // set the previous time reading to the current
                                 // profiled time
-
 }
 }
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index 9250810a2010a235074c0d29b8fe8bd63650324c..939f6e061985b27b4369b37925c0d2bf6a7c9a5d 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -1,11 +1,11 @@
 //===--------------------------- tensor_runtime_cpu.cc --------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  consists of the custom implementation of non-approximated and 
-// approximated  versions of tensor operations to execute on CPUs. The 
-// software approximations implemented for tensor convolutions are feature 
-// sampling and perforation for FP32 compute precisions only.  
+//
+//  This file  consists of the custom implementation of non-approximated and
+// approximated  versions of tensor operations to execute on CPUs. The
+// software approximations implemented for tensor convolutions are feature
+// sampling and perforation for FP32 compute precisions only.
 //
 //===----------------------------------------------------------------------===//
 
@@ -29,7 +29,7 @@
 #include <string>
 #include <vector>
 #include <math.h>
-#include<bits/stdc++.h>
+#include <bits/stdc++.h>
 #include <pthread.h>
 #include <omp.h>
 
@@ -39,1081 +39,1130 @@
 #include "tensor_cpu_runtime.h"
 
 void llvm_hpvm_initTensorRtCPU() {
-    // NOTE: Do Nothing
+  // NOTE: Do Nothing
 }
 
 void llvm_hpvm_cleanupTensorRtCPU() {
-    // NOTE: Do Nothing
+  // NOTE: Do Nothing
 }
 
 void hpvm_request_tensorCPU(void *tensor, int destination) {
-    // NOTE: Do Nothing
+  // NOTE: Do Nothing
 }
-  
+
 std::vector<void *> PtrVect;
 
 void freeBatchMemory() {
-    for(auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
-        free(*it);
-    }
-    PtrVect.erase(PtrVect.begin(), PtrVect.end());
+  for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
+    free(*it);
+  }
+  PtrVect.erase(PtrVect.begin(), PtrVect.end());
 }
 
-
-int getTypeSizeCPU(int data_type)  __attribute__((always_inline));
+int getTypeSizeCPU(int data_type) __attribute__((always_inline));
 inline int getTypeSizeCPU(int data_type) {
-    return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
+  return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
 }
 
-void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline));
-inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) {
-    int type_size = getTypeSizeCPU(data_type);
-    size_t size_in_bytes = type_size * num_elems;
-    tensor->size_in_bytes = size_in_bytes;
+void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems)
+    __attribute__((always_inline));
+inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type,
+                              size_t num_elems) {
+  int type_size = getTypeSizeCPU(data_type);
+  size_t size_in_bytes = type_size * num_elems;
+  tensor->size_in_bytes = size_in_bytes;
 }
 
-void allocateMemCPU(struct Tensor *tensor, int data_type, 
-                    size_t num_elems, bool freeMemory = true) __attribute__((always_inline));
-inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, bool freeMemory) {
-    setSizeInBytesCPU(tensor, data_type, num_elems);
-    tensor->data_type = data_type;
-    tensor->num_elems = num_elems;
-    tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
-    if(freeMemory)
-        PtrVect.push_back(tensor->host_data);
+void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems,
+                    bool freeMemory = true) __attribute__((always_inline));
+inline void allocateMemCPU(struct Tensor *tensor, int data_type,
+                           size_t num_elems, bool freeMemory) {
+  setSizeInBytesCPU(tensor, data_type, num_elems);
+  tensor->data_type = data_type;
+  tensor->num_elems = num_elems;
+  tensor->host_data =
+      (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
+  if (freeMemory)
+    PtrVect.push_back(tensor->host_data);
 }
 
-void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) __attribute__((always_inline));
-inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
-    Tensor *tensor = (Tensor *)tensor_ptr;
-    if (tensor->size_in_bytes != size_in_bytes) {
-        printf("The destination and source sizes don't match");
-    }
-    memcpy(tensor->host_data, data_ptr, size_in_bytes); // Is this efficient enough?
+void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes)
+    __attribute__((always_inline));
+inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr,
+                              size_t size_in_bytes) {
+  Tensor *tensor = (Tensor *)tensor_ptr;
+  if (tensor->size_in_bytes != size_in_bytes) {
+    printf("The destination and source sizes don't match");
+  }
+  memcpy(tensor->host_data, data_ptr,
+         size_in_bytes); // Is this efficient enough?
 }
 
 void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
-                     size_t dim2_size, size_t dim3_size, size_t dim4_size, 
-                    bool freeMemory = true) __attribute__((always_inline));
-inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,         
-                                    size_t dim2_size, size_t dim3_size, 
-                                    size_t dim4_size, bool freeMemory) {
-    struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
-    size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-    if(freeMemory)
-        PtrVect.push_back(tensor);
-    allocateMemCPU(tensor, data_type, num_elems, freeMemory);
-    
-    // Setting the tensor dimensions
-    size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
-    dim_sizes[0] = dim1_size;
-    dim_sizes[1] = dim2_size;
-    dim_sizes[2] = dim3_size;
-    dim_sizes[3] = dim4_size;
-    tensor->dims.dim_sizes = dim_sizes;
-    tensor->dims.num_dims = 4;
-    tensor->data_placement = HOST;    
-    return tensor;
+                        size_t dim2_size, size_t dim3_size, size_t dim4_size,
+                        bool freeMemory = true) __attribute__((always_inline));
+inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
+                               size_t dim2_size, size_t dim3_size,
+                               size_t dim4_size, bool freeMemory) {
+  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+  if (freeMemory)
+    PtrVect.push_back(tensor);
+  allocateMemCPU(tensor, data_type, num_elems, freeMemory);
+
+  // Setting the tensor dimensions
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  dim_sizes[2] = dim3_size;
+  dim_sizes[3] = dim4_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 4;
+  tensor->data_placement = HOST;
+  return tensor;
 }
 
-void* tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                                    int horizontal_pad, int vertical_stride,
-                                    int horizontal_stride, int conv_mode,
-                                    int compute_precision) {
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-    int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int output_size = output_width * output_height;
-    printf("--CREATE 4D TENSOR\n");    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    printf("CREATED 4D TENSOR\n");
-    long int conv_data_size = 
-        sizeof(float) * num_filter_elem * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(conv_data_size);
-    printf("host data: %p\n", host_data);
-    printf("number of batches: %d\n", batch_size);
-    omp_set_num_threads(4);
-     #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                        + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                                && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                        + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
+void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int compute_precision) {
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+  int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) /
+                          horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int output_size = output_width * output_height;
+  printf("--CREATE 4D TENSOR\n");
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+                                               output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+  printf("CREATED 4D TENSOR\n");
+  long int conv_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(conv_data_size);
+  printf("host data: %p\n", host_data);
+  printf("number of batches: %d\n", batch_size);
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        for (int w = 0; w < output_width; w++) {
+          const int inH = h * vertical_stride - vertical_pad;
+          const int inW = w * horizontal_stride - horizontal_pad;
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
-        for (int p = 0; p < num_filters; ++p) {
-             for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+      }
+    }
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(host_data);
-    printf("END: %p\n", output);
-    return output;
+  }
+  free(host_data);
+  printf("END: %p\n", output);
+  return output;
 }
 
-void* tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, 
-                                                int vertical_pad, int horizontal_pad, 
-                                                int vertical_stride, int horizontal_stride, 
-                                                int conv_mode, int compute_precision, 
-                                                int skip_every, int start) {
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    const int batch_size = input->dims.dim_sizes[0];
-    const int channels = input->dims.dim_sizes[1];
-    const int image_height = input->dims.dim_sizes[2];
-    const int image_width = input->dims.dim_sizes[3];
-    const int num_filters = filter->dims.dim_sizes[0];
-    const int kernel_height = filter->dims.dim_sizes[2];
-    const int kernel_width = filter->dims.dim_sizes[3];
-    const int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    const int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    const int num_filter_elem = kernel_height * kernel_width * channels;
-
-    const int remainder = ((num_filter_elem - start) % skip_every > 0);
-    const int reduced_num_filter_elem = 
-            num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
-    const int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    const long int host_data_size = sizeof(float) * reduced_num_filter_elem 
-                                    * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
-   
-    const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
-    float *reduced_kernels = (float *) malloc(reduced_filer_size);
-   
-    float fac =  (((float) skip_every) / ((float) skip_every - 1));
-    int reduced_filter_dim = reduced_num_filter_elem / channels;
-
-    // Create reduced filter
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int f = 0; f < num_filters; f++) {
-        for(int i = 0; i < reduced_num_filter_elem; i++) {
-            int ch = i / reduced_filter_dim;
-            int offset  = (start + ch) % skip_every; 
-            int in_index;
-            if(i < offset) {
-                in_index = i;
-            } else {
-                in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) 
-                        + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset -1;
-            }
-            reduced_kernels[f * reduced_num_filter_elem + i] = 
-                                fac * host_filter[num_filter_elem * f + in_index];
+void *tensorRegularFilterSamplingConvolutionCPU(
+    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int compute_precision, int skip_every, int start) {
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  const int batch_size = input->dims.dim_sizes[0];
+  const int channels = input->dims.dim_sizes[1];
+  const int image_height = input->dims.dim_sizes[2];
+  const int image_width = input->dims.dim_sizes[3];
+  const int num_filters = filter->dims.dim_sizes[0];
+  const int kernel_height = filter->dims.dim_sizes[2];
+  const int kernel_width = filter->dims.dim_sizes[3];
+  const int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  const int output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  const int num_filter_elem = kernel_height * kernel_width * channels;
+
+  const int remainder = ((num_filter_elem - start) % skip_every > 0);
+  const int reduced_num_filter_elem =
+      num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
+  const int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+                                               output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  const long int host_data_size = sizeof(float) * reduced_num_filter_elem *
+                                  output_height * output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  const int reduced_filer_size =
+      sizeof(float) * num_filters * reduced_num_filter_elem;
+  float *reduced_kernels = (float *)malloc(reduced_filer_size);
+
+  float fac = (((float)skip_every) / ((float)skip_every - 1));
+  int reduced_filter_dim = reduced_num_filter_elem / channels;
+
+  // Create reduced filter
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int f = 0; f < num_filters; f++) {
+    for (int i = 0; i < reduced_num_filter_elem; i++) {
+      int ch = i / reduced_filter_dim;
+      int offset = (start + ch) % skip_every;
+      int in_index;
+      if (i < offset) {
+        in_index = i;
+      } else {
+        in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) +
+                   (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                   offset - 1;
+      }
+      reduced_kernels[f * reduced_num_filter_elem + i] =
+          fac * host_filter[num_filter_elem * f + in_index];
+    }
+  }
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int h = 0; h < output_height; h++) {
+      for (int w = 0; w < output_width; w++) {
+        const int inH = h * vertical_stride - vertical_pad;
+        const int inW = w * horizontal_stride - horizontal_pad;
+        for (int fi = 0; fi < reduced_num_filter_elem; fi++) {
+          int in_index;
+          const int ch = fi / reduced_filter_dim;
+          const int offset = (start + ch) % skip_every;
+          if (fi < offset) {
+            in_index = fi;
+          } else {
+            in_index =
+                ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                offset - 1;
+          }
+          const int i =
+              (in_index % (kernel_width * kernel_height)) / kernel_width;
+          const int j = in_index % kernel_width;
+          const int output_index = h * output_width + w;
+          const int out_index = b * reduced_num_filter_elem * output_size +
+                                output_index * reduced_num_filter_elem + fi;
+          if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+              inW + j < image_width) {
+            host_data[out_index] =
+                host_image[((b * channels + ch) * image_height + (inH + i)) *
+                               image_width +
+                           (inW + j)];
+          } else {
+            host_data[out_index] = 0;
+          }
         }
+      }
     }
 
-    omp_set_num_threads(4);   
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
-                        int in_index;
-                        const int ch = fi / reduced_filter_dim;
-                        const int offset  = (start + ch) % skip_every;
-                        if(fi < offset) {
-                            in_index = fi;
-                        } else {
-                            in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-                        }
-                        const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; 
-                        const int j = in_index % kernel_width;
-                        const int output_index = h * output_width + w;
-                        const int out_index = b * reduced_num_filter_elem * output_size 
-                                            + output_index * reduced_num_filter_elem + fi;
-                        if(inH + i >= 0 && inH + i < image_height 
-                        && inW + j >= 0 && inW + j < image_width) {
-                            host_data[out_index] = 
-                                host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                        } else {
-                            host_data[out_index] = 0;
-                        }
-                }
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < reduced_num_filter_elem; ++k) {
+          int input_index = k + reduced_num_filter_elem * m +
+                            b * reduced_num_filter_elem * output_size;
+          sum += host_data[input_index] *
+                 reduced_kernels[p * reduced_num_filter_elem + k];
         }
-
-         // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < reduced_num_filter_elem; ++k) {
-                    int input_index = k + reduced_num_filter_elem * m 
-                                    + b * reduced_num_filter_elem * output_size;
-                    sum += host_data[input_index] 
-                            * reduced_kernels[p * reduced_num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
-        }
-
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(reduced_kernels);
-    free(host_data);
-  
-    return output;
+  }
+  free(reduced_kernels);
+  free(host_data);
+
+  return output;
 }
 
-void* tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, 
-                                                  int vertical_pad, int horizontal_pad, 
-                                                  int vertical_stride, int horizontal_stride, 
-                                                  int conv_mode, int compute_precision, 
-                                                  int skip_every, int start) {
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    const int batch_size = input->dims.dim_sizes[0];
-    const int channels = input->dims.dim_sizes[1];
-    const int image_height = input->dims.dim_sizes[2];
-    const int image_width = input->dims.dim_sizes[3];
-    const int num_filters = filter->dims.dim_sizes[0];
-    const int kernel_height = filter->dims.dim_sizes[2];
-    const int kernel_width = filter->dims.dim_sizes[3];
-    const int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    const int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    const int num_filter_elem = kernel_height * kernel_width * channels;
-
-    const int remainder = ((num_filter_elem - start) % skip_every > 0);
-    const int reduced_num_filter_elem = 
-            num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
-    const int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    const long int host_data_size = sizeof(float) * reduced_num_filter_elem 
-                                    * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
-   
-    const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
-    float *reduced_kernels = (float *) malloc(reduced_filer_size);
-   
-    float fac =  (((float) skip_every) / ((float) skip_every - 1));
-    int reduced_filter_dim = reduced_num_filter_elem / channels;
-
-    // Create Reduced filter
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int f = 0; f < num_filters; f++) {
-        for(int i = 0; i < start; i++) {
-            reduced_kernels[f * reduced_num_filter_elem + i] = 
-                                        host_filter[num_filter_elem * f + i];
-        }
-        #pragma omp simd
-        for(int i = start; i < reduced_num_filter_elem; i++) {
-            int in_index = ((i - start + 1) * skip_every) / (skip_every - 1)
-                    + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + start - 1;
-            reduced_kernels[f * reduced_num_filter_elem + i] = 
-                            fac * host_filter[num_filter_elem * f + in_index];
-        }
+void *tensorIrregularFilterSamplingConvolutionCPU(
+    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int compute_precision, int skip_every, int start) {
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  const int batch_size = input->dims.dim_sizes[0];
+  const int channels = input->dims.dim_sizes[1];
+  const int image_height = input->dims.dim_sizes[2];
+  const int image_width = input->dims.dim_sizes[3];
+  const int num_filters = filter->dims.dim_sizes[0];
+  const int kernel_height = filter->dims.dim_sizes[2];
+  const int kernel_width = filter->dims.dim_sizes[3];
+  const int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  const int output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  const int num_filter_elem = kernel_height * kernel_width * channels;
+
+  const int remainder = ((num_filter_elem - start) % skip_every > 0);
+  const int reduced_num_filter_elem =
+      num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
+  const int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+                                               output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  const long int host_data_size = sizeof(float) * reduced_num_filter_elem *
+                                  output_height * output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  const int reduced_filer_size =
+      sizeof(float) * num_filters * reduced_num_filter_elem;
+  float *reduced_kernels = (float *)malloc(reduced_filer_size);
+
+  float fac = (((float)skip_every) / ((float)skip_every - 1));
+
+  // Create Reduced filter
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int f = 0; f < num_filters; f++) {
+    for (int i = 0; i < start; i++) {
+      reduced_kernels[f * reduced_num_filter_elem + i] =
+          host_filter[num_filter_elem * f + i];
     }
-
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
-                        int in_index;
-                        int offset = start;
-                        if(fi < offset) {
-                            in_index = fi;
-                        } else {
-                            in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-                             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-                        }
-                        const int ch = in_index / (kernel_width * kernel_height);
-                        const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; 
-                        const int j = in_index % kernel_width;
-                        const int output_index = h * output_width + w;
-                        const int out_index = b * reduced_num_filter_elem * output_size 
-                                            + output_index * reduced_num_filter_elem + fi;
-                        if(inH + i >= 0 && inH + i < image_height 
-                        && inW + j >= 0 && inW + j < image_width) {
-                            host_data[out_index] = 
-                                host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                        } else {
-                            host_data[out_index] = 0;
-                        }
-                }
-            }
+#pragma omp simd
+    for (int i = start; i < reduced_num_filter_elem; i++) {
+      int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) +
+                     (((i - start + 1) * skip_every) % (skip_every - 1) > 0) +
+                     start - 1;
+      reduced_kernels[f * reduced_num_filter_elem + i] =
+          fac * host_filter[num_filter_elem * f + in_index];
+    }
+  }
+
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int h = 0; h < output_height; h++) {
+      for (int w = 0; w < output_width; w++) {
+        const int inH = h * vertical_stride - vertical_pad;
+        const int inW = w * horizontal_stride - horizontal_pad;
+        for (int fi = 0; fi < reduced_num_filter_elem; fi++) {
+          int in_index;
+          int offset = start;
+          if (fi < offset) {
+            in_index = fi;
+          } else {
+            in_index =
+                ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                offset - 1;
+          }
+          const int ch = in_index / (kernel_width * kernel_height);
+          const int i =
+              (in_index % (kernel_width * kernel_height)) / kernel_width;
+          const int j = in_index % kernel_width;
+          const int output_index = h * output_width + w;
+          const int out_index = b * reduced_num_filter_elem * output_size +
+                                output_index * reduced_num_filter_elem + fi;
+          if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+              inW + j < image_width) {
+            host_data[out_index] =
+                host_image[((b * channels + ch) * image_height + (inH + i)) *
+                               image_width +
+                           (inW + j)];
+          } else {
+            host_data[out_index] = 0;
+          }
         }
+      }
+    }
 
-        // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < reduced_num_filter_elem; ++k) {
-                    int input_index = k + reduced_num_filter_elem * m 
-                                    + b * reduced_num_filter_elem * output_size;
-                    sum += host_data[input_index] 
-                                * reduced_kernels[p * reduced_num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < reduced_num_filter_elem; ++k) {
+          int input_index = k + reduced_num_filter_elem * m +
+                            b * reduced_num_filter_elem * output_size;
+          sum += host_data[input_index] *
+                 reduced_kernels[p * reduced_num_filter_elem + k];
         }
-
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(reduced_kernels);
-    free(host_data);
-  
-    return output;
-}
+  }
+  free(reduced_kernels);
+  free(host_data);
 
-void* tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                                int horizontal_pad, int vertical_stride, int horizontal_stride, 
-                                int conv_mode, int compute_precision, int row, int start) {
-    
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-
-    int full_output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int full_output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int full_output_size = full_output_height * full_output_width;
-
-    Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                            full_output_height, full_output_width);
-    float * __restrict__ full_output_data = (float *)full_output->host_data;
-   
-    int remainder = (full_output_height - start) % row > 0;
-    int output_height = 
-            full_output_height - ((full_output_height - start) / row) - remainder;
-
-    int output_width = full_output_width;
-    float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters 
-                                                * output_height * output_width);   
-    int output_size = output_width * output_height;
-    long int host_data_size = sizeof(float) * num_filter_elem * output_height 
-                                                        * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
+  return output;
+}
 
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                int inH;
-                if(h < start) {
-                    inH = h * vertical_stride - vertical_pad;
-                } else {
-                    int h_index = ((h - start + 1) * row) / (row - 1) 
-                                + (((h - start + 1) * row) % (row - 1) > 0) + start - 1;
-                    inH = h_index * vertical_stride - vertical_pad;
-                }
-                for(int w = 0; w < output_width; w++) {
-                    int inW = w * horizontal_stride - horizontal_pad;
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = 
-                                        (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                    + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                            && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
+void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int compute_precision, int row,
+                                  int start) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+
+  int full_output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int full_output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int full_output_size = full_output_height * full_output_width;
+
+  Tensor *full_output = (Tensor *)create4DTensorCPU(
+      0, 0, batch_size, num_filters, full_output_height, full_output_width);
+  float *__restrict__ full_output_data = (float *)full_output->host_data;
+
+  int remainder = (full_output_height - start) % row > 0;
+  int output_height =
+      full_output_height - ((full_output_height - start) / row) - remainder;
+
+  int output_width = full_output_width;
+  float *output_data = (float *)malloc(
+      sizeof(float) * batch_size * num_filters * output_height * output_width);
+  int output_size = output_width * output_height;
+  long int host_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        int inH;
+        if (h < start) {
+          inH = h * vertical_stride - vertical_pad;
+        } else {
+          int h_index = ((h - start + 1) * row) / (row - 1) +
+                        (((h - start + 1) * row) % (row - 1) > 0) + start - 1;
+          inH = h_index * vertical_stride - vertical_pad;
+        }
+        for (int w = 0; w < output_width; w++) {
+          int inW = w * horizontal_stride - horizontal_pad;
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
+      }
+    }
 
-        // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
+    }
 
-        // Interpolate
-        for (int p = 0; p < num_filters; ++p) {
-            for(int h = 0; h < full_output_height; h++) { 
-                for(int w = 0; w < full_output_width; w++) {
-                   int full_output_index = b * num_filters * full_output_size 
-                            + p * full_output_size + h * full_output_width  + w;
-                   if(h < start) {
-                       int output_index = b * num_filters * output_size 
-                                        + p * output_size + h * output_width  + w;
-                       full_output_data[full_output_index] = output_data[output_index];
-                   } else if(h == full_output_height - 1) {
-                       int output_index = b * num_filters * output_size + p * output_size 
-                                                + (output_height - 1) * output_width  + w;
-                       full_output_data[full_output_index] = output_data[output_index];
-                    } else if(h == 0) {
-                        int output_index = b * num_filters * output_size 
-                                            + p * output_size + 0 * output_width  + w;
-                        full_output_data[full_output_index] = output_data[output_index]; 
-                    } else if((h - start) % row == 0) {
-                        int row_index = h - ((h + 1 - start) / row); 
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                            + row_index * output_width + w;
-                        full_output_data[full_output_index] = 
-                            (output_data[output_index] + output_data[output_index - output_width]) / 2;
-                   } else {
-                       int remainder = ((h + 1 - start) % row) > 0;
-                       int row_index = h - ((h + 1 - start) / row) - remainder;
-                       int output_index = b * num_filters * output_size + p * output_size 
-                                                        + row_index * output_width + w;
-                       full_output_data[full_output_index] = output_data[output_index];
-                  }
-                }
-            }
-         }
+    // Interpolate
+    for (int p = 0; p < num_filters; ++p) {
+      for (int h = 0; h < full_output_height; h++) {
+        for (int w = 0; w < full_output_width; w++) {
+          int full_output_index = b * num_filters * full_output_size +
+                                  p * full_output_size + h * full_output_width +
+                                  w;
+          if (h < start) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (h == full_output_height - 1) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               (output_height - 1) * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (h == 0) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               0 * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if ((h - start) % row == 0) {
+            int row_index = h - ((h + 1 - start) / row);
+            int output_index = b * num_filters * output_size + p * output_size +
+                               row_index * output_width + w;
+            full_output_data[full_output_index] =
+                (output_data[output_index] +
+                 output_data[output_index - output_width]) /
+                2;
+          } else {
+            int remainder = ((h + 1 - start) % row) > 0;
+            int row_index = h - ((h + 1 - start) / row) - remainder;
+            int output_index = b * num_filters * output_size + p * output_size +
+                               row_index * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          }
+        }
+      }
     }
-    free(output_data);
-    free(host_data);
+  }
+  free(output_data);
+  free(host_data);
 
-    return full_output;
+  return full_output;
 }
 
-void* tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                                int horizontal_pad, int vertical_stride, int horizontal_stride, 
-                                int conv_mode, int compute_precision, int col, int start) {
-    
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-    
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-    int full_output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int full_output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int full_output_size = full_output_height * full_output_width;
-
-    Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                                    full_output_height, full_output_width);
-    float * __restrict__ full_output_data = (float *)full_output->host_data;
-
-    int remainder = (full_output_width - start) % col > 0;
-    int output_width = full_output_width - ((full_output_width - start) / col) - remainder;
-
-    int output_height = full_output_height;
-    float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters 
-                                                    * output_height * output_width);
-    int output_size = output_width * output_height;
-    long int host_data_size = sizeof(float) * num_filter_elem * output_height 
-                                                        * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
-
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                int inH = h * vertical_stride - vertical_pad;
-                for(int w = 0; w < output_width; w++) {
-                    int inW;
-                    if(w < start) {
-                        inW = w * horizontal_stride - horizontal_pad;
-                    } else {
-                        int w_index = ((w - start + 1) * col) / (col - 1) 
-                                + (((w - start + 1) * col) % (col - 1) > 0) + start - 1;
-                        inW = w_index * horizontal_stride - horizontal_pad;
-                    }
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = 
-                                        (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                    + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                            && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m 
-                                            + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
+void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int compute_precision, int col,
+                                  int start) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+  int full_output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int full_output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int full_output_size = full_output_height * full_output_width;
+
+  Tensor *full_output = (Tensor *)create4DTensorCPU(
+      0, 0, batch_size, num_filters, full_output_height, full_output_width);
+  float *__restrict__ full_output_data = (float *)full_output->host_data;
+
+  int remainder = (full_output_width - start) % col > 0;
+  int output_width =
+      full_output_width - ((full_output_width - start) / col) - remainder;
+
+  int output_height = full_output_height;
+  float *output_data = (float *)malloc(
+      sizeof(float) * batch_size * num_filters * output_height * output_width);
+  int output_size = output_width * output_height;
+  long int host_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        int inH = h * vertical_stride - vertical_pad;
+        for (int w = 0; w < output_width; w++) {
+          int inW;
+          if (w < start) {
+            inW = w * horizontal_stride - horizontal_pad;
+          } else {
+            int w_index = ((w - start + 1) * col) / (col - 1) +
+                          (((w - start + 1) * col) % (col - 1) > 0) + start - 1;
+            inW = w_index * horizontal_stride - horizontal_pad;
+          }
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
+      }
+    }
 
-        // Interpolate
-        for (int p = 0; p < num_filters; ++p) {
-            for(int h = 0; h < full_output_height; h++) {
-                for(int w = 0; w < full_output_width; w++) {
-                    int full_output_index = b * num_filters * full_output_size 
-                                + p * full_output_size + h * full_output_width  + w;
-                     if(w < start) {
-                         int output_index = b * num_filters * output_size 
-                                        + p * output_size + h * output_width + w;
-                         full_output_data[full_output_index] = output_data[output_index];
-                    } else if(w == full_output_width - 1) {
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                    + h * output_width  + output_width - 1;
-                        full_output_data[full_output_index] = output_data[output_index];
-                    } else if(w == 0) {
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                                + h * output_width  + 0;
-                        full_output_data[full_output_index] = output_data[output_index];
-                    } else if((w - start) % col == 0) {
-                        int col_index = w - ((w + 1 - start) / col);
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                            + h * output_width + col_index;
-                        full_output_data[full_output_index] = 
-                            (output_data[output_index] + output_data[output_index - 1]) / 2;
-                    } else {
-                        int remainder = ((w + 1 - start) % col) > 0;
-                        int col_index = w - ((w + 1 - start) / col) - remainder;
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                            + h * output_width + col_index;
-                        full_output_data[full_output_index] = output_data[output_index];
-                    }
-                }
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(output_data);
-    free(host_data);
-
-    return full_output;
-}
 
-void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr, 
-                          int vertical_pad, int horizontal_pad, 
-                          int vertical_stride, int horizontal_stride, 
-                          int conv_mode, int compute_precision, 
-                          int row, int col, int skip_every, int start) {
-    if(row > 1) {
-        printf("ROW PERFORATION\n");
-        return tensorRowPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
-                        horizontal_pad, vertical_stride, horizontal_stride, conv_mode, 
-                        compute_precision, row, start);
-    } 
-    if(col > 1) {
-     printf("COL PERFORATION\n");
-     return tensorColPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
-                             horizontal_pad, vertical_stride, horizontal_stride, conv_mode, 
-                            compute_precision, col, start);
-    }  
-    if(skip_every > 1) {
-        printf("INPUT FILTERING\n");
-        Tensor *input = (Tensor *)input_ptr;
-        Tensor *filter = (Tensor *)filter_ptr;
-
-        const int kernel_height = filter->dims.dim_sizes[2];
-        const int kernel_width = filter->dims.dim_sizes[3];
-
-        if(!(kernel_height * kernel_width % skip_every)) {
-            return tensorRegularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, 
-                                    vertical_pad, horizontal_pad, vertical_stride,
-                                    horizontal_stride, conv_mode, 
-                                    compute_precision, skip_every, start);
+    // Interpolate
+    for (int p = 0; p < num_filters; ++p) {
+      for (int h = 0; h < full_output_height; h++) {
+        for (int w = 0; w < full_output_width; w++) {
+          int full_output_index = b * num_filters * full_output_size +
+                                  p * full_output_size + h * full_output_width +
+                                  w;
+          if (w < start) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (w == full_output_width - 1) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + output_width - 1;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (w == 0) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + 0;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if ((w - start) % col == 0) {
+            int col_index = w - ((w + 1 - start) / col);
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + col_index;
+            full_output_data[full_output_index] =
+                (output_data[output_index] + output_data[output_index - 1]) / 2;
+          } else {
+            int remainder = ((w + 1 - start) % col) > 0;
+            int col_index = w - ((w + 1 - start) / col) - remainder;
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + col_index;
+            full_output_data[full_output_index] = output_data[output_index];
+          }
         }
-        return tensorIrregularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, 
-                                    vertical_pad, horizontal_pad, vertical_stride, 
-                                    horizontal_stride, conv_mode, 
-                                    compute_precision, skip_every, start);
+      }
     }
-    printf("---REGULAR CONV\n");
-    return tensorRegularConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
-                                 horizontal_pad, vertical_stride, 
-                                 horizontal_stride, conv_mode, compute_precision);
+  }
+  free(output_data);
+  free(host_data);
+
+  return full_output;
 }
 
-void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups){
-	
-    Tensor *input = (Tensor *)input_ptr;
+void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                          int horizontal_pad, int vertical_stride,
+                          int horizontal_stride, int conv_mode,
+                          int compute_precision, int row, int col,
+                          int skip_every, int start) {
+  if (row > 1) {
+    printf("ROW PERFORATION\n");
+    return tensorRowPerfConvolutionCPU(
+        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+        horizontal_stride, conv_mode, compute_precision, row, start);
+  }
+  if (col > 1) {
+    printf("COL PERFORATION\n");
+    return tensorColPerfConvolutionCPU(
+        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+        horizontal_stride, conv_mode, compute_precision, col, start);
+  }
+  if (skip_every > 1) {
+    printf("INPUT FILTERING\n");
     Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    const int batch_size = input->dims.dim_sizes[0];
-    const int channels = input->dims.dim_sizes[1];
-    const int image_height = input->dims.dim_sizes[2];
-    const int image_width = input->dims.dim_sizes[3];
-    const int num_filters = filter->dims.dim_sizes[0];
+
     const int kernel_height = filter->dims.dim_sizes[2];
     const int kernel_width = filter->dims.dim_sizes[3];
-    const int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    const int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    const int filter_dim = kernel_height * kernel_width;
-    const int num_filter_elem = filter_dim * channels;
-    const int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, channels, 
-                                                    output_height * output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    const long int conv_data_size = 
-        sizeof(float) * num_filter_elem * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(conv_data_size);
-   
-    omp_set_num_threads(4);
-     #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                        + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                                && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                        + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
+
+    if (!(kernel_height * kernel_width % skip_every)) {
+      return tensorRegularFilterSamplingConvolutionCPU(
+          input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+          horizontal_stride, conv_mode, compute_precision, skip_every, start);
+    }
+    return tensorIrregularFilterSamplingConvolutionCPU(
+        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+        horizontal_stride, conv_mode, compute_precision, skip_every, start);
+  }
+  printf("---REGULAR CONV\n");
+  return tensorRegularConvolutionCPU(
+      input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+      horizontal_stride, conv_mode, compute_precision);
+}
+
+void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int conv_groups) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  const int batch_size = input->dims.dim_sizes[0];
+  const int channels = input->dims.dim_sizes[1];
+  const int image_height = input->dims.dim_sizes[2];
+  const int image_width = input->dims.dim_sizes[3];
+  const int num_filters = filter->dims.dim_sizes[0];
+  const int kernel_height = filter->dims.dim_sizes[2];
+  const int kernel_width = filter->dims.dim_sizes[3];
+  const int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  const int output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  const int filter_dim = kernel_height * kernel_width;
+  const int num_filter_elem = filter_dim * channels;
+  const int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensorCPU(
+      0, 0, batch_size, num_filters, channels, output_height * output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  const long int conv_data_size = sizeof(float) * num_filter_elem *
+                                  output_height * output_width * batch_size;
+  float *host_data = (float *)malloc(conv_data_size);
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        for (int w = 0; w < output_width; w++) {
+          const int inH = h * vertical_stride - vertical_pad;
+          const int inW = w * horizontal_stride - horizontal_pad;
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
-        for (int p = 0; p < num_filters; ++p) {
-             for (int m = 0; m < output_size; ++m) {
-                for (int ch = 0; ch < channels; ch++) {
-                    float sum = 0;
-                    #pragma omp simd reduction(+:sum)
-                    for (int k = 0; k < filter_dim; ++k) {
-                        int input_index = k + ch * filter_dim + num_filter_elem * m + b * num_filter_elem * output_size;
-                        sum += host_data[input_index] * host_filter[p * num_filter_elem + ch * filter_dim + k];
-                    }
-                    output_data[b * (output_size * num_filters * channels) + p * output_size * channels + ch * output_size + m] = sum;
-                }
-            }
+      }
+    }
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        for (int ch = 0; ch < channels; ch++) {
+          float sum = 0;
+#pragma omp simd reduction(+ : sum)
+          for (int k = 0; k < filter_dim; ++k) {
+            int input_index = k + ch * filter_dim + num_filter_elem * m +
+                              b * num_filter_elem * output_size;
+            sum += host_data[input_index] *
+                   host_filter[p * num_filter_elem + ch * filter_dim + k];
+          }
+          output_data[b * (output_size * num_filters * channels) +
+                      p * output_size * channels + ch * output_size + m] = sum;
         }
+      }
     }
+  }
 
-    free(host_data);
-    return output;
+  free(host_data);
+  return output;
 }
 
-void* tensorAddCPU(void *x_ptr, void *bias_ptr) {
-    Tensor *x = (Tensor *)x_ptr;
-    Tensor *bias = (Tensor *)bias_ptr;
-    
-    float * __restrict__ x_data = (float *)x->host_data;
-    float * __restrict__ bias_data = (float *)bias->host_data;
-    int n = x->dims.dim_sizes[0];
-    int c = x->dims.dim_sizes[1];
-    int h = x->dims.dim_sizes[2];
-    int w = x->dims.dim_sizes[3];
-    
-    if(x->num_elems == bias->num_elems) {
-        int const1 = c * h * w;
-        int const2 = h * w;
-         omp_set_num_threads(4);
-        #pragma omp parallel for
-        for (int i = 0; i < n; i++) { 
-            for (int j = 0; j < c; j++) {
-                 #pragma omp simd collapse(2)
-                for (int k = 0; k < h; k++) {
-                    for (int l = 0; l < w; l++) {
-                        x_data[i * const1 + j * const2 + (k * w)  + l] += 
-                                bias_data[i * const1 + j * const2 + (k*w) + l];
-                    }
-                }
-            }
+void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
+  Tensor *x = (Tensor *)x_ptr;
+  Tensor *bias = (Tensor *)bias_ptr;
+
+  float *__restrict__ x_data = (float *)x->host_data;
+  float *__restrict__ bias_data = (float *)bias->host_data;
+  int n = x->dims.dim_sizes[0];
+  int c = x->dims.dim_sizes[1];
+  int h = x->dims.dim_sizes[2];
+  int w = x->dims.dim_sizes[3];
+
+  if (x->num_elems == bias->num_elems) {
+    int const1 = c * h * w;
+    int const2 = h * w;
+    omp_set_num_threads(4);
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+#pragma omp simd collapse(2)
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            x_data[i * const1 + j * const2 + (k * w) + l] +=
+                bias_data[i * const1 + j * const2 + (k * w) + l];
+          }
         }
-    } else {
-         omp_set_num_threads(4);
-        #pragma omp parallel for
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < c; j++) {
-                #pragma omp simd collapse(2)
-                for (int k = 0; k < h; k++) {
-                    for (int l = 0; l < w; l++) {
-                        x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
-                    }
-                }
-            }
-        }   
+      }
+    }
+  } else {
+    omp_set_num_threads(4);
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+#pragma omp simd collapse(2)
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
+          }
+        }
+      }
     }
-    
-    return x;
+  }
+
+  return x;
 }
 
 float max(float v1, float v2) __attribute__((always_inline));
-inline float maximum(float v1, float v2){
-    return (v1 < v2) ? v2 : v1;
-}
+inline float maximum(float v1, float v2) { return (v1 < v2) ? v2 : v1; }
 
 void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
-             int window_width, int vertical_pad, int horizontal_pad,
-                          int vertical_stride, int horizontal_stride) {
-   
-    Tensor *input = (Tensor *)input_ptr;
-    float * __restrict__ input_data = (float *)input->host_data;
-    
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    
-    int output_height = 
-        1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
-    int output_width = 
-        1 + ((image_width - window_width + 2 * horizontal_pad) / horizontal_stride);
-    
-    int center_x = (window_width - 1) / 2 - horizontal_pad;
-    int center_y = (window_height - 1) / 2 - vertical_pad;
-    int x_radius = (window_width - 1) / 2;
-    int y_radius = (window_height - 1) / 2;
-    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, channels, 
-                                                output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-   
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for (int b = 0; b < batch_size; b++) {
-        for (int ch = 0; ch < channels; ch++) {
-            int ii = 0, jj = 0;
-            for (int r = center_y; r < image_height + vertical_pad - y_radius; 
-                                                        r += vertical_stride) {
-                for (int c = center_x; c < image_width + horizontal_pad - x_radius; 
-                                                            c += horizontal_stride) {
-                    float val = (poolFunction == 0) ? -3.40282e+38 : 0;
-                    int y_radius_var = y_radius - r;
-                    int y_radius_var_max = y_radius_var + image_height;
-                    int x_radius_var = x_radius - c;
-                    int x_radius_var_max = x_radius_var + image_width;
-                    int ki_min = (y_radius_var > 0) ? 
-                        ((y_radius_var < window_height) ? y_radius_var : -1) : 0;
-                    int ki_max = (y_radius_var_max < window_height) ? 
-                                 ((y_radius_var_max >= 0) ?  y_radius_var_max : -1) : window_height;
-                    int kj_min = (x_radius_var > 0) ? 
-                                ((x_radius_var < window_width) ? x_radius_var : -1) : 0;
-                    int kj_max = (x_radius_var_max < window_width) ? 
-                                    ((x_radius_var_max >= 0) ?  x_radius_var_max : -1) : window_width;
-                                        
-                    if(ki_min != ki_max && kj_min != kj_max && ki_min != -1 
-                            && ki_max != -1 && kj_min != -1 && kj_max != -1) {
-                        if(!poolFunction) {
-                            for (int ki = 0; ki < window_height; ki++) {
-                                for (int kj = 0; kj < window_width; kj++) {
-                                    val = maximum(
-                                    val,
-                                    input_data[b * (channels * image_height * image_width) +
-                                    ch * (image_height * image_width) +
-                                    (r - y_radius + ki) * image_width + (c - x_radius + kj)]);
-                                }
-                            }
-                        } else {
-                            for (int ki = 0; ki < window_height; ki++) {
-                                for (int kj = 0; kj < window_width; kj++) {
-                                    val += input_data[b * (channels * image_height * image_width) 
-                                            + ch * (image_height * image_width) +
-                                            (r - y_radius + ki) * image_width + (c - x_radius + kj)];
-                                }
-                            }
-                        }
-                    }
-                    if (poolFunction == 1) {
-                        val /= window_height * window_width;
-                    }
-                    output_data[b * (channels * output_height * output_width) +
-                        ch * (output_height * output_width) + ii * output_width + jj] = val;
-                    jj++;
-                    if (jj == output_width) {
-                        jj = 0;
-                        ii++;
-                    }
+                       int window_width, int vertical_pad, int horizontal_pad,
+                       int vertical_stride, int horizontal_stride) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  float *__restrict__ input_data = (float *)input->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+
+  int output_height =
+      1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
+  int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) /
+                          horizontal_stride);
+
+  int center_x = (window_width - 1) / 2 - horizontal_pad;
+  int center_y = (window_height - 1) / 2 - vertical_pad;
+  int x_radius = (window_width - 1) / 2;
+  int y_radius = (window_height - 1) / 2;
+
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels,
+                                               output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      int ii = 0, jj = 0;
+      for (int r = center_y; r < image_height + vertical_pad - y_radius;
+           r += vertical_stride) {
+        for (int c = center_x; c < image_width + horizontal_pad - x_radius;
+             c += horizontal_stride) {
+          float val = (poolFunction == 0) ? -3.40282e+38 : 0;
+          int y_radius_var = y_radius - r;
+          int y_radius_var_max = y_radius_var + image_height;
+          int x_radius_var = x_radius - c;
+          int x_radius_var_max = x_radius_var + image_width;
+          int ki_min =
+              (y_radius_var > 0)
+                  ? ((y_radius_var < window_height) ? y_radius_var : -1)
+                  : 0;
+          int ki_max = (y_radius_var_max < window_height)
+                           ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1)
+                           : window_height;
+          int kj_min = (x_radius_var > 0)
+                           ? ((x_radius_var < window_width) ? x_radius_var : -1)
+                           : 0;
+          int kj_max = (x_radius_var_max < window_width)
+                           ? ((x_radius_var_max >= 0) ? x_radius_var_max : -1)
+                           : window_width;
+
+          if (ki_min != ki_max && kj_min != kj_max && ki_min != -1 &&
+              ki_max != -1 && kj_min != -1 && kj_max != -1) {
+            if (!poolFunction) {
+              for (int ki = 0; ki < window_height; ki++) {
+                for (int kj = 0; kj < window_width; kj++) {
+                  val = maximum(
+                      val,
+                      input_data[b * (channels * image_height * image_width) +
+                                 ch * (image_height * image_width) +
+                                 (r - y_radius + ki) * image_width +
+                                 (c - x_radius + kj)]);
                 }
+              }
+            } else {
+              for (int ki = 0; ki < window_height; ki++) {
+                for (int kj = 0; kj < window_width; kj++) {
+                  val +=
+                      input_data[b * (channels * image_height * image_width) +
+                                 ch * (image_height * image_width) +
+                                 (r - y_radius + ki) * image_width +
+                                 (c - x_radius + kj)];
+                }
+              }
             }
+          }
+          if (poolFunction == 1) {
+            val /= window_height * window_width;
+          }
+          output_data[b * (channels * output_height * output_width) +
+                      ch * (output_height * output_width) + ii * output_width +
+                      jj] = val;
+          jj++;
+          if (jj == output_width) {
+            jj = 0;
+            ii++;
+          }
         }
+      }
     }
-  
-    return output;
+  }
+
+  return output;
 }
 
 void *tensorTanhCPU(void *input_ptr) {
-    Tensor *input = (Tensor *)input_ptr;
-    
-    float *input_data = (float *)input->host_data;
-    size_t num_elems = input->num_elems;
-    
-     omp_set_num_threads(4);
-     #pragma omp parallel for
-    for (size_t i = 0; i < num_elems; i++) {
-        input_data[i] = tanhf(input_data[i]);
-    }
-   
-    return input;
+  Tensor *input = (Tensor *)input_ptr;
+
+  float *input_data = (float *)input->host_data;
+  size_t num_elems = input->num_elems;
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (size_t i = 0; i < num_elems; i++) {
+    input_data[i] = tanhf(input_data[i]);
+  }
+
+  return input;
 }
 
 void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
-    Tensor *lhs = (Tensor *)lhs_ptr;
-    Tensor *rhs = (Tensor *)rhs_ptr;
-    
-    int m = lhs->dims.dim_sizes[0];
-    int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
-    int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
-    
-    Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1);
-
-    float * __restrict__ lhs_arr = (float *)lhs->host_data;
-    float * __restrict__ rhs_arr = (float *)rhs->host_data;
-    float * __restrict__ output_arr = (float *)output->host_data;
-    
-    int k = 1;
-    #pragma unroll 4   // Can we unroll more???
-    for (int j = 1; j < lhs->dims.num_dims; j++) {
-        k = k * lhs->dims.dim_sizes[j]; // input neurons
-    }
-    float *tran_rhs = (float *) malloc(sizeof(float) * k * n);
-    omp_set_num_threads(4);
-    #pragma omp parallel for simd
-    for (int l = 0; l < k; l++) {
-        for (int j = 0; j < n; j++) {
-            tran_rhs[j * k + l] = rhs_arr[l * n + j];
-        }   
+  Tensor *lhs = (Tensor *)lhs_ptr;
+  Tensor *rhs = (Tensor *)rhs_ptr;
+
+  int m = lhs->dims.dim_sizes[0];
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
+
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1);
+
+  float *__restrict__ lhs_arr = (float *)lhs->host_data;
+  float *__restrict__ rhs_arr = (float *)rhs->host_data;
+  float *__restrict__ output_arr = (float *)output->host_data;
+
+  int k = 1;
+#pragma unroll 4 // Can we unroll more???
+  for (int j = 1; j < lhs->dims.num_dims; j++) {
+    k = k * lhs->dims.dim_sizes[j]; // input neurons
+  }
+  float *tran_rhs = (float *)malloc(sizeof(float) * k * n);
+  omp_set_num_threads(4);
+#pragma omp parallel for simd
+  for (int l = 0; l < k; l++) {
+    for (int j = 0; j < n; j++) {
+      tran_rhs[j * k + l] = rhs_arr[l * n + j];
     }
-    
-    #pragma omp parallel for
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-           float sum = 0.0;
-          #pragma omp simd reduction(+:sum)
-           for (int l = 0; l < k; l++) {
-                sum += lhs_arr[i * k + l] * tran_rhs[j * k + l];
-            }
-            output_arr[i * n + j] = sum;
-        }
+  }
+
+#pragma omp parallel for
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      float sum = 0.0;
+#pragma omp simd reduction(+ : sum)
+      for (int l = 0; l < k; l++) {
+        sum += lhs_arr[i * k + l] * tran_rhs[j * k + l];
+      }
+      output_arr[i * n + j] = sum;
     }
-    free(tran_rhs);
-    return output;
+  }
+  free(tran_rhs);
+  return output;
 }
 
 void *tensorSoftmaxCPU(void *input_ptr) {
-    Tensor *input = (Tensor *)input_ptr;
-    
-    float *logits = (float *)input->host_data;
-    int n = input->dims.dim_sizes[0];
-    int c = input->dims.dim_sizes[1];
-    
-     omp_set_num_threads(4);
-    #pragma omp parallel for
-    for (int i = 0; i < n; i++) {
-        float x = 0;
-        for(int j = i*c; j < c + i*c; j++) {
-            logits[j] = expf(logits[j]);
-        }
-       
-        #pragma omp simd reduction(+:x)
-        for(int j = i*c; j < i*c+c; j++) {
-            x += logits[j];
-        }
-        
-         #pragma omp simd
-        for(int j = i*c; j < i*c + c; j++) {
-            logits[j] /= x;
-        }                                                                                                                                                   
+  Tensor *input = (Tensor *)input_ptr;
+
+  float *logits = (float *)input->host_data;
+  int n = input->dims.dim_sizes[0];
+  int c = input->dims.dim_sizes[1];
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int i = 0; i < n; i++) {
+    float x = 0;
+    for (int j = i * c; j < c + i * c; j++) {
+      logits[j] = expf(logits[j]);
     }
 
-    return input;
-}
-
-void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-                         void* mean_ptr, void* variance_ptr, double epsilon) {
-    
-    Tensor* input = (Tensor*) input_ptr;
-    Tensor* gamma = (Tensor*) gamma_ptr;
-    Tensor* beta = (Tensor*) beta_ptr;
-    Tensor* mean = (Tensor*) mean_ptr;
-    Tensor* variance = (Tensor*) variance_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_beta = (float *)beta->host_data;
-    float * __restrict__ host_gamma = (float *)gamma->host_data;
-    float * __restrict__ host_mean = (float *)mean->host_data;
-    float * __restrict__ host_variance = (float *)variance->host_data;
-    
-    float alpha_val = 1.0f, beta_val = 0.0f;
-    size_t num_elems = input->num_elems;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int image_dim = image_height * image_width;
+#pragma omp simd reduction(+ : x)
+    for (int j = i * c; j < i * c + c; j++) {
+      x += logits[j];
+    }
 
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            float mean = 0;
-            #pragma omp simd reduction(+:mean)
-            for(int i = 0; i < image_dim; i++) {
-                int index = b * channels * image_dim + ch * image_dim + i;
-                mean += host_image[index];
-            }
-            mean = mean / channels;
-         
-            float variance = 0;
-            #pragma omp simd reduction(+:variance)
-            for(int i = 0; i < image_dim; i++) {
-                int index = b * channels * image_dim + ch * image_dim + i;
-                float tmp = host_image[index] - mean;
-                variance += (tmp * tmp);  
-            }
-            variance = variance / channels;
-            
-           #pragma omp simd 
-            for(int i = 0; i < image_dim; i++) {
-                int index = b * channels * image_dim + ch * image_dim + i;
-                host_image[index] = host_beta[ch] 
-                                  + (host_gamma[ch] * ((host_image[index] - mean) / sqrt(epsilon + variance)));
-            }
-        }
+#pragma omp simd
+    for (int j = i * c; j < i * c + c; j++) {
+      logits[j] /= x;
     }
-    return input;
+  }
+
+  return input;
 }
 
- void *tensorReluCPU(void *input_ptr) {
-     Tensor *input = (Tensor *)input_ptr;
-     float *input_data = (float *)input->host_data;
-     size_t num_elems = input->num_elems;
-     
-     #pragma omp simd
-     for (size_t i = 0; i < num_elems; i++) {
-         input_data[i] = (input_data[i] < 0) ? 0 : input_data[i];
+void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                         void *mean_ptr, void *variance_ptr, double epsilon) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *gamma = (Tensor *)gamma_ptr;
+  Tensor *beta = (Tensor *)beta_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_beta = (float *)beta->host_data;
+  float *__restrict__ host_gamma = (float *)gamma->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int image_dim = image_height * image_width;
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      float mean = 0;
+#pragma omp simd reduction(+ : mean)
+      for (int i = 0; i < image_dim; i++) {
+        int index = b * channels * image_dim + ch * image_dim + i;
+        mean += host_image[index];
+      }
+      mean = mean / channels;
+
+      float variance = 0;
+#pragma omp simd reduction(+ : variance)
+      for (int i = 0; i < image_dim; i++) {
+        int index = b * channels * image_dim + ch * image_dim + i;
+        float tmp = host_image[index] - mean;
+        variance += (tmp * tmp);
+      }
+      variance = variance / channels;
+
+#pragma omp simd
+      for (int i = 0; i < image_dim; i++) {
+        int index = b * channels * image_dim + ch * image_dim + i;
+        host_image[index] =
+            host_beta[ch] + (host_gamma[ch] * ((host_image[index] - mean) /
+                                               sqrt(epsilon + variance)));
+      }
     }
+  }
+  return input;
+}
 
-    return input;
+void *tensorReluCPU(void *input_ptr) {
+  Tensor *input = (Tensor *)input_ptr;
+  float *input_data = (float *)input->host_data;
+  size_t num_elems = input->num_elems;
+
+#pragma omp simd
+  for (size_t i = 0; i < num_elems; i++) {
+    input_data[i] = (input_data[i] < 0) ? 0 : input_data[i];
+  }
+
+  return input;
 }
 
 void *tensorRelu2CPU(void *input_ptr, float min, float max) {
-    Tensor *input = (Tensor *)input_ptr;
-    float *input_data = (float *)input->host_data;
-    size_t num_elems = input->num_elems;
-    
-    #pragma omp simd
-    for (size_t i = 0; i < num_elems; i++) {
-        input_data[i] = (input_data[i] < min) ? min : ((input_data[i] > max) ? 
-                                                        max : input_data[i]);
-    }       
-
-    return input;
-}         
+  Tensor *input = (Tensor *)input_ptr;
+  float *input_data = (float *)input->host_data;
+  size_t num_elems = input->num_elems;
+
+#pragma omp simd
+  for (size_t i = 0; i < num_elems; i++) {
+    input_data[i] = (input_data[i] < min)
+                        ? min
+                        : ((input_data[i] > max) ? max : input_data[i]);
+  }
+
+  return input;
+}
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index 319936b482c455af2fcc0280adb15d7c126c088a..253f7614337908e72c82ba986f860dd58c7c9b3f 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -1,8 +1,9 @@
-/* This file includes the API implementation of the HPVM tensor runtime built on cublas, cudnn
-**
-**  Author: Hashim Sharif
-**  Email: hsharif3@illinois.edu
-*/
+/* This file includes the API implementation of the HPVM tensor runtime built on
+ *cublas, cudnn
+ **
+ **  Author: Hashim Sharif
+ **  Email: hsharif3@illinois.edu
+ */
 
 #include <stdio.h>
 #include <stdarg.h>
@@ -31,7 +32,6 @@
 #include <cuda_fp16.h>
 #include <driver_types.h>
 
-
 // Tensor runtime header files
 #include "tensor_runtime.h"
 #include "tensor_utils.h"
@@ -46,236 +46,177 @@
 #include "half_precision_api.h"
 #include "approx_simulation.h"
 
+// FIXIT: tensorAdd currently only works for 4D tensors
+void *tensorAdd(void *x_ptr, void *bias_ptr) {
 
+  Tensor *x = (Tensor *)x_ptr;
+  Tensor *bias = (Tensor *)bias_ptr;
 
-
-
-// FIXIT: tensorAdd currently only works for 4D tensors
-void* tensorAdd(void* x_ptr, void* bias_ptr){
-  
-  Tensor* x = (Tensor*) x_ptr;
-  Tensor* bias = (Tensor*) bias_ptr;
-  
-  INFO("*** TensorAdd \n");  
+  INFO("*** TensorAdd \n");
   profileEvent("Add");
-    
+
   float alpha = 1.0f;
-  //float beta = 0.0f;
+  // float beta = 0.0f;
   hostToDeviceCopy(x);
   hostToDeviceCopy(bias);
 
   convertToFP32(x);
   convertToFP32(bias);
 
-  
   DEBUG("x->num_elems = %d \n", x->num_elems);
   DEBUG("bias->num_elems = %d \n", bias->num_elems);
 
-  if(cudnnHandle == NULL){
-    ERROR("cudnnHandle NOT initialized!! \n");    
+  if (cudnnHandle == NULL) {
+    ERROR("cudnnHandle NOT initialized!! \n");
   }
-  
+
   // FIXIT: routine fails for 3D tensors
   checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc,
-			    bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data));
+                            bias->gpu_data, &alpha, x->tensor_desc,
+                            x->gpu_data));
 
   profileEvent("Add_end", true);
 
-  #ifdef ERROR_INJECTION_ENABLED  
-  if(op_counter >= total_ops){
-    ERROR("No accuracy flag found \n");
-  }
-  
-  int op_acc = op_accuracies[op_counter];
-
-  void* error_norms = tensorAddError(x, op_acc);
-  add_norms(error_norms, "tensorAdd", op_acc);
-  add_bias_overheads(x, op_acc);
-  op_counter++;
-  
-  #endif
-  
-  
   return x;
 }
 
-
 // FIXIT: Generalize all of the routines for types {half, float, double}
-void* tensorConvolution(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups){  
-  
+void *tensorConvolution(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups) {
+
   INFO("*** TensorConvolution \n");
   profileEvent("Conv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
-  
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
   cudnnConvolutionDescriptor_t convDesc;
   cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  if(conv_mode == 0)
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   mode = CUDNN_CROSS_CORRELATION;
   // FIXIT: Need to be more aware of the implications of alpha and beta
   float alpha = 1.0f, beta = 0.0f;
-  
-  // TODO: Support other cases;  
+
+  // TODO: Support other cases;
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
   convertToFP32(input);
   convertToFP32(filter);
 
-  
-  DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);  
+  DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+        horizontal_stride);
 
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
-  }  
-  
-  
+  }
+
   cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-  
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     vertical_stride, horizontal_stride, // conv strides
-					     1, 1, // upscaling values
-					     mode , // mode is configurable
-                                             computeType)); // defines compute precision
+
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      vertical_stride, horizontal_stride,     // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   // NOTE: Set conv groups for grouped convolution e.g., depthwise convolution
   checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
-  int n, c, h, w; // output dimensions  
+  int n, c, h, w; // output dimensions
   // Find dimension of convolution output
 
-  if(input->tensor_desc == NULL || filter->filter_desc == NULL)
+  if (input->tensor_desc == NULL || filter->filter_desc == NULL)
     ERROR("Input or Filter descriptor is NULL");
-    
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
 
-    
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
+
   DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output;
-  if(input->data_format == CUDNN_TENSOR_NCHW)
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type,  
-			              CUDNN_TENSOR_NCHW, n, c, h, w);
-  else if(input->data_format == CUDNN_TENSOR_NHWC){
+  Tensor *output;
+  if (input->data_format == CUDNN_TENSOR_NCHW)
+    output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                      CUDNN_TENSOR_NCHW, n, c, h, w);
+  else if (input->data_format == CUDNN_TENSOR_NHWC) {
     DEBUG("* NHWC Format \n");
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
-			              CUDNN_TENSOR_NHWC, n, h, w, c);
-  }
-  else
+    output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                      CUDNN_TENSOR_NHWC, n, h, w, c);
+  } else
     ERROR("Unsupported Tensor Type");
 
   // NOTE: Changing output tensor placement from host to device
-  changeTensorPlacement(output, DEVICE); 
+  changeTensorPlacement(output, DEVICE);
   // NOTE: Necessary to insert the above call for every output tensor
-    
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0],
-	output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
-
-  if(convDesc == NULL || input->tensor_desc == NULL ||
-     filter->filter_desc == NULL || output->tensor_desc == NULL)
-    ERROR("NULL descriptor! \n");
 
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = "
+        "%d, W = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
+
+  if (convDesc == NULL || input->tensor_desc == NULL ||
+      filter->filter_desc == NULL || output->tensor_desc == NULL)
+    ERROR("NULL descriptor! \n");
 
   // Debugging info prints
   printTensorDescInfo(input);
   printTensorDescInfo(filter);
   printTensorDescInfo(output);
 
-  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						 input->tensor_desc,
-						 filter->filter_desc,
-						 convDesc,
-						 output->tensor_desc,
-						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,	 
-						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						 0,
-						 &convAlgo));
-
-  
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support
+  // is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+      // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+      0, &convAlgo));
+
   DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	 CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	 CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-	 
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
   // NOTE: Currently using GEMM based convolution - other algorithms available
-  // TODO: Benchmark other convolution algorithms e.g., winograd 
+  // TODO: Benchmark other convolution algorithms e.g., winograd
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     convDesc,
-						     output->tensor_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
-  void* workspace;
-  checkCudaErrors(cudaMalloc(&workspace, workspace_size)); 
+  void *workspace;
+  checkCudaErrors(cudaMalloc(&workspace, workspace_size));
   DEBUG("workspace size = %d \n", workspace_size);
 
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+      filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace,
+      workspace_size, &beta, output->tensor_desc, output->gpu_data));
 
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				     input->gpu_data, filter->filter_desc, filter->gpu_data,
-				     convDesc, convAlgo, workspace, workspace_size,
-				     &beta, output->tensor_desc, output->gpu_data));
-		       
   profileEvent("Conv_end", true);
-
-
-  #ifdef ERROR_INJECTION_ENABLED
-
-  if(op_counter >= total_ops){
-    ERROR("No accuracy flag found \n");
-  }
-  
-  int op_acc = op_accuracies[op_counter];
-  
-  void* error_norms = tensorAddError(output, op_acc);
-  add_norms(error_norms, "tensorConv", op_acc);
-  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
-
-  op_counter++;
-  
-  #endif
-
-  
   return output;
 }
 
-
-
 // NOTE: Supports Max and Avg Pooling
-void* tensorPooling(void* input_ptr,
-		    int poolFunction,
-		    int window_height, int window_width,
-		    int vertical_pad, int horizontal_pad,
-		    int vertical_stride, int horizontal_stride){
+void *tensorPooling(void *input_ptr, int poolFunction, int window_height,
+                    int window_width, int vertical_pad, int horizontal_pad,
+                    int vertical_stride, int horizontal_stride) {
 
   INFO("*** TensorPooling \n");
   profileEvent("Pool");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   cudnnPoolingDescriptor_t poolDesc;
   // FIXIT: Need to be more aware of the implications of alpha and beta
@@ -285,83 +226,57 @@ void* tensorPooling(void* input_ptr,
 
   convertToFP32(input);
 
-  
-  checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc));            
+  checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc));
 
   int n = input->dims.dim_sizes[0];
   int c = input->dims.dim_sizes[1];
-  int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / vertical_stride;
+  int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) /
+          vertical_stride;
   h = h + 1;
-  int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / horizontal_stride;
+  int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) /
+          horizontal_stride;
   w = w + 1;
 
-  DEBUG("n = %d, c = %d, h = %d, w = %d , dim1 = %d , dim2 = %d \n",
-	n, c, h, w, input->dims.dim_sizes[2], input->dims.dim_sizes[3]);
-  
+  DEBUG("n = %d, c = %d, h = %d, w = %d , dim1 = %d , dim2 = %d \n", n, c, h, w,
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3]);
 
-  Tensor* output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *output =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, n, c, h, w);
   // Changing output tensor placement from host to device
-  changeTensorPlacement(output, DEVICE); 
+  changeTensorPlacement(output, DEVICE);
 
   // FIXIT: The output tensor is hardcoded to NCHW
-  checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_desc,
-					CUDNN_TENSOR_NCHW,
-					CUDNN_DATA_FLOAT,
-					n, c,
-					h, w));
+  checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_desc, CUDNN_TENSOR_NCHW,
+                                        CUDNN_DATA_FLOAT, n, c, h, w));
 
   // Select between Max-Pooling and Avg-Pooling
   cudnnPoolingMode_t pool_mode;
-  if(poolFunction == 0)
+  if (poolFunction == 0)
     pool_mode = CUDNN_POOLING_MAX;
-  else if(poolFunction == 1)
+  else if (poolFunction == 1)
     pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-  
-  checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc,
-					 pool_mode,
-					 CUDNN_PROPAGATE_NAN,
-					 window_height, window_width,
-					 vertical_pad, horizontal_pad,
-					 vertical_stride, horizontal_stride));
-     
-  checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, input->tensor_desc,
-				 input->gpu_data, &beta, output->tensor_desc, output->gpu_data));
-
-  profileEvent("Pool_end", true);
 
+  checkCUDNN(cudnnSetPooling2dDescriptor(
+      poolDesc, pool_mode, CUDNN_PROPAGATE_NAN, window_height, window_width,
+      vertical_pad, horizontal_pad, vertical_stride, horizontal_stride));
 
-  #ifdef ERROR_INJECTION_ENABLED
+  checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha,
+                                 input->tensor_desc, input->gpu_data, &beta,
+                                 output->tensor_desc, output->gpu_data));
 
-  if(op_counter >= total_ops){
-    ERROR("No accuracy flag found \n");
-  }
-  
-  int op_acc = op_accuracies[op_counter];
-  void* error_norms = tensorAddError(output, op_acc);
-  add_norms(error_norms, "tensorPooling", op_acc);
-  add_pool_overheads(input, window_height, vertical_stride, op_acc);
-
-  op_counter++;
-  
-  #endif
-
-  
+  profileEvent("Pool_end", true);
   return output;
 }
 
-
-
-
-
-/* Reference Implementation based on: https://gist.github.com/peterwittek/6303527 */
-void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){
+/* Reference Implementation based on:
+ * https://gist.github.com/peterwittek/6303527 */
+void *tensorGemmGPU(void *lhs_ptr, void *rhs_ptr) {
 
   INFO("*** TensorGemmGPU \n");
   profileEvent("Mul");
 
-  Tensor* lhs = (Tensor*) lhs_ptr;
-  Tensor* rhs = (Tensor*) rhs_ptr;
-
+  Tensor *lhs = (Tensor *)lhs_ptr;
+  Tensor *rhs = (Tensor *)rhs_ptr;
 
   DEBUG("rhs->dims.num_dims = %d \n", rhs->dims.num_dims);
   DEBUG("lhs->dims.num_dims = %d \n", lhs->dims.num_dims);
@@ -371,30 +286,30 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){
   // 'm' holds the batch dimension - assuming NCHW format Tensors
   int m = lhs->dims.dim_sizes[0];
   // The rhs last dimension must contain the neurons
-  int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
   int k = 1;
-  
+
   // Flattening the dimensions after the batch dimension
   // NOTE: Allowing any number of dimensions > 2 for lhs
-  for (int j = 1 ; j < lhs->dims.num_dims; j++){
+  for (int j = 1; j < lhs->dims.num_dims; j++) {
     k = k * lhs->dims.dim_sizes[j]; // input neurons
   }
 
-  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
+  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
   // Dimension-note: Check if k is same across the two tensors
   DEBUG("m = %d, n = %d, k = %d \n", m, n, k);
-  if(rhs_k != k){
+  if (rhs_k != k) {
     ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k);
   }
 
-  Tensor* output = NULL;
+  Tensor *output = NULL;
   DEBUG("Creating new TENSOR * \n");
-  output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1);
+  output =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1);
 
-   
   DEBUG("Changing placement *\n");
   // Changing output tensor placement from host to device
-  changeTensorPlacement(output, DEVICE); 
+  changeTensorPlacement(output, DEVICE);
 
   DEBUG("Changed Placement * \n\n");
 
@@ -404,175 +319,105 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){
   convertToFP32(lhs);
   convertToFP32(rhs);
 
-  
   DEBUG("CuBlasSgemm *\n");
-   
+
   // INFO: cuBlas uses column-major format
   // INFO: The leading dimension is just the FIRST Dimension
-  // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN expects
-  checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-			      n, m, k,
-			      &alpha,
-			      (float*) rhs->gpu_data, n,
-			      (float*) lhs->gpu_data, k,
-			      &beta,
-			      (float*) output->gpu_data, n));  
-
-  
-  profileEvent("Mul_end", true);
-
-
+  // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN
+  // expects
+  checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
+                              &alpha, (float *)rhs->gpu_data, n,
+                              (float *)lhs->gpu_data, k, &beta,
+                              (float *)output->gpu_data, n));
 
-  #ifdef ERROR_INJECTION_ENABLED
-
-  if(op_counter >= total_ops){
-    ERROR("No accuracy flag found \n");
-  }
-  
-  int op_acc = op_accuracies[op_counter];
-  
-  void* error_norms = tensorAddError(output, op_acc);
-  add_norms(error_norms, "tensorGemm", op_acc);
-  add_gemm_overheads(lhs_ptr, rhs_ptr, op_acc);
-
-  op_counter++;
-  
-  #endif
- 
-  
+  profileEvent("Mul_end", true);
   return output;
 }
 
-
-
-
-
-
-void* tensorRelu(void* input_ptr){
+void *tensorRelu(void *input_ptr) {
 
   DEBUG("*** TensorRelu \n");
   profileEvent("Relu");
 
-  Tensor* input = (Tensor*) input_ptr;
-  
+  Tensor *input = (Tensor *)input_ptr;
+
   cudnnActivationDescriptor_t reluDesc;
   float alpha = 1.0f, beta = 0.0f;
 
   hostToDeviceCopy(input);
 
   convertToFP32(input);
-  
-  
+
   checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc));
 
   checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU,
-					  CUDNN_PROPAGATE_NAN, 0.0));
+                                          CUDNN_PROPAGATE_NAN, 0.0));
 
   checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha,
-				    input->tensor_desc, input->gpu_data, &beta,
-				    input->tensor_desc, input->gpu_data));
+                                    input->tensor_desc, input->gpu_data, &beta,
+                                    input->tensor_desc, input->gpu_data));
 
   profileEvent("Relu_end", true);
-
-
-  #ifdef ERROR_INJECTION_ENABLED
-  
-  if(op_counter >= total_ops){
-    ERROR("No accuracy flag found \n");
-  }
-  
-  int op_acc = op_accuracies[op_counter];
-    
-  void* error_norms = tensorAddError(input, op_acc);
-  add_norms(error_norms, "tensorRelu", op_acc);
-  add_relu_overheads(input, op_acc);
-  op_counter++;  
-  #endif
-  
-
   return input;
 }
 
-
 // Think: Should Softmax be broken into multiple IR operations?
-void* tensorSoftmax(void* input_ptr){
+void *tensorSoftmax(void *input_ptr) {
 
   INFO("*** TensorSoftmax \n");
   profileEvent("Softmax");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
   float alpha = 1.0f, beta = 0.0f;
 
   hostToDeviceCopy(input);
-  convertToFP32(input); 
-     
-  checkCUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
-				 &alpha, input->tensor_desc, input->gpu_data, &beta,
-				 input->tensor_desc, input->gpu_data));
+  convertToFP32(input);
 
-  deviceToHostCopy(input);  
+  checkCUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE,
+                                 CUDNN_SOFTMAX_MODE_CHANNEL, &alpha,
+                                 input->tensor_desc, input->gpu_data, &beta,
+                                 input->tensor_desc, input->gpu_data));
+
+  deviceToHostCopy(input);
   profileEvent("Softmax_end", true);
-  
+
   return input;
 }
 
-
-
-
-void* tensorRelu2(void* input_ptr, float min, float max){
+void *tensorRelu2(void *input_ptr, float min, float max) {
 
   INFO("*** TensorClippedRelu *** \n");
   profileEvent("Relu");
 
   cudnnActivationDescriptor_t reluDesc;
   float alpha = 1.0f, beta = 0.0f;
-  
-  Tensor* input = (Tensor*) input_ptr;
+
+  Tensor *input = (Tensor *)input_ptr;
 
   hostToDeviceCopy(input);
 
   convertToFP32(input);
-  
 
   checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc));
 
-  checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU,
-					  CUDNN_PROPAGATE_NAN, max));
+  checkCUDNN(cudnnSetActivationDescriptor(
+      reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_PROPAGATE_NAN, max));
 
   checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha,
-				    input->tensor_desc, input->gpu_data, &beta,
-				    input->tensor_desc, input->gpu_data));
+                                    input->tensor_desc, input->gpu_data, &beta,
+                                    input->tensor_desc, input->gpu_data));
 
-  
-  
   profileEvent("Relu_end", true);
-
-
-  #ifdef ERROR_INJECTION_ENABLED
-  
-  if(op_counter >= total_ops){
-    ERROR("No accuracy flag found \n");
-  }
-  
-  int op_acc = op_accuracies[op_counter];
-  void* error_norms = tensorAddError(input, op_acc);
-  add_norms(error_norms, "tensorClippedRelu", op_acc);
-  add_relu_overheads(input, op_acc);
-  op_counter++;  
-  #endif
-  
-
   return input;
 }
 
-
-void* tensorTanh(void* input_ptr){
+void *tensorTanh(void *input_ptr) {
 
   INFO("*** TensorTanh \n");
   profileEvent("Tanh");
 
-  Tensor* input = (Tensor*) input_ptr;
-  
+  Tensor *input = (Tensor *)input_ptr;
+
   cudnnActivationDescriptor_t tanhDesc;
   float alpha = 1.0f, beta = 0.0f;
 
@@ -580,55 +425,36 @@ void* tensorTanh(void* input_ptr){
 
   convertToFP32(input);
 
-  
   checkCUDNN(cudnnCreateActivationDescriptor(&tanhDesc));
 
   checkCUDNN(cudnnSetActivationDescriptor(tanhDesc, CUDNN_ACTIVATION_TANH,
-					  CUDNN_PROPAGATE_NAN, 0.0));
+                                          CUDNN_PROPAGATE_NAN, 0.0));
 
   checkCUDNN(cudnnActivationForward(cudnnHandle, tanhDesc, &alpha,
-				    input->tensor_desc, input->gpu_data, &beta,
-				    input->tensor_desc, input->gpu_data));
+                                    input->tensor_desc, input->gpu_data, &beta,
+                                    input->tensor_desc, input->gpu_data));
 
   profileEvent("Tanh_end", true);
-
-
-  #ifdef ERROR_INJECTION_ENABLED
-  
-  if(op_counter >= total_ops){
-    ERROR("No accuracy flag found \n");
-  }
-  
-  int op_acc = op_accuracies[op_counter];
-  void* error_norms = tensorAddError(input, op_acc);
-  add_norms(error_norms, "tensorTanh", op_acc);
-  add_relu_overheads(input, op_acc);
-  op_counter++;  
-  #endif
-  
-
   return input;
 }
 
-
-
-
-void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-		      void* mean_ptr, void* variance_ptr, double epsilon){
+void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                      void *mean_ptr, void *variance_ptr, double epsilon) {
 
   INFO("*** TensorBatchNorm \n");
   profileEvent("BatchNorm");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* gamma = (Tensor*) gamma_ptr;
-  Tensor* beta = (Tensor*) beta_ptr;
-  Tensor* mean = (Tensor*) mean_ptr;
-  Tensor* variance = (Tensor*) variance_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *gamma = (Tensor *)gamma_ptr;
+  Tensor *beta = (Tensor *)beta_ptr;
+  Tensor *mean = (Tensor *)mean_ptr;
+  Tensor *variance = (Tensor *)variance_ptr;
 
-  if (input == NULL || gamma == NULL || beta == NULL || mean == NULL || variance == NULL){
+  if (input == NULL || gamma == NULL || beta == NULL || mean == NULL ||
+      variance == NULL) {
     ERROR("NULL Input Tensor");
   }
-  
+
   float alpha_val = 1.0f, beta_val = 0.0f;
   hostToDeviceCopy(input);
   hostToDeviceCopy(gamma);
@@ -638,149 +464,127 @@ void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
 
   convertToFP32(input);
 
- 
-  
-  checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle, CUDNN_BATCHNORM_SPATIAL,
-						     &alpha_val, &beta_val,
-						     input->tensor_desc, input->gpu_data,
-						     input->tensor_desc, input->gpu_data,
-						     gamma->tensor_desc, gamma->gpu_data,
-						     beta->gpu_data, mean->gpu_data,
-						     variance->gpu_data,
-						     epsilon));
+  checkCUDNN(cudnnBatchNormalizationForwardInference(
+      cudnnHandle, CUDNN_BATCHNORM_SPATIAL, &alpha_val, &beta_val,
+      input->tensor_desc, input->gpu_data, input->tensor_desc, input->gpu_data,
+      gamma->tensor_desc, gamma->gpu_data, beta->gpu_data, mean->gpu_data,
+      variance->gpu_data, epsilon));
 
   profileEvent("BatchNorm_end", true);
-
-
-  #ifdef ERROR_INJECTION_ENABLED
-  
-  if(op_counter >= total_ops){
-    ERROR("No accuracy flag found \n");
-  }
-  
-  int op_acc = op_accuracies[op_counter];
-  void* error_norms = tensorAddError(input, op_acc);
-  add_norms(error_norms, "tensorBatchNorm", op_acc);
-  add_relu_overheads(input, op_acc);
-  op_counter++;  
-  #endif
-  
-
   return input;
 }
 
-
-
-
 // TODO: benchmark performance of tensorSplit
-void** tensorSplit(void* tensor_ptr, int num_splits, int split_dim){
+void **tensorSplit(void *tensor_ptr, int num_splits, int split_dim) {
 
-  INFO("*** TensorSplit \n");  
+  INFO("*** TensorSplit \n");
   profileEvent("tensorSplit");
 
-  Tensor* tensor = (Tensor*) tensor_ptr;
-  
+  Tensor *tensor = (Tensor *)tensor_ptr;
+
   deviceToHostCopy(tensor); // Splitting done on the host
 
-  Tensor** splits = (Tensor**) malloc(sizeof(Tensor*) * num_splits);
-  size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * tensor->dims.num_dims);
-  for(unsigned int i = 0; i < tensor->dims.num_dims; i++){
+  Tensor **splits = (Tensor **)malloc(sizeof(Tensor *) * num_splits);
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * tensor->dims.num_dims);
+  for (unsigned int i = 0; i < tensor->dims.num_dims; i++) {
     dim_sizes[i] = tensor->dims.dim_sizes[i];
   }
 
-  
   dim_sizes[split_dim] = tensor->dims.dim_sizes[split_dim] / num_splits;
-  if(dim_sizes[split_dim] < 1)
+  if (dim_sizes[split_dim] < 1)
     ERROR("Split Dimension < 1 after splitting");
 
   size_t copy_size = getTypeSize(tensor->data_type);
-  for(unsigned int i = split_dim; i < tensor->dims.num_dims; i++){
+  for (unsigned int i = split_dim; i < tensor->dims.num_dims; i++) {
     copy_size = copy_size * dim_sizes[i];
   }
-  
-  for(unsigned int i = 0; i < num_splits; i++){
 
-    DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] = %d \n",
-	 dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
+  for (unsigned int i = 0; i < num_splits; i++) {
+
+    DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, "
+          "dim_sizes[3] = %d \n",
+          dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
+
+    Tensor *split = (Tensor *)create4DTensor(
+        tensor->data_type, tensor->data_format, dim_sizes[0], dim_sizes[1],
+        dim_sizes[2], dim_sizes[3]);
 
-    Tensor* split = (Tensor*) create4DTensor(tensor->data_type, tensor->data_format,
-					  dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
-    
     size_t copy_start = i * copy_size;
     size_t copy_stride = num_splits * copy_size;
-    DEBUG("copy_size = %d, copy_start = %d, copy_stride = %d, tensor->size_in_bytes = %d \n",
-	 copy_size, copy_start, copy_stride, tensor->size_in_bytes);
+    DEBUG("copy_size = %d, copy_start = %d, copy_stride = %d, "
+          "tensor->size_in_bytes = %d \n",
+          copy_size, copy_start, copy_stride, tensor->size_in_bytes);
 
     int index = 0;
-    while(copy_start + copy_size <= tensor->size_in_bytes){
-      memcpy(((char*) split->host_data + (index * copy_size)),
-	     ((char*)tensor->host_data + copy_start),
-	     copy_size);
+    while (copy_start + copy_size <= tensor->size_in_bytes) {
+      memcpy(((char *)split->host_data + (index * copy_size)),
+             ((char *)tensor->host_data + copy_start), copy_size);
       copy_start += copy_stride;
       index++;
     }
-   	
-    splits[i] = split;     
+
+    splits[i] = split;
   }
 
   profileEvent("tensorSplit_end", true);
 
-  return (void**) splits;
+  return (void **)splits;
 }
 
+void *tensorConcat(void **tensors_ptr, int num_splits, int split_dim) {
 
-void* tensorConcat(void** tensors_ptr, int num_splits, int split_dim){
-
-  INFO("*** TensorConcat \n");  
+  INFO("*** TensorConcat \n");
   profileEvent("tensorConcat");
 
-  Tensor** tensors = (Tensor**) tensors_ptr;
+  Tensor **tensors = (Tensor **)tensors_ptr;
 
-  for(int i = 0; i < num_splits; i++){
+  for (int i = 0; i < num_splits; i++) {
     deviceToHostCopy(tensors[i]); // Concatenation done on the host
   }
-  
+
   // The no of dimensions of concatenated tensor are the same
-  size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * tensors[0]->dims.num_dims);
-  for(unsigned int i = 0; i < tensors[0]->dims.num_dims; i++){
+  size_t *dim_sizes =
+      (size_t *)malloc(sizeof(size_t) * tensors[0]->dims.num_dims);
+  for (unsigned int i = 0; i < tensors[0]->dims.num_dims; i++) {
     dim_sizes[i] = tensors[0]->dims.dim_sizes[i];
   }
-  
+
   size_t copy_size = getTypeSize(tensors[0]->data_type);
-  for(unsigned int i = split_dim; i < tensors[0]->dims.num_dims; i++){
+  for (unsigned int i = split_dim; i < tensors[0]->dims.num_dims; i++) {
     copy_size = copy_size * dim_sizes[i];
   }
 
   dim_sizes[split_dim] = dim_sizes[split_dim] * num_splits;
-  if(dim_sizes[split_dim] < 1)
+  if (dim_sizes[split_dim] < 1)
     ERROR("Split Dimension < 1 after concat");
 
-  Tensor* output = (Tensor*) create4DTensor(tensors[0]->data_type, tensors[0]->data_format,
-					 dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
-
-  DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] = %d \n",
-       dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
+  Tensor *output = (Tensor *)create4DTensor(
+      tensors[0]->data_type, tensors[0]->data_format, dim_sizes[0],
+      dim_sizes[1], dim_sizes[2], dim_sizes[3]);
 
+  DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] "
+        "= %d \n",
+        dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
 
   int num_copies = 1;
-  for(unsigned int i = 0; i < split_dim; i++){
+  for (unsigned int i = 0; i < split_dim; i++) {
     num_copies = num_copies * dim_sizes[i];
   }
-  
+
   size_t copy_stride = num_splits * copy_size;
-  DEBUG("copy_size = %d, num_copies = %d, copy_stride = %d, output->size_in_bytes = %d \n",
-       copy_size, num_copies, copy_stride, output->size_in_bytes);
+  DEBUG("copy_size = %d, num_copies = %d, copy_stride = %d, "
+        "output->size_in_bytes = %d \n",
+        copy_size, num_copies, copy_stride, output->size_in_bytes);
 
-  for(unsigned int i = 0; i < num_copies; i++){
+  for (unsigned int i = 0; i < num_copies; i++) {
     // FIXIT: Don't be specific to 4D tensors
     size_t copy_start = i * copy_stride;
-   
-    for(int j = 0; j < num_splits; j++){
-      struct Tensor* split = tensors[j];
-      memcpy(((char*) output->host_data + copy_start + (j * copy_size)),
-	     ((char*) split->host_data + (i * copy_size)),
-	     copy_size);   
-    }      
+
+    for (int j = 0; j < num_splits; j++) {
+      struct Tensor *split = tensors[j];
+      memcpy(((char *)output->host_data + copy_start + (j * copy_size)),
+             ((char *)split->host_data + (i * copy_size)), copy_size);
+    }
   }
 
   profileEvent("tensorConcat_end", true);
@@ -788,15 +592,13 @@ void* tensorConcat(void** tensors_ptr, int num_splits, int split_dim){
   return output;
 }
 
+void *tensorLRN(void *input_ptr, unsigned int LRN_window, double LRN_alpha,
+                double LRN_beta, double LRN_k) {
 
-
-void* tensorLRN(void* input_ptr, unsigned int LRN_window,
-		double LRN_alpha, double LRN_beta, double LRN_k){
-
-  INFO("*** TensorLRN \n");  
+  INFO("*** TensorLRN \n");
   profileEvent("tensorLRN");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   hostToDeviceCopy(input);
 
@@ -804,29 +606,28 @@ void* tensorLRN(void* input_ptr, unsigned int LRN_window,
   cudnnLRNDescriptor_t LRNDesc;
   checkCUDNN(cudnnCreateLRNDescriptor(&LRNDesc));
 
-  DEBUG("window = %d, LRN_alpha = %f, LRN_beta = %f, LRN_k = %f \n",
-       LRN_window, LRN_alpha, LRN_beta, LRN_k);
- 
-  
-  checkCUDNN(cudnnSetLRNDescriptor(LRNDesc, LRN_window, LRN_alpha, LRN_beta, LRN_k));
+  DEBUG("window = %d, LRN_alpha = %f, LRN_beta = %f, LRN_k = %f \n", LRN_window,
+        LRN_alpha, LRN_beta, LRN_k);
+
+  checkCUDNN(
+      cudnnSetLRNDescriptor(LRNDesc, LRN_window, LRN_alpha, LRN_beta, LRN_k));
 
-  size_t* dim_sizes = input->dims.dim_sizes;
-  Tensor* output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
-			  CUDNN_TENSOR_NCHW, dim_sizes[0], dim_sizes[1],
-			  dim_sizes[2], dim_sizes[3]);
+  size_t *dim_sizes = input->dims.dim_sizes;
+  Tensor *output = (Tensor *)create4DTensor(
+      (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, dim_sizes[0],
+      dim_sizes[1], dim_sizes[2], dim_sizes[3]);
 
-  changeTensorPlacement(output, DEVICE); 
+  changeTensorPlacement(output, DEVICE);
 
   printTensorDescInfo(input);
   printTensorDescInfo(output);
-  
-  checkCUDNN(cudnnLRNCrossChannelForward(cudnnHandle, LRNDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1,
-					 &alpha, input->tensor_desc, input->gpu_data,
-					 &beta, output->tensor_desc, output->gpu_data));
+
+  checkCUDNN(cudnnLRNCrossChannelForward(
+      cudnnHandle, LRNDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha,
+      input->tensor_desc, input->gpu_data, &beta, output->tensor_desc,
+      output->gpu_data));
 
   profileEvent("tensorLRN_end", true);
-    
+
   return output;
 }
-
-
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
index 079a9898294b01ba8dfcb575f11998790f24abfa..f6bfe700b44c88fea06c6a76267b49af4a523716 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
@@ -1,13 +1,12 @@
 //===--------------------------- tensor_utils.cu --------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 //  This file  consists of the custom implementation of utility functions
 // useful for approximated and non-approximated versions of tensor operations.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
@@ -42,18 +41,15 @@
 #include "global_data.h"
 #include "fp16_gemm.h"
 
+extern "C" {
 
-
-extern "C"{
-
-
-void freeTensor(void* tensor_ptr){
-  Tensor* tensor = (Tensor*) tensor_ptr;
+void freeTensor(void *tensor_ptr) {
+  Tensor *tensor = (Tensor *)tensor_ptr;
 
   tensors_ptr.erase(tensor->gpu_data);
   tensors_ptr.erase(tensor->gpu_half_data);
   host_ptr.erase(tensor->host_data);
-  
+
   cudaFree(tensor->gpu_data);
   tensor->gpu_data = nullptr;
   cudaFree(tensor->gpu_half_data);
@@ -62,43 +58,42 @@ void freeTensor(void* tensor_ptr){
   tensor->host_data = nullptr;
 }
 
-
 // Returns the size of the target datatype
-int getTypeSize(int data_type){
+int getTypeSize(int data_type) {
   // TODO: Add support for more data types
   switch (data_type) {
-    case float_type:
-      return 4;
-    case double_type:
-      return 8;
-    case half_type:
-      return 2;
-    case int_type:
-      return 1;
-    case float2_type:
-      return 8;
-    case half2_type:
-      return 4;
-    default:
-      ERROR("Unknown type %d\n", data_type);
+  case float_type:
+    return 4;
+  case double_type:
+    return 8;
+  case half_type:
+    return 2;
+  case int_type:
+    return 1;
+  case float2_type:
+    return 8;
+  case half2_type:
+    return 4;
+  default:
+    ERROR("Unknown type %d\n", data_type);
   }
   return 0;
 }
 
-static int getFullPrecTypeSize(int data_type){
+static int getFullPrecTypeSize(int data_type) {
   switch (data_type) {
-    case float_type:
-    case half_type:
-      return 4;
-    case double_type:
-      return 8;
-    case int_type:
-      return 1;
-    case float2_type:
-    case half2_type:
-      return 8;
-    default:
-      ERROR("Unknown type %d\n", data_type);
+  case float_type:
+  case half_type:
+    return 4;
+  case double_type:
+    return 8;
+  case int_type:
+    return 1;
+  case float2_type:
+  case half2_type:
+    return 8;
+  default:
+    ERROR("Unknown type %d\n", data_type);
   }
   return 0;
 }
@@ -107,7 +102,7 @@ static bool isFP16Compound(int data_type) {
   return data_type == half_type || data_type == half2_type;
 }
 
-void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){
+void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) {
   int type_size = getTypeSize(data_type);
   size_t size_in_bytes = type_size * num_elems;
   tensor->size_in_bytes = size_in_bytes;
@@ -115,18 +110,20 @@ void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){
   DEBUG("***--- size_in_bytes = %d \n", size_in_bytes);
 }
 
-
 // NOTE: Always allocates FP32 on Host, FP32/FP16 for Device (GPU)
-void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){
+void allocateMem(struct Tensor *tensor, int data_type, size_t num_elems) {
   setSizeInBytes(tensor, data_type, num_elems);
   tensor->data_type = data_type;
-  tensor->cur_type = data_type; // type maintained for hanlding FP32 <-> FP16 conversions
+  tensor->cur_type =
+      data_type; // type maintained for hanlding FP32 <-> FP16 conversions
   tensor->num_elems = num_elems;
-  
-  size_t size_on_host = num_elems * getFullPrecTypeSize(data_type); // NOTE: On host, always FP32
-  tensor->host_data = (void*) malloc(size_on_host); // Allocate memory on the host
-  tensor->data_placement = HOST; // By defaut data is on the host
-  
+
+  size_t size_on_host =
+      num_elems * getFullPrecTypeSize(data_type); // NOTE: On host, always FP32
+  tensor->host_data =
+      (void *)malloc(size_on_host); // Allocate memory on the host
+  tensor->data_placement = HOST;    // By defaut data is on the host
+
   DEBUG("Attempting to Allocate = %lu \n\n\n", tensor->size_in_bytes);
 
   if (isFP16Compound(data_type)) {
@@ -142,23 +139,25 @@ void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){
   }
 
   tracked_tensors[tensor] = 1; // For FP16-FP32 data handling
-  
+
   host_ptr.insert(tensor->host_data);
   obj_ptr.insert(tensor);
-  //host_ptr.push_back(tensor->host_data); 
+  // host_ptr.push_back(tensor->host_data);
 }
 
 /// Two tensor formats are supported: NCHW and NHWC.
 /// TODO: Make this more general in the future.
 ///
-void setCudnnDataFormat(struct Tensor* tensor, int data_format){
+void setCudnnDataFormat(struct Tensor *tensor, int data_format) {
 
-  switch(data_format){
+  switch (data_format) {
   case 0:
-    data_format = CUDNN_TENSOR_NCHW; break;
+    data_format = CUDNN_TENSOR_NCHW;
+    break;
   case 1:
-    data_format = CUDNN_TENSOR_NHWC; break;
-  
+    data_format = CUDNN_TENSOR_NHWC;
+    break;
+
   default:
     break;
   }
@@ -167,39 +166,31 @@ void setCudnnDataFormat(struct Tensor* tensor, int data_format){
   DEBUG("tensor->data_format = %d \n", tensor->data_format);
 }
 
-
-void set4DFilterDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size,
-			   size_t dim2_size, size_t dim3_size, size_t dim4_size){
+void set4DFilterDescriptor(struct Tensor *tensor, int data_format,
+                           size_t dim1_size, size_t dim2_size, size_t dim3_size,
+                           size_t dim4_size) {
 
   setCudnnDataFormat(tensor, data_format);
-  
+
   checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_desc));
 
   checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_half_desc));
 
-  
-  checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_desc,
-					(cudnnDataType_t) CUDNN_DATA_FLOAT, //tensor->data_type,
-					(cudnnTensorFormat_t) tensor->data_format,
-					dim1_size,
-					dim2_size, 
-					dim3_size,
-					dim4_size));
-
-  checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_half_desc,
-					(cudnnDataType_t) CUDNN_DATA_HALF,
-					(cudnnTensorFormat_t) tensor->data_format,
-					dim1_size,
-					dim2_size, 
-					dim3_size,
-					dim4_size));  
+  checkCUDNN(cudnnSetFilter4dDescriptor(
+      tensor->filter_desc,
+      (cudnnDataType_t)CUDNN_DATA_FLOAT, // tensor->data_type,
+      (cudnnTensorFormat_t)tensor->data_format, dim1_size, dim2_size, dim3_size,
+      dim4_size));
 
+  checkCUDNN(cudnnSetFilter4dDescriptor(
+      tensor->filter_half_desc, (cudnnDataType_t)CUDNN_DATA_HALF,
+      (cudnnTensorFormat_t)tensor->data_format, dim1_size, dim2_size, dim3_size,
+      dim4_size));
 }
 
-
-
-void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size,
-			   size_t dim2_size, size_t dim3_size, size_t dim4_size){
+void set4DTensorDescriptor(struct Tensor *tensor, int data_format,
+                           size_t dim1_size, size_t dim2_size, size_t dim3_size,
+                           size_t dim4_size) {
 
   setCudnnDataFormat(tensor, data_format);
 
@@ -207,292 +198,270 @@ void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s
 
   checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_half_desc));
 
-  // For certain operations, the strides may need to change - in which case the descriptor
-  // needs to be reinitialized
-  cudnnSetTensor4dDescriptor(tensor->tensor_desc,
-			     (cudnnTensorFormat_t) tensor->data_format, // Data format
-			     (cudnnDataType_t) CUDNN_DATA_FLOAT, //tensor->data_type, // Data type
-			     dim1_size, dim2_size, 
-			     dim3_size, dim4_size);
-
+  // For certain operations, the strides may need to change - in which case the
+  // descriptor needs to be reinitialized
+  cudnnSetTensor4dDescriptor(
+      tensor->tensor_desc,
+      (cudnnTensorFormat_t)tensor->data_format, // Data format
+      (cudnnDataType_t)CUDNN_DATA_FLOAT, // tensor->data_type, // Data type
+      dim1_size, dim2_size, dim3_size, dim4_size);
 
-  cudnnSetTensor4dDescriptor(tensor->tensor_half_desc,
-			     (cudnnTensorFormat_t) tensor->data_format, // Data format
-			     (cudnnDataType_t) CUDNN_DATA_HALF, // Data type
-			     dim1_size, dim2_size, 
-			     dim3_size, dim4_size);
+  cudnnSetTensor4dDescriptor(
+      tensor->tensor_half_desc,
+      (cudnnTensorFormat_t)tensor->data_format, // Data format
+      (cudnnDataType_t)CUDNN_DATA_HALF,         // Data type
+      dim1_size, dim2_size, dim3_size, dim4_size);
 
-  
   cudnnDataType_t dType;
   int nStride, cStride, hStride, wStride;
   int size1, size2, size3, size4;
-  cudnnGetTensor4dDescriptor(tensor->tensor_desc,
-  			     &dType,
-  			     &size1, &size2, &size3, &size4,
-  			     &nStride, &cStride, &hStride, &wStride);
-			   
-  DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n",
-  	 nStride, cStride, hStride, wStride);
-}
+  cudnnGetTensor4dDescriptor(tensor->tensor_desc, &dType, &size1, &size2,
+                             &size3, &size4, &nStride, &cStride, &hStride,
+                             &wStride);
 
+  DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", nStride,
+        cStride, hStride, wStride);
+}
 
 // FIXIT: Striding still not working - hence 2D and 3D tensor support is missing
-void setTensorDescriptor(struct Tensor* tensor, int num_dims,
-			 size_t* dim_sizes){
+void setTensorDescriptor(struct Tensor *tensor, int num_dims,
+                         size_t *dim_sizes) {
 
   checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_desc));
 
-  int* strides = (int*) malloc(sizeof(int) * num_dims);
+  int *strides = (int *)malloc(sizeof(int) * num_dims);
   strides[num_dims - 1] = 1;
-  for(int i = num_dims - 2; i >= 0; i--){
-    strides[i] = strides[i+1] * dim_sizes[i+1];
+  for (int i = num_dims - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * dim_sizes[i + 1];
   }
 
-  for(int i = 0; i < num_dims; i++){
+  for (int i = 0; i < num_dims; i++) {
     DEBUG("strides[%d] = %d \n", i, strides[i]);
   }
 
-  int* const_dims = (int*) malloc(sizeof(int) * num_dims);
-  for(int j = 0 ; j < num_dims; j++){
-    const_dims[j] = (int) dim_sizes[j];
+  int *const_dims = (int *)malloc(sizeof(int) * num_dims);
+  for (int j = 0; j < num_dims; j++) {
+    const_dims[j] = (int)dim_sizes[j];
     DEBUG("const_dim = %d \n", const_dims[j]);
   }
-  
-  DEBUG("data_type = %d, cuDNN_value = %d \n", tensor->data_type, CUDNN_DATA_FLOAT); 
-  // For certain operations, the strides may need to change - in which case the descriptor
-  // needs to be reinitialized
-  checkCUDNN(cudnnSetTensorNdDescriptor(tensor->tensor_desc,
-					(cudnnDataType_t) tensor->data_type, // Data type
-					num_dims,
-					(const int*) const_dims,
-					(const int*) strides));
+
+  DEBUG("data_type = %d, cuDNN_value = %d \n", tensor->data_type,
+        CUDNN_DATA_FLOAT);
+  // For certain operations, the strides may need to change - in which case the
+  // descriptor needs to be reinitialized
+  checkCUDNN(cudnnSetTensorNdDescriptor(
+      tensor->tensor_desc,
+      (cudnnDataType_t)tensor->data_type, // Data type
+      num_dims, (const int *)const_dims, (const int *)strides));
 }
 
+/// HPVM tensor runtime allows creation of 2D, 3D and 4D tensors.
 
-/// HPVM tensor runtime allows creation of 2D, 3D and 4D tensors. 
+void *create2DTensor(int data_type, size_t dim1_size, size_t dim2_size) {
+  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size;
+  allocateMem(tensor, data_type, num_elems);
+  // Setting the tensor dimensions
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 2);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 2;
 
+  return tensor;
+}
 
-  void* create2DTensor(int data_type, size_t dim1_size, size_t dim2_size){
-    struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor));
-    size_t num_elems = dim1_size * dim2_size;
-    allocateMem(tensor, data_type, num_elems);
-    // Setting the tensor dimensions  
-    size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 2);
-    dim_sizes[0] = dim1_size;
-    dim_sizes[1] = dim2_size;
-    tensor->dims.dim_sizes = dim_sizes;
-    tensor->dims.num_dims = 2;
-  
-    return tensor;
-  }
+void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
+                     size_t dim3_size) {
+  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size * dim3_size;
+  allocateMem(tensor, data_type, num_elems);
+  // Setting the tensor dimensions
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 3);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  dim_sizes[2] = dim3_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 3;
+
+  return tensor;
+}
 
+void *create4DTensor(int data_type, int data_format, size_t dim1_size,
+                     size_t dim2_size, size_t dim3_size, size_t dim4_size) {
+  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+  allocateMem(tensor, data_type, num_elems);
+  // Setting the tensor dimensions
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  dim_sizes[2] = dim3_size;
+  dim_sizes[3] = dim4_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 4;
+  // Done setting tensor dimensions
+  // setTensorDescriptor(tensor, 4, dim_sizes);
+  set4DTensorDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size,
+                        dim4_size);
+  // FIXIT: filter descriptor should be invoked only for filters
+  set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size,
+                        dim4_size);
+
+  return tensor;
+}
 
-  void* create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size){
-    struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor));
-    size_t num_elems = dim1_size * dim2_size * dim3_size;
-    allocateMem(tensor, data_type, num_elems);
-    // Setting the tensor dimensions  
-    size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 3);
-    dim_sizes[0] = dim1_size;
-    dim_sizes[1] = dim2_size;
-    dim_sizes[2] = dim3_size;
-    tensor->dims.dim_sizes = dim_sizes;
-    tensor->dims.num_dims = 3;
-
-    return tensor;
-  }
+void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
 
+  Tensor *tensor = (Tensor *)tensor_ptr;
 
-  void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size, size_t dim4_size){
-    struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor));
-    size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-    allocateMem(tensor, data_type, num_elems);
-    // Setting the tensor dimensions  
-    size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 4);
-    dim_sizes[0] = dim1_size;
-    dim_sizes[1] = dim2_size;
-    dim_sizes[2] = dim3_size;
-    dim_sizes[3] = dim4_size;
-    tensor->dims.dim_sizes = dim_sizes;
-    tensor->dims.num_dims = 4;
-    // Done setting tensor dimensions  
-    //setTensorDescriptor(tensor, 4, dim_sizes);
-    set4DTensorDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size);
-    // FIXIT: filter descriptor should be invoked only for filters
-    set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size);
-  
-    return tensor;
+  size_t host_size_in_bytes = tensor->num_elems * 4;
+  // if(tensor->size_in_bytes != size_in_bytes){
+  if (host_size_in_bytes != size_in_bytes) {
+    ERROR("The destination and source sizes don't match");
   }
 
+  std::memcpy(tensor->host_data, data_ptr, size_in_bytes);
 
-  void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){
+  changeTensorPlacement(tensor, HOST);
+
+  tensor->cur_type = float_type;
+}
 
-    Tensor* tensor = (Tensor*) tensor_ptr;
+void hostToDeviceCopy(struct Tensor *tensor) {
 
-    size_t host_size_in_bytes = tensor->num_elems * 4;
-    //if(tensor->size_in_bytes != size_in_bytes){
-    if(host_size_in_bytes != size_in_bytes){
-      ERROR("The destination and source sizes don't match");
-    }
-  
-    std::memcpy(tensor->host_data, data_ptr, size_in_bytes);
+  if (tensor->data_placement != DEVICE) {
+    cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
+               cudaMemcpyHostToDevice);
+    DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
+    tensor->data_placement = DEVICE;
+  } else {
+    DEBUG("No data movement required - Data on Device \n");
+  }
+}
 
-    changeTensorPlacement(tensor, HOST);
+void deviceToHostCopy(struct Tensor *tensor) {
 
-    tensor->cur_type = float_type;
+  if (tensor->data_placement != HOST) {
+    cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes,
+               cudaMemcpyDeviceToHost);
+    DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
+    tensor->data_placement = HOST;
+  } else {
+    DEBUG("No data movement required - Data on Host \n");
   }
+}
 
-		      
+// void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){
 
-  void hostToDeviceCopy(struct Tensor* tensor){
+void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) {
 
-    if(tensor->data_placement != DEVICE){
-      cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
-		 cudaMemcpyHostToDevice);
-      DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
-      tensor->data_placement = DEVICE;
-    }
-    else{
-      DEBUG("No data movement required - Data on Device \n");    
-    }
-  
-  }
+  struct Tensor *srcTensor = (struct Tensor *)srcTensor_ptr;
+  struct Tensor *dstTensor = (struct Tensor *)dstTensor_ptr;
 
+  if (srcTensor->data_placement == HOST) {
+    memcpy(dstTensor->host_data, srcTensor->host_data,
+           srcTensor->size_in_bytes);
+    DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes);
+    dstTensor->data_placement = HOST;
+  } else if (srcTensor->data_placement == DEVICE) {
+    cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data,
+               srcTensor->size_in_bytes, cudaMemcpyDeviceToDevice);
+    DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes);
+    dstTensor->data_placement = DEVICE;
+  }
+}
 
-  void deviceToHostCopy(struct Tensor* tensor){
+void hpvm_request_tensor(void *tensor_ptr, int destination) {
 
-    if(tensor->data_placement != HOST){
+  Tensor *tensor = (Tensor *)tensor_ptr;
+  // If destination is the host
+  if (destination == 0) {
+    if (tensor->data_placement != HOST) {
       cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes,
-		 cudaMemcpyDeviceToHost);  
+                 cudaMemcpyDeviceToHost);
       DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
       tensor->data_placement = HOST;
+    } else {
+      DEBUG("No data movement required - Data on Host \n");
     }
-    else{
-      DEBUG("No data movement required - Data on Host \n");    
-    }
-    
-  }
-
-
-  //void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){
-
-  void tensorCopy(void* srcTensor_ptr, void* dstTensor_ptr){
-
-    struct Tensor* srcTensor = (struct Tensor*) srcTensor_ptr;
-    struct Tensor* dstTensor = (struct Tensor*) dstTensor_ptr;
-
-    
-    if(srcTensor->data_placement == HOST){
-      memcpy(dstTensor->host_data, srcTensor->host_data, srcTensor->size_in_bytes);  
-      DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes);
-      dstTensor->data_placement = HOST;
-    }
-    else if (srcTensor->data_placement == DEVICE){
-      cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data, srcTensor->size_in_bytes,
-		 cudaMemcpyDeviceToDevice);
-      DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes);
-      dstTensor->data_placement = DEVICE;
-    }
-    
   }
+  // If destination is the GPU
+  else if (destination == 1) {
 
-
-  void hpvm_request_tensor(void* tensor_ptr, int destination){
-
-    Tensor* tensor = (Tensor*) tensor_ptr;
-    // If destination is the host
-    if(destination == 0){  
-      if(tensor->data_placement != HOST){
-	cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes,
-		   cudaMemcpyDeviceToHost);  
-	DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
-	tensor->data_placement = HOST;
-      }
-      else{
-	DEBUG("No data movement required - Data on Host \n");    
-      }
-    }
-    // If destination is the GPU
-    else if(destination == 1){
-
-      if(tensor->data_placement != DEVICE){
-	cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
-		   cudaMemcpyHostToDevice);
-	DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
-	tensor->data_placement = DEVICE;
-      }
-      else{
-	DEBUG("No data movement required - Data on Device \n");    
-      }    
+    if (tensor->data_placement != DEVICE) {
+      cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
+                 cudaMemcpyHostToDevice);
+      DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
+      tensor->data_placement = DEVICE;
+    } else {
+      DEBUG("No data movement required - Data on Device \n");
     }
-  
   }
+}
 
+void convertToFP16(struct Tensor *tensor) {
 
-
- void convertToFP16(struct Tensor* tensor){
-
-  if(tensor == NULL)
+  if (tensor == NULL)
     return;
-  
+
   if (tensor->cur_type == half_type)
     return;
-    
+
   DEBUG("ConvertoFP16 \n");
 
   setSizeInBytes(tensor, half_type, tensor->num_elems);
   size_t size_in_bytes = tensor->size_in_bytes;
   DEBUG("size_in_bytes = %d \n", size_in_bytes);
-  
-  if(tensor->gpu_half_data == NULL)
-     checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, size_in_bytes)); // Allocate memory on GPU
-  // If Tensor is one of Tracked (has to free per batch) then track all data types
-  if(tracked_tensors.find(tensor) != tracked_tensors.end())
+
+  if (tensor->gpu_half_data == NULL)
+    checkCudaErrors(cudaMalloc(&tensor->gpu_half_data,
+                               size_in_bytes)); // Allocate memory on GPU
+  // If Tensor is one of Tracked (has to free per batch) then track all data
+  // types
+  if (tracked_tensors.find(tensor) != tracked_tensors.end())
     tensors_ptr.insert(tensor->gpu_half_data);
 
-  f2h((float*) tensor->gpu_data, tensor->num_elems, (half*) tensor->gpu_half_data);
+  f2h((float *)tensor->gpu_data, tensor->num_elems,
+      (half *)tensor->gpu_half_data);
 
-  tensor->cur_type = half_type;  
+  tensor->cur_type = half_type;
 }
 
+void convertToFP32(struct Tensor *tensor) {
 
-
-void convertToFP32(struct Tensor* tensor){
-
-  if(tensor == NULL)
+  if (tensor == NULL)
     return;
-  
+
   // Need this check for both offline and online profiling path
   if (tensor->cur_type == float_type)
     return;
-    
+
   DEBUG("ConvertoFP32 \n");
-  
+
   setSizeInBytes(tensor, float_type, tensor->num_elems);
   size_t size_in_bytes = tensor->size_in_bytes;
-  
+
   // If FP32 data array doesn't exist, allocate
-  if(tensor->gpu_data == NULL){
-    checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
+  if (tensor->gpu_data == NULL) {
+    checkCudaErrors(
+        cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
     DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes);
   }
-  // If Tensor is one of Tracked (has to free per batch) then track all data types
-  if(tracked_tensors.find(tensor) != tracked_tensors.end())
+  // If Tensor is one of Tracked (has to free per batch) then track all data
+  // types
+  if (tracked_tensors.find(tensor) != tracked_tensors.end())
     tensors_ptr.insert(tensor->gpu_data);
 
-  h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data);
+  h2f((half *)tensor->gpu_half_data, tensor->num_elems,
+      (float *)tensor->gpu_data);
 
   tensor->cur_type = float_type;
-
 }
 
+void convertToFP32_offline(struct Tensor *tensor) {
 
-
-void convertToFP32_offline(struct Tensor* tensor){
-
-  if(tensor == NULL)
+  if (tensor == NULL)
     return;
 
   if (tensor->cur_type == half_type)
@@ -504,36 +473,36 @@ void convertToFP32_offline(struct Tensor* tensor){
   size_t size_in_bytes = tensor->size_in_bytes;
 
   // If FP32 data array doesn't exist, allocate
-  if(tensor->gpu_data == NULL){
-    checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
+  if (tensor->gpu_data == NULL) {
+    checkCudaErrors(
+        cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
     DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes);
   }
 
-  // If Tensor is one of Tracked (has to free per batch) then track all data types
-  if(tracked_tensors.find(tensor) != tracked_tensors.end())
+  // If Tensor is one of Tracked (has to free per batch) then track all data
+  // types
+  if (tracked_tensors.find(tensor) != tracked_tensors.end())
     tensors_ptr.insert(tensor->gpu_data);
 
-  h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data);
+  h2f((half *)tensor->gpu_half_data, tensor->num_elems,
+      (float *)tensor->gpu_data);
 
   tensor->cur_type = float_type;
-  
+
   cudaFree(tensor->gpu_half_data);
   tensors_ptr.erase(tensor->gpu_half_data);
   tensor->gpu_half_data = NULL;
 }
 
-
-
-
-
 // Called from within the runtime to change the data placement
-// This routine is required to change the output data placements from host to device
-void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement){
+// This routine is required to change the output data placements from host to
+// device
+void changeTensorPlacement(struct Tensor *tensor,
+                           data_location_t data_placement) {
 
-  if(tensor == NULL)
+  if (tensor == NULL)
     ERROR("Tensor == NULL");
   tensor->data_placement = data_placement;
 }
 
-
 } // end of Extern"C"
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu
index 5cdfdf5a55109fac66a89f544306fbe7b4b9562a..8c77234e2432bd5fe1cde144b031d42273140d42 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu
@@ -1,13 +1,13 @@
 //===--------------------------- wrapper_runtime.cu -----------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-// This file contains the implementation of some of the core API to tensor runtime
-// so that runtime tuning of approximations can be done on different targets.
+//
+// This file contains the implementation of some of the core API to tensor
+// runtime so that runtime tuning of approximations can be done on different
+// targets.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include <stdio.h>
 #include <cstdio>
 #include <cstdlib>
@@ -24,7 +24,6 @@
 #include <cuda_fp16.h>
 #include <driver_types.h>
 
-
 // Tensor runtime header files
 #include "tensor_utils.h"
 #include "debug.h"
@@ -37,641 +36,580 @@
 #include "half_precision_api.h"
 
 #include "hpvm-rt-controller.h"
-#include "approxhpvm_runtime_utils.h" 
+#include "approxhpvm_runtime_utils.h"
 #include "approx_api.h"
 
-
-extern "C"{
-
-  /**** Wrapper Runtime API ***/
-
-
-  void* wrapper_ConvLayer(const char* hpvm_node_id,
-			  void* input, 
-			  void* filter, 
-			  void* bias, 
-			  int conv_pad_h, int conv_pad_w,
-			  int conv_stride_h, int conv_stride_w,
-			  int pool_id, int pool_size,
-			  int activation_id,
-			  // NOTE: out_min, out_max are only relevant for ClippedRelu
-			  float out_min, float out_max){
-
-    NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
-
-    if (NodeConf->isGPUNodeConfiguration()) {
-	DEBUG("GPU Configuration for ConvLayer\n");
-	// Mapped to GPU - get a GPU node configuration
-	GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
-
-	std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-				std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						       int> > > > &ApproxChoices =
-	  GPUConf->getApproxChoices();
-
-	// Check for convolution as first operation
-	CUSTOM_ASSERT((ApproxChoices.size() >= 1) &&
-		      (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) &&
-		      "Incorrect number/type of operations in provided Conv layer configuration");
-
-	void* conv_out = handleTensorConvApproximationTuples(ApproxChoices[0].second,
-							     input, filter, conv_pad_h, conv_pad_w,
-							     conv_stride_h, conv_stride_w);
-	void* add_out;
-	if (bias != NULL) {
-	  // Check for add as second operation
-	  CUSTOM_ASSERT((ApproxChoices.size() >= 2) &&
-			(ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) &&
-			"Incorrect number/type of operations in provided Conv layer configuration");
-	  add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
-						       conv_out, bias);
-	} else {
-	  add_out = conv_out;
-	}
-
-	void* activation_out;
-	switch (activation_id) {
-	case -1:
-	  { // No activation
-	    //INFO("No activation Function\n");
-	    activation_out = add_out;
-	  }
-	  break;
-	case 0:
-	  { // TanH activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out = handleTensorTanhApproximationTuples(ApproxChoices[2].second,
-								 add_out);
-	  }
-	  break;
-	case 1:
-	  { // ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out = handleTensorReluApproximationTuples(ApproxChoices[2].second,
-								 add_out);
-	  }
-	  break;
-	case 2:
-	  { // Clipped ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out =
-	      handleTensorClippedReluApproximationTuples(ApproxChoices[2].second,
-							 add_out, out_min, out_max);
-	  }
-	  break;
-	default:
-	  {
-	    ERROR("Activation id %d NOT supported \n", activation_id);
-	  }
-	  break;
-	}
-
-	void* pool_out;
-
-	if (pool_size > 0) {
-	  switch (pool_id) {
-	  case 0:
-	    {
-	      // If we remove the asserts, we can have all cases handled by a single call
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MAX) &&
-			    "Expected POOL_MAX in provided Conv layer configuration");
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size, pool_size, 0, 0,
-						       pool_size, pool_size);
-	    }
-	    break;
-	  case 1:
-	    {
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) &&
-			    "Expected POOL_MEAN in provided Conv layer configuration");
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size, pool_size, 0, 0,
-						       pool_size, pool_size);
-	    }
-	    break;
-	  case 2:
-	    {
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
-			    "Expected POOL_MIN in provided Conv layer configuration");
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size, pool_size, 0, 0,
-						       pool_size, pool_size);
-	    }
-	    break;
-	  default:
-	    {
-	      ERROR("Pool id %d NOT supported \n", pool_id);
-	    }
-	    break;
-	  }
-	} else {
-	  pool_out = activation_out;
-	}
-	return pool_out;
+extern "C" {
+
+/**** Wrapper Runtime API ***/
+
+void *
+wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
+                  void *bias, int conv_pad_h, int conv_pad_w, int conv_stride_h,
+                  int conv_stride_w, int pool_id, int pool_size,
+                  int activation_id,
+                  // NOTE: out_min, out_max are only relevant for ClippedRelu
+                  float out_min, float out_max) {
+
+  NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
+
+  if (NodeConf->isGPUNodeConfiguration()) {
+    DEBUG("GPU Configuration for ConvLayer\n");
+    // Mapped to GPU - get a GPU node configuration
+    GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
+
+    std::vector<
+        std::pair<GPUNodeConfiguration::TENSOR_OP,
+                  std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+        &ApproxChoices = GPUConf->getApproxChoices();
+
+    // Check for convolution as first operation
+    CUSTOM_ASSERT(
+        (ApproxChoices.size() >= 1) &&
+        (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) &&
+        "Incorrect number/type of operations in provided Conv layer "
+        "configuration");
+
+    void *conv_out = handleTensorConvApproximationTuples(
+        ApproxChoices[0].second, input, filter, conv_pad_h, conv_pad_w,
+        conv_stride_h, conv_stride_w);
+    void *add_out;
+    if (bias != NULL) {
+      // Check for add as second operation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 2) &&
+          (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
+                                                   conv_out, bias);
+    } else {
+      add_out = conv_out;
+    }
+
+    void *activation_out;
+    switch (activation_id) {
+    case -1: { // No activation
+      // INFO("No activation Function\n");
+      activation_out = add_out;
+    } break;
+    case 0: { // TanH activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      activation_out =
+          handleTensorTanhApproximationTuples(ApproxChoices[2].second, add_out);
+    } break;
+    case 1: { // ReLU activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      activation_out =
+          handleTensorReluApproximationTuples(ApproxChoices[2].second, add_out);
+    } break;
+    case 2: { // Clipped ReLU activation
+      CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
+                    (ApproxChoices[2].first ==
+                     GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
+                    "Incorrect number/type of operations in provided Conv "
+                    "layer configuration");
+      activation_out = handleTensorClippedReluApproximationTuples(
+          ApproxChoices[2].second, add_out, out_min, out_max);
+    } break;
+    default: {
+      ERROR("Activation id %d NOT supported \n", activation_id);
+    } break;
+    }
+
+    void *pool_out;
+
+    if (pool_size > 0) {
+      switch (pool_id) {
+      case 0: {
+        // If we remove the asserts, we can have all cases handled by a single
+        // call
+        CUSTOM_ASSERT((ApproxChoices.back().first ==
+                       GPUNodeConfiguration::TENSOR_OP::POOL_MAX) &&
+                      "Expected POOL_MAX in provided Conv layer configuration");
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size,
+            pool_size, 0, 0, pool_size, pool_size);
+      } break;
+      case 1: {
+        CUSTOM_ASSERT(
+            (ApproxChoices.back().first ==
+             GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) &&
+            "Expected POOL_MEAN in provided Conv layer configuration");
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size,
+            pool_size, 0, 0, pool_size, pool_size);
+      } break;
+      case 2: {
+        CUSTOM_ASSERT((ApproxChoices.back().first ==
+                       GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
+                      "Expected POOL_MIN in provided Conv layer configuration");
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size,
+            pool_size, 0, 0, pool_size, pool_size);
+      } break;
+      default: {
+        ERROR("Pool id %d NOT supported \n", pool_id);
+      } break;
       }
-      else {
-	ERROR("Unsupported Configuration");
-	abort();
-      }
-
-    return NULL;
+    } else {
+      pool_out = activation_out;
+    }
+    return pool_out;
+  } else {
+    ERROR("Unsupported Configuration");
+    abort();
   }
 
+  return NULL;
+}
 
-
-
-  
-  void* wrapper_ConvLayer2(const char* hpvm_node_id,
-			  void* input, 
-			  void* filter, 
-			  void* bias, 
-			  int conv_pad_h, int conv_pad_w,
-			  int conv_stride_h, int conv_stride_w,
-			  int pool_id,
-			  int pool_size_v, int pool_size_h,			 
-			  int pool_pad_v, int pool_pad_h,
-			  int pool_stride_v, int pool_stride_h,
-			  int activation_id,
-			  // NOTE: out_min, out_max are only relevant for ClippedRelu
-			  float out_min, float out_max){
-
-    INFO ("*** Conv Layer \n");
-    
-    NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
-	if (NodeConf->isGPUNodeConfiguration()) {
-	DEBUG("GPU Configuration for ConvLayer\n");
-	// Mapped to GPU - get a GPU node configuration
-	GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
-
-	std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-				std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						       int> > > > &ApproxChoices =
-	GPUConf->getApproxChoices();
-
-	
-	//printf("*** Convolution \n ApproxChoice = %d \n  BatchNorm = %d \n CONV = %d \n", ApproxChoices[0].first,
-	//	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
-	//       GPUNodeConfiguration::TENSOR_OP::CONV);
-
-	// Check for convolution as first operation
-	CUSTOM_ASSERT((ApproxChoices.size() >= 1) &&
-		      (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) &&
-		      "Incorrect number/type of operations in provided Conv layer configuration");
-
-
-	
-	void* conv_out = handleTensorConvApproximationTuples(ApproxChoices[0].second,
-							     input, filter, conv_pad_h, conv_pad_w,
-							     conv_stride_h, conv_stride_w);
-	void* add_out;
-	if (bias != NULL) {
-	  // Check for add as second operation
-	  CUSTOM_ASSERT((ApproxChoices.size() >= 2) &&
-			(ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) &&
-			"Incorrect number/type of operations in provided Conv layer configuration");
-	  add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
-						       conv_out, bias);
-	} else {
-	  add_out = conv_out;
-	}
-
-	void* activation_out;
-	switch (activation_id) {
-	case -1:
-	  { // No activation
-	    //INFO("No activation Function\n");
-	    activation_out = add_out;
-	  }
-	  break;
-	case 0:
-	  { // TanH activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out = handleTensorTanhApproximationTuples(ApproxChoices[2].second,
-								 add_out);
-	  }
-	  break;
-	case 1:
-	  { // ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out = handleTensorReluApproximationTuples(ApproxChoices[2].second,
-								 add_out);
-	  }
-	  break;
-	case 2:
-	  { // Clipped ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out =
-	      handleTensorClippedReluApproximationTuples(ApproxChoices[2].second,
-							 add_out, out_min, out_max);
-	  }
-	  break;
-	default:
-	  {
-	    ERROR("Activation id %d NOT supported \n", activation_id);
-	  }
-	  break;
-	}
-
-	void* pool_out;
-
-	if (pool_size_v > 0) {
-	  switch (pool_id) {
-	  case 0:
-	    {
-	      // If we remove the asserts, we can have all cases handled by a single call
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MAX) &&
-			    "Expected POOL_MAX in provided Conv layer configuration");
-	      
-	      pool_out = handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-								activation_out, pool_id,
-								pool_size_v, pool_size_h,
-								pool_pad_v, pool_pad_h,
-								pool_stride_v, pool_stride_h);
-	      
-
-	    }
-	    break;
-	  case 1:
-	    {
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) &&
-			    "Expected POOL_MEAN in provided Conv layer configuration");
-
-	      // FIXIT: POOL_MEAN still needs fixing
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size_v, pool_size_h,
-						       0, 0,
-						       pool_size_v, pool_size_h);
-	    
-	    }
-	    break;
-	  case 2:
-	    {
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
-			    "Expected POOL_MIN in provided Conv layer configuration");
-
-	      // FIXIT: Pool_MEAN needs fixing
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size_v, pool_size_h, 0, 0,
-						       pool_size_v, pool_size_h);
-	    }
-	    break;
-	  default:
-	    {
-	      ERROR("Pool id %d NOT supported \n", pool_id);
-	    }
-	    break;
-	  }
-	} else {
-	  pool_out = activation_out;
-	}
-	return pool_out;
-      }
-      else {
-	ERROR("Unsupported Configuration");
-	abort();
+void *wrapper_ConvLayer2(
+    const char *hpvm_node_id, void *input, void *filter, void *bias,
+    int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w,
+    int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v,
+    int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id,
+    // NOTE: out_min, out_max are only relevant for ClippedRelu
+    float out_min, float out_max) {
+
+  INFO("*** Conv Layer \n");
+
+  NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
+  if (NodeConf->isGPUNodeConfiguration()) {
+    DEBUG("GPU Configuration for ConvLayer\n");
+    // Mapped to GPU - get a GPU node configuration
+    GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
+
+    std::vector<
+        std::pair<GPUNodeConfiguration::TENSOR_OP,
+                  std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+        &ApproxChoices = GPUConf->getApproxChoices();
+
+    // printf("*** Convolution \n ApproxChoice = %d \n  BatchNorm = %d \n CONV =
+    // %d \n", ApproxChoices[0].first,
+    //	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
+    //       GPUNodeConfiguration::TENSOR_OP::CONV);
+
+    // Check for convolution as first operation
+    CUSTOM_ASSERT(
+        (ApproxChoices.size() >= 1) &&
+        (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) &&
+        "Incorrect number/type of operations in provided Conv layer "
+        "configuration");
+
+    void *conv_out = handleTensorConvApproximationTuples(
+        ApproxChoices[0].second, input, filter, conv_pad_h, conv_pad_w,
+        conv_stride_h, conv_stride_w);
+    void *add_out;
+    if (bias != NULL) {
+      // Check for add as second operation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 2) &&
+          (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
+                                                   conv_out, bias);
+    } else {
+      add_out = conv_out;
+    }
+
+    void *activation_out;
+    switch (activation_id) {
+    case -1: { // No activation
+      // INFO("No activation Function\n");
+      activation_out = add_out;
+    } break;
+    case 0: { // TanH activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      activation_out =
+          handleTensorTanhApproximationTuples(ApproxChoices[2].second, add_out);
+    } break;
+    case 1: { // ReLU activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      activation_out =
+          handleTensorReluApproximationTuples(ApproxChoices[2].second, add_out);
+    } break;
+    case 2: { // Clipped ReLU activation
+      CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
+                    (ApproxChoices[2].first ==
+                     GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
+                    "Incorrect number/type of operations in provided Conv "
+                    "layer configuration");
+      activation_out = handleTensorClippedReluApproximationTuples(
+          ApproxChoices[2].second, add_out, out_min, out_max);
+    } break;
+    default: {
+      ERROR("Activation id %d NOT supported \n", activation_id);
+    } break;
+    }
+
+    void *pool_out;
+
+    if (pool_size_v > 0) {
+      switch (pool_id) {
+      case 0: {
+        // If we remove the asserts, we can have all cases handled by a single
+        // call
+        CUSTOM_ASSERT((ApproxChoices.back().first ==
+                       GPUNodeConfiguration::TENSOR_OP::POOL_MAX) &&
+                      "Expected POOL_MAX in provided Conv layer configuration");
+
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size_v,
+            pool_size_h, pool_pad_v, pool_pad_h, pool_stride_v, pool_stride_h);
+
+      } break;
+      case 1: {
+        CUSTOM_ASSERT(
+            (ApproxChoices.back().first ==
+             GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) &&
+            "Expected POOL_MEAN in provided Conv layer configuration");
+
+        // FIXIT: POOL_MEAN still needs fixing
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size_v,
+            pool_size_h, 0, 0, pool_size_v, pool_size_h);
+
+      } break;
+      case 2: {
+        CUSTOM_ASSERT((ApproxChoices.back().first ==
+                       GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
+                      "Expected POOL_MIN in provided Conv layer configuration");
+
+        // FIXIT: Pool_MEAN needs fixing
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size_v,
+            pool_size_h, 0, 0, pool_size_v, pool_size_h);
+      } break;
+      default: {
+        ERROR("Pool id %d NOT supported \n", pool_id);
+      } break;
       }
-
-    return NULL;
+    } else {
+      pool_out = activation_out;
+    }
+    return pool_out;
+  } else {
+    ERROR("Unsupported Configuration");
+    abort();
   }
 
+  return NULL;
+}
 
-
-
-  
-
-  void* wrapper_FCLayer(const char* hpvm_node_id,
-			void* input, 
-			void* weights, 
-			void* bias, 
-			int activation_id,
-			// NOTE: out_min and out_max are only relevant for ClippedRelu
-			float out_min, float out_max){ 
-
-    INFO ("*** Dense Layer \n");
-    
-    NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
-	if (NodeConf->isGPUNodeConfiguration()) {
-	DEBUG("GPU Configuration for FCLayer\n");
-	// Mapped to GPU - get a GPU node configuration
-	GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
-
-	std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-				std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						       int> > > > &ApproxChoices =
-	GPUConf->getApproxChoices();
-
-	// Approximation choices must be for a FC wrapper operation
-	CUSTOM_ASSERT((ApproxChoices.size() == 2 || ApproxChoices.size() == 3) &&
-		      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MUL &&
-		      ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD &&
-		      "Invalid configuration generated for FC layer wrapper operation");
-
-	void* gemm_out = handleTensorMulApproximationTuples(ApproxChoices[0].second,
-							    input, weights);
-	void* add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
-							   gemm_out, bias);
-
-	void* activation_out;
-	switch (activation_id) {
-	case -1:
-	  { // No activation
-	    CUSTOM_ASSERT((ApproxChoices.size() == 2) &&
-			  "Incorrect number of operations in provided FC layer configuration");
-	    //INFO("No activation Function\n");
-	    activation_out = add_out;
-	  }
-	  break;
-	case 0:
-	  { // TanH activation
-	    CUSTOM_ASSERT((ApproxChoices.size() == 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
-			  "Incorrect number/type of operations in provided FC layer configuration");
-	    activation_out = handleTensorTanhApproximationTuples(ApproxChoices[1].second,
-								 add_out);
-	  }
-	  break;
-	case 1:
-	  { // ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() == 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
-			  "Incorrect number/type of operations in provided FC layer configuration");
-	    activation_out = handleTensorReluApproximationTuples(ApproxChoices[1].second,
-								 add_out);
-	  }
-	  break;
-	case 2:
-	  { // Clipped ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() == 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
-			  "Incorrect number/type of operations in provided FC layer configuration");
-	    activation_out =
-	      handleTensorClippedReluApproximationTuples(ApproxChoices[1].second,
-							 add_out, out_min, out_max);
-	  }
-	  break;
-	default:
-	  {
-	    ERROR("Activation id %d NOT supported \n", activation_id);
-	  }
-	  break;
-	}
-	return activation_out;
-      }
-      else {
-	ERROR("Unsupported Configuration");
-	abort();
-      }
-
-    return NULL;
+void *
+wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights,
+                void *bias, int activation_id,
+                // NOTE: out_min and out_max are only relevant for ClippedRelu
+                float out_min, float out_max) {
+
+  INFO("*** Dense Layer \n");
+
+  NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
+  if (NodeConf->isGPUNodeConfiguration()) {
+    DEBUG("GPU Configuration for FCLayer\n");
+    // Mapped to GPU - get a GPU node configuration
+    GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
+
+    std::vector<
+        std::pair<GPUNodeConfiguration::TENSOR_OP,
+                  std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+        &ApproxChoices = GPUConf->getApproxChoices();
+
+    // Approximation choices must be for a FC wrapper operation
+    CUSTOM_ASSERT(
+        (ApproxChoices.size() == 2 || ApproxChoices.size() == 3) &&
+        ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MUL &&
+        ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD &&
+        "Invalid configuration generated for FC layer wrapper operation");
+
+    void *gemm_out = handleTensorMulApproximationTuples(ApproxChoices[0].second,
+                                                        input, weights);
+    void *add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
+                                                       gemm_out, bias);
+
+    void *activation_out;
+    switch (activation_id) {
+    case -1: { // No activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() == 2) &&
+          "Incorrect number of operations in provided FC layer configuration");
+      // INFO("No activation Function\n");
+      activation_out = add_out;
+    } break;
+    case 0: { // TanH activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() == 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
+          "Incorrect number/type of operations in provided FC layer "
+          "configuration");
+      activation_out =
+          handleTensorTanhApproximationTuples(ApproxChoices[1].second, add_out);
+    } break;
+    case 1: { // ReLU activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() == 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
+          "Incorrect number/type of operations in provided FC layer "
+          "configuration");
+      activation_out =
+          handleTensorReluApproximationTuples(ApproxChoices[1].second, add_out);
+    } break;
+    case 2: { // Clipped ReLU activation
+      CUSTOM_ASSERT((ApproxChoices.size() == 3) &&
+                    (ApproxChoices[2].first ==
+                     GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
+                    "Incorrect number/type of operations in provided FC layer "
+                    "configuration");
+      activation_out = handleTensorClippedReluApproximationTuples(
+          ApproxChoices[1].second, add_out, out_min, out_max);
+    } break;
+    default: {
+      ERROR("Activation id %d NOT supported \n", activation_id);
+    } break;
+    }
+    return activation_out;
+  } else {
+    ERROR("Unsupported Configuration");
+    abort();
   }
 
+  return NULL;
+}
 
+void *wrapper_tensorRelu(const char *hpvm_node_id, void *input_ptr) {
 
+  INFO("*** Relu Operation \n");
 
-  void* wrapper_tensorRelu(const char* hpvm_node_id, void* input_ptr){
-
-    INFO("*** Relu Operation \n");
-    
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
 
-    // Approximation choices must be for a relu operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::RELU &&
-		  "Invalid configuration generated for tensor relu wrapper operation");
+  // Approximation choices must be for a relu operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::RELU &&
+      "Invalid configuration generated for tensor relu wrapper operation");
 
-    return handleTensorReluApproximationTuples(ApproxChoices[0].second,
-					       input_ptr);
-
-  }
+  return handleTensorReluApproximationTuples(ApproxChoices[0].second,
+                                             input_ptr);
+}
 
-  void* wrapper_tensorClippedRelu(const char* hpvm_node_id,
-				  void* input_ptr,
-				  float out_min, float out_max){
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+void *wrapper_tensorClippedRelu(const char *hpvm_node_id, void *input_ptr,
+                                float out_min, float out_max) {
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
-
-    // Approximation choices must be for a relu operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU &&
-		  "Invalid configuration generated for tensor clipped relu wrapper operation");
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
 
-    return handleTensorClippedReluApproximationTuples(ApproxChoices[0].second,
-						      input_ptr, out_min, out_max);
+  // Approximation choices must be for a relu operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+                ApproxChoices[0].first ==
+                    GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU &&
+                "Invalid configuration generated for tensor clipped relu "
+                "wrapper operation");
 
-  }
+  return handleTensorClippedReluApproximationTuples(
+      ApproxChoices[0].second, input_ptr, out_min, out_max);
+}
 
-  void* wrapper_tensorTanh(const char* hpvm_node_id, void* input_ptr){
-    //  return tensorTanh(input_ptr);
+void *wrapper_tensorTanh(const char *hpvm_node_id, void *input_ptr) {
+  //  return tensorTanh(input_ptr);
 
-    GPUNodeConfiguration *GPUConf =
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
 
-    // Approximation choices must be for a tanh operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::TANH &&
-		  "Invalid configuration generated for tensor tanh wrapper operation");
-
-    return handleTensorTanhApproximationTuples(ApproxChoices[0].second,
-					       input_ptr);
-
-  }
+  // Approximation choices must be for a tanh operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::TANH &&
+      "Invalid configuration generated for tensor tanh wrapper operation");
 
+  return handleTensorTanhApproximationTuples(ApproxChoices[0].second,
+                                             input_ptr);
+}
 
-  void* wrapper_tensorBatchNorm(const char* hpvm_node_id,
-				void* input_ptr, void* gamma_ptr, void* beta_ptr,
-				void* mean_ptr, void* variance_ptr, double epsilon){
+void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr,
+                              void *gamma_ptr, void *beta_ptr, void *mean_ptr,
+                              void *variance_ptr, double epsilon) {
 
-    INFO("*** BatchNorm Operation \n");
+  INFO("*** BatchNorm Operation \n");
 
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices =
 
-    GPUConf->getApproxChoices();
+          GPUConf->getApproxChoices();
 
-    // printf("*** BatchNorm \n ApproxChoice = %d \n  BatchNorm = %d \n CONV = %d \n", ApproxChoices[0].first,
-    //	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
-    //	       GPUNodeConfiguration::TENSOR_OP::CONV);
+  // printf("*** BatchNorm \n ApproxChoice = %d \n  BatchNorm = %d \n CONV = %d
+  // \n", ApproxChoices[0].first,
+  //	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
+  //	       GPUNodeConfiguration::TENSOR_OP::CONV);
 
-    // Approximation choices must be for a batchnorm operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::BATCHNORM &&
-		  "Invalid configuration generated for tensor batchnorm wrapper operation");
-
-    return handleTensorBatchNormApproximationTuples(ApproxChoices[0].second,
-						    input_ptr, gamma_ptr, beta_ptr,
-						    mean_ptr, variance_ptr, epsilon);
-
-  }
+  // Approximation choices must be for a batchnorm operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::BATCHNORM &&
+      "Invalid configuration generated for tensor batchnorm wrapper operation");
 
+  return handleTensorBatchNormApproximationTuples(
+      ApproxChoices[0].second, input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+      variance_ptr, epsilon);
+}
 
-  void* wrapper_tensorAdd(const char* hpvm_node_id, void* input_ptr, void* bias_ptr){
+void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr,
+                        void *bias_ptr) {
 
-   
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices =
 
-    GPUConf->getApproxChoices();
+          GPUConf->getApproxChoices();
 
-    // Approximation choices must be for an add operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::ADD &&
-		  "Invalid configuration generated for tensor add wrapper operation");
+  // Approximation choices must be for an add operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::ADD &&
+      "Invalid configuration generated for tensor add wrapper operation");
 
-    return handleTensorAddApproximationTuples(ApproxChoices[0].second,
-					      input_ptr, bias_ptr);
-
-  }
+  return handleTensorAddApproximationTuples(ApproxChoices[0].second, input_ptr,
+                                            bias_ptr);
+}
 
+void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr,
+                            int poolFunction, int window_height,
+                            int window_width, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride) {
 
-  void* wrapper_tensorPooling(const char* hpvm_node_id,
-			      void* input_ptr,
-			      int poolFunction,
-			      int window_height, int window_width,
-			      int vertical_pad, int horizontal_pad,
-			      int vertical_stride, int horizontal_stride){
+  INFO("*** TensorPooling Operation \n");
 
-    INFO("*** TensorPooling Operation \n");
-    
-    //  return tensorPooling(input_ptr, poolFunction, window_height, window_width,
-    //		       vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+  //  return tensorPooling(input_ptr, poolFunction, window_height, window_width,
+  //		       vertical_pad, horizontal_pad, vertical_stride,
+  // horizontal_stride);
 
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-
-    GPUConf->getApproxChoices();
-
-    // Approximation choices must be for a single operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  "Invalid configuration generated for tensor pool wrapper operation");
-    enum GPUNodeConfiguration::TENSOR_OP top = ApproxChoices[0].first;
-    // Approximation choices must be for a pool operation
-    CUSTOM_ASSERT((top == GPUNodeConfiguration::TENSOR_OP::POOL_MAX  ||
-		   top == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN ||
-		   top == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
-		  "Invalid configuration generated for tensor pool wrapper operation");
-
-    return handleTensorPoolingApproximationTuples(ApproxChoices[0].second,
-						  input_ptr, poolFunction,
-						  window_height, window_width,
-						  vertical_pad, horizontal_pad,
-						  vertical_stride, horizontal_stride);
-
-  }
-
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices =
+
+          GPUConf->getApproxChoices();
+
+  // Approximation choices must be for a single operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      "Invalid configuration generated for tensor pool wrapper operation");
+  enum GPUNodeConfiguration::TENSOR_OP top = ApproxChoices[0].first;
+  // Approximation choices must be for a pool operation
+  CUSTOM_ASSERT(
+      (top == GPUNodeConfiguration::TENSOR_OP::POOL_MAX ||
+       top == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN ||
+       top == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
+      "Invalid configuration generated for tensor pool wrapper operation");
+
+  return handleTensorPoolingApproximationTuples(
+      ApproxChoices[0].second, input_ptr, poolFunction, window_height,
+      window_width, vertical_pad, horizontal_pad, vertical_stride,
+      horizontal_stride);
+}
 
-  void* wrapper_tensorGroupConvolution(const char* hpvm_node_id,
-				       void* input, void* filter,
-				       int vertical_pad, int horizontal_pad,
-				       int vertical_stride, int horizontal_stride,
-				       int conv_mode, int conv_groups){
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+void *wrapper_tensorGroupConvolution(const char *hpvm_node_id, void *input,
+                                     void *filter, int vertical_pad,
+                                     int horizontal_pad, int vertical_stride,
+                                     int horizontal_stride, int conv_mode,
+                                     int conv_groups) {
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
-
-    // Approximation choices must be for a group_conv operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::GROUP_CONV &&
-		  "Invalid configuration generated for tensor group_conv wrapper operation");
-
-    return handleTensorGroupConvApproximationTuples(ApproxChoices[0].second,
-						    input, filter,
-						    vertical_pad, horizontal_pad,
-						    vertical_stride, horizontal_stride,
-						    conv_mode, conv_groups);
-
-  }
-
-
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
+
+  // Approximation choices must be for a group_conv operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+                ApproxChoices[0].first ==
+                    GPUNodeConfiguration::TENSOR_OP::GROUP_CONV &&
+                "Invalid configuration generated for tensor group_conv wrapper "
+                "operation");
+
+  return handleTensorGroupConvApproximationTuples(
+      ApproxChoices[0].second, input, filter, vertical_pad, horizontal_pad,
+      vertical_stride, horizontal_stride, conv_mode, conv_groups);
+}
 
-  void* wrapper_tensorSoftmax(const char* hpvm_node_id, void* input_ptr){
-    //  return tensorSoftmax(input_ptr);
+void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr) {
+  //  return tensorSoftmax(input_ptr);
 
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
-
-    // Approximation choices must be for a softmax operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::SOFTMAX &&
-		  "Invalid configuration generated for tensor softmax wrapper operation");
-
-    return handleTensorSoftmaxApproximationTuples(ApproxChoices[0].second, input_ptr);
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
 
+  // Approximation choices must be for a softmax operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::SOFTMAX &&
+      "Invalid configuration generated for tensor softmax wrapper operation");
 
-  }
-
-
-
-  void* tensor_set_node_id(unsigned int node_id){
+  return handleTensorSoftmaxApproximationTuples(ApproxChoices[0].second,
+                                                input_ptr);
+}
 
-    currentTensorID = node_id;
+void *tensor_set_node_id(unsigned int node_id) {
 
-    return NULL;
-  }
+  currentTensorID = node_id;
 
+  return NULL;
+}
 }
diff --git a/hpvm/projects/keras/README.md b/hpvm/projects/keras/README.md
index 4abb5563fb1e0c0749c9bc67c9d7debe5adce93e..e3cd3b8b19f0df790867f403fbf9a2770c0fee89 100644
--- a/hpvm/projects/keras/README.md
+++ b/hpvm/projects/keras/README.md
@@ -43,6 +43,18 @@ python setup.py install
 ```
 **NOTE:** This step must be performed each time (for each shell process) the frontend is to be used.
 
+
+## Download CNN Model Files 
+
+The weight (model) and data files to use with the CNN benchmarks are hosted on Git LFS and need to separately downloaded. This can be done using:
+
+```
+git lfs fetch 
+git lfs checkout 
+```
+
+**NOTE:** Data donwload is necesary before running benchmarks
+
 ## Running Benchmaks
 
 Benchmarks under `./src/` 
diff --git a/hpvm/projects/keras/docs/Support.md b/hpvm/projects/keras/docs/Support.md
index a31d012d0bbed679445cacd0760fd7295a8e7088..e5e7b1a1a2125940cd0749e9c957c43bf2205aa3 100644
--- a/hpvm/projects/keras/docs/Support.md
+++ b/hpvm/projects/keras/docs/Support.md
@@ -1,5 +1,4 @@
 
-
 ## Supported Keras Operators 
 
 The Keras frontend supports `Sequential()` Keras models.
@@ -23,7 +22,19 @@ The list of supported operations is as follows:
 
 ## Limitations 
 
-* We support convolutional neural networks that include the supported operators above - RNNs/LSTMS are not supported
+* Currently, we support Convolutional Neural Networks (CNNs) that include the supported operators (above) - RNNs/LSTMs not supported
 * We currently only support models in NCHW format (NHWC is not supported)
 * Softmax operator should be the last operation in the CNN pipeline 
-* Softmax operation must be a separate operator (not specified as activation to another type of Keras operator)
+* Softmax operation must be a separate operator (not specified as activation to another type of Keras operator). Example of what works:
+
+```
+Activation ("softmax")
+```
+
+Example of what is NOT supported:
+
+```
+Dense(num_classes, activation="softmax")
+```
+
+
diff --git a/hpvm/projects/pred_tuner/.gitignore b/hpvm/projects/pred_tuner/.gitignore
deleted file mode 100644
index 23e6d258015162d516c02fecb0a4f87acf4fb73d..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/.gitignore
+++ /dev/null
@@ -1,28 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Opentuner
-opentuner.db/
-opentuner.log
-
-# Custom
-.idea/
-.vscode/
-/data/
-results/
-tuner_results
-tuner_results/
-*.sh
-*.ipynb
-logistics/
-autotuner/
diff --git a/hpvm/projects/pred_tuner/LICENSE b/hpvm/projects/pred_tuner/LICENSE
deleted file mode 100644
index 2e229faa39851c4ddf71b0284c7e56a02dfd577a..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2017 liukuang
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/hpvm/projects/pred_tuner/README.md b/hpvm/projects/pred_tuner/README.md
deleted file mode 100644
index 8d7a6db2bdc622e6cac73c56e443e8d3e797133c..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/README.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Autotuning with Error-predictive Proxy
-
-Performs autotuning on program approximation knobs using an error-predictive proxy in place of the original
-program, to greatly speedup autotuning while getting results comparable in quality.
-
-Work in progress.
-
-## Getting Started
-
-After finishing this readme, go to [./proxy_tuner.py](./proxy_tuner.py) to try tuning one
-model. Use this set of arguments for a start:
-
-```bash
-python proxy_tuner.py --test-limit 1000 --accuracy-drop 1.5 --accuracy-slack 2.1 \
--o tuner_output alexnet2 autotuner/data/alexnet2
-```
-
-## Supported Programs & Approximations
-
-### Programs
-
-Currently DNN only. Support for several image processing benchmarks are in progress.
-
-Supported DNNs:
-
-- `LeNet @ MNIST`
-
-- `AlexNet @ CIFAR-10`
-
-- `AlexNet2 @ CIFAR-10`
-
-- `VGG16 @ CIFAR-10`
-
-- `ResNet18 @ CIFAR-10`
-
-- `MobileNet @ CIFAR-10`
-
-- `VGG16 @ CIFAR-100`
-
-- `VGG16 @ ImageNet`
-
-- `ResNet50 @ ImageNet`
-
-### Approximations
-
-Currently _hardware-independent_ approximations only. Hardware-reliant approximations are in progress.
-
-Approximations: (output) perforation for convolution, kernel sampling for convolution.
-
-## Proxy Model
-
-TODO: add working principle of proxy modeling.
-
-## Autotuner
-
-We use [opentuner](http://opentuner.org/) for autontuning tasks.
-
-## Project Structure
-
-### Library
-
-- `models`: PyTorch definition for DNN models
-
-  - `models/dataset`: Dataset loaders for both HPVM and PyTorch-standard DNN models
-
-  - `models/hpvm`: Definition for HPVM-ported models, with customized convolution layers
-
-- `toolkit`: core code of project, including DNN indexing / transformations / approximations. See
-  the code for details.
-
-### Entry Point
-
-- `./proxy_tuner.py`: perform autotuning for a given model, accuracy threshold, and a number of iterations,
-  using a proxy model that predicts the accuracy of approximated DNN (instead of running an inference, which
-  can be slow).
-
-- `./run_proxy_tuner.py`: run autotuning for all models defined in `utils/tuner_postprocess/benchmarks.py` on
-  a set of 3 accuracy thresholds, and perform postprocessing such as computing pareto curve.
-  
-  This is the right end-to-end script to use for obtaining a comprehensive set of autotuner results.
-
-### Other Code
-
-- `tests`: runnable scripts that can be used as tests (and other actual functionalities)
-
-- `utils`: helper functions for library and autotuner that are generally standalone, except
-
-  - `utils/utils.py` contains some convenient wrapper for model training, etc. that depends on the library.
-
-### Data
-
-- `autotuner/data`: descriptions for each DNN model, such as listing of layers, tunable
-  knobs, etc.
diff --git a/hpvm/projects/pred_tuner/bin/benchmark.py b/hpvm/projects/pred_tuner/bin/benchmark.py
deleted file mode 100644
index 92c8b2de5262469d9b752b5a2acd28db55e464a5..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/bin/benchmark.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import gc
-from time import time
-from typing import Dict, Iterator, List
-
-import numpy
-from tqdm import tqdm
-
-from exp import Benchmark, bench_tuner_data
-from toolkit import ConfigT, LinearCombEstimator, LinearEstimator, LinearQoSEstimator, ModuleIndexer, \
-    NetApproxSelector
-from utils import gpu_mem_mb, init_by_name, nn_to_output, tensor_to_accuracy
-
-
-def generate_random_configs(layer_approxes: Dict[int, List[int]], n_configs: int) -> Iterator[ConfigT]:
-    from numpy.random import choice
-    from random import randrange
-    all_layers = [k for k, ns in layer_approxes.items() if ns]
-    for _ in range(n_configs):
-        config = {}
-        n_approx_layers_ = randrange(len(all_layers) + 1)
-        approx_layers = choice(all_layers, n_approx_layers_, replace=False)
-        for layer_idx in approx_layers:
-            config[layer_idx] = choice(layer_approxes[layer_idx], 1)[0]
-        yield config
-
-
-def time_action(action):
-    tt0 = time()
-    action()
-    tt1 = time()
-    return tt1 - tt0
-
-
-def mean_std_str(np_array):
-    return f"{np_array.mean():.7f} +- {np_array.std():.7f}"
-
-
-def main_loop(bench, baseline_dag, testloader):
-    _t_baseline_inf = time()
-    baseline_output = nn_to_output(baseline_dag.module, testloader)
-    baseline_acc = tensor_to_accuracy(baseline_output, testloader)
-    print(f"Model accuracy: {baseline_acc}; test set size: {baseline_output.size(0)}")
-    t_baseline_inf = time() - _t_baseline_inf
-    nas = NetApproxSelector(baseline_dag)
-
-    def acc_crit(inputs_):
-        return tensor_to_accuracy(inputs_, testloader)
-
-    def threshold_eval(inputs_):
-        import numpy as np
-        accs = np.array([acc_crit(x) for x in inputs_])
-        return baseline_acc - accs.mean() < 3.0
-
-    def run_model(net):
-        return nn_to_output(net, testloader)
-
-    _t_profile = time()
-    pickle_path = bench.result_dir / 'proxy.pkl'
-    f1 = LinearCombEstimator(
-        nas, run_model, acc_crit, threshold_eval, 0.95, independent_init=False
-    )
-    f2 = LinearQoSEstimator(
-        nas, run_model, acc_crit, threshold_eval, 0.95, independent_init=False
-    )
-    LinearEstimator.coinit_estimators(nas, run_model, threshold_eval, f1, f2, storage=pickle_path)
-    t_profile = time() - _t_profile
-    print(
-        f"Baseline inference time: {t_baseline_inf:.3f} sec, predictor init time: {t_profile:.3f} sec; "
-        f"Predictor init time is {t_profile / t_baseline_inf:.3f} times of inference time"
-    )
-    configs = generate_random_configs(nas.net_approxes, 30)
-    pbar = tqdm(configs)
-    times = []
-    for config in pbar:
-        pbar.set_postfix(mem=gpu_mem_mb())
-        approx = nas.apply_approx_by_config(config).module
-        t_inf = time_action(lambda: nn_to_output(approx, testloader))
-        t_f1 = time_action(lambda: f1.estimate(config))
-        t_f2 = time_action(lambda: f2.estimate(config))
-        pbar.write(
-            f"Inference time: {t_inf:.3f} sec, predictors time: {t_f1:.3f} | {t_f2:.3f} sec"
-        )
-        times.append([t_inf, t_f1, t_f2])
-        gc.collect()
-    times = numpy.array(times)
-    s_inf, s0, s1 = numpy.apply_along_axis(mean_std_str, 0, times)
-    print(f"Result: inference time {s_inf}, predictor time: {s0} | {s1}")
-    print("Timing raw data:", times)
-
-
-def main():
-    for network in (
-            'alexnet_hpvm', 'alexnet2_hpvm',
-            'vgg16_cifar10_hpvm', 'vgg16_cifar100_hpvm',
-            'mobilenet_hpvm',
-            'resnet18_hpvm',
-            'lenet_hpvm',
-            'vgg16_imagenet_hpvm',
-            'alexnet_imagenet_hpvm',
-            # 'resnet50_imagenet_hpvm',
-    ):
-        bench: Benchmark = bench_tuner_data[network]
-        print(f"{network}: ")
-        baseline, testloader, _, shapes = init_by_name(network)
-        baseline_dag = ModuleIndexer(baseline)
-        main_loop(bench, baseline_dag, testloader)
-        gc.collect()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/bin/discrepancy.py b/hpvm/projects/pred_tuner/bin/discrepancy.py
deleted file mode 100644
index 8be92df66ae3a2bcb2d33088bb20064404d37913..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/bin/discrepancy.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import os
-from pathlib import Path
-from typing import Optional
-
-import matplotlib.pyplot as plt
-import seaborn
-import torch
-from tqdm import tqdm
-
-from toolkit import ModuleIndexer, NetApproxSelector, StateCapturer
-from utils import device, init_by_name
-
-
-def run_concat_output_at(net_index: ModuleIndexer, testloader, layer: int) -> Optional[torch.Tensor]:
-    snet = StateCapturer(net_index, lambda i, x: x.clone().detach() if i == layer else None)
-    for inputs, targets in testloader:
-        inputs, targets = inputs.to(device), targets.to(device)
-        snet(inputs)
-    outputs = snet.net_state[layer]
-    return torch.cat(outputs) if outputs else None
-
-
-def get_discrepancy_for(baseline, approxed, testloader, changed_layer):
-    baseline_output = run_concat_output_at(baseline, testloader, changed_layer)
-    approxed_output = run_concat_output_at(approxed, testloader, changed_layer)
-    assert baseline_output.shape == approxed_output.shape
-    tqdm.write(f"{baseline_output.size()}")
-    diff = baseline_output - approxed_output
-    diff_rel = torch.abs(diff / baseline_output).cpu()
-    diff_rel[torch.isnan(diff_rel)] = 0
-    diff_rel[diff_rel > 10] = 10
-    return diff_rel
-
-
-def main():
-    prefix = Path('results/discrepancy/resnet50_imagenet_hpvm')
-    os.makedirs(prefix, exist_ok=True)
-    baseline, testloader, _, shapes = init_by_name('resnet50_imagenet_hpvm')
-    net_index = ModuleIndexer(baseline)
-    nas = NetApproxSelector(net_index)
-    total = sum(len(ns) for ns in nas.net_approxes.values())
-    for layer, approx, approxed_net_dag in tqdm(nas.apply_indep_approx(), total=total):
-        if approx == 11:
-            continue
-        diff_rel = get_discrepancy_for(net_index, approxed_net_dag, testloader, layer)
-        fig, ax = plt.subplots()
-        seaborn.heatmap(diff_rel.mean(0).mean(0).numpy(), ax=ax)
-        fig.savefig((prefix / f'{layer}_{approx}.png').open('wb'), dpi=200)
-        plt.close(fig)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/bin/filter_configs.py b/hpvm/projects/pred_tuner/bin/filter_configs.py
deleted file mode 100644
index bf23668b81ff0bdf071d27d9e010932ab07e6eea..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/bin/filter_configs.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from typing import List, Tuple
-
-from exp import Benchmark, ExpState, bench_tuner_data
-from utils.config import Config
-
-
-def filter_configs(
-        validation: List[Config], test: List[Config],
-        vali_threshold: float, test_threshold: float = 3.0
-) -> Tuple[List[Config], List[Config]]:
-    # Filter validation and test set by their respective thresholds
-    filtered_validation = [
-        c for c in validation if c.avg_loss <= vali_threshold
-    ]
-    filtered_test = [
-        c for c in test if c.avg_loss <= test_threshold
-    ]
-    # Test configs also need to be a subset of validation configs.
-    name_to_filtered = {x.fname: x for x in filtered_test}
-    intersect_names = set(list(name_to_filtered.keys())).intersection(
-        set((x.fname for x in filtered_validation))
-    )
-    filtered_test_ = [name_to_filtered[fname] for fname in intersect_names]
-    assert set([id(x) for x in filtered_test_]).issubset(set([id(x) for x in filtered_test]))
-    return filtered_validation, filtered_test_
-
-
-def process_configs(bench: Benchmark, calib_slack: float, states: ExpState):
-    validated_configs = states.validated_configs.configs
-    tested_configs = states.tested_configs.configs
-    old_len = len(validated_configs)
-    valid_configs, test_configs = filter_configs(
-        validated_configs, tested_configs, calib_slack
-    )
-    states.valid_configs.finalize_dump(valid_configs)
-    states.test_configs.finalize_dump(test_configs)
-    print(f"{bench.model_name}: {old_len} -> {len(validated_configs)}, {len(tested_configs)}")
-    # Finalize data input and plot everything.
-    states.finalize_plot()
-
-
-def main():
-    for bench in bench_tuner_data.values():
-        bench: Benchmark
-        try:
-            states = ExpState(bench)
-        except ValueError:
-            print(f"Model {bench.model_name} has incomplete experiment data; skipping")
-            continue
-        process_configs(bench, 2.1, states)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/bin/inferences.py b/hpvm/projects/pred_tuner/bin/inferences.py
deleted file mode 100644
index 065abfd223f0a5c234dd36cc8aca7324415ac96f..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/bin/inferences.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from tqdm import tqdm
-
-from models import BaselineInfo, networks
-from utils import device
-
-if __name__ == '__main__':
-    for net_name in networks:
-        baseline_info = BaselineInfo.init_by_name(net_name, device)
-        tqdm.write(f"{net_name}: {baseline_info.val_qos} (validation) {baseline_info.test_qos} (test")
diff --git a/hpvm/projects/pred_tuner/bin/mock_autotuner.py b/hpvm/projects/pred_tuner/bin/mock_autotuner.py
deleted file mode 100644
index ec12e1643ab319e0120f2e95c7801825f04484bb..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/bin/mock_autotuner.py
+++ /dev/null
@@ -1,230 +0,0 @@
-import gc
-import json
-import os
-from pathlib import Path
-from sys import argv
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple
-
-import matplotlib.pyplot as plt
-import numpy as np
-from tqdm import tqdm, trange
-
-from exp import Benchmark, bench_tuner_data
-from toolkit import ConfigT, LinearCombEstimator, LinearEstimator, \
-    LinearQoSEstimator, ModuleIndexer, NetApproxSelector, WeightedLinearCombEstimator
-from toolkit.estimators import WeightedLinearQoSEstimator
-from utils import config_pylogger, gpu_mem_mb, init_by_name, nn_to_accuracy, nn_to_output, qos_stats, tensor_to_accuracy
-
-msg_logger = config_pylogger(output_dir=Path('tuner_results/logs'), verbose=True)
-
-
-class Evaluator:
-    def __init__(
-            self, nas: NetApproxSelector, n_approx_layers: Optional[int],
-            n_configs: int, testloader, threshold: Optional[float]
-    ):
-        self.nas = nas
-        self.layer_approxes = nas.net_approxes
-        self.n_approx_layers = n_approx_layers
-        self.n_configs = n_configs
-        self.testloader = testloader
-        self.threshold = threshold
-        self.config_accs = None
-
-    def generate_random_configs(self) -> Iterator[ConfigT]:
-        from numpy.random import choice
-        from random import randrange
-        all_layers = [k for k, ns in self.layer_approxes.items() if ns]
-        for _ in range(self.n_configs):
-            config = {}
-            if self.n_approx_layers is None:
-                n_approx_layers_ = randrange(len(all_layers) + 1)
-            else:
-                n_approx_layers_ = min(self.n_approx_layers, len(all_layers))
-            approx_layers = choice(all_layers, n_approx_layers_, replace=False)
-            for layer_idx in approx_layers:
-                config[layer_idx] = choice(self.layer_approxes[layer_idx], 1)[0]
-            yield config
-
-    def evaluate_config(self, config: ConfigT) -> Tuple[float, float]:
-        deterministic = self.nas.is_deterministic(config)
-        n_runs = 1 if deterministic else 30
-        approxed = self.nas.apply_approx_by_config(config).module
-        accs = []
-        for _ in trange(n_runs, leave=None):
-            acc = nn_to_accuracy(approxed, self.testloader)
-            accs.append(acc)
-        mean, confident_acc, _ = qos_stats(accs, 0.95)
-        return mean, confident_acc
-
-    def sort_configs_by_mean_acc(self):
-        sorted_ = sorted(self.config_accs, key=lambda p: p[1], reverse=True)
-        from itertools import takewhile
-        if self.threshold is not None:
-            sorted_ = list(takewhile(lambda p: p[1] > self.threshold, sorted_))
-        self.config_accs = np.array(sorted_)
-
-    @staticmethod
-    def calculate_perm_dist(pred_order):
-        n = len(pred_order)
-        actual_order = np.arange(n)
-        return np.linalg.norm(actual_order - pred_order, ord=1) / ((n ** 2 - 1) / 3)
-
-    def use_predictors(self, predictors: Iterable[LinearEstimator]) -> \
-            Optional[List[Tuple[np.ndarray, np.ndarray]]]:
-        self.sort_configs_by_mean_acc()
-        if len(self.config_accs) == 0:
-            return None
-        configs = self.config_accs[:, 0]
-        raw_prediction = []
-        for predictor in predictors:
-            # N * 2 array: avg acc, 95% confidence acc
-            pred_accs = np.array([
-                predictor.estimate(config) for config in configs
-            ])
-            pred_order = (-pred_accs[:, 0]).argsort(kind='stable')
-            raw_prediction.append((pred_accs, pred_order))
-        return raw_prediction
-
-    def run_configs(self):
-        configs = self.generate_random_configs()
-        pbar = tqdm(configs)
-        config_accs = []
-        for config in pbar:
-            pbar.set_postfix(mem=gpu_mem_mb())
-            mean_acc, confident_acc = self.evaluate_config(config)
-            config_accs.append([config, mean_acc, confident_acc])
-            gc.collect()
-        self.config_accs = np.array(config_accs)
-
-
-class NumpyEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, np.ndarray):
-            return obj.tolist()
-        return json.JSONEncoder.default(self, obj)
-
-
-class DataPlotStorage:
-    def __init__(self, save_to_prefix: Path):
-        self.save_to = save_to_prefix
-        os.makedirs(self.save_to.parent, exist_ok=True)
-        self.args = []
-        self.fig, self.axes = plt.subplots()
-
-    def plot(self, *args, **kwargs):
-        self.args.append({'args': args, 'kwargs': kwargs})
-        self.axes.plot(*args, **kwargs)
-
-    def errorbar(self, *args, **kwargs):
-        self.args.append({'args': args, 'kwargs': kwargs})
-        self.axes.errorbar(*args, **kwargs)
-
-    def save_and_close(self):
-        self.fig.savefig(self.save_to.with_suffix('.png'), dpi=200)
-        with self.save_to.with_suffix('.json').open('w') as f:
-            json.dump(self.args, f, cls=NumpyEncoder)
-        plt.close(self.fig)
-
-
-def compare_estimators(
-        eva: Evaluator, predictors: Dict[str, LinearEstimator], n_runs: int, st: DataPlotStorage
-):
-    all_dists = []
-    for _ in trange(n_runs):
-        eva.run_configs()
-        raw_predictions = eva.use_predictors(predictors.values())
-        dists = [eva.calculate_perm_dist(order) for _, order in raw_predictions]
-        all_dists.append(dists)
-    dists_t = zip(*all_dists)
-    for vs, label in zip(dists_t, predictors.keys()):
-        st.plot(sorted(vs), label=label)
-    st.axes.set_ylim(bottom=0)
-    st.fig.legend()
-    st.save_and_close()
-
-
-def plot_acc_estm_discrepancy(
-        eva: Evaluator, predictors: Dict[str, LinearEstimator], st: DataPlotStorage
-):
-    eva.run_configs()
-    raw_predictions = eva.use_predictors(predictors.values())
-    if not raw_predictions:
-        return
-    measured_mean_accs = eva.config_accs[:, 1]
-    yerr = measured_mean_accs - eva.config_accs[:, 2]
-    st.errorbar(
-        measured_mean_accs, measured_mean_accs, fmt='.', yerr=yerr, uplims=True, label='baseline'
-    )
-    for (pred_accs, _), label in zip(raw_predictions, predictors.keys()):
-        pred_accs = pred_accs
-        yerr = pred_accs[:, 0] - pred_accs[:, 1]
-        st.errorbar(
-            measured_mean_accs, pred_accs[:, 0],
-            fmt='.', yerr=yerr, uplims=True, label=label
-        )
-    min_x, max_x = np.min(measured_mean_accs), np.max(measured_mean_accs)
-    diag_x = np.linspace(min_x, max_x, 500)
-    st.errorbar(diag_x, diag_x, linewidth=1)
-    st.axes.set_xlabel('Measured accuracy (%)')
-    st.axes.set_ylabel('Predicted accuracy (%)')
-    st.fig.legend()
-    st.save_and_close()
-
-
-def train_predictors(eva: Evaluator, *predictors: LinearEstimator):
-    for conf in eva.generate_random_configs():
-        for p in predictors:
-            p.estimate(conf)
-
-
-def main():
-    base_path = Path(argv[1]) if len(argv) > 1 else Path('results/mock_autotuner')
-
-    for network in (
-            'alexnet2_hpvm', 'vgg16_cifar10_hpvm', 'vgg16_cifar100_hpvm',
-            'mobilenet_hpvm',
-            'resnet18_hpvm',
-            'vgg16_imagenet_hpvm', 'resnet50_imagenet_hpvm'
-    ):
-        bench: Benchmark = bench_tuner_data[network]
-        print(f"{bench.model_name}: ")
-        baseline, testloader, _, shapes = init_by_name(bench.model_name)
-        baseline_dag = ModuleIndexer(baseline)
-        baseline_acc = nn_to_accuracy(baseline_dag.module, testloader)
-        nas = NetApproxSelector(baseline_dag)
-
-        def acc_crit(inputs_):
-            return tensor_to_accuracy(inputs_, testloader)
-
-        def threshold_eval(inputs_):
-            accs = np.array([acc_crit(x) for x in inputs_])
-            return baseline_acc - accs.mean() < 3.0
-
-        def run_model(net):
-            return nn_to_output(net, testloader)
-
-        f1 = LinearCombEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False)
-        f2 = LinearQoSEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False)
-        f3 = WeightedLinearCombEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False)
-        f4 = WeightedLinearQoSEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False)
-        LinearEstimator.coinit_estimators(
-            nas, run_model, threshold_eval, f1, f2, f3, f4,
-            storage=Path('model_params/pickles') / Path(bench.base_dir).name / 'proxy_dev.pkl'
-        )
-        train_predictors(Evaluator(nas, None, 700, testloader, baseline_acc), f3, f4)
-        st = DataPlotStorage(base_path / "cmp_acc_diff" / f"{bench.model_name}")
-        plot_acc_estm_discrepancy(
-            Evaluator(nas, None, 200, testloader, baseline_acc - 10),
-            {'f1': f1, 'f2': f2, 'f3': f3, 'f4': f4}, st
-        )
-        st = DataPlotStorage(base_path / 'cmp_ordering' / f"{bench.model_name}" / "n_none")
-        compare_estimators(
-            Evaluator(nas, None, 20, testloader, None),
-            {'f1': f1, 'f2': f2, 'f3': f3, 'f4': f4}, 10, st
-        )
-        gc.collect()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/bin/print_approxes.py b/hpvm/projects/pred_tuner/bin/print_approxes.py
deleted file mode 100644
index c95d080326ad2e806d772454c15bed68c573ca17..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/bin/print_approxes.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from collections import defaultdict
-
-import matplotlib.pyplot as plt
-import pandas as pd
-import seaborn
-from tqdm import tqdm
-
-from models.domains import Accuracy
-from models import BaselineInfo
-from toolkit import NetApproxSelector
-from utils import device
-
-
-def main():
-    baseline_info = BaselineInfo.init_by_name('mobilenet_hpvm', device)
-    nas = NetApproxSelector(baseline_info.baseline_net, dev_time_only=True, ignore_fp32=False)
-    table = defaultdict(dict)
-    pbar = tqdm(nas.list_single_approxes())
-    for layer, approx, _ in pbar:
-        pbar.set_postfix(k=layer, i=approx)
-        approxed_net = nas.apply_approx_by_config({layer: approx}).module
-        acc: Accuracy = baseline_info.get_qos(approxed_net, baseline_info.val_loader)
-        table[layer][approx] = acc.to_scalar()
-    df = pd.DataFrame(
-        [pd.Series(list(d.values()), index=d.keys()) for d in table.values()],
-        index=list(table.keys())
-    )
-    with open('accuracy.json', 'w') as f:
-        df.to_json(f)
-    seaborn.heatmap(df.to_numpy())
-    plt.savefig('accuracy.png', dpi=200)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/bin/progress_graph.py b/hpvm/projects/pred_tuner/bin/progress_graph.py
deleted file mode 100644
index 0d7d0d5526f708e8049e3f185ebceebe68f4b778..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/bin/progress_graph.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from itertools import groupby
-from operator import itemgetter
-from pathlib import Path
-from typing import Tuple
-
-import matplotlib.pyplot as plt
-
-from exp import Benchmark, ExpState, batch_id, bench_tuner_data
-from utils import Config
-
-
-def finalize_figs(filename, ax, fig):
-    ax.legend()
-    ax.set_ylim(bottom=1.0)
-    fig.savefig(filename, dpi=200)
-    plt.close(fig)
-
-
-def process_configs(bench: Benchmark, states: ExpState, shared_ax):
-    def get_features(c: Config) -> Tuple[int, int, float]:
-        *_, run_s, iter_s = c.fname.split('_')
-        return int(run_s), int(iter_s), c.speedup
-
-    def get_max_speedup(group):
-        group = sorted(list(group), key=itemgetter(1))
-        iter_max_speedup = []
-        max_speedup = 0
-        for _, i, speedup in group:
-            max_speedup = max(max_speedup, speedup)
-            iter_max_speedup.append((i, max_speedup))
-        return iter_max_speedup
-
-    run_iter_speedup = sorted(
-        [get_features(c) for c in states.all_configs.configs], key=itemgetter(0)
-    )
-    run_groups = groupby(run_iter_speedup, key=itemgetter(0))
-    fig, ax = plt.subplots()
-    for run, run_group in run_groups:
-        iter_max_speedup = get_max_speedup(run_group)
-        iters, max_speedups = zip(*iter_max_speedup)
-        ax.plot(iters, max_speedups, label=f"loss={run + 1}%")
-        if run + 1 == 3:
-            shared_ax.plot(iters, max_speedups, label=f"{bench.model_name.replace('_hpvm', '')}")
-    finalize_figs(bench.result_dir / f"tuner_progress.png", ax, fig)
-
-
-def main():
-    fig, ax = plt.subplots()
-    for bench in bench_tuner_data.values():
-        bench: Benchmark
-        try:
-            states = ExpState(bench)
-        except ValueError:
-            print(f"Model {bench.model_name} has incomplete experiment data; skipping")
-            continue
-        process_configs(bench, states, ax)
-    finalize_figs(Path("results") / f"{batch_id}_tuner_progress.png", ax, fig)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/bin/train_model.py b/hpvm/projects/pred_tuner/bin/train_model.py
deleted file mode 100644
index d3d0d80725f5784c42ec8f6a26b65ff183df1649..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/bin/train_model.py
+++ /dev/null
@@ -1,186 +0,0 @@
-"""Train CIFAR10 with PyTorch."""
-import argparse
-import os
-from typing import List
-
-import numpy as np
-import torch
-from torch import optim
-from torch.nn import CrossEntropyLoss, Module
-from torch.optim.lr_scheduler import ReduceLROnPlateau
-from tqdm import tqdm
-
-from models.torch import ResNet18
-from models.datasets import get_cifar10_train_dataloader, get_cifar10_test_dataloader
-from utils import device
-
-
-class RunningStats:
-    def __init__(self, criterion):
-        self.criterion = criterion
-        self.all_outputs = None
-        self.all_targets = np.zeros([0])
-        self.avg_loss, self.correct, self.total = 0, 0, 0
-        self.conf_mat = None
-        self.n_batches = 0
-
-    @property
-    def n_classes(self):
-        if self.all_outputs is None:
-            raise RuntimeError("Num of classes is unknown before seeing first input")
-        return self.all_outputs.shape[1]
-
-    def setup_for_first_output(self, outputs):
-        n_classes = outputs.shape[1]
-        self.all_outputs = np.zeros([0, n_classes])
-        self.conf_mat = np.zeros([n_classes, n_classes])
-
-    def add_output(self, outputs, targets):
-        if self.all_outputs is None:
-            self.setup_for_first_output(outputs)
-        loss = self.criterion(outputs, targets)
-        _, predicted = outputs.max(1)
-        self.avg_loss = (self.avg_loss * self.n_batches + loss.item()) / (self.n_batches + 1)
-        self.total += targets.size(0)
-        self.correct += predicted.eq(targets).sum().item()
-        for t, p in zip(targets, predicted):
-            self.conf_mat[int(t), p] += 1
-        self.n_batches += 1
-        outputs = outputs.clone().cpu().detach()
-        targets = targets.clone().cpu().detach()
-        self.all_outputs = np.vstack([self.all_outputs, outputs])
-        self.all_targets = np.hstack([self.all_targets, targets])
-        return loss
-
-    def classwise_outputs(self) -> List[np.ndarray]:
-        class_outputs = [np.zeros([0, self.n_classes]) for _ in range(self.n_classes)]
-        for output, label_class in zip(self.all_outputs, self.all_targets):
-            co = class_outputs[int(label_class)]
-            class_outputs[int(label_class)] = np.vstack([co, output])
-        return class_outputs
-
-    @property
-    def acc(self):
-        return 100. * self.correct / self.total
-
-    @property
-    def classwise_acc(self) -> List[float]:
-        return [self.conf_mat[i, i] / self.conf_mat[i].sum() for i in range(self.n_classes)]
-
-
-def test(net, testloader, criterion):
-    net.eval()
-    rs = RunningStats(criterion)
-    with torch.no_grad():
-        pbar = tqdm(enumerate(testloader), total=len(testloader))
-        for batch_idx, (inputs, targets) in pbar:
-            inputs, targets = inputs.to(device), targets.to(device)
-            outputs = net(inputs)
-            rs.add_output(outputs, targets)
-            pbar.set_postfix_str(
-                f"Loss: {rs.avg_loss:.3f} | Acc: {rs.acc:.3f}% ({rs.correct}/{rs.total})"
-            )
-    return rs
-
-
-def load_torch_checkpoint(net: Module, chpt_path: str):
-    print('==> Loading checkpoint..')
-    checkpoint = torch.load(chpt_path)
-    net.load_state_dict(checkpoint['net'])
-    start_epoch = checkpoint['epoch']
-    return start_epoch
-
-
-def get_optimizer(net, lr):
-    return optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
-
-
-class EarlyStopping:
-    """Early stops the training if validation loss doesn't improve after a given patience."""
-
-    def __init__(self, path, patience=7, delta=0):
-        """
-        Args:
-            patience (int): How long to wait after last time validation loss improved.
-                            Default: 7
-            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
-                            Default: 0
-            path (str): Path for the checkpoint to be saved to.
-                            Default: 'checkpoint.pt'
-        """
-        self.patience = patience
-        self.counter = 0
-        self.min_loss = None
-        self.delta = delta
-        self.path = path
-
-    def __call__(self, val_loss, model, epoch):
-        if self.min_loss is None or val_loss < self.min_loss - self.delta:
-            # Improved
-            self.min_loss = val_loss
-            self.save_checkpoint(model, epoch)
-            self.counter = 0
-        else:
-            self.counter += 1
-            if self.counter >= self.patience:
-                return True
-        return False
-
-    def save_checkpoint(self, model, epoch):
-        tqdm.write('Saving..')
-        state = {
-            'net': model.state_dict(),
-            'epoch': epoch,
-        }
-        if not os.path.isdir(os.path.dirname(self.path)):
-            os.makedirs(os.path.dirname(self.path))
-        torch.save(state, self.path)
-
-
-def train_one_epoch(net, trainloader, optimizer, criterion):
-    net.train()
-    rs = RunningStats(criterion)
-    pbar = tqdm(trainloader)
-    for inputs, targets in pbar:
-        optimizer.zero_grad()
-        inputs, targets = inputs.to(device), targets.to(device)
-        outputs = net(inputs)
-        loss = rs.add_output(outputs, targets)
-        loss.backward()
-        optimizer.step()
-        pbar.set_postfix_str(
-            f"Loss: {rs.avg_loss:.3f} | Acc: {rs.acc:.3f}% ({rs.correct}/{rs.total})"
-        )
-
-
-def train(net, checkpoint, output, lr):
-    start_epoch = load_torch_checkpoint(net, checkpoint) if checkpoint else 0
-    trainloader = get_cifar10_train_dataloader('./data', 128)
-    testloader = get_cifar10_test_dataloader('./data', 100)
-    criterion = CrossEntropyLoss()
-    optimizer = get_optimizer(net, lr)
-    es = EarlyStopping(output, patience=5)
-    reduce_lr = ReduceLROnPlateau(optimizer, factor=0.2, patience=3, verbose=True)
-    for epoch in range(start_epoch + 1, start_epoch + 200):
-        print('\nEpoch: %d' % epoch)
-        train_one_epoch(net, trainloader, optimizer, criterion)
-        rs = test(net, testloader, criterion)
-        if es(rs.avg_loss, net, epoch):
-            print(f"Early stopped at {epoch}")
-            break
-        reduce_lr.step(rs.avg_loss)
-
-
-def main():
-    parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
-    parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
-    parser.add_argument('--resume', '-r', type=str, help='resume from checkpoint')
-    parser.add_argument(
-        '--output', '-o', type=str, required=True, help='path to save checkpoint to'
-    )
-    args = parser.parse_args()
-    train(ResNet18().to(device), args.resume, args.output, args.lr)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/exp.py b/hpvm/projects/pred_tuner/exp.py
deleted file mode 100644
index e7457d5b475d53f7a6c05fcea28f8b1cc4507c93..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/exp.py
+++ /dev/null
@@ -1,438 +0,0 @@
-import abc
-import json
-import os
-from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple, Type
-
-from torch.nn import Linear, Module
-from torch.utils.data import DataLoader
-
-from models.domains import QoS, qos_stats
-from models.hpvm import HPVMConvBundle
-from models import BaselineInfo
-from toolkit import LinearEstimator, NetApproxSelector
-from utils import config_pylogger, get_knob_config_file, get_tensorrt_dir, device
-from utils.config import Config, dump_rt_format_to, load_configs_from_dir, plot_configs
-
-batch_id = "batch405"
-is_dev_time = False
-ConfigT = Dict[int, int]
-msg_logger = config_pylogger(output_dir=Path('tuner_results/logs'), verbose=True)
-
-
-def get_layer_desc(path: Path) -> List[List[str]]:
-    with path.open() as f:
-        return [x.split() for x in f]
-
-
-def get_layer_desc_in_pytorch(layer_desc: List[List[str]]) -> \
-        Tuple[List[Optional[Module]], Dict[int, int]]:
-    desc = []
-    remapping = {}
-    for ext_i, vals in enumerate(layer_desc):
-        if vals and 'conv' == vals[0]:
-            remapping[ext_i] = len(remapping)
-            desc.append(HPVMConvBundle)
-        elif vals and 'dense' == vals[0]:
-            remapping[ext_i] = len(remapping)
-            desc.append(Linear)
-        else:
-            desc.append(None)
-    return desc, remapping
-
-
-def read_cost_file(layer_desc: List[List[str]], path: Path) -> List[float]:
-    with path.open() as f:
-        raw_costs = [float(x.strip()) for x in f]
-    costs = []
-    raw_cost_it = 0
-    for layer in layer_desc:
-        if 'conv' in layer or 'dense' in layer:
-            costs.append(raw_costs[raw_cost_it])
-            raw_cost_it += 1
-        else:
-            costs.append(0)
-    assert len(layer_desc) == len(costs)
-    return costs
-
-
-def read_global_knobs_speedup(path: Path):
-    knobs_speedup = {}
-    with path.open() as f:
-        for x in f:
-            toks = x.split("\t")
-            ID = int(toks[0].split(",")[1])
-            speedup = float(toks[2])
-            knobs_speedup[ID] = speedup
-    return knobs_speedup
-
-
-class Benchmark:
-    def __init__(self, json_data: dict):
-        self.json_data = json_data
-        self.model_name: str = self.model_name  # RHS from json data
-        # Use baseline configuration as seed to aid the autotuner
-        # TODO: put this as a field in benchmarks.json
-        self.use_seed = self.model_name == 'resnet50_imagenet_hpvm'
-        tensorrt = get_tensorrt_dir()
-        self.cost_file = tensorrt / self.cost_file
-        self.layer_file = tensorrt / self.layer_file
-        self.knobs_config_file = tensorrt / "autotuner/data/global_knobs.txt"
-        self.batch_dir = tensorrt / self.base_dir / "loss_123" / batch_id
-        self.result_dir = self.batch_dir / ("dev_tuner" if is_dev_time else "inst_tuner")
-
-        self.layer_desc = get_layer_desc(self.layer_file)
-        self.pytorch_layer_desc, self.layer_remap = get_layer_desc_in_pytorch(self.layer_desc)
-        msg_logger.debug(f"HPVM order to neutral order remapping, model {self.model_name}: {self.layer_remap}")
-        self.layer_costs = read_cost_file(self.layer_desc, self.cost_file)
-        self.knobs_speedup = read_global_knobs_speedup(get_knob_config_file())
-
-    def set_batch_id(self, batch_id_: str = batch_id, is_dev_time_: bool = is_dev_time):
-        tensorrt = get_tensorrt_dir()
-        self.batch_dir = tensorrt / self.base_dir / "loss_123" / batch_id_
-        self.result_dir = self.batch_dir / ("dev_tuner" if is_dev_time_ else "inst_tuner")
-
-    def __getattr__(self, item: str):
-        return self.json_data[item]
-
-    def translate_config(self, autotuner: ConfigT) -> ConfigT:
-        ret = {}
-        for x, v in autotuner.items():
-            if x not in self.layer_remap:
-                assert v == 11
-                continue
-            ret[self.layer_remap[x]] = v
-        return ret
-
-    def get_baseline_config(self, is_fp16: bool) -> ConfigT:
-        conf = {}
-        for layer_id, layer in enumerate(self.pytorch_layer_desc):
-            knob = 12 if layer is not None and is_fp16 else 11
-            conf[layer_id] = knob
-        return conf
-
-    def pattern_match_layer_knobs(self, module_to_knobs: Dict[Module, List[int]]) -> Dict[int, List[int]]:
-        conv_knobs = [knobs for m, knobs in module_to_knobs.items() if isinstance(m, HPVMConvBundle)]
-        linear_knobs = [knobs for m, knobs in module_to_knobs.items() if isinstance(m, Linear)]
-        assert len(conv_knobs) + len(linear_knobs) == len(module_to_knobs)
-        conv_knobs_idx, linear_knobs_idx = 0, 0
-        ret = {}
-        for layer_id, module_ty in enumerate(self.pytorch_layer_desc):
-            if module_ty is HPVMConvBundle:
-                # PROMISE does not apply to first layer of LeNet.
-                if self.model_name == "lenet_hpvm" and layer_id == 0:
-                    this_conv_knobs = [x for x in conv_knobs[conv_knobs_idx] if x >= 11]
-                else:
-                    this_conv_knobs = conv_knobs[conv_knobs_idx]
-                ret[layer_id] = this_conv_knobs + [11]
-                conv_knobs_idx += 1
-            elif module_ty is Linear:
-                ret[layer_id] = linear_knobs[linear_knobs_idx] + [11]
-                linear_knobs_idx += 1
-            else:
-                ret[layer_id] = [11]
-        assert conv_knobs_idx == len(conv_knobs)
-        return ret
-
-    def compute_config_cost(self, cfg: ConfigT) -> Tuple[float, float]:
-        orig_cost = 0.0
-        total_cost = 0.0
-        for layer, knob in cfg.items():
-            op_cost = self.layer_costs[layer]
-            speedup = self.knobs_speedup[knob]
-            total_cost += (op_cost * 1.0 / speedup * 1.0)
-            orig_cost += op_cost
-        speedup = (orig_cost * 1.0) / (total_cost * 1.0)
-        return total_cost, speedup
-
-    def get_n_layers(self) -> int:
-        return len(self.layer_desc)
-
-
-class ConfigMeasurer(BaselineInfo):
-    def __init__(
-            self, net: Module, val_loader: DataLoader, test_loader: DataLoader,
-            non_tensor_output: bool, qos_class: Type[QoS],
-            nas: NetApproxSelector, bench: Benchmark
-    ):
-        super().__init__(net, val_loader, test_loader, non_tensor_output, qos_class)
-        self.nas = nas
-        self.bench_translate_config = bench.translate_config
-        self.layer_remap = {k: v for k, v in enumerate(list(self.nas.net_approxes.keys()))}
-        msg_logger.debug(f"Neutral order to module scanning order remapping: {self.layer_remap}")
-        self.bench = bench
-        msg_logger.info(
-            f"Model {bench.model_name} baseline accuracy = "
-            f"{self.val_qos} ({self.test_qos} test)"
-        )
-
-    def translate_config(self, autotuner_cfg: ConfigT):
-        autotuner_cfg = self.bench_translate_config(autotuner_cfg)
-        # Translate layer index from autotuner format (0, 1, 2...)
-        # to proxy format (actual layer index)
-        cfg = {self.layer_remap[k]: v for k, v in autotuner_cfg.items() if v != 11}
-        return cfg
-
-    @classmethod
-    def init_from_bench(cls, bench: Benchmark) -> 'ConfigMeasurer':
-        bi = BaselineInfo.init_by_name(bench.model_name, device)
-        nas = NetApproxSelector(bi.baseline_net, dev_time_only=is_dev_time, ignore_fp32=not is_dev_time)
-        return cls(
-            bi.baseline_net, bi.val_loader, bi.test_loader,
-            bi.non_tensor_output, bi.qos_class, nas, bench
-        )
-
-    def proxy_estimate(self, cfg: ConfigT, proxy: LinearEstimator) -> Tuple[QoS, QoS]:
-        cfg = self.translate_config(cfg)
-        mean_acc, confident_acc = proxy.estimate(cfg)
-        return mean_acc, confident_acc
-
-    def actual_measure(
-            self, cfg: ConfigT, n_runs: int, is_test_set: bool, threshold: QoS = None
-    ) -> Tuple[QoS, Optional[float]]:
-        cfg = self.translate_config(cfg)
-        approx = self.nas.apply_approx_by_config(cfg).module
-        dataloader = self.test_loader if is_test_set else self.val_loader
-        from tqdm import trange
-        qoses = []
-        for _ in trange(n_runs, leave=None):
-            qoses.append(self.get_qos(approx, dataloader))
-        mean, _, confidence = qos_stats(qoses, threshold=threshold)
-        return mean, confidence
-
-    def get_knobs(self):
-        # Delaying computing knobs because nas can be modified externally (knobs filtered)
-        ext_layer_to_knobs = self.bench.pattern_match_layer_knobs(self.nas.get_layer_approxes())
-        msg_logger.debug(f"Getting knobs:")
-        for layer, knobs in ext_layer_to_knobs.items():
-            msg_logger.debug(f"  {layer}: {knobs}")
-        return ext_layer_to_knobs
-
-
-class PersistentState(abc.ABC):
-    def __init__(self):
-        self._substates: Dict[str, PersistentState] = {}
-
-    def __setattr__(self, name, value):
-        if isinstance(value, PersistentState):
-            self._substates[name] = value
-        super().__setattr__(name, value)
-
-    def dump(self):
-        self._dump_self()
-        for v in self._substates.values():
-            v.dump()
-
-    def load(self):
-        if self.filled():
-            return
-        try:
-            self._load_self()
-        except (ValueError, RuntimeError, FileNotFoundError) as e:
-            msg_logger.info(f"Exception {e} when loading state")
-        for k, v in self._substates.items():
-            v.load()
-
-    def filled(self):
-        return self._self_is_initialized() and all((v.filled() for v in self._substates.values()))
-
-    @abc.abstractmethod
-    def _dump_self(self):
-        pass
-
-    @abc.abstractmethod
-    def _load_self(self):
-        pass
-
-    @abc.abstractmethod
-    def _self_is_initialized(self) -> bool:
-        pass
-
-
-class PersistentConfigs(PersistentState):
-    def __init__(self, bench: Benchmark, prefix: str, baseline_acc: QoS, rt_cpu: bool, rt_gpu: bool):
-        super().__init__()
-        self._data = []
-        self._filled = False
-        self.bench = bench
-        self.prefix = prefix
-        self.baseline_qos = baseline_acc
-        self.rt_cpu_path = self.bench.result_dir / f"{prefix}_cpu.txt" if rt_cpu else None
-        self.rt_gpu_path = self.bench.result_dir / f"{prefix}_fp16.txt" if rt_gpu else None
-
-    @property
-    def config_folder(self) -> Path:
-        return self.bench.result_dir / self.prefix
-
-    @property
-    def configs(self) -> List[Config]:
-        return self._data
-
-    def _load_self(self):
-        # Try reading autotuner configs and hpvm-rt configs
-        self._data = load_configs_from_dir(self.config_folder, self.baseline_qos)
-        # If hpvm-rt is not present, dump it.
-        # TODO: check rt format integrity
-        if (
-                (self.rt_cpu_path and not self.rt_cpu_path.is_file()) or
-                (self.rt_cpu_path and not self.rt_cpu_path.is_file())
-        ):
-            self.finalize_dump()
-        self._filled = True
-
-    def _dump_self(self):
-        for conf in self._data:
-            self._dump_one(conf)
-        self.finalize_dump()
-
-    def _self_is_initialized(self) -> bool:
-        return self._filled
-
-    def _dump_one(self, config: Config):
-        if not self.config_folder.is_dir():
-            os.mkdir(self.config_folder.as_posix())
-        config_path = self.config_folder / config.fname
-        with config_path.open('w') as f:
-            f.write(config.to_tuner_format())
-
-    def append(self, config: Config):
-        self._data.append(config)
-        self._dump_one(config)
-
-    def extend(self, configs: Iterable[Config]):
-        confs = []
-        for conf in configs:
-            self._dump_one(conf)
-            confs.append(conf)
-        self._data.extend(confs)
-
-    def finalize_dump(self, with_configs: Iterable[Config] = None):
-        if with_configs is not None:
-            self.extend(with_configs)
-        self._filled = True
-        dump_rt_format_to(
-            self.bench.layer_desc, self._data, self.baseline_qos,
-            self.rt_cpu_path, self.rt_gpu_path
-        )
-
-
-class TuningTime(PersistentState):
-    def __init__(self, path: Path):
-        super().__init__()
-        self.timers = {}
-        self.path = path
-
-    def _load_self(self):
-        import re
-        with self.path.open() as f:
-            lines = f.readlines()
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            match = re.match(r'Timer ([^=]+) = ([0-9.]+) hours', line)
-            if not match:
-                raise RuntimeError(f"File {self.path} malformed")
-            self.timers[match.group(1)] = float(match.group(2))
-
-    def _dump_self(self):
-        for k, v in self.timers.items():
-            self._dump_one(k, v)
-
-    def _self_is_initialized(self) -> bool:
-        return bool(self.timers)
-
-    def _dump_one(self, key: str, value: float):
-        time_hrs = value / (60 * 60)
-        msg_logger.info(f"Timer {key} = {time_hrs:.3f} hours")
-        with self.path.open('a') as f:
-            f.write(f"Timer {key} = {time_hrs} hours\n")
-
-    def add_timer(self, key: str, value: float):
-        self.timers[key] = value
-        self._dump_one(key, value)
-
-
-class AccPair(PersistentState):
-    def __init__(self, path: Path, qos_class: Type[QoS]):
-        super().__init__()
-        self.path = path
-        self.qos_class = qos_class
-        self._data = None
-
-    @property
-    def accs(self) -> Tuple[QoS, QoS]:
-        if self._data is None:
-            raise AttributeError("Accuracy not init'ed yet")
-        return self._data
-
-    @accs.setter
-    def accs(self, value: Tuple[QoS, QoS]):
-        self._data = value
-        self._dump_self()
-
-    def _load_self(self):
-        with self.path.open() as f:
-            acc_val, acc_test = [self.qos_class.parse(s) for s in f.read().split('\n')]
-        self._data = acc_val, acc_test
-
-    def _dump_self(self):
-        with self.path.open('w') as f:
-            f.write(f"{self._data[0]}\n{self._data[1]}")
-
-    def _self_is_initialized(self) -> bool:
-        return self._data is not None
-
-
-class ExpState(PersistentState):
-    def __init__(self, bench: Benchmark, qos_class: Type[QoS], accs: Tuple[QoS, QoS] = None):
-        super().__init__()
-        self.bench = bench
-        self.baseline_accs = AccPair(bench.result_dir / 'baseline_acc.txt', qos_class)
-        self.baseline_accs.load()
-        if not self.baseline_accs.filled():
-            if accs is None:
-                raise ValueError("Provide model baseline accuracy")
-            self.baseline_accs.accs = accs
-        acc_val, acc_test = self.baseline_accs.accs
-        self.all_configs = PersistentConfigs(bench, 'all', acc_val, False, False)
-        self.filtered_configs = PersistentConfigs(bench, 'filtered', acc_val, False, False)
-        self.validated_configs = PersistentConfigs(bench, 'validated', acc_val, False, False)
-        self.tested_configs = PersistentConfigs(bench, 'tested', acc_test, False, False)
-        self.valid_configs = PersistentConfigs(bench, 'valid', acc_val, True, True)
-        self.test_configs = PersistentConfigs(bench, 'test', acc_test, True, True)
-        self.timers = TuningTime(bench.result_dir / 'tuning_time.txt')
-        super().load()
-
-    def _load_self(self):
-        pass
-
-    def _dump_self(self):
-        pass
-
-    def _self_is_initialized(self) -> bool:
-        return True
-
-    def finalize_plot(self):
-        if not self.filled():
-            raise RuntimeError("Cannot finalize before data slots are all filled")
-        plot_configs(
-            self.bench.result_dir / "all_plot.png",
-            all=self.all_configs.configs
-        )
-        plot_configs(
-            self.bench.result_dir / "validated_tested_plot.png",
-            filtered=self.filtered_configs.configs,
-            validated=self.validated_configs.configs,
-            tested=self.tested_configs.configs
-        )
-        plot_configs(
-            self.bench.result_dir / "filtered_plot.png",
-            valid=self.valid_configs.configs,
-            test=self.test_configs.configs
-        )
-
-
-with (Path(__file__).parent / 'utils/benchmarks.json').open() as f_:
-    benchmark_data = json.load(f_)
-bench_tuner_data = {k: Benchmark(v) for k, v in benchmark_data.items()}
diff --git a/hpvm/projects/pred_tuner/model_params b/hpvm/projects/pred_tuner/model_params
deleted file mode 120000
index 90aaa403fdbec5110e1c02431a7df3f31fed0dbf..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/model_params
+++ /dev/null
@@ -1 +0,0 @@
-../hpvm-tensor-rt/model_params
\ No newline at end of file
diff --git a/hpvm/projects/pred_tuner/models/__init__.py b/hpvm/projects/pred_tuner/models/__init__.py
deleted file mode 100644
index 192f4b5bea17503603ba8f1208a22cea78af2897..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .networks import networks
-from .inference import get_all_output, move_to_device_recursively, BaselineInfo
-from .domains import QoS
diff --git a/hpvm/projects/pred_tuner/models/datasets/__init__.py b/hpvm/projects/pred_tuner/models/datasets/__init__.py
deleted file mode 100644
index 1a1e35fcea0e29482abbace082f825aac6c8d608..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/datasets/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .hpvm import CIFAR, CIFARImage, HPVMDataset, ImageNet, MNIST
-from .torch import get_cifar10_test_dataset, get_cifar10_test_dataloader, get_cifar10_train_dataloader
diff --git a/hpvm/projects/pred_tuner/models/datasets/hpvm.py b/hpvm/projects/pred_tuner/models/datasets/hpvm.py
deleted file mode 100644
index aa871d89d85493a0c8ad1237ed9e5e8b0b34ac49..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/datasets/hpvm.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import logging
-from pathlib import Path
-from typing import Iterator, List, Tuple, TypeVar
-
-import numpy as np
-import torch
-from torch.utils.data.dataset import IterableDataset
-
-from models.hpvm import read_tensor_from_file
-
-RetT = Tuple[torch.Tensor, torch.Tensor]
-T = TypeVar('T', bound='HPVMDataset')
-msg_logger = logging.getLogger()
-
-
-class HPVMDataset(IterableDataset):
-    def __init__(self, inputs: torch.Tensor, outputs: torch.Tensor):
-        self.inputs, self.outputs = inputs, outputs
-
-    @classmethod
-    def from_file(cls, *args, **kwargs):
-        pass
-
-    @property
-    def sample_input(self):
-        inputs, outputs = next(iter(self))
-        return inputs
-
-    def __len__(self) -> int:
-        return len(self.inputs)
-
-    def __getitem__(self, idx) -> RetT:
-        if idx >= len(self):
-            raise IndexError("Dataset index out of range")
-        return self.inputs[idx], self.outputs[idx]
-
-    def __iter__(self) -> Iterator[RetT]:
-        for i in range(len(self)):
-            yield self[i]
-
-
-class HPVMDNNDataset(HPVMDataset):
-    @classmethod
-    def _from_file(
-            cls, input_file: Path, labels_file: Path, is_uint8_label: bool,
-            count: int, offset: int, *item_shapes: int
-    ):
-        # NOTE: assuming (N, *) ordering of inputs (such as NCHW, NHWC)
-        channel_size = np.prod(np.array(item_shapes))
-        if count != -1:
-            count *= channel_size
-        offset *= channel_size
-        inputs = read_tensor_from_file(
-            input_file, -1, *item_shapes, count=count, offset=offset,
-            use_progress_bar=True
-        )
-        label_read_ty = np.int8 if is_uint8_label else np.int32
-        labels = read_tensor_from_file(
-            labels_file, -1, read_ty=label_read_ty, cast_ty=np.long,
-            count=count, offset=offset
-        )
-        if inputs.size(0) != labels.size(0):
-            raise ValueError("Input and output have different number of data points")
-        msg_logger.info(f"{inputs.shape[0]} entries loaded from dataset.")
-        return cls(inputs, labels)
-
-    @classmethod
-    def from_default_file(cls, prefix: str):
-        prefix = Path(prefix)
-        return cls.from_file(
-            Path(prefix) / 'input.bin', Path(prefix) / 'labels.bin'
-        )
-
-
-class MNIST(HPVMDNNDataset):
-    @classmethod
-    def from_file(
-            cls, input_file: Path, labels_file: Path, count: int = -1, offset: int = 0
-    ):
-        return cls._from_file(
-            input_file, labels_file, True, count, offset, 1, 28, 28
-        )
-
-
-class CIFAR(HPVMDNNDataset):
-    @classmethod
-    def from_file(
-            cls, input_file: Path, labels_file: Path, count: int = -1, offset: int = 0
-    ):
-        return cls._from_file(
-            input_file, labels_file, True, count, offset, 3, 32, 32
-        )
-
-
-class ImageNet(HPVMDNNDataset):
-    @classmethod
-    def from_file(
-            cls, input_file: Path, labels_file: Path, count: int = -1, offset: int = 0
-    ):
-        return cls._from_file(
-            input_file, labels_file, False, count, offset, 3, 224, 224
-        )
-
-
-class HPVMImageDataset(HPVMDataset):
-    @classmethod
-    def _from_file(
-            cls, input_file: Path, output_file: Path,
-            count: int, offset: int, input_shape: List[int], output_shape: List[int]
-    ):
-        # NOTE: assuming (N, *) ordering of inputs (such as NCHW, NHWC)
-        channel_size = np.prod(np.array(input_shape))
-        if count != -1:
-            count *= channel_size
-        offset *= channel_size
-        inputs = read_tensor_from_file(
-            input_file, -1, *input_shape, count=count, offset=offset,
-            use_progress_bar=True
-        )
-        outputs = read_tensor_from_file(
-            output_file, -1, *output_shape, count=count, offset=offset,
-            use_progress_bar=True
-        )
-        print(f"(input={inputs.shape[0]}, output={outputs.shape[0]}) entries loaded from dataset.")
-        return cls(inputs, outputs)
-
-    @classmethod
-    def from_default_file(cls, prefix: str):
-        prefix = Path(prefix)
-        return cls.from_file(
-            Path(prefix) / 'input.bin', Path(prefix) / 'canny_input.bin',
-            Path(prefix) / 'labels.bin', Path(prefix) / 'output.bin'
-        )
-
-
-class CIFARImage(HPVMImageDataset):
-    def __init__(
-            self, inputs: torch.Tensor, outputs: torch.Tensor, cifar: CIFAR
-    ):
-        super().__init__(inputs, outputs)
-        self.cifar = cifar
-
-    @classmethod
-    def from_file(
-            cls, dnn_input_file: Path, image_input_file: Path,
-            labels_file: Path, output_file: Path,
-            batch_size: int = 100, count: int = -1, offset: int = 0
-    ):
-        classifier = CIFAR.from_file(dnn_input_file, labels_file)
-        dataset = HPVMImageDataset._from_file(
-            image_input_file, output_file, count, offset,
-            [3, 128, 128], [1, 128, 128]
-        )
-        return cls(dataset.inputs, dataset.outputs, classifier)
-
-    def sample(self: 'CIFARImage', ratio: float) -> 'CIFARImage':
-        raise NotImplementedError()
-
-    def __getitem__(self, idx):
-        if idx >= len(self):
-            raise IndexError("Dataset index out of range")
-        cifar_in, cifar_out = self.cifar[idx]
-        return (cifar_in, self.inputs[idx]), (cifar_out, self.outputs[idx])
diff --git a/hpvm/projects/pred_tuner/models/datasets/torch.py b/hpvm/projects/pred_tuner/models/datasets/torch.py
deleted file mode 100644
index 1b07bd17c744df733158dc5d84da3f1934e7cd3c..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/datasets/torch.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import logging
-
-from torch.utils.data import DataLoader
-from torchvision.datasets import CIFAR10
-from torchvision.transforms import transforms
-
-msg_logger = logging.getLogger()
-
-
-def get_cifar10_train_dataloader(root: str, batchsize: int) -> DataLoader:
-    transform_train = transforms.Compose([
-        transforms.RandomCrop(32, padding=4),
-        transforms.RandomHorizontalFlip(),
-        transforms.ToTensor(),
-        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-    ])
-    dl = DataLoader(
-        CIFAR10(root=root, train=True, download=True, transform=transform_train),
-        batch_size=batchsize, shuffle=True
-    )
-    msg_logger.info(f"{len(dl)} entries loaded from training dataset.")
-    return dl
-
-
-def get_cifar10_test_dataset(root: str) -> CIFAR10:
-    transform_test = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-    ])
-    dataset = CIFAR10(root=root, train=False, download=True, transform=transform_test)
-    msg_logger.info(f"{len(dataset)} entries loaded from training dataset.")
-    return dataset
-
-
-def get_cifar10_test_dataloader(root: str, batchsize: int) -> DataLoader:
-    dl = DataLoader(get_cifar10_test_dataset(root), batch_size=batchsize)
-    return dl
diff --git a/hpvm/projects/pred_tuner/models/domains/__init__.py b/hpvm/projects/pred_tuner/models/domains/__init__.py
deleted file mode 100644
index abe6c13a378fe61f9dee7b1c7a60950c1a58226a..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/domains/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .qoses import QoS, Accuracy, qos_stats
diff --git a/hpvm/projects/pred_tuner/models/domains/qoses.py b/hpvm/projects/pred_tuner/models/domains/qoses.py
deleted file mode 100644
index 0a1e7f2eb1050f5adcc4e25d7b65100e3141ae8a..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/domains/qoses.py
+++ /dev/null
@@ -1,317 +0,0 @@
-import abc
-from typing import Iterable, List, Optional, Tuple
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader
-
-
-class QoS(abc.ABC):
-    @abc.abstractmethod
-    def __sub__(self, other: 'QoS') -> 'QoS':
-        pass
-
-    @abc.abstractmethod
-    def __add__(self, other: 'QoS') -> 'QoS':
-        pass
-
-    @abc.abstractmethod
-    def __truediv__(self, other: float) -> 'QoS':
-        pass
-
-    @abc.abstractmethod
-    def __lt__(self, other: 'QoS') -> bool:
-        pass
-
-    @abc.abstractmethod
-    def __eq__(self, other: 'QoS') -> bool:
-        pass
-
-    def __gt__(self, other: 'QoS') -> bool:
-        return not self <= other
-
-    def __le__(self, other: 'QoS') -> bool:
-        return self < other or self == other
-
-    def __ge__(self, other: 'QoS') -> bool:
-        return not self < other
-
-    @abc.abstractmethod
-    def __hash__(self):
-        pass
-
-    @abc.abstractmethod
-    def __repr__(self) -> str:
-        pass
-
-    @abc.abstractmethod
-    def to_scalar(self, relative_to=None) -> float:
-        pass
-
-    @abc.abstractmethod
-    def numpy(self) -> np.ndarray:
-        pass
-
-    @abc.abstractmethod
-    def null(self) -> 'QoS':
-        pass
-
-    @staticmethod
-    @abc.abstractmethod
-    def parse(string: str) -> 'QoS':
-        pass
-
-    @abc.abstractmethod
-    def min_positive_loss(self) -> 'QoS':
-        pass
-
-    @staticmethod
-    @abc.abstractmethod
-    def suggested_tuner_thresholds(baseline: 'QoS') -> List['QoS']:
-        pass
-
-    @staticmethod
-    @abc.abstractmethod
-    def suggested_val_threshold(baseline: 'QoS') -> 'QoS':
-        pass
-
-    @staticmethod
-    @abc.abstractmethod
-    def suggested_test_threshold(baseline: 'QoS') -> 'QoS':
-        pass
-
-    @staticmethod
-    @abc.abstractmethod
-    def from_output(output, ground_truth) -> 'QoS':
-        pass
-
-    @classmethod
-    def combine_qoses(cls, qoses: Iterable['QoS']) -> 'QoS':
-        qoses = np.array(qoses)
-        return qoses.mean()
-
-    @classmethod
-    def from_all_output(cls, outputs: List, dataloader: DataLoader) -> 'QoS':
-        if not outputs:
-            raise ValueError("Empty output has no QoS value")  # Probably can result cls.null()
-        qoses = []
-        for (_, gt_output), output in zip(dataloader, outputs):
-            qoses.append(cls.from_output(output, gt_output))
-        return cls.combine_qoses(qoses)
-
-
-class ScalarQoS(QoS, abc.ABC):
-    def __init__(self, value: float):
-        self.value = value
-
-    def __sub__(self, other: 'ScalarQoS') -> 'ScalarQoS':
-        return self.__class__(self.value - other.value)
-
-    def __add__(self, other: 'ScalarQoS') -> 'ScalarQoS':
-        return self.__class__(self.value + other.value)
-
-    def __truediv__(self, other: float):
-        return self.__class__(self.value / other)
-
-    def __lt__(self, other: 'ScalarQoS') -> bool:
-        return self.value < other.value
-
-    def __eq__(self, other: 'ScalarQoS') -> bool:
-        return self.value == other.value
-
-    def __hash__(self):
-        return hash(self.value)
-
-    def __repr__(self) -> str:
-        return repr(self.value)
-
-    def null(self) -> 'ScalarQoS':
-        return self.__class__(0.0)
-
-    def to_scalar(self, relative_to=None) -> float:
-        return self.value
-
-    def numpy(self) -> np.ndarray:
-        return np.array([self.value])
-
-    @classmethod
-    def parse(cls, string: str) -> 'ScalarQoS':
-        return cls(float(string))
-
-
-class Accuracy(ScalarQoS):
-    def __init__(self, accuracy: float):
-        super().__init__(accuracy)
-
-    def min_positive_loss(self) -> 'Accuracy':
-        return Accuracy(0.05) if self.value < 0 else self
-
-    @staticmethod
-    def suggested_tuner_thresholds(baseline: 'Accuracy') -> List['Accuracy']:
-        return [baseline - Accuracy(0.8), baseline - Accuracy(1.5), baseline - Accuracy(2.1)]
-
-    @staticmethod
-    def suggested_val_threshold(baseline: 'Accuracy') -> 'Accuracy':
-        return baseline - Accuracy(2.1)
-
-    @staticmethod
-    def suggested_test_threshold(baseline: 'Accuracy') -> 'Accuracy':
-        return baseline - Accuracy(3.0)
-
-    @staticmethod
-    def from_output(output: torch.Tensor, ground_truth: torch.Tensor) -> 'Accuracy':
-        ground_truth = ground_truth.to(output.device)
-        correct = output.argmax(dim=1).eq(ground_truth).sum().item()
-        acc = correct / ground_truth.shape[0]
-        return Accuracy(acc * 100)
-
-
-class PSNR(ScalarQoS):
-    artificial_max = 100
-
-    def __init__(self, psnr: float):
-        super().__init__(psnr)
-
-    def min_positive_loss(self) -> 'PSNR':
-        return PSNR(1) if self.value < 0 else self
-
-    @staticmethod
-    def suggested_tuner_thresholds(baseline: 'PSNR') -> List['PSNR']:
-        return [PSNR(30), PSNR(25), PSNR(20)]
-
-    @staticmethod
-    def suggested_val_threshold(baseline: 'PSNR') -> 'PSNR':
-        return PSNR(20)
-
-    @staticmethod
-    def suggested_test_threshold(baseline: 'PSNR') -> 'PSNR':
-        return PSNR(20)
-
-    @staticmethod
-    def from_output(output: torch.Tensor, ground_truth: torch.Tensor) -> 'PSNR':
-        ground_truth = ground_truth.to(output.device)
-        if ground_truth.shape[0] != 0:
-            max_i = ground_truth.max()
-            mse = torch.sum((output - ground_truth) ** 2) / output.nelement()
-            psnr = (20 * torch.log10(max_i) - 10 * torch.log10(mse)).item()
-        else:
-            psnr = PSNR.artificial_max
-        return PSNR(psnr)
-
-
-class MultiQoS(QoS, abc.ABC):
-    def __init__(self, *qoses: ScalarQoS):
-        self.qoses = qoses
-
-    def __sub__(self, other: 'MultiQoS') -> 'MultiQoS':
-        assert type(self) == type(other)
-        return self.__class__(*(x - y for x, y in zip(self.qoses, other.qoses)))
-
-    def __add__(self, other: 'MultiQoS') -> 'MultiQoS':
-        assert type(self) == type(other)
-        return self.__class__(*(x + y for x, y in zip(self.qoses, other.qoses)))
-
-    def __truediv__(self, other: int):
-        return self.__class__(*(x / other for x in self.qoses))
-
-    def __lt__(self, other: 'MultiQoS') -> bool:
-        assert type(self) == type(other)
-        return all((x < y for x, y in zip(self.qoses, other.qoses)))
-
-    def __eq__(self, other: 'MultiQoS') -> bool:
-        assert type(self) == type(other)
-        return all((x == y for x, y in zip(self.qoses, other.qoses)))
-
-    def __hash__(self):
-        return hash(self.qoses)
-
-    def __repr__(self) -> str:
-        return ','.join(repr(q) for q in self.qoses)
-
-    def null(self) -> 'MultiQoS':
-        return MultiQoS(*(q.null() for q in self.qoses))
-
-    def numpy(self) -> np.ndarray:
-        return np.array([q.to_scalar() for q in self.qoses])
-
-    def min_positive_loss(self) -> 'MultiQoS':
-        return self.__class__(*(q.min_positive_loss() for q in self.qoses))
-
-
-PairT = Tuple[torch.Tensor, torch.Tensor]
-TripleT = Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
-
-
-class AccuracyPSNR(MultiQoS):
-    def __init__(self, acc: Accuracy, psnr: PSNR):
-        super().__init__(acc, psnr)
-
-    def to_scalar(self, relative_to: 'AccuracyPSNR' = None) -> float:
-        acc, psnr = self.qoses
-        if relative_to is not None:
-            thres_acc, thres_psnr = relative_to.qoses
-            punishment = (-1 if acc < thres_acc else 0) + (-1 if psnr < thres_psnr else 0)
-        else:
-            punishment = 0
-        max_psnr = PSNR.artificial_max
-        normed_psnr = min(psnr.value, max_psnr) / max_psnr  # [0, 1], higher better
-        acc = acc.value / 100  # [0, 1], higher better
-        combined = (acc + normed_psnr) / 2  # [0, 1], higher better
-        assert 0 <= combined <= 1
-        return combined + punishment
-
-    @staticmethod
-    def parse(string: str) -> 'AccuracyPSNR':
-        acc, psnr = string.split(',')
-        return AccuracyPSNR(Accuracy.parse(acc), PSNR.parse(psnr))
-
-    # noinspection PyTypeChecker
-    @staticmethod
-    def suggested_tuner_thresholds(baseline: 'AccuracyPSNR') -> List['AccuracyPSNR']:
-        ret = []
-        for acc in Accuracy.suggested_tuner_thresholds(baseline.qoses[0]):
-            for psnr in PSNR.suggested_tuner_thresholds(baseline.qoses[1]):
-                ret.append(AccuracyPSNR(acc, psnr))
-        return ret
-
-    # noinspection PyTypeChecker
-    @staticmethod
-    def suggested_val_threshold(baseline: 'AccuracyPSNR') -> 'AccuracyPSNR':
-        return AccuracyPSNR(
-            Accuracy.suggested_val_threshold(baseline.qoses[0]),
-            PSNR.suggested_val_threshold(baseline.qoses[1])
-        )
-
-    # noinspection PyTypeChecker
-    @staticmethod
-    def suggested_test_threshold(baseline: 'AccuracyPSNR') -> 'AccuracyPSNR':
-        return AccuracyPSNR(
-            Accuracy.suggested_test_threshold(baseline.qoses[0]),
-            PSNR.suggested_test_threshold(baseline.qoses[1])
-        )
-
-    @staticmethod
-    def from_output(output: TripleT, ground_truth: PairT) -> 'AccuracyPSNR':
-        gt_labels, gt_images = ground_truth
-        labels, image_selection, images = output
-        gt_labels = gt_labels.to(labels.device)
-        gt_images = gt_images.to(images.device)
-        acc = Accuracy.from_output(labels, gt_labels)
-        gt_images = gt_images[image_selection]
-        psnr = PSNR.from_output(images, gt_images)
-        return AccuracyPSNR(acc, psnr)
-
-
-def qos_stats(qoses: List[QoS], confidence: float = None, threshold: QoS = None) -> \
-        Tuple[QoS, Optional[QoS], Optional[float]]:
-    qoses = np.array(qoses)
-    n_runs = len(qoses)
-    confidence_at_thres = np.count_nonzero(qoses > threshold) / n_runs if threshold else None
-    if confidence is None:
-        qos_at_confidence = None
-    else:
-        index = int((1 - confidence) * n_runs)
-        # Otherwise it's np.float64 and causes trouble with opentuner
-        qos_at_confidence = qoses[index]
-    mean_acc = qoses.mean()
-    return mean_acc, qos_at_confidence, confidence_at_thres
diff --git a/hpvm/projects/pred_tuner/models/hpvm/__init__.py b/hpvm/projects/pred_tuner/models/hpvm/__init__.py
deleted file mode 100644
index 337738c0bf41002f910acfb98b9e8073ebc10052..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/hpvm/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from .alexnet import AlexNet, AlexNet2, AlexNetImageNet
-from .alexnet_canny import AlexNet2Canny
-from .layers import HPVMConvBundle, HPVMDNN, HPVMDefaultModule, read_tensor_from_file
-from .lenet import LeNet
-from .mobilenet import MobileNet
-from .resnet import ResNet18, ResNet50
-from .vgg16 import VGG16Cifar10, VGG16Cifar100, VGG16ImageNet
diff --git a/hpvm/projects/pred_tuner/models/hpvm/alexnet.py b/hpvm/projects/pred_tuner/models/hpvm/alexnet.py
deleted file mode 100644
index b7c9b6c3cae1e86ac699913b3f1d09af28c52705..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/hpvm/alexnet.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from torch.nn import Linear, ReLU, Sequential, Tanh
-
-from .layers import HPVMConvBundle, HPVMDNN
-
-
-class AlexNet(HPVMDNN):
-    def __init__(self):
-        convs = Sequential(
-            HPVMConvBundle(3, 64, 11, Tanh, pool_size=2, padding=5),
-            HPVMConvBundle(64, 192, 5, Tanh, pool_size=2, padding=2),
-            HPVMConvBundle(192, 384, 3, Tanh, padding=1),
-            HPVMConvBundle(384, 256, 3, Tanh, padding=1),
-            HPVMConvBundle(256, 256, 3, Tanh, pool_size=2, padding=1)
-        )
-        linears = Sequential(Linear(4096, 10))
-        super().__init__(convs, linears)
-
-
-class AlexNet2(HPVMDNN):
-    def __init__(self):
-        convs = Sequential(
-            HPVMConvBundle(3, 32, 3, Tanh, padding=1),
-            HPVMConvBundle(32, 32, 3, Tanh, pool_size=2, padding=1),
-            HPVMConvBundle(32, 64, 3, Tanh, padding=1),
-            HPVMConvBundle(64, 64, 3, Tanh, pool_size=2, padding=1),
-            HPVMConvBundle(64, 128, 3, Tanh, padding=1),
-            HPVMConvBundle(128, 128, 3, Tanh, pool_size=2, padding=1)
-        )
-        linears = Sequential(Linear(2048, 10))
-        super().__init__(convs, linears)
-
-
-class AlexNetImageNet(HPVMDNN):
-    def __init__(self):
-        convs = Sequential(
-            HPVMConvBundle(3, 64, 11, ReLU, padding=2, stride=4, pool_size=3, pool_stride=2),
-            HPVMConvBundle(64, 192, 5, ReLU, padding=2, pool_size=3, pool_stride=2),
-            HPVMConvBundle(192, 384, 3, ReLU, padding=1),
-            HPVMConvBundle(384, 256, 3, ReLU, padding=1),
-            HPVMConvBundle(256, 256, 3, ReLU, padding=1, pool_size=3, pool_stride=2)
-        )
-        linears = Sequential(
-            Linear(9216, 4096),
-            ReLU(),
-            Linear(4096, 4096),
-            ReLU(),
-            Linear(4096, 1000),
-        )
-        super().__init__(convs, linears)
diff --git a/hpvm/projects/pred_tuner/models/hpvm/alexnet_canny.py b/hpvm/projects/pred_tuner/models/hpvm/alexnet_canny.py
deleted file mode 100644
index 5e610279121a5b368f4cdf64b72e0a2d6fe9289a..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/hpvm/alexnet_canny.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import Iterable, Tuple
-
-import torch
-from torch.nn import Softmax
-
-from .alexnet import AlexNet2
-from .layers import HPVMConvBundle, HPVMDefaultModule, ReduceKind, TensorReduce
-
-
-class AlexNet2Canny(HPVMDefaultModule):
-    def __init__(self, on_classes: Iterable[int]):
-        super().__init__()
-        prototype = AlexNet2()
-        self.on_classes = list(on_classes)
-        self.convs = prototype.convs
-        self.linears = prototype.linears
-        self.softmax = Softmax(1)
-        self.reduce_1 = TensorReduce(1, ReduceKind.sum)
-        self.gaussian = HPVMConvBundle(1, 1, 5, padding=2, bias=False)
-        self.sobel_x = HPVMConvBundle(1, 1, 3, padding=1, bias=False)
-        self.sobel_y = HPVMConvBundle(1, 1, 3, padding=1, bias=False)
-        self.reduce_2 = TensorReduce(2, ReduceKind.max)
-        self.reduce_3 = TensorReduce(2, ReduceKind.max)
-
-    def canny(self, images: torch.Tensor) -> torch.Tensor:
-        assert len(images.shape) == 4  # Assuming NCHW
-        grayscale = self.reduce_1(images)
-        grayscale = grayscale.unsqueeze(1)
-        denoised = self.gaussian(grayscale)
-        grad_x = self.sobel_x(denoised)
-        grad_y = self.sobel_y(denoised)
-        grad_mag = torch.sqrt(grad_x ** 2 + grad_y ** 2)
-        grad_max_1D = self.reduce_2(grad_mag)
-        grad_max = self.reduce_3(grad_max_1D)
-        grad_max = grad_max.unsqueeze(2).unsqueeze(3)
-        grad_mag_norm = grad_mag / grad_max
-        return grad_mag_norm
-
-    def forward(self, inputs) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        from functools import reduce
-        from operator import ior
-        dnn_input, canny_input = inputs
-        conv_outputs = self.convs(dnn_input)
-        dnn_outputs = self.softmax(self.linears(conv_outputs.view(conv_outputs.shape[0], -1)))
-        classes = dnn_outputs.argmax(dim=1)
-        selection = reduce(ior, (classes == i for i in self.on_classes))
-        selected_inputs = canny_input[selection]
-        return dnn_outputs, selection, self.canny(selected_inputs)
diff --git a/hpvm/projects/pred_tuner/models/hpvm/layers.py b/hpvm/projects/pred_tuner/models/hpvm/layers.py
deleted file mode 100644
index fed66e7b1507ac4ca309de0dc0599dde9a926a8a..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/hpvm/layers.py
+++ /dev/null
@@ -1,223 +0,0 @@
-from enum import Enum
-from pathlib import Path
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from torch.nn import AvgPool2d, BatchNorm2d, Conv2d, Linear, MaxPool2d, Module, Parameter, ReLU, Sequential, Softmax, \
-    Tanh
-
-
-def rsetattr(obj, attr, val):
-    pre, _, post = attr.rpartition('.')
-    return setattr(rgetattr(obj, pre) if pre else obj, post, val)
-
-
-def rgetattr(obj, attr, *args):
-    def _getattr(obj_, attr_):
-        return getattr(obj_, attr_, *args)
-
-    import functools
-    return functools.reduce(_getattr, attr.split('.'), obj)
-
-
-def read_tensor_from_file(
-        filename: Union[str, Path], *shape: int,
-        read_ty=np.float32, cast_ty=np.float32,
-        count: int = -1, offset: int = 0,
-        use_progress_bar: bool = False
-) -> torch.Tensor:
-    from tqdm import trange
-    block_size = 102400
-    offset = offset * read_ty().itemsize
-    mmap = np.memmap(filename, dtype=read_ty, mode='r', offset=offset)
-    raw = np.empty_like(mmap)
-    n_entries = min(mmap.shape[0], count) if count != -1 else mmap.shape[0]
-    n_blocks = int(np.ceil(n_entries / block_size))
-    iterable = trange(n_blocks) if use_progress_bar else range(n_blocks)
-    for block in iterable:
-        l, r = block * block_size, min(n_entries, (block + 1) * block_size)
-        raw[l:r] = mmap[l:r]
-    del mmap
-    if cast_ty != read_ty:
-        raw = raw.astype(cast_ty)
-    loaded_np = raw.reshape(shape)
-    return torch.from_numpy(loaded_np)
-
-
-ActivT = Optional[Callable[[], Module]]
-ArgsT = Union[List, Dict]
-RangeT = Tuple[float, float]
-RangeOT = Optional[RangeT]
-
-
-class HPVMConvBundle(Module):
-    def __init__(
-            self, in_channels: int, out_channels: int, kernel_size: int,
-            activation: ActivT = None,
-            pool_size: Optional[int] = None, pool_stride: Optional[int] = None,
-            **conv_kwargs
-    ):
-        super().__init__()
-        self.conv = Conv2d(in_channels, out_channels, kernel_size, **conv_kwargs)
-        if pool_size is None:
-            self.pooling = Sequential()
-        else:
-            pool_stride = pool_stride or pool_size
-            self.pooling = MaxPool2d(pool_size, stride=pool_stride)
-        self.activation = Sequential() if activation is None else activation()
-        self.conv_ranges_ = None
-
-    def forward(self, input_: torch.Tensor) -> torch.Tensor:
-        return self.activation(self.pooling(self.conv(input_)))
-
-    def input_to_conv(self, input_: torch.Tensor) -> torch.Tensor:
-        bias = self.conv.bias
-        self.conv.bias = None
-        conv_out = self.conv(input_)
-        self.conv.bias = bias
-        return conv_out
-
-    def conv_to_output(self, conv_output: torch.Tensor) -> torch.Tensor:
-        if self.conv.bias is not None:
-            broadcast_bias = self.conv.bias.reshape(1, -1, 1, 1)
-            return self.activation(self.pooling(conv_output + broadcast_bias))
-        else:
-            return self.activation(self.pooling(conv_output))
-
-    def __getattr__(self, item):
-        if item in ('weight', 'bias'):
-            return getattr(self.conv, item)
-        return super(HPVMConvBundle, self).__getattr__(item)
-
-    def __setattr__(self, key, value):
-        if key in ('weight', 'bias'):
-            setattr(self.conv, key, value)
-        else:
-            super(HPVMConvBundle, self).__setattr__(key, value)
-
-
-class ReduceKind(Enum):
-    sum = 1
-    max = 2
-
-
-class TensorReduce(Module):
-    def __init__(self, dim: int, kind: ReduceKind, skip_ratio: float = 0.0):
-        super().__init__()
-        self.dim = dim
-        self.skip_ratio = skip_ratio
-        if kind == ReduceKind.sum:
-            self.reducer = lambda x: x.sum(dim=0)  # Because we transpose the input
-            self.normalizer = lambda x: x / (1 - self.skip_ratio)
-        elif kind == ReduceKind.max:
-            self.reducer = lambda x: x.max(dim=0)[0]
-            self.normalizer = lambda x: x
-
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        from math import ceil
-        inputs_t = inputs.transpose(0, self.dim)
-        if len(inputs) == 0:
-            dim_reduced = torch.zeros_like(inputs_t)[0]
-        else:
-            reduce_dim_size = inputs_t.size(0)
-            approxed_dim_size = int(ceil((1 - self.skip_ratio) * reduce_dim_size))
-            # Take a contiguous chunk and reduce over it, ignore the rest
-            dim_reduced: torch.Tensor = self.normalizer(self.reducer(inputs_t[:approxed_dim_size]))
-        return dim_reduced.unsqueeze(0).transpose(0, self.dim).squeeze(self.dim)
-
-    def change_skip_ratio(self, skip_ratio: float) -> 'TensorReduce':
-        return TensorReduce(self.dim, self.kind, skip_ratio)
-
-
-def read_quant_ranges(prefix: Path):
-    range_file = prefix / 'quant_ranges.txt'
-    if not range_file.is_file():
-        return None
-    with range_file.open() as f:
-        return [[float(field) for field in line.strip().split()] for line in f.readlines()]
-
-
-class HPVMDefaultModule(Module):
-    @staticmethod
-    def load_into_layer(
-            layer: Module, attr_name: str, filename: str, prefix: Path,
-            is_linear_weight: bool = False
-    ):
-        tensor = rgetattr(layer, attr_name)
-        if is_linear_weight:
-            n_out, n_in = tensor.shape
-            loaded = read_tensor_from_file(prefix / filename, n_in, n_out).T
-        else:
-            loaded = read_tensor_from_file(prefix / filename, *tensor.shape)
-        if type(tensor) is Parameter:
-            loaded = Parameter(loaded, requires_grad=True)
-        rsetattr(layer, attr_name, loaded)
-
-    @staticmethod
-    def install_quant_range(module: Module, values: List[float]):
-        in_min, in_max, w_min, w_max, b_min, b_max, out_min, out_max = values
-        module.conv_ranges = (in_min, in_max), (w_min, w_max), (b_min, b_max), (out_min, out_max)
-
-    def default_load_hpvm_weights(self, prefix: str):
-        # TODO: this is probably better done with help of ModuleDAG
-        prefix = Path(prefix)
-        convs, group_convs, linears, bns = [], [], [], []
-        weightless_types = AvgPool2d, MaxPool2d, ReLU, Tanh, Softmax, TensorReduce
-        container_types = (Sequential,)
-        for module in self.modules():
-            if isinstance(module, HPVMConvBundle):
-                convs.append(module)
-            elif isinstance(module, Conv2d):
-                if module.groups != 1:
-                    group_convs.append(module)
-            elif isinstance(module, Linear):
-                linears.append(module)
-            elif isinstance(module, BatchNorm2d):
-                bns.append(module)
-            elif type(module) in weightless_types:
-                pass
-            elif type(module) in container_types or len(list(module.children())) != 0:
-                continue
-            else:
-                raise RuntimeError(f"Layer type {type(module)} not understood")
-        load = self.load_into_layer
-        quant_ranges = read_quant_ranges(prefix)
-        quant_ranges_idx = 0
-        for i, conv in enumerate(convs):
-            conv: HPVMConvBundle
-            load(conv, 'weight', f"conv2d_{i + 1}_w.bin", prefix)
-            if conv.bias is not None:
-                load(conv, 'bias', f"conv2d_{i + 1}_b.bin", prefix)
-            if quant_ranges is not None:
-                self.install_quant_range(conv, quant_ranges[quant_ranges_idx])
-                quant_ranges_idx += 1
-        for i, gconv in enumerate(group_convs):
-            load(gconv, 'weight', f"depthwise_conv2d_{i + 1}_w.bin", prefix)
-            if gconv.bias is not None:
-                load(gconv, 'bias', f"depthwise_conv2d_{i + 1}_b.bin", prefix)
-        for i, bn in enumerate(bns):
-            bn: BatchNorm2d
-            load(bn, 'weight', f"batch_normalization_{i + 1}_gamma.bin", prefix)
-            load(bn, 'bias', f"batch_normalization_{i + 1}_beta.bin", prefix)
-            load(bn, 'running_mean', f"batch_normalization_{i + 1}_mean.bin", prefix)
-            load(bn, 'running_var', f"batch_normalization_{i + 1}_variance.bin", prefix)
-        for i, linear in enumerate(linears):
-            load(linear, 'weight', f"dense_{i + 1}_w.bin", prefix, True)
-            load(linear, 'bias', f"dense_{i + 1}_b.bin", prefix)
-            if quant_ranges is not None:
-                self.install_quant_range(linear, quant_ranges[quant_ranges_idx])
-                quant_ranges_idx += 1
-        assert quant_ranges is None or len(quant_ranges) == quant_ranges_idx
-
-
-class HPVMDNN(HPVMDefaultModule):
-    def __init__(self, convs: Sequential, linears: Sequential):
-        super().__init__()
-        self.convs = convs
-        self.linears = linears
-        self.softmax = Softmax(1)
-
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        outputs = self.convs(inputs)
-        return self.softmax(self.linears(outputs.view(outputs.shape[0], -1)))
diff --git a/hpvm/projects/pred_tuner/models/hpvm/lenet.py b/hpvm/projects/pred_tuner/models/hpvm/lenet.py
deleted file mode 100644
index 0802b5f78d2c73d352afe68b16df74689e9aec68..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/hpvm/lenet.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from torch.nn import Linear, Sequential, Tanh
-
-from .layers import HPVMConvBundle, HPVMDNN
-
-
-class LeNet(HPVMDNN):
-    def __init__(self):
-        convs = Sequential(
-            HPVMConvBundle(1, 32, 5, Tanh, 2, padding=2),
-            HPVMConvBundle(32, 64, 5, Tanh, 2, padding=2)
-        )
-        linears = Sequential(
-            Linear(7 * 7 * 64, 1024), Tanh(),
-            Linear(1024, 10), Tanh()
-        )
-        super().__init__(convs, linears)
diff --git a/hpvm/projects/pred_tuner/models/hpvm/mobilenet.py b/hpvm/projects/pred_tuner/models/hpvm/mobilenet.py
deleted file mode 100644
index f48a214fc9c1d7ec52cd5a24ec0e8d82d38aaa6e..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/hpvm/mobilenet.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from torch.nn import AvgPool2d, BatchNorm2d, Conv2d, Linear, ReLU, Sequential
-
-from .layers import HPVMDNN, HPVMConvBundle
-
-
-def _make_seq(in_channels, out_channels, c_kernel_size, gc_stride, gc_kernel_size=3):
-    return Sequential(
-        HPVMConvBundle(
-            in_channels, out_channels, c_kernel_size,
-            bias=False, padding=(c_kernel_size - 1) // 2
-        ),
-        BatchNorm2d(out_channels, eps=0.001),
-        ReLU(),
-        Conv2d(
-            out_channels, out_channels, gc_kernel_size,
-            bias=False, stride=gc_stride, padding=(gc_kernel_size - 1) // 2, groups=out_channels
-        ),
-        BatchNorm2d(out_channels, eps=0.001),
-        ReLU()
-    )
-
-
-class MobileNet(HPVMDNN):
-    def __init__(self):
-        convs = Sequential(
-            _make_seq(3, 32, 3, 1),
-            _make_seq(32, 64, 1, 2),
-            _make_seq(64, 128, 1, 1),
-            _make_seq(128, 128, 1, 2),
-            _make_seq(128, 256, 1, 1),
-            _make_seq(256, 256, 1, 2),
-            _make_seq(256, 512, 1, 1),
-            _make_seq(512, 512, 1, 1),
-            _make_seq(512, 512, 1, 1),
-            _make_seq(512, 512, 1, 1),
-            _make_seq(512, 512, 1, 1),
-            _make_seq(512, 512, 1, 2),
-            _make_seq(512, 1024, 1, 1),
-            HPVMConvBundle(1024, 1024, 1, padding=0, bias=False),
-            BatchNorm2d(1024, eps=0.001),
-            ReLU(),
-            AvgPool2d(2)
-        )
-        linears = Sequential(Linear(1024, 10))
-        super().__init__(convs, linears)
diff --git a/hpvm/projects/pred_tuner/models/hpvm/resnet.py b/hpvm/projects/pred_tuner/models/hpvm/resnet.py
deleted file mode 100644
index fc42a00001792b59b593b668f6cf4e8a5a230d9d..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/hpvm/resnet.py
+++ /dev/null
@@ -1,96 +0,0 @@
-from torch.nn import AvgPool2d, BatchNorm2d, Linear, Module, ReLU, Sequential
-
-from .layers import HPVMConvBundle, HPVMDNN
-
-
-class BasicBlock(Module):
-    def __init__(self, ins, outs, shortcut=False):
-        super().__init__()
-        stride = 2 if shortcut else 1
-        self.mainline = Sequential(
-            HPVMConvBundle(ins, outs, 3, ReLU, padding=1, stride=stride),
-            HPVMConvBundle(outs, outs, 3, padding=1)
-        )
-        self.relu1 = ReLU()
-        self.shortcut = HPVMConvBundle(ins, outs, 1, stride=stride) \
-            if shortcut else Sequential()
-
-    def forward(self, input_):
-        return self.relu1(self.mainline(input_) + self.shortcut(input_))
-
-
-class ResNet18(HPVMDNN):
-    def __init__(self):
-        convs = Sequential(
-            HPVMConvBundle(3, 16, 3, ReLU, padding=1),
-            BasicBlock(16, 16),
-            BasicBlock(16, 16),
-            BasicBlock(16, 16),
-            BasicBlock(16, 32, True),
-            BasicBlock(32, 32),
-            BasicBlock(32, 32),
-            BasicBlock(32, 64, True),
-            BasicBlock(64, 64),
-            BasicBlock(64, 64),
-            AvgPool2d(8)
-        )
-        linears = Sequential(Linear(64, 10))
-        super().__init__(convs, linears)
-
-
-class Bottleneck(Module):
-    expansion = 4
-
-    def __init__(self, in_planes, planes, stride=1):
-        super(Bottleneck, self).__init__()
-        self.mainline = Sequential(
-            HPVMConvBundle(in_planes, planes, 1, stride=stride),
-            BatchNorm2d(planes, eps=0.001),
-            ReLU(),
-            HPVMConvBundle(planes, planes, 3, padding=1),
-            BatchNorm2d(planes, eps=0.001),
-            ReLU(),
-            HPVMConvBundle(planes, self.expansion * planes, 1),
-            BatchNorm2d(self.expansion * planes, eps=0.001)
-        )
-        self.relu1 = ReLU()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = Sequential(
-                HPVMConvBundle(in_planes, self.expansion * planes, 1, stride=stride),
-                BatchNorm2d(self.expansion * planes, eps=0.001)
-            )
-        else:
-            self.shortcut = Sequential()
-
-    def forward(self, input_):
-        return self.relu1(self.mainline(input_) + self.shortcut(input_))
-
-
-class ResNet50(HPVMDNN):
-    def __init__(self):
-        convs = Sequential(
-            HPVMConvBundle(3, 64, 7, ReLU, pool_size=3, pool_stride=2, padding=3, stride=2),
-            BatchNorm2d(64, eps=0.001),
-            Bottleneck(64, 64),
-            Bottleneck(256, 64),
-            Bottleneck(256, 64),
-
-            Bottleneck(256, 128, stride=2),
-            Bottleneck(512, 128),
-            Bottleneck(512, 128),
-            Bottleneck(512, 128),
-
-            Bottleneck(512, 256, stride=2),
-            Bottleneck(1024, 256),
-            Bottleneck(1024, 256),
-            Bottleneck(1024, 256),
-            Bottleneck(1024, 256),
-            Bottleneck(1024, 256),
-
-            Bottleneck(1024, 512, stride=2),
-            Bottleneck(2048, 512),
-            Bottleneck(2048, 512),
-            AvgPool2d(7)
-        )
-        linears = Sequential(Linear(2048, 1000))
-        super().__init__(convs, linears)
diff --git a/hpvm/projects/pred_tuner/models/hpvm/vgg16.py b/hpvm/projects/pred_tuner/models/hpvm/vgg16.py
deleted file mode 100644
index b31c0d47ca43118cbc1f7ad43b517d6dc02dd223..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/hpvm/vgg16.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from typing import Iterable
-
-from torch.nn import Linear, ReLU, Sequential
-
-from .layers import HPVMConvBundle, HPVMDNN
-
-
-class _VGG16(HPVMDNN):
-    def __init__(self, linear_inouts: Iterable[int]):
-        convs = Sequential(
-            HPVMConvBundle(3, 64, 3, ReLU, padding=1),
-            HPVMConvBundle(64, 64, 3, ReLU, 2, padding=1),
-            HPVMConvBundle(64, 128, 3, ReLU, padding=1),
-            HPVMConvBundle(128, 128, 3, ReLU, 2, padding=1),
-            HPVMConvBundle(128, 256, 3, ReLU, padding=1),
-            HPVMConvBundle(256, 256, 3, ReLU, padding=1),
-            HPVMConvBundle(256, 256, 3, ReLU, 2, padding=1),
-            HPVMConvBundle(256, 512, 3, ReLU, padding=1),
-            HPVMConvBundle(512, 512, 3, ReLU, padding=1),
-            HPVMConvBundle(512, 512, 3, ReLU, 2, padding=1),
-            HPVMConvBundle(512, 512, 3, ReLU, padding=1),
-            HPVMConvBundle(512, 512, 3, ReLU, padding=1),
-            HPVMConvBundle(512, 512, 3, ReLU, 2, padding=1)
-        )
-        linear_layers = [Linear(in_, out) for in_, out in zip(linear_inouts, linear_inouts[1:])]
-        linear_relus = [ReLU() for _ in range(2 * len(linear_layers) - 1)]
-        linear_relus[::2] = linear_layers
-        linears = Sequential(*linear_relus)
-        super().__init__(convs, linears)
-
-
-class VGG16Cifar10(_VGG16):
-    def __init__(self):
-        super().__init__([512, 512, 10])
-
-
-class VGG16Cifar100(_VGG16):
-    def __init__(self):
-        super().__init__([512, 512, 100])
-
-
-class VGG16ImageNet(_VGG16):
-    def __init__(self):
-        super().__init__([25088, 4096, 4096, 1000])
diff --git a/hpvm/projects/pred_tuner/models/inference.py b/hpvm/projects/pred_tuner/models/inference.py
deleted file mode 100644
index d797e9e605d8c3363d20f09fb52eb4a78195a9ac..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/inference.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import logging
-from typing import Type, Union
-
-import torch
-from torch.nn import Module
-from torch.utils.data import DataLoader, IterableDataset, Subset
-
-from .domains import QoS
-from .hpvm import HPVMDNN, HPVMDefaultModule
-from .networks import networks
-
-msg_logger = logging.getLogger(__name__)
-
-
-def move_to_device_recursively(data: object, device_: Union[torch.device, str]):
-    if isinstance(data, torch.Tensor):
-        return data.to(device_)
-    if not hasattr(data, '__dict__'):
-        if isinstance(data, list):
-            return [move_to_device_recursively(x, device_) for x in data]
-        elif isinstance(data, tuple):
-            return tuple([move_to_device_recursively(x, device_) for x in data])
-        else:
-            raise RuntimeError(f"Don't know how to manipulate {type(data)}")
-    for key, value in data.__dict__.items():
-        data.__dict__[key] = move_to_device_recursively(value, device_)
-    return data
-
-
-def _infer_net_device(net: Module):
-    return next(iter(net.parameters())).device
-
-
-def get_all_output(net: Module, dataloader: DataLoader):
-    outputs = []
-    device = _infer_net_device(net)
-    with torch.no_grad():
-        for inputs, targets in dataloader:
-            inputs = move_to_device_recursively(inputs, device)
-            outputs.append(net(inputs))
-    return outputs
-
-
-def load_torch_checkpoint(net: Module, chpt_path: str):
-    msg_logger.info('==> Loading checkpoint..')
-    checkpoint = torch.load(chpt_path)
-    net.load_state_dict(checkpoint.pop('net'))
-    return checkpoint
-
-
-class BaselineInfo:
-    def __init__(
-            self, net: Module, val_loader: DataLoader, test_loader: DataLoader,
-            non_tensor_output: bool, qos_class: Type[QoS]
-    ):
-        self.baseline_net = net
-        self.val_loader = val_loader
-        self.test_loader = test_loader
-        self.non_tensor_output = non_tensor_output
-        self.qos_class = qos_class
-        self.val_qos = self.get_qos(net, val_loader)
-        self.test_qos = self.get_qos(net, test_loader)
-
-    def get_qos(self, net: Module, dataloader: DataLoader):
-        return self.qos_class.from_all_output(get_all_output(net, dataloader), dataloader)
-
-    @staticmethod
-    def _split_dataset(dataset: IterableDataset, split_at: int):
-        return Subset(dataset, torch.arange(0, split_at)), \
-               Subset(dataset, torch.arange(split_at, len(dataset)))
-
-    @classmethod
-    def init_by_name(cls, model_name: str, device) -> 'BaselineInfo':
-        msg_logger.info('==> Building model..')
-        network_factory, dataset_factory, batchsize, prefix, qos_class = networks[model_name]
-        net = network_factory()
-        # 1. Load network weights
-        msg_logger.info('==> Loading checkpoint..')
-        if isinstance(net, HPVMDefaultModule):
-            net.default_load_hpvm_weights(prefix)
-        else:
-            load_torch_checkpoint(net, prefix)
-        net = net.eval().to(device)
-        # 2. Load dataset
-        msg_logger.info('==> Loading dataset...')
-        if isinstance(net, HPVMDNN):
-            dataset = dataset_factory(prefix)
-            non_tensor_output = False
-        elif isinstance(net, HPVMDefaultModule):  # Is image benchmark
-            dataset = dataset_factory(prefix)
-            non_tensor_output = True
-        else:
-            dataset = dataset_factory('./data')
-            non_tensor_output = False
-        # 3. Split dataset
-        test_set, val_set = cls._split_dataset(dataset, 5000)
-        test_loader = DataLoader(test_set, batch_size=batchsize)
-        val_loader = DataLoader(val_set, batch_size=batchsize)
-        return cls(net, val_loader, test_loader, non_tensor_output, qos_class)
diff --git a/hpvm/projects/pred_tuner/models/networks.py b/hpvm/projects/pred_tuner/models/networks.py
deleted file mode 100644
index a5611bcb3e681c618cc5f8d8d188e9afc2fb5687..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/networks.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from . import hpvm
-from .datasets import CIFAR, CIFARImage, MNIST, get_cifar10_test_dataset
-from .domains import Accuracy
-from .domains.qoses import AccuracyPSNR
-from .torch import ResNet18, VGG
-
-
-networks = {
-    'lenet_hpvm': (
-        hpvm.LeNet, MNIST.from_default_file, 5000,
-        'model_params/lenet_mnist', Accuracy
-    ),
-    'alexnet_hpvm': (
-        hpvm.AlexNet, CIFAR.from_default_file, 2000,
-        'model_params/alexnet_cifar10', Accuracy
-    ),
-    'alexnet2_hpvm': (
-        hpvm.AlexNet2, CIFAR.from_default_file, 2000,
-        'model_params/alexnet2_cifar10', Accuracy
-    ),
-    'vgg16_cifar10_hpvm': (
-        hpvm.VGG16Cifar10, CIFAR.from_default_file, 500,
-        'model_params/vgg16_cifar10', Accuracy
-    ),
-    'vgg16_cifar100_hpvm': (
-        hpvm.VGG16Cifar100, CIFAR.from_default_file, 500,
-        'model_params/vgg16_cifar100', Accuracy
-    ),
-    'mobilenet_hpvm': (
-        hpvm.MobileNet, CIFAR.from_default_file, 1000,
-        'model_params/mobilenet', Accuracy
-    ),
-    'resnet18_hpvm': (
-        hpvm.ResNet18, CIFAR.from_default_file, 1000,
-        'model_params/resnet18_cifar10', Accuracy
-    ),
-    'alexnet_imagenet_hpvm': (
-        hpvm.AlexNetImageNet, CIFAR.from_default_file, 100,
-        'model_params/alexnet_imagenet', Accuracy
-    ),
-    'vgg16_imagenet_hpvm': (
-        hpvm.VGG16ImageNet, CIFAR.from_default_file, 50,
-        'model_params/vgg16_imagenet', Accuracy
-    ),
-    'resnet50_imagenet_hpvm': (
-        hpvm.ResNet50, CIFAR.from_default_file, 25,
-        'model_params/resnet50_imagenet', Accuracy
-    ),
-    'alexnet2_canny_hpvm': (
-        lambda: hpvm.AlexNet2Canny(on_classes=[1, 2, 3, 4, 5]),
-        CIFARImage.from_default_file, 50,
-        'model_params/alexnet2_canny', AccuracyPSNR
-    )
-}
diff --git a/hpvm/projects/pred_tuner/models/torch/__init__.py b/hpvm/projects/pred_tuner/models/torch/__init__.py
deleted file mode 100644
index aff98ce114a9f0797ed08e74db1184d727f94f2e..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from .vgg import *
-from .dpn import *
-from .lenet import *
-from .senet import *
-from .pnasnet import *
-from .densenet import *
-from .googlenet import *
-from .shufflenet import *
-from .shufflenetv2 import *
-from .resnet import *
-from .resnext import *
-from .preact_resnet import *
-from .mobilenet import *
-from .mobilenetv2 import *
-from .efficientnet import *
diff --git a/hpvm/projects/pred_tuner/models/torch/densenet.py b/hpvm/projects/pred_tuner/models/torch/densenet.py
deleted file mode 100644
index 47ebbbe08e40503d6785711acd8bd7dd2cdba768..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/densenet.py
+++ /dev/null
@@ -1,107 +0,0 @@
-'''DenseNet in PyTorch.'''
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Bottleneck(nn.Module):
-    def __init__(self, in_planes, growth_rate):
-        super(Bottleneck, self).__init__()
-        self.bn1 = nn.BatchNorm2d(in_planes)
-        self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(4*growth_rate)
-        self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
-
-    def forward(self, x):
-        out = self.conv1(F.relu(self.bn1(x)))
-        out = self.conv2(F.relu(self.bn2(out)))
-        out = torch.cat([out,x], 1)
-        return out
-
-
-class Transition(nn.Module):
-    def __init__(self, in_planes, out_planes):
-        super(Transition, self).__init__()
-        self.bn = nn.BatchNorm2d(in_planes)
-        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
-
-    def forward(self, x):
-        out = self.conv(F.relu(self.bn(x)))
-        out = F.avg_pool2d(out, 2)
-        return out
-
-
-class DenseNet(nn.Module):
-    def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
-        super(DenseNet, self).__init__()
-        self.growth_rate = growth_rate
-
-        num_planes = 2*growth_rate
-        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
-
-        self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
-        num_planes += nblocks[0]*growth_rate
-        out_planes = int(math.floor(num_planes*reduction))
-        self.trans1 = Transition(num_planes, out_planes)
-        num_planes = out_planes
-
-        self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
-        num_planes += nblocks[1]*growth_rate
-        out_planes = int(math.floor(num_planes*reduction))
-        self.trans2 = Transition(num_planes, out_planes)
-        num_planes = out_planes
-
-        self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
-        num_planes += nblocks[2]*growth_rate
-        out_planes = int(math.floor(num_planes*reduction))
-        self.trans3 = Transition(num_planes, out_planes)
-        num_planes = out_planes
-
-        self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3])
-        num_planes += nblocks[3]*growth_rate
-
-        self.bn = nn.BatchNorm2d(num_planes)
-        self.linear = nn.Linear(num_planes, num_classes)
-
-    def _make_dense_layers(self, block, in_planes, nblock):
-        layers = []
-        for i in range(nblock):
-            layers.append(block(in_planes, self.growth_rate))
-            in_planes += self.growth_rate
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = self.conv1(x)
-        out = self.trans1(self.dense1(out))
-        out = self.trans2(self.dense2(out))
-        out = self.trans3(self.dense3(out))
-        out = self.dense4(out)
-        out = F.avg_pool2d(F.relu(self.bn(out)), 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-def DenseNet121():
-    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32)
-
-def DenseNet169():
-    return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32)
-
-def DenseNet201():
-    return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32)
-
-def DenseNet161():
-    return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48)
-
-def densenet_cifar():
-    return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12)
-
-def test():
-    net = densenet_cifar()
-    x = torch.randn(1,3,32,32)
-    y = net(x)
-    print(y)
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/dpn.py b/hpvm/projects/pred_tuner/models/torch/dpn.py
deleted file mode 100644
index d334367fcc9876b104a94b7ae333362ea0a64469..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/dpn.py
+++ /dev/null
@@ -1,98 +0,0 @@
-'''Dual Path Networks in PyTorch.'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Bottleneck(nn.Module):
-    def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer):
-        super(Bottleneck, self).__init__()
-        self.out_planes = out_planes
-        self.dense_depth = dense_depth
-
-        self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(in_planes)
-        self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False)
-        self.bn2 = nn.BatchNorm2d(in_planes)
-        self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(out_planes+dense_depth)
-
-        self.shortcut = nn.Sequential()
-        if first_layer:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(out_planes+dense_depth)
-            )
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = F.relu(self.bn2(self.conv2(out)))
-        out = self.bn3(self.conv3(out))
-        x = self.shortcut(x)
-        d = self.out_planes
-        out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1)
-        out = F.relu(out)
-        return out
-
-
-class DPN(nn.Module):
-    def __init__(self, cfg):
-        super(DPN, self).__init__()
-        in_planes, out_planes = cfg['in_planes'], cfg['out_planes']
-        num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth']
-
-        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
-        self.last_planes = 64
-        self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1)
-        self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2)
-        self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2)
-        self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2)
-        self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10)
-
-    def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride):
-        strides = [stride] + [1]*(num_blocks-1)
-        layers = []
-        for i,stride in enumerate(strides):
-            layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0))
-            self.last_planes = out_planes + (i+2) * dense_depth
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def DPN26():
-    cfg = {
-        'in_planes': (96,192,384,768),
-        'out_planes': (256,512,1024,2048),
-        'num_blocks': (2,2,2,2),
-        'dense_depth': (16,32,24,128)
-    }
-    return DPN(cfg)
-
-def DPN92():
-    cfg = {
-        'in_planes': (96,192,384,768),
-        'out_planes': (256,512,1024,2048),
-        'num_blocks': (3,4,20,3),
-        'dense_depth': (16,32,24,128)
-    }
-    return DPN(cfg)
-
-
-def test():
-    net = DPN92()
-    x = torch.randn(1,3,32,32)
-    y = net(x)
-    print(y)
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/efficientnet.py b/hpvm/projects/pred_tuner/models/torch/efficientnet.py
deleted file mode 100644
index 6a10a97468b5a505d5ea4bf1b5b53859dacef233..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/efficientnet.py
+++ /dev/null
@@ -1,99 +0,0 @@
-'''EfficientNet in PyTorch.
-
-Paper: "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks".
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Block(nn.Module):
-    '''expand + depthwise + pointwise + squeeze-excitation'''
-
-    def __init__(self, in_planes, out_planes, expansion, stride):
-        super(Block, self).__init__()
-        self.stride = stride
-
-        planes = expansion * in_planes
-        self.conv1 = nn.Conv2d(
-            in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
-                               stride=stride, padding=1, groups=planes, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(
-            planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
-        self.bn3 = nn.BatchNorm2d(out_planes)
-
-        self.shortcut = nn.Sequential()
-        if stride == 1 and in_planes != out_planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, out_planes, kernel_size=1,
-                          stride=1, padding=0, bias=False),
-                nn.BatchNorm2d(out_planes),
-            )
-
-        # SE layers
-        self.fc1 = nn.Conv2d(out_planes, out_planes//16, kernel_size=1)
-        self.fc2 = nn.Conv2d(out_planes//16, out_planes, kernel_size=1)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = F.relu(self.bn2(self.conv2(out)))
-        out = self.bn3(self.conv3(out))
-        shortcut = self.shortcut(x) if self.stride == 1 else out
-        # Squeeze-Excitation
-        w = F.avg_pool2d(out, out.size(2))
-        w = F.relu(self.fc1(w))
-        w = self.fc2(w).sigmoid()
-        out = out * w + shortcut
-        return out
-
-
-class EfficientNet(nn.Module):
-    def __init__(self, cfg, num_classes=10):
-        super(EfficientNet, self).__init__()
-        self.cfg = cfg
-        self.conv1 = nn.Conv2d(3, 32, kernel_size=3,
-                               stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(32)
-        self.layers = self._make_layers(in_planes=32)
-        self.linear = nn.Linear(cfg[-1][1], num_classes)
-
-    def _make_layers(self, in_planes):
-        layers = []
-        for expansion, out_planes, num_blocks, stride in self.cfg:
-            strides = [stride] + [1]*(num_blocks-1)
-            for stride in strides:
-                layers.append(Block(in_planes, out_planes, expansion, stride))
-                in_planes = out_planes
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layers(out)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def EfficientNetB0():
-    # (expansion, out_planes, num_blocks, stride)
-    cfg = [(1,  16, 1, 2),
-           (6,  24, 2, 1),
-           (6,  40, 2, 2),
-           (6,  80, 3, 2),
-           (6, 112, 3, 1),
-           (6, 192, 4, 2),
-           (6, 320, 1, 2)]
-    return EfficientNet(cfg)
-
-
-def test():
-    net = EfficientNetB0()
-    x = torch.randn(2, 3, 32, 32)
-    y = net(x)
-    print(y.shape)
-
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/googlenet.py b/hpvm/projects/pred_tuner/models/torch/googlenet.py
deleted file mode 100644
index 8ed8f6eb236d966f206f457e1637e11fecd44408..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/googlenet.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""GoogLeNet with PyTorch."""
-import torch
-import torch.nn as nn
-
-
-class Inception(nn.Module):
-    def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes):
-        super(Inception, self).__init__()
-        # 1x1 conv branch
-        self.b1 = nn.Sequential(
-            nn.Conv2d(in_planes, n1x1, kernel_size=1),
-            nn.BatchNorm2d(n1x1),
-            nn.ReLU(True),
-        )
-
-        # 1x1 conv -> 3x3 conv branch
-        self.b2 = nn.Sequential(
-            nn.Conv2d(in_planes, n3x3red, kernel_size=1),
-            nn.BatchNorm2d(n3x3red),
-            nn.ReLU(True),
-            nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1),
-            nn.BatchNorm2d(n3x3),
-            nn.ReLU(True),
-        )
-
-        # 1x1 conv -> 5x5 conv branch
-        self.b3 = nn.Sequential(
-            nn.Conv2d(in_planes, n5x5red, kernel_size=1),
-            nn.BatchNorm2d(n5x5red),
-            nn.ReLU(True),
-            nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1),
-            nn.BatchNorm2d(n5x5),
-            nn.ReLU(True),
-            nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1),
-            nn.BatchNorm2d(n5x5),
-            nn.ReLU(True),
-        )
-
-        # 3x3 pool -> 1x1 conv branch
-        self.b4 = nn.Sequential(
-            nn.MaxPool2d(3, stride=1, padding=1),
-            nn.Conv2d(in_planes, pool_planes, kernel_size=1),
-            nn.BatchNorm2d(pool_planes),
-            nn.ReLU(True),
-        )
-
-    def forward(self, x):
-        y1 = self.b1(x)
-        y2 = self.b2(x)
-        y3 = self.b3(x)
-        y4 = self.b4(x)
-        return torch.cat([y1, y2, y3, y4], 1)
-
-
-class GoogLeNet(nn.Module):
-    def __init__(self):
-        super(GoogLeNet, self).__init__()
-        self.pre_layers = nn.Sequential(
-            nn.Conv2d(3, 192, kernel_size=3, padding=1),
-            nn.BatchNorm2d(192),
-            nn.ReLU(True),
-        )
-
-        self.a3 = Inception(192, 64, 96, 128, 16, 32, 32)
-        self.b3 = Inception(256, 128, 128, 192, 32, 96, 64)
-
-        self.maxpool = nn.MaxPool2d(3, stride=2, padding=1)
-
-        self.a4 = Inception(480, 192, 96, 208, 16, 48, 64)
-        self.b4 = Inception(512, 160, 112, 224, 24, 64, 64)
-        self.c4 = Inception(512, 128, 128, 256, 24, 64, 64)
-        self.d4 = Inception(512, 112, 144, 288, 32, 64, 64)
-        self.e4 = Inception(528, 256, 160, 320, 32, 128, 128)
-
-        self.a5 = Inception(832, 256, 160, 320, 32, 128, 128)
-        self.b5 = Inception(832, 384, 192, 384, 48, 128, 128)
-
-        self.avgpool = nn.AvgPool2d(8, stride=1)
-        self.linear = nn.Linear(1024, 10)
-
-    def forward(self, x):
-        out = self.pre_layers(x)
-        out = self.a3(out)
-        out = self.b3(out)
-        out = self.maxpool(out)
-        out = self.a4(out)
-        out = self.b4(out)
-        out = self.c4(out)
-        out = self.d4(out)
-        out = self.e4(out)
-        out = self.maxpool(out)
-        out = self.a5(out)
-        out = self.b5(out)
-        out = self.avgpool(out)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def test():
-    net = GoogLeNet()
-    x = torch.randn(1, 3, 32, 32)
-    y = net(x)
-    print(y.size())
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/lenet.py b/hpvm/projects/pred_tuner/models/torch/lenet.py
deleted file mode 100644
index d657b7482a75a3058e5795f367dfbb32e948b9d5..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/lenet.py
+++ /dev/null
@@ -1,23 +0,0 @@
-'''LeNet in PyTorch.'''
-import torch.nn as nn
-import torch.nn.functional as F
-
-class LeNet(nn.Module):
-    def __init__(self):
-        super(LeNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 6, 5)
-        self.conv2 = nn.Conv2d(6, 16, 5)
-        self.fc1   = nn.Linear(16*5*5, 120)
-        self.fc2   = nn.Linear(120, 84)
-        self.fc3   = nn.Linear(84, 10)
-
-    def forward(self, x):
-        out = F.relu(self.conv1(x))
-        out = F.max_pool2d(out, 2)
-        out = F.relu(self.conv2(out))
-        out = F.max_pool2d(out, 2)
-        out = out.view(out.size(0), -1)
-        out = F.relu(self.fc1(out))
-        out = F.relu(self.fc2(out))
-        out = self.fc3(out)
-        return out
diff --git a/hpvm/projects/pred_tuner/models/torch/mobilenet.py b/hpvm/projects/pred_tuner/models/torch/mobilenet.py
deleted file mode 100644
index 497ef1e867d2a597b9b444ebc7a6f30cd5219777..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/mobilenet.py
+++ /dev/null
@@ -1,61 +0,0 @@
-'''MobileNet in PyTorch.
-
-See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
-for more details.
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Block(nn.Module):
-    '''Depthwise conv + Pointwise conv'''
-    def __init__(self, in_planes, out_planes, stride=1):
-        super(Block, self).__init__()
-        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
-        self.bn1 = nn.BatchNorm2d(in_planes)
-        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
-        self.bn2 = nn.BatchNorm2d(out_planes)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = F.relu(self.bn2(self.conv2(out)))
-        return out
-
-
-class MobileNet(nn.Module):
-    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
-    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]
-
-    def __init__(self, num_classes=10):
-        super(MobileNet, self).__init__()
-        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(32)
-        self.layers = self._make_layers(in_planes=32)
-        self.linear = nn.Linear(1024, num_classes)
-
-    def _make_layers(self, in_planes):
-        layers = []
-        for x in self.cfg:
-            out_planes = x if isinstance(x, int) else x[0]
-            stride = 1 if isinstance(x, int) else x[1]
-            layers.append(Block(in_planes, out_planes, stride))
-            in_planes = out_planes
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layers(out)
-        out = F.avg_pool2d(out, 2)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def test():
-    net = MobileNet()
-    x = torch.randn(1,3,32,32)
-    y = net(x)
-    print(y.size())
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/mobilenetv2.py b/hpvm/projects/pred_tuner/models/torch/mobilenetv2.py
deleted file mode 100644
index 17e5823ef4426ceceae462782a267f89b1ecbc76..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/mobilenetv2.py
+++ /dev/null
@@ -1,86 +0,0 @@
-'''MobileNetV2 in PyTorch.
-
-See the paper "Inverted Residuals and Linear Bottlenecks:
-Mobile Networks for Classification, Detection and Segmentation" for more details.
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Block(nn.Module):
-    '''expand + depthwise + pointwise'''
-    def __init__(self, in_planes, out_planes, expansion, stride):
-        super(Block, self).__init__()
-        self.stride = stride
-
-        planes = expansion * in_planes
-        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
-        self.bn3 = nn.BatchNorm2d(out_planes)
-
-        self.shortcut = nn.Sequential()
-        if stride == 1 and in_planes != out_planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False),
-                nn.BatchNorm2d(out_planes),
-            )
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = F.relu(self.bn2(self.conv2(out)))
-        out = self.bn3(self.conv3(out))
-        out = out + self.shortcut(x) if self.stride==1 else out
-        return out
-
-
-class MobileNetV2(nn.Module):
-    # (expansion, out_planes, num_blocks, stride)
-    cfg = [(1,  16, 1, 1),
-           (6,  24, 2, 1),  # NOTE: change stride 2 -> 1 for CIFAR10
-           (6,  32, 3, 2),
-           (6,  64, 4, 2),
-           (6,  96, 3, 1),
-           (6, 160, 3, 2),
-           (6, 320, 1, 1)]
-
-    def __init__(self, num_classes=10):
-        super(MobileNetV2, self).__init__()
-        # NOTE: change conv1 stride 2 -> 1 for CIFAR10
-        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(32)
-        self.layers = self._make_layers(in_planes=32)
-        self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False)
-        self.bn2 = nn.BatchNorm2d(1280)
-        self.linear = nn.Linear(1280, num_classes)
-
-    def _make_layers(self, in_planes):
-        layers = []
-        for expansion, out_planes, num_blocks, stride in self.cfg:
-            strides = [stride] + [1]*(num_blocks-1)
-            for stride in strides:
-                layers.append(Block(in_planes, out_planes, expansion, stride))
-                in_planes = out_planes
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layers(out)
-        out = F.relu(self.bn2(self.conv2(out)))
-        # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def test():
-    net = MobileNetV2()
-    x = torch.randn(2,3,32,32)
-    y = net(x)
-    print(y.size())
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/pnasnet.py b/hpvm/projects/pred_tuner/models/torch/pnasnet.py
deleted file mode 100644
index de8c4d51f2667f84eab86f29be9a00ea7d0ad1c3..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/pnasnet.py
+++ /dev/null
@@ -1,125 +0,0 @@
-'''PNASNet in PyTorch.
-
-Paper: Progressive Neural Architecture Search
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class SepConv(nn.Module):
-    '''Separable Convolution.'''
-    def __init__(self, in_planes, out_planes, kernel_size, stride):
-        super(SepConv, self).__init__()
-        self.conv1 = nn.Conv2d(in_planes, out_planes,
-                               kernel_size, stride,
-                               padding=(kernel_size-1)//2,
-                               bias=False, groups=in_planes)
-        self.bn1 = nn.BatchNorm2d(out_planes)
-
-    def forward(self, x):
-        return self.bn1(self.conv1(x))
-
-
-class CellA(nn.Module):
-    def __init__(self, in_planes, out_planes, stride=1):
-        super(CellA, self).__init__()
-        self.stride = stride
-        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
-        if stride==2:
-            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
-            self.bn1 = nn.BatchNorm2d(out_planes)
-
-    def forward(self, x):
-        y1 = self.sep_conv1(x)
-        y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
-        if self.stride==2:
-            y2 = self.bn1(self.conv1(y2))
-        return F.relu(y1+y2)
-
-class CellB(nn.Module):
-    def __init__(self, in_planes, out_planes, stride=1):
-        super(CellB, self).__init__()
-        self.stride = stride
-        # Left branch
-        self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride)
-        self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride)
-        # Right branch
-        self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride)
-        if stride==2:
-            self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
-            self.bn1 = nn.BatchNorm2d(out_planes)
-        # Reduce channels
-        self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
-        self.bn2 = nn.BatchNorm2d(out_planes)
-
-    def forward(self, x):
-        # Left branch
-        y1 = self.sep_conv1(x)
-        y2 = self.sep_conv2(x)
-        # Right branch
-        y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1)
-        if self.stride==2:
-            y3 = self.bn1(self.conv1(y3))
-        y4 = self.sep_conv3(x)
-        # Concat & reduce channels
-        b1 = F.relu(y1+y2)
-        b2 = F.relu(y3+y4)
-        y = torch.cat([b1,b2], 1)
-        return F.relu(self.bn2(self.conv2(y)))
-
-class PNASNet(nn.Module):
-    def __init__(self, cell_type, num_cells, num_planes):
-        super(PNASNet, self).__init__()
-        self.in_planes = num_planes
-        self.cell_type = cell_type
-
-        self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(num_planes)
-
-        self.layer1 = self._make_layer(num_planes, num_cells=6)
-        self.layer2 = self._downsample(num_planes*2)
-        self.layer3 = self._make_layer(num_planes*2, num_cells=6)
-        self.layer4 = self._downsample(num_planes*4)
-        self.layer5 = self._make_layer(num_planes*4, num_cells=6)
-
-        self.linear = nn.Linear(num_planes*4, 10)
-
-    def _make_layer(self, planes, num_cells):
-        layers = []
-        for _ in range(num_cells):
-            layers.append(self.cell_type(self.in_planes, planes, stride=1))
-            self.in_planes = planes
-        return nn.Sequential(*layers)
-
-    def _downsample(self, planes):
-        layer = self.cell_type(self.in_planes, planes, stride=2)
-        self.in_planes = planes
-        return layer
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = self.layer5(out)
-        out = F.avg_pool2d(out, 8)
-        out = self.linear(out.view(out.size(0), -1))
-        return out
-
-
-def PNASNetA():
-    return PNASNet(CellA, num_cells=6, num_planes=44)
-
-def PNASNetB():
-    return PNASNet(CellB, num_cells=6, num_planes=32)
-
-
-def test():
-    net = PNASNetB()
-    x = torch.randn(1,3,32,32)
-    y = net(x)
-    print(y)
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/preact_resnet.py b/hpvm/projects/pred_tuner/models/torch/preact_resnet.py
deleted file mode 100644
index abb1bc313c011d2ee650c353c515e2cd404503f3..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/preact_resnet.py
+++ /dev/null
@@ -1,118 +0,0 @@
-'''Pre-activation ResNet in PyTorch.
-
-Reference:
-[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Identity Mappings in Deep Residual Networks. arXiv:1603.05027
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class PreActBlock(nn.Module):
-    '''Pre-activation version of the BasicBlock.'''
-    expansion = 1
-
-    def __init__(self, in_planes, planes, stride=1):
-        super(PreActBlock, self).__init__()
-        self.bn1 = nn.BatchNorm2d(in_planes)
-        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
-
-        if stride != 1 or in_planes != self.expansion*planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
-            )
-
-    def forward(self, x):
-        out = F.relu(self.bn1(x))
-        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
-        out = self.conv1(out)
-        out = self.conv2(F.relu(self.bn2(out)))
-        out += shortcut
-        return out
-
-
-class PreActBottleneck(nn.Module):
-    '''Pre-activation version of the original Bottleneck module.'''
-    expansion = 4
-
-    def __init__(self, in_planes, planes, stride=1):
-        super(PreActBottleneck, self).__init__()
-        self.bn1 = nn.BatchNorm2d(in_planes)
-        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(planes)
-        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
-
-        if stride != 1 or in_planes != self.expansion*planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
-            )
-
-    def forward(self, x):
-        out = F.relu(self.bn1(x))
-        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
-        out = self.conv1(out)
-        out = self.conv2(F.relu(self.bn2(out)))
-        out = self.conv3(F.relu(self.bn3(out)))
-        out += shortcut
-        return out
-
-
-class PreActResNet(nn.Module):
-    def __init__(self, block, num_blocks, num_classes=10):
-        super(PreActResNet, self).__init__()
-        self.in_planes = 64
-
-        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
-        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
-        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
-        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
-        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
-        self.linear = nn.Linear(512*block.expansion, num_classes)
-
-    def _make_layer(self, block, planes, num_blocks, stride):
-        strides = [stride] + [1]*(num_blocks-1)
-        layers = []
-        for stride in strides:
-            layers.append(block(self.in_planes, planes, stride))
-            self.in_planes = planes * block.expansion
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = self.conv1(x)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def PreActResNet18():
-    return PreActResNet(PreActBlock, [2,2,2,2])
-
-def PreActResNet34():
-    return PreActResNet(PreActBlock, [3,4,6,3])
-
-def PreActResNet50():
-    return PreActResNet(PreActBottleneck, [3,4,6,3])
-
-def PreActResNet101():
-    return PreActResNet(PreActBottleneck, [3,4,23,3])
-
-def PreActResNet152():
-    return PreActResNet(PreActBottleneck, [3,8,36,3])
-
-
-def test():
-    net = PreActResNet18()
-    y = net((torch.randn(1,3,32,32)))
-    print(y.size())
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/resnet.py b/hpvm/projects/pred_tuner/models/torch/resnet.py
deleted file mode 100644
index d7c03ed134293e2a6a1dd373556e83978ef3d560..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/resnet.py
+++ /dev/null
@@ -1,122 +0,0 @@
-"""ResNet in PyTorch.
-
-For Pre-activation ResNet, see 'preact_resnet.py'.
-
-Reference:
-[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
-    Deep Residual Learning for Image Recognition. arXiv:1512.03385
-"""
-import torch.nn as nn
-import torch.nn.functional as F
-
-from models.hpvm import HPVMConvBundle
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, in_planes, planes, stride=1):
-        super(BasicBlock, self).__init__()
-        self.conv1 = HPVMConvBundle(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.relu1 = nn.ReLU()
-        self.conv2 = HPVMConvBundle(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = nn.Sequential(
-                HPVMConvBundle(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(self.expansion * planes)
-            )
-        self.relu2 = nn.ReLU()
-
-    def forward(self, x):
-        out = self.relu1(self.bn1(self.conv1(x)))
-        out = self.bn2(self.conv2(out))
-        out += self.shortcut(x)
-        out = self.relu2(out)
-        return out
-
-
-class Bottleneck(nn.Module):
-    expansion = 4
-
-    def __init__(self, in_planes, planes, stride=1):
-        super(Bottleneck, self).__init__()
-        self.conv1 = HPVMConvBundle(in_planes, planes, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = HPVMConvBundle(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv3 = HPVMConvBundle(planes, self.expansion * planes, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(self.expansion * planes)
-
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion * planes:
-            self.shortcut = nn.Sequential(
-                HPVMConvBundle(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(self.expansion * planes)
-            )
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = F.relu(self.bn2(self.conv2(out)))
-        out = self.bn3(self.conv3(out))
-        out += self.shortcut(x)
-        out = F.relu(out)
-        return out
-
-
-class ResNet(nn.Module):
-    def __init__(self, block, num_blocks, num_classes=10):
-        super(ResNet, self).__init__()
-        self.in_planes = 64
-
-        self.conv1 = HPVMConvBundle(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
-        self.relu = nn.ReLU()
-        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
-        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
-        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
-        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
-        self.avg_pool2d = nn.AvgPool2d(4)
-        self.linear = nn.Linear(512 * block.expansion, num_classes)
-
-    def _make_layer(self, block, planes, num_blocks, stride):
-        strides = [stride] + [1] * (num_blocks - 1)
-        layers = []
-        for stride in strides:
-            layers.append(block(self.in_planes, planes, stride))
-            self.in_planes = planes * block.expansion
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = self.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = self.avg_pool2d(out)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def ResNet18():
-    return ResNet(BasicBlock, [2, 2, 2, 2])
-
-
-def ResNet34():
-    return ResNet(BasicBlock, [3, 4, 6, 3])
-
-
-def ResNet50():
-    return ResNet(Bottleneck, [3, 4, 6, 3])
-
-
-def ResNet101():
-    return ResNet(Bottleneck, [3, 4, 23, 3])
-
-
-def ResNet152():
-    return ResNet(Bottleneck, [3, 8, 36, 3])
diff --git a/hpvm/projects/pred_tuner/models/torch/resnext.py b/hpvm/projects/pred_tuner/models/torch/resnext.py
deleted file mode 100644
index 7a08f3e7d9fdf3b65aad5b773d4d113c6b796423..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/resnext.py
+++ /dev/null
@@ -1,95 +0,0 @@
-'''ResNeXt in PyTorch.
-
-See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details.
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Block(nn.Module):
-    '''Grouped convolution block.'''
-    expansion = 2
-
-    def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1):
-        super(Block, self).__init__()
-        group_width = cardinality * bottleneck_width
-        self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(group_width)
-        self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False)
-        self.bn2 = nn.BatchNorm2d(group_width)
-        self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(self.expansion*group_width)
-
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != self.expansion*group_width:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(self.expansion*group_width)
-            )
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = F.relu(self.bn2(self.conv2(out)))
-        out = self.bn3(self.conv3(out))
-        out += self.shortcut(x)
-        out = F.relu(out)
-        return out
-
-
-class ResNeXt(nn.Module):
-    def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10):
-        super(ResNeXt, self).__init__()
-        self.cardinality = cardinality
-        self.bottleneck_width = bottleneck_width
-        self.in_planes = 64
-
-        self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
-        self.layer1 = self._make_layer(num_blocks[0], 1)
-        self.layer2 = self._make_layer(num_blocks[1], 2)
-        self.layer3 = self._make_layer(num_blocks[2], 2)
-        # self.layer4 = self._make_layer(num_blocks[3], 2)
-        self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes)
-
-    def _make_layer(self, num_blocks, stride):
-        strides = [stride] + [1]*(num_blocks-1)
-        layers = []
-        for stride in strides:
-            layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride))
-            self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width
-        # Increase bottleneck_width by 2 after each stage.
-        self.bottleneck_width *= 2
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        # out = self.layer4(out)
-        out = F.avg_pool2d(out, 8)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def ResNeXt29_2x64d():
-    return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64)
-
-def ResNeXt29_4x64d():
-    return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64)
-
-def ResNeXt29_8x64d():
-    return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64)
-
-def ResNeXt29_32x4d():
-    return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4)
-
-def test_resnext():
-    net = ResNeXt29_2x64d()
-    x = torch.randn(1,3,32,32)
-    y = net(x)
-    print(y.size())
-
-# test_resnext()
diff --git a/hpvm/projects/pred_tuner/models/torch/senet.py b/hpvm/projects/pred_tuner/models/torch/senet.py
deleted file mode 100644
index 98bfa0ca51dcd07b586432c9f9460be8d1f0b745..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/senet.py
+++ /dev/null
@@ -1,121 +0,0 @@
-'''SENet in PyTorch.
-
-SENet is the winner of ImageNet-2017. The paper is not released yet.
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class BasicBlock(nn.Module):
-    def __init__(self, in_planes, planes, stride=1):
-        super(BasicBlock, self).__init__()
-        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_planes != planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False),
-                nn.BatchNorm2d(planes)
-            )
-
-        # SE layers
-        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)  # Use nn.Conv2d instead of nn.Linear
-        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.bn2(self.conv2(out))
-
-        # Squeeze
-        w = F.avg_pool2d(out, out.size(2))
-        w = F.relu(self.fc1(w))
-        w = F.sigmoid(self.fc2(w))
-        # Excitation
-        out = out * w  # New broadcasting feature from v0.2!
-
-        out += self.shortcut(x)
-        out = F.relu(out)
-        return out
-
-
-class PreActBlock(nn.Module):
-    def __init__(self, in_planes, planes, stride=1):
-        super(PreActBlock, self).__init__()
-        self.bn1 = nn.BatchNorm2d(in_planes)
-        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(planes)
-        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
-
-        if stride != 1 or in_planes != planes:
-            self.shortcut = nn.Sequential(
-                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False)
-            )
-
-        # SE layers
-        self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1)
-        self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(x))
-        shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x
-        out = self.conv1(out)
-        out = self.conv2(F.relu(self.bn2(out)))
-
-        # Squeeze
-        w = F.avg_pool2d(out, out.size(2))
-        w = F.relu(self.fc1(w))
-        w = F.sigmoid(self.fc2(w))
-        # Excitation
-        out = out * w
-
-        out += shortcut
-        return out
-
-
-class SENet(nn.Module):
-    def __init__(self, block, num_blocks, num_classes=10):
-        super(SENet, self).__init__()
-        self.in_planes = 64
-
-        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(64)
-        self.layer1 = self._make_layer(block,  64, num_blocks[0], stride=1)
-        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
-        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
-        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
-        self.linear = nn.Linear(512, num_classes)
-
-    def _make_layer(self, block, planes, num_blocks, stride):
-        strides = [stride] + [1]*(num_blocks-1)
-        layers = []
-        for stride in strides:
-            layers.append(block(self.in_planes, planes, stride))
-            self.in_planes = planes
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.layer4(out)
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def SENet18():
-    return SENet(PreActBlock, [2,2,2,2])
-
-
-def test():
-    net = SENet18()
-    y = net(torch.randn(1,3,32,32))
-    print(y.size())
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/shufflenet.py b/hpvm/projects/pred_tuner/models/torch/shufflenet.py
deleted file mode 100644
index acff6f78266c55bb93f5b12a6306a5647ebb0769..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/shufflenet.py
+++ /dev/null
@@ -1,109 +0,0 @@
-'''ShuffleNet in PyTorch.
-
-See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class ShuffleBlock(nn.Module):
-    def __init__(self, groups):
-        super(ShuffleBlock, self).__init__()
-        self.groups = groups
-
-    def forward(self, x):
-        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
-        N,C,H,W = x.size()
-        g = self.groups
-        return x.view(N,g,C//g,H,W).permute(0,2,1,3,4).reshape(N,C,H,W)
-
-
-class Bottleneck(nn.Module):
-    def __init__(self, in_planes, out_planes, stride, groups):
-        super(Bottleneck, self).__init__()
-        self.stride = stride
-
-        mid_planes = out_planes/4
-        g = 1 if in_planes==24 else groups
-        self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
-        self.bn1 = nn.BatchNorm2d(mid_planes)
-        self.shuffle1 = ShuffleBlock(groups=g)
-        self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
-        self.bn2 = nn.BatchNorm2d(mid_planes)
-        self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
-        self.bn3 = nn.BatchNorm2d(out_planes)
-
-        self.shortcut = nn.Sequential()
-        if stride == 2:
-            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.shuffle1(out)
-        out = F.relu(self.bn2(self.conv2(out)))
-        out = self.bn3(self.conv3(out))
-        res = self.shortcut(x)
-        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
-        return out
-
-
-class ShuffleNet(nn.Module):
-    def __init__(self, cfg):
-        super(ShuffleNet, self).__init__()
-        out_planes = cfg['out_planes']
-        num_blocks = cfg['num_blocks']
-        groups = cfg['groups']
-
-        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(24)
-        self.in_planes = 24
-        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
-        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
-        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
-        self.linear = nn.Linear(out_planes[2], 10)
-
-    def _make_layer(self, out_planes, num_blocks, groups):
-        layers = []
-        for i in range(num_blocks):
-            stride = 2 if i == 0 else 1
-            cat_planes = self.in_planes if i == 0 else 0
-            layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))
-            self.in_planes = out_planes
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-def ShuffleNetG2():
-    cfg = {
-        'out_planes': [200,400,800],
-        'num_blocks': [4,8,4],
-        'groups': 2
-    }
-    return ShuffleNet(cfg)
-
-def ShuffleNetG3():
-    cfg = {
-        'out_planes': [240,480,960],
-        'num_blocks': [4,8,4],
-        'groups': 3
-    }
-    return ShuffleNet(cfg)
-
-
-def test():
-    net = ShuffleNetG2()
-    x = torch.randn(1,3,32,32)
-    y = net(x)
-    print(y)
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/shufflenetv2.py b/hpvm/projects/pred_tuner/models/torch/shufflenetv2.py
deleted file mode 100644
index eefcda32059f0b8575148098c78ff5d84effd388..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/shufflenetv2.py
+++ /dev/null
@@ -1,162 +0,0 @@
-'''ShuffleNetV2 in PyTorch.
-
-See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details.
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class ShuffleBlock(nn.Module):
-    def __init__(self, groups=2):
-        super(ShuffleBlock, self).__init__()
-        self.groups = groups
-
-    def forward(self, x):
-        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
-        N, C, H, W = x.size()
-        g = self.groups
-        return x.view(N, g, C//g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W)
-
-
-class SplitBlock(nn.Module):
-    def __init__(self, ratio):
-        super(SplitBlock, self).__init__()
-        self.ratio = ratio
-
-    def forward(self, x):
-        c = int(x.size(1) * self.ratio)
-        return x[:, :c, :, :], x[:, c:, :, :]
-
-
-class BasicBlock(nn.Module):
-    def __init__(self, in_channels, split_ratio=0.5):
-        super(BasicBlock, self).__init__()
-        self.split = SplitBlock(split_ratio)
-        in_channels = int(in_channels * split_ratio)
-        self.conv1 = nn.Conv2d(in_channels, in_channels,
-                               kernel_size=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(in_channels)
-        self.conv2 = nn.Conv2d(in_channels, in_channels,
-                               kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False)
-        self.bn2 = nn.BatchNorm2d(in_channels)
-        self.conv3 = nn.Conv2d(in_channels, in_channels,
-                               kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(in_channels)
-        self.shuffle = ShuffleBlock()
-
-    def forward(self, x):
-        x1, x2 = self.split(x)
-        out = F.relu(self.bn1(self.conv1(x2)))
-        out = self.bn2(self.conv2(out))
-        out = F.relu(self.bn3(self.conv3(out)))
-        out = torch.cat([x1, out], 1)
-        out = self.shuffle(out)
-        return out
-
-
-class DownBlock(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super(DownBlock, self).__init__()
-        mid_channels = out_channels // 2
-        # left
-        self.conv1 = nn.Conv2d(in_channels, in_channels,
-                               kernel_size=3, stride=2, padding=1, groups=in_channels, bias=False)
-        self.bn1 = nn.BatchNorm2d(in_channels)
-        self.conv2 = nn.Conv2d(in_channels, mid_channels,
-                               kernel_size=1, bias=False)
-        self.bn2 = nn.BatchNorm2d(mid_channels)
-        # right
-        self.conv3 = nn.Conv2d(in_channels, mid_channels,
-                               kernel_size=1, bias=False)
-        self.bn3 = nn.BatchNorm2d(mid_channels)
-        self.conv4 = nn.Conv2d(mid_channels, mid_channels,
-                               kernel_size=3, stride=2, padding=1, groups=mid_channels, bias=False)
-        self.bn4 = nn.BatchNorm2d(mid_channels)
-        self.conv5 = nn.Conv2d(mid_channels, mid_channels,
-                               kernel_size=1, bias=False)
-        self.bn5 = nn.BatchNorm2d(mid_channels)
-
-        self.shuffle = ShuffleBlock()
-
-    def forward(self, x):
-        # left
-        out1 = self.bn1(self.conv1(x))
-        out1 = F.relu(self.bn2(self.conv2(out1)))
-        # right
-        out2 = F.relu(self.bn3(self.conv3(x)))
-        out2 = self.bn4(self.conv4(out2))
-        out2 = F.relu(self.bn5(self.conv5(out2)))
-        # concat
-        out = torch.cat([out1, out2], 1)
-        out = self.shuffle(out)
-        return out
-
-
-class ShuffleNetV2(nn.Module):
-    def __init__(self, net_size):
-        super(ShuffleNetV2, self).__init__()
-        out_channels = configs[net_size]['out_channels']
-        num_blocks = configs[net_size]['num_blocks']
-
-        self.conv1 = nn.Conv2d(3, 24, kernel_size=3,
-                               stride=1, padding=1, bias=False)
-        self.bn1 = nn.BatchNorm2d(24)
-        self.in_channels = 24
-        self.layer1 = self._make_layer(out_channels[0], num_blocks[0])
-        self.layer2 = self._make_layer(out_channels[1], num_blocks[1])
-        self.layer3 = self._make_layer(out_channels[2], num_blocks[2])
-        self.conv2 = nn.Conv2d(out_channels[2], out_channels[3],
-                               kernel_size=1, stride=1, padding=0, bias=False)
-        self.bn2 = nn.BatchNorm2d(out_channels[3])
-        self.linear = nn.Linear(out_channels[3], 10)
-
-    def _make_layer(self, out_channels, num_blocks):
-        layers = [DownBlock(self.in_channels, out_channels)]
-        for i in range(num_blocks):
-            layers.append(BasicBlock(out_channels))
-            self.in_channels = out_channels
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        # out = F.max_pool2d(out, 3, stride=2, padding=1)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = F.relu(self.bn2(self.conv2(out)))
-        out = F.avg_pool2d(out, 4)
-        out = out.view(out.size(0), -1)
-        out = self.linear(out)
-        return out
-
-
-configs = {
-    0.5: {
-        'out_channels': (48, 96, 192, 1024),
-        'num_blocks': (3, 7, 3)
-    },
-
-    1: {
-        'out_channels': (116, 232, 464, 1024),
-        'num_blocks': (3, 7, 3)
-    },
-    1.5: {
-        'out_channels': (176, 352, 704, 1024),
-        'num_blocks': (3, 7, 3)
-    },
-    2: {
-        'out_channels': (224, 488, 976, 2048),
-        'num_blocks': (3, 7, 3)
-    }
-}
-
-
-def test():
-    net = ShuffleNetV2(net_size=0.5)
-    x = torch.randn(3, 3, 32, 32)
-    y = net(x)
-    print(y.shape)
-
-
-# test()
diff --git a/hpvm/projects/pred_tuner/models/torch/vgg.py b/hpvm/projects/pred_tuner/models/torch/vgg.py
deleted file mode 100644
index 2650d2f4859bedcef0de53a60c58c36b706148af..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/models/torch/vgg.py
+++ /dev/null
@@ -1,39 +0,0 @@
-"""VGG11/13/16/19 in Pytorch."""
-import torch.nn as nn
-from models.hpvm import HPVMConvBundle
-
-
-cfg = {
-    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
-    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
-    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
-    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
-}
-
-
-class VGG(nn.Module):
-    def __init__(self, vgg_name):
-        super(VGG, self).__init__()
-        self.features = self._make_layers(cfg[vgg_name])
-        self.classifier = nn.Linear(512, 10)
-
-    def forward(self, x):
-        out = self.features(x)
-        out = out.view(out.size(0), -1)
-        out = self.classifier(out)
-        return out
-
-    @staticmethod
-    def _make_layers(config):
-        layers = []
-        in_channels = 3
-        for x in config:
-            if x == 'M':
-                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
-            else:
-                layers += [HPVMConvBundle(in_channels, x, kernel_size=3, padding=1),
-                           nn.BatchNorm2d(x),
-                           nn.ReLU(inplace=True)]
-                in_channels = x
-        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
-        return nn.Sequential(*layers)
diff --git a/hpvm/projects/pred_tuner/run_tuner.py b/hpvm/projects/pred_tuner/run_tuner.py
deleted file mode 100644
index 5470763ae01b73b51702c413bd18254f4c5b0d2f..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/run_tuner.py
+++ /dev/null
@@ -1,305 +0,0 @@
-#!/usr/bin/env python
-#
-# Development-time Tuner with Algorithmic Approximations:
-# Approximations: Perforation, Sampling with varying knobs for rate, skip offset
-import copy
-import logging
-import os
-import shutil
-import time
-from pathlib import Path
-from typing import List, Tuple
-
-import numpy as np
-import opentuner
-from opentuner import ConfigurationManipulator, EnumParameter, MeasurementInterface
-from opentuner.measurement.inputmanager import FixedInputManager
-from opentuner.search.objective import ThresholdAccuracyMinimizeTime
-from opentuner.tuningrunmain import TuningRunMain
-from torch.nn import Module
-from tqdm import tqdm
-
-from exp import Benchmark, ConfigMeasurer, ExpState, TuningTime, batch_id, bench_tuner_data, is_dev_time
-from models import get_all_output, networks, QoS
-from toolkit import ConfigT
-from toolkit.estimators import WeightedLinearQoSEstimator
-from utils import Config, config, reapply_last_config
-
-msg_logger = logging.getLogger(__name__)
-use_proxy = False
-n_promise_valid_runs = 30
-confidence_level = 0.95
-
-
-def init_proxy(ni: ConfigMeasurer, pickle_path: Path):
-    def acc_crit(inputs_):
-        return ni.get_qos(inputs_, ni.val_loader)
-
-    def threshold_eval(inputs_):
-        accs = np.array([acc_crit(x) for x in inputs_])
-        return ni.val_qos - accs.mean() < 3.0
-
-    def run_model(net: Module):
-        return get_all_output(net, ni.val_loader)
-
-    return WeightedLinearQoSEstimator(
-        ni.nas, run_model, acc_crit, threshold_eval, confidence_level, storage=pickle_path
-    )
-
-
-class Timer:
-    def __init__(self, timer_state: TuningTime, timer_name: str):
-        self.timer_state = timer_state
-        self.name = timer_name
-        self.start = None
-
-    def __enter__(self):
-        self.start = time.time()
-        return self
-
-    def __exit__(self, *args):
-        end = time.time()
-        interval = end - self.start
-        self.timer_state.add_timer(self.name, interval)
-
-
-class TunerDriver:
-    def __init__(self, bench: Benchmark):
-        self.bench = bench
-        msg_logger.info(f"Tuning for model {self.bench.model_name}")
-        # Initialize folder.
-        self._init_folder(bench)
-        # Take a snapshot of current code.
-        self.take_code_snapshot()
-        # Initialize network information and qos thresholds
-        self.net_info = ConfigMeasurer.init_from_bench(self.bench)
-        qoses = self.net_info.val_qos, self.net_info.test_qos
-        qos_type = self.net_info.val_qos.__class__
-        self.tuner_thres = qos_type.suggested_tuner_thresholds(self.net_info.val_qos)
-        self.val_thres = qos_type.suggested_val_threshold(self.net_info.val_qos)
-        self.test_thres = qos_type.suggested_test_threshold(self.net_info.test_qos)
-        # Tuner states.
-        self.states = ExpState(bench, qos_type, qoses)
-        # Current # of iteration. `ProxyTuner` will use this.
-        self.run_id, self.iter = 0, 0
-        # Initialize proxy.
-        if use_proxy:
-            self.proxy = init_proxy(self.net_info, self.bench.result_dir / 'proxy.pkl')
-        else:
-            self.proxy = None
-
-    @staticmethod
-    def _init_folder(bench: Benchmark):
-        def remove_file_or_folder(path: Path):
-            if path.is_dir():
-                shutil.rmtree(child)
-            elif path.is_file():
-                path.unlink()  # Removes file despite the surprising name
-
-        pickle_path = bench.result_dir / 'proxy.pkl'
-        # Remove everything in result folder except pickle file
-        if bench.result_dir.is_dir():
-            msg_logger.warning(f"!Cleaning existing result dir = {bench.result_dir}")
-            for child in bench.result_dir.glob('*'):
-                if child == pickle_path:
-                    continue
-                msg_logger.info(f"  !Removing {child}")
-                remove_file_or_folder(child)
-        # Create result folder if it doesn't exist
-        if not bench.result_dir.is_dir():
-            msg_logger.info(f"Creating output directory = {bench.result_dir}")
-            os.makedirs(bench.result_dir)
-
-    def get_default_args(self):
-        args = opentuner.default_argparser().parse_args()
-        args.database = f"opentuner.db/{batch_id}.db"
-        args.test_limit = self.bench.autotuner_runs
-        parent = Path(args.database).parent
-        if not parent.is_dir():
-            os.makedirs(parent, exist_ok=True)
-        return args
-
-    def tuner_exec(self):
-        # Get default opentuner args
-        args = self.get_default_args()
-        # Start tuning for each threshold
-        for i, thres in enumerate(self.tuner_thres):
-            with Timer(self.states.timers, f"tuning_{i}"):
-                msg_logger.info(
-                    f"Tuning goal: qos >= {thres}; keeping configs with qos >= {self.val_thres}"
-                )
-                tuner = ProxyTuner(args, self, thres, self.val_thres)
-                # TuningRunMain.__init__ initializes its own logger, so we'll reapply our settings.
-                tuning_main = TuningRunMain(tuner, args)
-                reapply_last_config()
-                # Unleash the tuner!
-                tuning_main.main()
-                # Remove tuner progress bar
-                tuner.pbar.close()
-                self.run_id += 1
-                self.iter = 0
-        # Postprocess configs
-        self.process_configs()
-
-    def calibrate_write_configs(self, configs: List[Config], is_test_set: bool):
-        write_to = self.states.tested_configs if is_test_set else self.states.validated_configs
-        gold_acc = self.net_info.test_qos if is_test_set else self.net_info.val_qos
-        for cfg in tqdm(configs, leave=False):
-            cfg = copy.deepcopy(cfg)
-            cfg: Config
-            flags = {k: v for k, v in enumerate(cfg.flags)}
-            measured_acc, confidence = self.net_info.actual_measure(
-                flags, cfg.total_runs, is_test_set, threshold=self.val_thres
-            )
-            prev_acc = cfg.avg_qos
-            cfg.update_acc(measured_acc, confidence, gold_acc)
-            new_acc = cfg.avg_qos
-            msg_logger.debug(f"{prev_acc} (mean) -> {new_acc} (mean)")
-            write_to.append(cfg)
-        write_to.finalize_dump()
-
-    @staticmethod
-    def filter_configs(
-            validation: List[Config], test: List[Config],
-            vali_threshold: QoS, test_threshold: QoS
-    ) -> Tuple[List[Config], List[Config]]:
-        # Filter validation and test set by their respective thresholds
-        filtered_validation = [
-            c for c in validation if c.avg_loss <= vali_threshold
-        ]
-        filtered_test = [
-            c for c in test if c.avg_loss <= test_threshold
-        ]
-        # Test configs also need to be a subset of validation configs.
-        name_to_filtered = {x.fname: x for x in filtered_test}
-        intersect_names = set(list(name_to_filtered.keys())).intersection(
-            set((x.fname for x in filtered_validation))
-        )
-        filtered_test_ = [name_to_filtered[fname] for fname in intersect_names]
-        return filtered_validation, filtered_test_
-
-    def process_configs(self):
-        # Finalize all configs because tuning is done.
-        # (this may not do anything now but will in the future)
-        self.states.all_configs.finalize_dump()
-        all_configs = self.states.all_configs.configs
-        # Pre-filter configs by a wide pareto margin
-        filtered_configs = config.is_pareto_efficient(all_configs, ratio=0.05, n_min=50, n_max=50)
-        msg_logger.info(f"Prefilter yields {len(filtered_configs)} configs from {len(all_configs)}")
-        self.states.filtered_configs.finalize_dump(with_configs=filtered_configs)
-        # Calibrate prefiltered configs (validation step)
-        with Timer(self.states.timers, "validate"):
-            self.calibrate_write_configs(filtered_configs, is_test_set=False)
-            validated_configs = self.states.validated_configs.configs
-        # Calibrate prefiltered configs on test set (test step)
-        with Timer(self.states.timers, "test"):
-            self.calibrate_write_configs(filtered_configs, is_test_set=True)
-            tested_configs = self.states.tested_configs.configs
-        # Filter valid and test set configs by thresholds
-        valid_configs, test_configs = self.filter_configs(
-            validated_configs, tested_configs, self.val_thres, self.test_thres
-        )
-        self.states.valid_configs.finalize_dump(valid_configs)
-        self.states.test_configs.finalize_dump(test_configs)
-        # Finalize data input and plot everything.
-        self.states.finalize_plot()
-
-    def take_code_snapshot(self):
-        import git
-        msg_logger.info(f"Taking git snapshot")
-        ref_dir = self.bench.result_dir / "references"
-        os.mkdir(ref_dir)
-        # Write current git commit (SHA id)
-        repo = git.Repo(search_parent_directories=True)
-        sha = repo.head.object.hexsha
-        msg_logger.info(f"Current code is at commit {sha}")
-        with (ref_dir / 'git_commit.txt').open('w') as f:
-            f.write(sha)
-        # Also put all outstanding code change in a diff file.
-        # This way changes in all git-tracked files are captured.
-        t = repo.head.commit.tree
-        with (ref_dir / 'diff.txt').open('w') as f:
-            f.write(repo.git.diff(t))
-
-    def make_config_name(self) -> str:
-        return f"{self.bench.model_name}_{self.run_id}_{self.iter}"
-
-    def get_accuracy(self, cfg: ConfigT) -> Tuple[QoS, QoS, int]:
-        has_promise_flags = set(cfg.values()).intersection(set(range(1, 7 + 1)))
-        config_validation_runs = n_promise_valid_runs if has_promise_flags else 1
-        if use_proxy:
-            mean_acc, confidence_acc = self.net_info.proxy_estimate(cfg, self.proxy)
-            assert has_promise_flags or (mean_acc == confidence_acc)
-        else:
-            mean_acc, _ = self.net_info.actual_measure(cfg, 1, is_test_set=False)
-            confidence_acc = mean_acc
-        return mean_acc, confidence_acc, config_validation_runs
-
-
-class ProxyTuner(MeasurementInterface):
-    def __init__(self, args, driver: TunerDriver, tuner_thres: QoS, accept_thres: QoS):
-        self.tuner_driver = driver
-        self.model_info = driver.net_info
-        self.bench = driver.bench
-        self.tuner_thres = tuner_thres
-        self.all_configs = driver.states.all_configs
-        self.pbar = tqdm(total=args.test_limit, leave=False)
-        objective = ThresholdAccuracyMinimizeTime(tuner_thres.to_scalar())
-        input_manager = FixedInputManager(size=driver.bench.get_n_layers())
-        super(ProxyTuner, self).__init__(
-            args, program_name=self.bench.model_name,
-            input_manager=input_manager, objective=objective
-        )
-        self.accept_thres = accept_thres
-
-    def manipulator(self) -> ConfigurationManipulator:
-        """Define the search space by creating a ConfigurationManipulator."""
-        manipulator = ConfigurationManipulator()
-        for ext_layer_id, knobs in self.model_info.get_knobs().items():
-            manipulator.add_parameter(EnumParameter(ext_layer_id, knobs))
-        return manipulator
-
-    def seed_configurations(self):
-        """Provide baseline config as seed if model uses seed."""
-        return [self.bench.get_baseline_config(not is_dev_time)] if self.bench.use_seed else []
-
-    def run(self, desired_result, input_, limit):
-        """Run a given configuration then return performance and accuracy."""
-        cfg: ConfigT = desired_result.configuration.data
-        # get_accuracy gives estimation of mean accuracy and 95% confident accuracy
-        mean_acc, confident_acc, n_runs = self.tuner_driver.get_accuracy(cfg)
-        # getConfigCost returns the cost associated with the selected configuration
-        total_comps, speedup = self.bench.compute_config_cost(cfg)
-        Result = opentuner.resultsdb.models.Result()
-        Result.time = total_comps
-        # Convert QoS to scalar, because opentuner does not support custom comparable datatype
-        Result.accuracy = confident_acc.to_scalar(relative_to=self.tuner_thres)
-
-        # If accuracy is acceptable, write this config
-        if confident_acc > self.accept_thres:
-            config_name = self.tuner_driver.make_config_name()
-            cfg_values = [cfg[layer] for layer in sorted(cfg.keys())]
-            writing_config = Config(
-                mean_acc, self.model_info.val_qos, config_name, cfg_values,
-                n_runs, 95.0, total_comps, speedup
-            )
-            self.all_configs.append(writing_config)
-            msg_logger.debug(
-                f"Config chosen with accuracy (mean) = {mean_acc}, (95%) = {confident_acc} "
-                f"and speedup = {speedup}"
-            )
-        self.tuner_driver.iter += 1
-        self.pbar.update()
-        return Result
-
-    def save_final_config(self, configuration):
-        """Print final configuration."""
-        msg_logger.info(f"Final configuration {configuration.data}")
-        msg_logger.info("Done with Autotuning run")
-
-
-if __name__ == '__main__':
-    assert set(networks.keys()).issubset(set(bench_tuner_data.keys()))
-    for network in ('alexnet2_hpvm',):
-        bench_: Benchmark = bench_tuner_data[network]
-        TunerDriver(bench_).tuner_exec()
diff --git a/hpvm/projects/pred_tuner/tests/data/1_1_output.json b/hpvm/projects/pred_tuner/tests/data/1_1_output.json
deleted file mode 100644
index 3892ae9622a1af68e92b11408372e3d88278ed6a..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/tests/data/1_1_output.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-  "('0', '0', '1', '1', '2', '0')": {
-    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvSampSim": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,",
-    "ConvApprox": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,",
-    "ConvApproxHalf2": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,"
-  },
-  "('0', '0', '1', '1', '2', '1')": {
-    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvSampSim": "40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,",
-    "ConvApprox": "40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,",
-    "ConvApproxHalf2": "40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,"
-  },
-  "('0', '0', '1', '1', '3', '0')": {
-    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvApprox": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvApproxHalf2": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,"
-  },
-  "('0', '0', '1', '1', '3', '1')": {
-    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvApprox": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvApproxHalf2": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,"
-  },
-  "('0', '0', '1', '1', '4', '0')": {
-    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvSampSim": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,",
-    "ConvApprox": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,",
-    "ConvApproxHalf2": "31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,"
-  },
-  "('0', '0', '1', '1', '4', '1')": {
-    "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,",
-    "ConvSampSim": "37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,",
-    "ConvApprox": "37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,",
-    "ConvApproxHalf2": "37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,"
-  },
-  "('1', '1', '1', '1', '2', '0')": {
-    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"
-  },
-  "('1', '1', '1', '1', '2', '1')": {
-    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,40.000000,40.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"
-  },
-  "('1', '1', '1', '1', '3', '0')": {
-    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"
-  },
-  "('1', '1', '1', '1', '3', '1')": {
-    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,"
-  },
-  "('1', '1', '1', '1', '4', '0')": {
-    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,32.000000,32.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,"
-  },
-  "('1', '1', '1', '1', '4', '1')": {
-    "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "FP16_Baseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,",
-    "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,"
-  }
-}
diff --git a/hpvm/projects/pred_tuner/tests/data/3_3_output.json b/hpvm/projects/pred_tuner/tests/data/3_3_output.json
deleted file mode 100644
index 2ccb23c01c7faff1e1c296f5d5bb667633327687..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/tests/data/3_3_output.json
+++ /dev/null
@@ -1,146 +0,0 @@
-{
-  "('0', '0', '1', '1', '2', '0')": {
-    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
-    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "ConvSampSim": "26.000000,26.000000,26.000000,26.000000,",
-    "ConvApprox": "26.000000,26.000000,26.000000,26.000000,",
-    "ConvApproxHalf2": "26.000000,26.000000,26.000000,26.000000,"
-  },
-  "('0', '0', '1', '1', '2', '1')": {
-    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
-    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "ConvSampSim": "56.000000,56.000000,56.000000,56.000000,",
-    "ConvApprox": "56.000000,56.000000,56.000000,56.000000,",
-    "ConvApproxHalf2": "56.000000,56.000000,56.000000,56.000000,"
-  },
-  "('0', '0', '1', '1', '3', '0')": {
-    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
-    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "ConvSampSim": "39.000000,39.000000,39.000000,39.000000,",
-    "ConvApprox": "39.000000,39.000000,39.000000,39.000000,",
-    "ConvApproxHalf2": "39.000000,39.000000,39.000000,39.000000,"
-  },
-  "('0', '0', '1', '1', '3', '1')": {
-    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
-    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "ConvSampSim": "42.000000,42.000000,42.000000,42.000000,",
-    "ConvApprox": "42.000000,42.000000,42.000000,42.000000,",
-    "ConvApproxHalf2": "42.000000,42.000000,42.000000,42.000000,"
-  },
-  "('0', '0', '1', '1', '4', '0')": {
-    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
-    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,",
-    "ConvApprox": "36.000000,36.000000,36.000000,36.000000,",
-    "ConvApproxHalf2": "35.968750,35.968750,35.968750,35.968750,"
-  },
-  "('0', '0', '1', '1', '4', '1')": {
-    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
-    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
-    "ConvSampSim": "45.333336,45.333336,45.333336,45.333336,",
-    "ConvApprox": "45.333336,45.333336,45.333336,45.333336,",
-    "ConvApproxHalf2": "45.312500,45.312500,45.312500,45.312500,"
-  },
-  "('1', '1', '1', '1', '2', '0')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvSampSim": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,",
-    "ConvApprox": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,",
-    "ConvApproxHalf2": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,"
-  },
-  "('1', '1', '1', '1', '2', '1')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvSampSim": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,",
-    "ConvApprox": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,",
-    "ConvApproxHalf2": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,"
-  },
-  "('1', '1', '1', '1', '3', '0')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvSampSim": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvApprox": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvApproxHalf2": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,"
-  },
-  "('1', '1', '1', '1', '3', '1')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvSampSim": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvApprox": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvApproxHalf2": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,"
-  },
-  "('1', '1', '1', '1', '4', '0')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvSampSim": "16.000000,22.666666,22.666666,13.333333,25.333334,36.000000,36.000000,22.666668,25.333334,36.000000,36.000000,22.666668,18.666666,25.333334,25.333334,16.000000,",
-    "ConvApprox": "16.000000,22.666666,22.666666,13.333333,25.333334,36.000000,36.000000,22.666668,25.333334,36.000000,36.000000,22.666668,18.666666,25.333334,25.333334,16.000000,",
-    "ConvApproxHalf2": "16.000000,22.671875,22.671875,13.328125,25.328125,35.968750,35.968750,22.656250,25.328125,35.968750,35.968750,22.656250,18.671875,25.328125,25.328125,16.000000,"
-  },
-  "('1', '1', '1', '1', '4', '1')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
-    "ConvSampSim": "18.666668,29.333332,29.333332,20.000000,29.333332,45.333336,45.333336,29.333332,29.333332,45.333336,45.333336,29.333332,20.000000,29.333332,29.333332,18.666668,",
-    "ConvApprox": "18.666668,29.333332,29.333332,20.000000,29.333332,45.333336,45.333336,29.333332,29.333332,45.333336,45.333336,29.333332,20.000000,29.333332,29.333332,18.666668,",
-    "ConvApproxHalf2": "18.656250,29.343750,29.343750,20.000000,29.328125,45.312500,45.312500,29.343750,29.328125,45.312500,45.312500,29.343750,20.000000,29.328125,29.328125,18.656250,"
-  },
-  "('1', '1', '2', '2', '2', '0')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "ConvSampSim": "12.000000,18.000000,18.000000,26.000000,",
-    "ConvApprox": "12.000000,18.000000,18.000000,26.000000,",
-    "ConvApproxHalf2": "12.000000,18.000000,18.000000,26.000000,"
-  },
-  "('1', '1', '2', '2', '2', '1')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "ConvSampSim": "24.000000,36.000000,36.000000,56.000000,",
-    "ConvApprox": "24.000000,36.000000,36.000000,56.000000,",
-    "ConvApproxHalf2": "24.000000,36.000000,36.000000,56.000000,"
-  },
-  "('1', '1', '2', '2', '3', '0')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "ConvSampSim": "18.000000,27.000000,25.500000,39.000000,",
-    "ConvApprox": "18.000000,27.000000,25.500000,39.000000,",
-    "ConvApproxHalf2": "18.000000,27.000000,25.500000,39.000000,"
-  },
-  "('1', '1', '2', '2', '3', '1')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "ConvSampSim": "18.000000,27.000000,28.500000,42.000000,",
-    "ConvApprox": "18.000000,27.000000,28.500000,42.000000,",
-    "ConvApproxHalf2": "18.000000,27.000000,28.500000,42.000000,"
-  },
-  "('1', '1', '2', '2', '4', '0')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "ConvSampSim": "16.000000,22.666666,25.333334,36.000000,",
-    "ConvApprox": "16.000000,22.666666,25.333334,36.000000,",
-    "ConvApproxHalf2": "16.000000,22.671875,25.328125,35.968750,"
-  },
-  "('1', '1', '2', '2', '4', '1')": {
-    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
-    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
-    "ConvSampSim": "18.666668,29.333332,29.333332,45.333336,",
-    "ConvApprox": "18.666668,29.333332,29.333332,45.333336,",
-    "ConvApproxHalf2": "18.656250,29.343750,29.328125,45.312500,"
-  }
-}
\ No newline at end of file
diff --git a/hpvm/projects/pred_tuner/tests/data/promise.json b/hpvm/projects/pred_tuner/tests/data/promise.json
deleted file mode 100644
index 331ff8527a17a4ff26965e7252cc49a4c409375a..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/tests/data/promise.json
+++ /dev/null
@@ -1,121 +0,0 @@
-{
-  "1": [
-    [
-      -0.980938,
-      -1.976522,
-      -2.999873,
-      -4.095768,
-      -5.115182,
-      0.0,
-      5.075658,
-      3.972848,
-      2.912783,
-      2.051733,
-      1.004169,
-      1.002379
-    ],
-    45.213196
-  ],
-  "2": [
-    [
-      -1.017428,
-      -2.01491,
-      -2.951011,
-      -4.042611,
-      -4.954911,
-      0.0,
-      5.05412,
-      3.951638,
-      2.94989,
-      1.99723,
-      1.001167,
-      0.98796
-    ],
-    12.535809
-  ],
-  "3": [
-    [
-      -1.003108,
-      -2.006269,
-      -3.00263,
-      -3.97216,
-      -4.969401,
-      0.0,
-      5.012199,
-      4.028375,
-      2.950729,
-      2.004691,
-      1.004823,
-      0.991805
-    ],
-    4.886813
-  ],
-  "4": [
-    [
-      -1.006497,
-      -1.975768,
-      -3.031142,
-      -4.02248,
-      -5.061712,
-      0.0,
-      5.017349,
-      3.992676,
-      2.998843,
-      2.002693,
-      0.997514,
-      1.00649
-    ],
-    3.129643
-  ],
-  "5": [
-    [
-      -1.001629,
-      -1.976943,
-      -2.982565,
-      -3.964559,
-      -4.99636,
-      0.0,
-      4.992359,
-      3.984341,
-      2.990126,
-      2.005831,
-      1.000539,
-      1.003548
-    ],
-    2.181237
-  ],
-  "6": [
-    [
-      -1.003159,
-      -1.985892,
-      -3.005964,
-      -4.008651,
-      -4.992874,
-      0.0,
-      4.996098,
-      4.012099,
-      3.001986,
-      2.001431,
-      0.996138,
-      0.997394
-    ],
-    1.362949
-  ],
-  "7": [
-    [
-      -1.003133,
-      -1.99733,
-      -3.00755,
-      -4.007799,
-      -5.003314,
-      0.0,
-      5.000926,
-      3.993208,
-      2.988745,
-      2.00329,
-      0.99986,
-      0.995669
-    ],
-    0.6926
-  ]
-}
\ No newline at end of file
diff --git a/hpvm/projects/pred_tuner/tests/data/quantization.json b/hpvm/projects/pred_tuner/tests/data/quantization.json
deleted file mode 100644
index 723eaa2b55bc067689beae34829d27d478a0c727..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/tests/data/quantization.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "(-4, 6)": [
-    -0.132812,
-    -4.0,
-    0.179688,
-    -0.40625,
-    1.664062,
-    -2.90625,
-    0.6875,
-    0.960938,
-    6.0,
-    6.0,
-    2.484375,
-    2.992188
-  ],
-  "(-2, 2)": [
-    -0.109375,
-    -2.0,
-    0.1875,
-    -0.40625,
-    1.6875,
-    -2.0,
-    0.6875,
-    0.984375,
-    2.0,
-    2.0,
-    2.0,
-    2.0
-  ],
-  "(-25, 8)": [
-    -0.121094,
-    -25.0,
-    0.136719,
-    -0.507812,
-    1.683594,
-    -2.957031,
-    0.652344,
-    0.910156,
-    6.96875,
-    7.097656,
-    2.457031,
-    2.972656
-  ],
-  "(-10, 10)": [
-    -0.15625,
-    -10.0,
-    0.15625,
-    -0.46875,
-    1.640625,
-    -2.96875,
-    0.625,
-    0.9375,
-    6.953125,
-    7.1875,
-    2.5,
-    2.96875
-  ]
-}
\ No newline at end of file
diff --git a/hpvm/projects/pred_tuner/tests/promise.py b/hpvm/projects/pred_tuner/tests/promise.py
deleted file mode 100644
index 59506d94251bfac4909b2236dc9480eb17b9ed70..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/tests/promise.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import json
-from pathlib import Path
-
-import torch
-
-from toolkit import ModuleIndexer, NetApproxSelector
-from toolkit.approxdnn import PromiseSim, quantize_256
-from utils import compute_accuracy, init_by_name, run_concat_output
-
-eps = 1e-5
-delta = 0.05  # Allow for some variance in promise testing
-
-
-def gt_eps(tensor: torch.Tensor) -> bool:
-    return torch.any(tensor.abs() > eps).item()
-
-
-def compare_quant(groundtruth: dict):
-    input_tensor = torch.tensor([-0.1, -25, 0.2, -0.4, 1.7, -2.9, 0.7, 0.99, 7, 7.2, 2.5, 3])
-    for k, v in groundtruth.items():
-        from ast import literal_eval as make_tuple
-        gt = torch.tensor(v)
-        ours = quantize_256(input_tensor, *make_tuple(k))
-        if gt_eps(gt - ours):
-            print(
-                f"Quantization results differ by more than eps = {eps};\n"
-                f"parameters = {k}\ngroundtruth = {gt}\nours = {ours}"
-            )
-            raise RuntimeError
-
-
-def compare_promise(groundtruth: dict):
-    input_tensor = torch.tensor([-1, -2, -3, -4, -5, 0, 5, 4, 3, 2, 1, 1], dtype=torch.float)
-    N = 1000
-    for k, (gt_avg, gt_error) in groundtruth.items():
-        gt_avg = torch.tensor(gt_avg)
-        sum_, our_error = torch.zeros_like(input_tensor, dtype=torch.float), 0
-        for _ in range(N):
-            out = PromiseSim.add_promise_noise(input_tensor, int(k))
-            sum_ += out
-            our_error += torch.sum((out - input_tensor) ** 2).item()
-        our_avg = sum_ / N
-        our_error = our_error / N
-        print(gt_avg, our_avg)
-        if abs(our_error - gt_error) > delta * max(our_error, gt_error):
-            print(
-                f"Promise results differ by more than delta = {delta * 100:.1f}%;\n"
-                f"swing = {k}, groundtruth error = {gt_error}\nours = {our_error}"
-            )
-            raise RuntimeError
-
-
-def is_in_range(mean1: float, std1: float, mean2: float) -> bool:
-    return mean1 - 3.0 * std1 < mean2 < mean1 + 3.0 * std1
-
-
-def compare_accuracy():
-    baseline, testloader, _, shapes = init_by_name('lenet_hpvm')
-    baseline_dag = ModuleIndexer(baseline)
-    nas = NetApproxSelector(baseline_dag, dev_time_only=False)
-    # {0: 1} -> 98.4808 0.1195
-    approx1 = nas.apply_approx_by_config({3: 1})
-    acc1 = compute_accuracy(run_concat_output(approx1.module, testloader), testloader)
-    assert is_in_range(0.984808, 0.001195, acc1)
-    # {0: 2} -> 99.5933 0.0519
-    approx2 = nas.apply_approx_by_config({3: 2})
-    acc2 = compute_accuracy(run_concat_output(approx2.module, testloader), testloader)
-    assert is_in_range(0.995933, 0.000519, acc2)
-    # {0: 3} -> 99.6723 0.0347
-    approx3 = nas.apply_approx_by_config({3: 3})
-    acc3 = compute_accuracy(run_concat_output(approx3.module, testloader), testloader)
-    assert is_in_range(0.996723, 0.000347, acc3)
-    print("Accuracy test passed.")
-
-
-def main():
-    data_folder = Path(__file__).parent / 'data'
-    with open(data_folder / 'quantization.json') as f:
-        compare_quant(json.load(f))
-    with open(data_folder / 'promise.json') as f:
-        compare_promise(json.load(f))
-    compare_accuracy()
-    print("Tests passed.")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/tests/resnet50.py b/hpvm/projects/pred_tuner/tests/resnet50.py
deleted file mode 100644
index 71711fbfd099d47ba047471ddde3423b297d0f56..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/tests/resnet50.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from toolkit import ModuleIndexer, NetApproxSelector
-from utils import compute_accuracy, init_by_name, run_concat_output
-
-
-def float_eq(f1, f2):
-    return abs(f1 - f2) < 1e-5
-
-
-def main():
-    baseline, testloader, _, shapes = init_by_name('resnet50_imagenet_hpvm')
-    baseline_dag = ModuleIndexer(baseline)
-    nas = NetApproxSelector(baseline_dag)
-    # baseline
-    baseline_output = run_concat_output(baseline_dag.module, testloader)
-    baseline_acc = compute_accuracy(baseline_output, testloader)
-    assert float_eq(baseline_acc, 0.773)
-    # {13: 242} -> 75.5
-    approx1 = nas.apply_approx_by_config({82: 242})
-    acc1 = compute_accuracy(run_concat_output(approx1.module, testloader), testloader)
-    assert float_eq(acc1, 0.755)
-    # {13: 242, 17: 247} -> 74.6
-    approx2 = nas.apply_approx_by_config({82: 242, 108: 247})
-    acc2 = compute_accuracy(run_concat_output(approx2.module, testloader), testloader)
-    assert float_eq(acc2, 0.746)
-    # {9: 237, 13: 242, 17: 247} -> 74.1
-    approx3 = nas.apply_approx_by_config({55: 237, 82: 242, 108: 247})
-    acc3 = compute_accuracy(run_concat_output(approx3.module, testloader), testloader)
-    assert float_eq(acc3, 0.741)
-    print("Accuracy test passed.")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/tests/sampling.py b/hpvm/projects/pred_tuner/tests/sampling.py
deleted file mode 100644
index 707506ef7b8312fda02ca646bd04d034c3eff6ea..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/tests/sampling.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import json
-from copy import deepcopy
-from pathlib import Path
-from typing import Tuple
-
-import torch
-
-from models.hpvm import HPVMConvBundle
-from toolkit import Conv2dSampling, Conv2dSamplingFP16, FP16Approx
-
-eps = 1e-5, 0.05
-
-
-def sampling_3_3_consts() -> Tuple[torch.Tensor, torch.Tensor]:
-    input_tensor = torch.ones(1, 3, 4, 4)
-    # Filter has value [2, 1, 2, 1, 2, 1...]
-    filter_tensor = torch.ones(1, 3, 3, 3)
-    filter_tensor.view(-1)[::2] = 2
-    return input_tensor, filter_tensor
-
-
-def sampling_1_1_consts() -> Tuple[torch.Tensor, torch.Tensor]:
-    input_tensor = torch.ones(1, 9, 2, 2) * 2
-    filter_tensor = torch.ones(4, 9, 1, 1) * 2
-    return input_tensor, filter_tensor
-
-
-def parse_tensor_str(string: str) -> torch.Tensor:
-    # String has an extra ',' at the end, so skipping an empty string after split
-    entries = [float(s) for s in string.split(',')[:-1]]
-    return torch.tensor(entries).cuda()
-
-
-def compare_to_groundtruth(groundtruth: dict, const_func):
-    input_tensor, filter_tensor = const_func()
-    input_tensor = input_tensor.cuda()
-    o_ch, i_ch, h, w = filter_tensor.size()
-    assert h == w
-    for k, v in groundtruth.items():
-        def compare(groundtruth_t: torch.Tensor, ours_t: torch.Tensor, is_fp16: bool):
-            diff = groundtruth_t - ours_t
-            eps_ = eps[1] if is_fp16 else eps[0]
-            is_diff = torch.any(diff.abs() > eps_).item()
-            if is_diff:
-                print(
-                    f"Results differ by more than eps = {eps};\n"
-                    f"parameters = {k}\n"
-                    f"groundtruth = {groundtruth_t}\n"
-                    f"ours = {ours_t}"
-                )
-                raise RuntimeError
-
-        from ast import literal_eval as make_tuple
-        pad_h, pad_w, stride_h, stride_w, skip_every, offset = [int(s) for s in make_tuple(k)]
-        conv_layer = HPVMConvBundle(
-            i_ch, o_ch, h, stride=(stride_h, stride_w), padding=(pad_h, pad_w)
-        )
-        conv_layer.weight.data = filter_tensor
-        conv_layer.bias.data = torch.zeros_like(conv_layer.bias.data)
-        conv_layer = conv_layer.cuda()
-        our_baseline = conv_layer(input_tensor).flatten()
-        fp16 = FP16Approx(deepcopy(conv_layer))
-        our_fp16 = fp16(input_tensor).flatten()
-        sampling = Conv2dSampling(skip_every, offset, 1.0, deepcopy(conv_layer))
-        our_sampled = sampling(input_tensor).flatten()
-        sampling_fp16 = Conv2dSamplingFP16(skip_every, offset, 1.0, deepcopy(conv_layer))
-        our_sampled_fp16 = sampling_fp16(input_tensor).float().flatten()
-        groundtruth_baseline = parse_tensor_str(v['Baseline'])
-        compare(groundtruth_baseline, our_baseline, False)
-        groundtruth_sampled1 = parse_tensor_str(v['ConvApprox'])
-        compare(groundtruth_sampled1, our_sampled, False)
-        groundtruth_sampled2 = parse_tensor_str(v['ConvSampSim'])
-        compare(groundtruth_sampled2, our_sampled, False)
-        groundtruth_baseline_fp16 = parse_tensor_str(v['FP16_Baseline'])
-        compare(groundtruth_baseline_fp16, our_fp16, True)
-        groundtruth_sampled_fp16 = parse_tensor_str(v['ConvApproxHalf2'])
-        compare(groundtruth_sampled_fp16, our_sampled_fp16, True)
-
-
-def main():
-    data_folder = Path(__file__).parent / 'data'
-    with open(data_folder / '1_1_output.json') as f:
-        compare_to_groundtruth(json.load(f), sampling_1_1_consts)
-    with open(data_folder / '3_3_output.json') as f:
-        compare_to_groundtruth(json.load(f), sampling_3_3_consts)
-    print("Tests passed.")
-
-
-if __name__ == '__main__':
-    main()
diff --git a/hpvm/projects/pred_tuner/toolkit/__init__.py b/hpvm/projects/pred_tuner/toolkit/__init__.py
deleted file mode 100644
index 892b8c154269c99b7446c70182886b2ee92fc499..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/toolkit/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .approxdnn import Approximation, AvailableApproximations, Conv2dSampling, FP16Approx, \
-    PerforateConv2dStride, PromiseSim
-from .estimators import LinearCombEstimator, LinearEstimator, LinearQoSEstimator, WeightedLinearCombEstimator
-from .transform import ConfigT, NetApproxSelector, StateCapturer
diff --git a/hpvm/projects/pred_tuner/toolkit/approxdnn.py b/hpvm/projects/pred_tuner/toolkit/approxdnn.py
deleted file mode 100644
index 06abca85d521326749902e0058b8a88e3571a611..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/toolkit/approxdnn.py
+++ /dev/null
@@ -1,442 +0,0 @@
-"""All approximation techniques for torch.nn layers."""
-import abc
-from typing import Dict, Iterable, List, Optional, Type
-
-import torch
-from torch.nn import Linear, Module
-
-from models.hpvm import HPVMConvBundle
-from utils import get_tensorrt_dir
-
-
-def interpolate_first_dim(tensor: torch.Tensor, interp_indices: Iterable[int]):
-    def tensor_at(idx_: int):
-        if idx_ in interp_indices:
-            raise IndexError
-        if idx_ < 0 or idx_ >= tensor.size()[0]:
-            return torch.zeros_like(tensor[0])
-        return tensor[idx_]
-
-    for idx in interp_indices:
-        if idx < 0 or idx >= tensor.size()[0]:
-            raise IndexError
-        elif idx == 0:  # First row
-            tensor[idx] = tensor_at(1)
-        elif idx == tensor.size()[0] - 1:  # Last row
-            tensor[idx] = tensor_at(idx - 1)
-        else:  # Middle rows
-            tensor[idx] = (tensor_at(idx - 1) + tensor_at(idx + 1)) / 2.0
-    return tensor
-
-
-class Approximation(abc.ABC):
-    @property
-    @abc.abstractmethod
-    def deterministic(self) -> bool:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def devtime(self) -> bool:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def fp32(self) -> bool:
-        pass
-
-    @abc.abstractmethod
-    def apply(self, module: Module) -> Module:
-        pass
-
-    @abc.abstractmethod
-    def is_less_approx(self, other: 'Approximation') -> Optional[bool]:
-        pass
-
-    def __repr__(self):
-        return f"{self.__class__}({self.__dict__})"
-
-
-class PerforateConv2dStride(Approximation):
-    r"""Simulation of strided perforated convolution for `torch.nn.Conv2d`.
-
-        Perforated convolution skips computing some entries in the output and instead interpolates
-        these values, to reduce the number of float-ops needed to complete a convolution op.
-        In this implementation, selected rows or columns of the output are discarded and replaced
-        with linearly interpolated values from the neighboring rows or columns. Each channel is
-        considered independently.
-        This implementation gives the same output as actual perforated convolution but without the
-        performance benefit.
-
-        Parameters
-        ----------
-        direction_is_row : bool
-            If True, discard and interpolate rows, otherwise columns.
-        stride : int \in [2, +\infty)
-            Skip 1 row/column in the convolution kernel per `stride` elements.
-        offset : int \in [0, stride)
-            Skipped first row/column is `offset`.
-
-        Attributes
-        ----------
-        interp_axis : int :math:`\in \{2, 3\}`
-            The axis that will be perforated over. As the input is an NCHW tensor, if
-            `direction_is_row` then `interp_axis = 2`, otherwise `interp_axis = 3`.
-        stride : int :math:`\in [2, +\infty)`
-            Equal to parameter `stride`.
-        offset : int :math:`\in [0, stride)`
-            Equal to parameter `offset`.
-        """
-
-    def __init__(self, direction_is_row: bool, stride: int, offset: int, use_fp16: bool):
-        assert stride >= 2
-        assert 0 <= offset < stride
-        self.interp_axis = 2 if direction_is_row else 3
-        self.stride = stride
-        self.offset = offset
-        self.fp16 = use_fp16
-
-    @property
-    def deterministic(self) -> bool:
-        return True
-
-    @property
-    def devtime(self) -> bool:
-        return not self.fp16
-
-    @property
-    def fp32(self) -> bool:
-        return not self.fp16
-
-    def is_less_approx(self, other: Approximation) -> Optional[bool]:
-        return None
-
-    class PerforateConv2dStrideModule(Module):
-        def __init__(self, conv: HPVMConvBundle, approx: 'PerforateConv2dStride'):
-            super().__init__()
-            self.conv = conv
-            self.approx = approx
-            if self.approx.fp16:
-                self.conv = self.conv.half()
-
-        def forward(self, x: torch.Tensor):
-            if self.approx.fp16:
-                x = x.half()
-            x = self.conv.input_to_conv(x)
-            assert x.dim() == 4
-            # Put self.approx.interp_axis to first axis temporarily
-            x = x.transpose(0, self.approx.interp_axis)
-            interp_indices = torch.tensor(range(self.approx.offset, x.size(0), self.approx.stride))
-            x = interpolate_first_dim(x, interp_indices)
-            # Putting axes back
-            x = x.transpose(0, self.approx.interp_axis)
-            x = self.conv.conv_to_output(x)
-            if self.approx.fp16:
-                assert x.dtype == torch.float16
-            return x.float()
-
-    def apply(self, module: HPVMConvBundle) -> PerforateConv2dStrideModule:
-        return self.PerforateConv2dStrideModule(module, self)
-
-
-class Conv2dSampling(Approximation):
-    r"""Simulation of sampled convolution for `torch.nn.Conv2d`.
-
-    Skips some elements of the convolution kernel in a uniform, strided manner,
-    to reduce the amount of float-ops needed to compute each output entry.
-    This implementation gives the same output as actual sampled convolution but without the
-    performance benefit.
-
-    Parameters
-    ----------
-    skip_every: int
-        Skip 1 element in the convolution kernel per `skip_every` elements.
-    skip_offset : int :math:`\in [0, +\infty)`
-        Index of first element to be skipped.
-        For example, if `skip_every = 3` and `skip_offset = 1`, then indices skipped
-        will be [1, 4, 7, ...]
-    interp_rate : float
-        The weight will be compensated ("interpolated") with a ratio after skipping elements,
-        which is naturally equal to :math:`1 + (1 / (skip_every - 1)`.
-        `interp_rate` modifies this rate to :math:`1 + (1 / (skip_every - 1) \times interp_rate`.
-    use_fp16 : bool
-        Whether to use fp16 weight/input or not.
-    """
-
-    def __init__(
-            self, skip_every: int, skip_offset: int, interp_rate: float, use_fp16: bool
-    ):
-        assert skip_every >= 2 and skip_offset >= 0
-        self.skip_every = skip_every
-        self.skip_offset = skip_offset
-        self.interp_rate = interp_rate
-        self.fp16 = use_fp16
-
-    @property
-    def deterministic(self) -> bool:
-        return True
-
-    @property
-    def devtime(self) -> bool:
-        return not self.fp16
-
-    @property
-    def fp32(self) -> bool:
-        return not self.fp16
-
-    def is_less_approx(self, other: Approximation) -> Optional[bool]:
-        return None
-
-    @staticmethod
-    def sample_conv_weight(
-            interp_rate: float, skip_every: int, skip_offset: int, weight: torch.Tensor
-    ):
-        r"""Samples (skips & interpolates) convolution kernel according to parameters.
-
-        For a given `weight` tensor of shape `(C1, C2, H, W)`, sample each output channel
-        (on axis 0) independently.
-        Flatten each output channel tensor into 1 dim.
-        In normal cases, set elements at indices ``range(skip_offset, C_2 * H * W, skip_every)``
-        to 0.
-        However, if `skip_every` == `h` == `w` == 3, we may end up skipping the same whole rows for
-        each input channel, which is undesirable.
-        Instead, increment the offset by 1 for each input channel.
-        Last, multiplies the kernel by the inverse ratio of elements dropped for an interpolation.
-        """
-        if len(weight.shape) != 4:
-            raise ValueError("Conv2d weight should be 4-dimensional")
-        c1, c2, h, w = weight.shape
-        if skip_every == h == w == 3:
-            # Indices (0..h*w) to skip for each input channel
-            per_chan_skip_indices = [
-                range((i_chan + skip_offset) % skip_every, h * w, skip_every)
-                for i_chan in range(c2)
-            ]
-            # Indices (0..c2*h*w) for each output channel, created by adding i*h*w for ith channel.
-            skip_indices = torch.tensor([
-                x + i * h * w for i, per_chan in enumerate(per_chan_skip_indices)
-                for x in per_chan
-            ])
-        else:
-            # Indices (0..c2*h*w) to skip for each output channel
-            skip_indices = torch.arange(skip_offset, c2 * h * w, skip_every)
-        flat_weight = weight.reshape(c1, -1)
-        flat_weight[:, skip_indices] = 0
-        interp_rate = 1 + (1 / (skip_every - 1) * interp_rate)
-        flat_weight *= interp_rate
-        return flat_weight.reshape_as(weight)
-
-    def apply(self, module: HPVMConvBundle) -> HPVMConvBundle:
-        # Not copying weight tensor leads to memory leak
-        cloned_conv_w = module.weight.clone().detach()
-        module.weight.data = self.sample_conv_weight(
-            self.interp_rate, self.skip_every, self.skip_offset, cloned_conv_w
-        )
-        return module
-
-
-def quantize_256(tensor: torch.Tensor, range_min: float, range_max: float) -> torch.Tensor:
-    """Quantize a tensor so that only 256 unique float value exists."""
-    quantize_range = 256
-    input_range = range_max - range_min
-    mul = input_range / quantize_range
-    # Map tensor into [0, 256] range.
-    affined = (tensor - range_min) / mul
-    # Convert tensor to int and back to float so it will have
-    # 256 (actually 257!; following hpvm impl) unique float values [0, 256].
-    # Then reverse affine it to the original range.
-    quanted = torch.floor(affined).to(torch.int).to(torch.float)
-    quanted_float = quanted * mul + range_min
-    # Clip tensor
-    return torch.clamp(quanted_float, range_min, range_max)
-
-
-class PromiseSim(Approximation):
-    scaling_values = [0.75, 0.64, 0.336, 0.21, 0.168, 0.14, 0.11, 0.0784, 0.005]
-
-    def __init__(self, noise_level: int):
-        super().__init__()
-        self.noise_level = noise_level
-
-    @property
-    def deterministic(self) -> bool:
-        return False
-
-    @property
-    def devtime(self) -> bool:
-        return False
-
-    @property
-    def fp32(self) -> bool:
-        return False
-
-    def is_less_approx(self, other: Approximation) -> Optional[bool]:
-        if isinstance(other, PromiseSim):
-            return self.noise_level > other.noise_level
-        return None
-
-    def add_promise_noise(self, tensor: torch.Tensor):
-        scale = self.scaling_values[self.noise_level]
-        noise = torch.normal(
-            mean=0.0, std=scale, size=tensor.size(), device=tensor.device
-        )
-        return noise * tensor + tensor
-
-    class PromiseSimModule(Module):
-        def __init__(self, module: HPVMConvBundle, approx: 'PromiseSim'):
-            super().__init__()
-            self.input_r, weight_r, bias_r, self.output_r = module.conv_ranges
-            module.weight.data = quantize_256(module.weight, *weight_r)
-            if module.bias is not None:
-                module.bias.data = quantize_256(module.bias, *bias_r)
-            self.module = module
-            self.approx = approx
-
-        def forward(self, input_: torch.Tensor) -> torch.Tensor:
-            # Quantize input, weight, bias (see __init__), and add noise to input.
-            input_ = quantize_256(input_, *self.input_r)
-            input_ = self.approx.add_promise_noise(input_)
-            output = self.module(input_)
-            # Then again, quantize output.
-            return quantize_256(output, *self.output_r)
-
-    def apply(self, module: HPVMConvBundle) -> PromiseSimModule:
-        return self.PromiseSimModule(module, self)
-
-
-class FP16Approx(Approximation):
-    def __init__(self):
-        super().__init__()
-
-    @property
-    def deterministic(self) -> bool:
-        return True
-
-    @property
-    def devtime(self) -> bool:
-        return False
-
-    @property
-    def fp32(self) -> bool:
-        return False
-
-    def is_less_approx(self, other: Approximation) -> Optional[bool]:
-        return None
-
-    class FP16ApproxModule(Module):
-        def __init__(self, module: Module):
-            super().__init__()
-            self.module = module.half()
-
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            x: torch.Tensor = self.module(x.half())
-            assert x.dtype == torch.float16
-            return x.float()
-
-    def apply(self, module: Module) -> FP16ApproxModule:
-        return self.FP16ApproxModule(module)
-
-
-AllApproxesT = Dict[int, Approximation]
-TypeApproxesT = Dict[Type[Module], List[int]]
-
-
-class AvailableApproximations:
-    r"""Holds a list of all available "approximation info": approximation + properties.
-
-        For properties see `Approximation`.
-
-        Parameters
-        ----------
-        all_knobs: Dict[int, Approximation]
-            A dict from int index to (approximation, is_dev_time) pair.
-            Also see class function `from_global_knobs_file`.
-
-        Attributes
-        ----------
-        all_knobs : Dict[int, Approximation]
-            A mapping from approximation index to approximation info pair `(approximation, is_dev_time)`.
-        type_to_knobs : Dict[Type[Module], List[int]]
-            A mapping from network layer type (subtype of `torch.nn.Module`) to a list of indexes of
-            applicable approximations. Values of `type_to_knobs` are always valid keys in `all_knobs`.
-        """
-
-    def __init__(self, all_knobs: Dict[int, Approximation], type_to_knobs: TypeApproxesT):
-        self.all_knobs = all_knobs
-        self.type_to_knobs = type_to_knobs
-
-    @classmethod
-    def from_global_knobs_file(cls) -> 'AvailableApproximations':
-        """Read and parse global_knobs.txt to provide all knobs supported and their indexes.
-
-        Returns two things:
-        * Dict of indexes to (approximations, is_dev_time). Approximation is in the form of functions
-        with a layer input; see `ModuleReplacerT`.
-        * Dict of type of torch.nn.Module to a list of approximation indexes that can be applied to this
-        type of layer.
-        """
-        with (get_tensorrt_dir() / 'autotuner/data/global_knobs.txt').open() as f:
-            lines = f.readlines()
-        all_knobs = {}
-        promise_and_fp16 = []
-        for line in lines:
-            desc, knobs, _, _, _, _, _ = line.rstrip().split()
-            category, index = desc.split(',')
-            index = int(index)
-            if category in ('perf', 'perf_fp16'):
-                row, col, offset = [int(s) for s in knobs.split(',')]
-                if row > 1 and col > 1:
-                    raise ValueError("Perforation on both row and column is not supported")
-                if col == 1:
-                    direction_is_row, stride = True, row
-                else:
-                    direction_is_row, stride = False, col
-                all_knobs[index] = PerforateConv2dStride(
-                    direction_is_row, stride, offset, 'fp16' in category
-                )
-            elif category in ('samp', 'samp_fp16'):
-                stride, offset, interp_rate = knobs.split(',')
-                stride, offset, interp_rate = int(stride), int(offset), float(interp_rate)
-                all_knobs[index] = Conv2dSampling(
-                    stride, offset, interp_rate, 'fp16' in category
-                )
-            elif category == 'swing_level':
-                all_knobs[index] = PromiseSim(index)
-                promise_and_fp16.append(index)
-            elif category == 'fp16':
-                all_knobs[index] = FP16Approx()
-                promise_and_fp16.append(index)
-        type_to_knobs = {
-            HPVMConvBundle: list(all_knobs.keys()),
-            Linear: promise_and_fp16
-        }
-        return cls(all_knobs, type_to_knobs)
-
-    def items(self, dev_time: bool, ignore_fp32: bool) -> Dict[Type[Module], List[int]]:
-        """Give a list of applicable approximations for each layer type.
-
-        If dev_time is True, returns only devtime approximations, otherwise all approximations.
-        """
-
-        def remove_non_dev(type_to_knobs: TypeApproxesT) -> TypeApproxesT:
-            return {
-                k: [v for v in vs if self.all_knobs[v].devtime]
-                for k, vs in type_to_knobs.items()
-            }
-
-        def remove_fp32(type_to_knobs: TypeApproxesT) -> TypeApproxesT:
-            return {
-                k: [v for v in vs if not self.all_knobs[v].fp32]
-                for k, vs in type_to_knobs.items()
-            }
-
-        type_to_knobs_ = self.type_to_knobs
-        if dev_time:
-            type_to_knobs_ = remove_non_dev(type_to_knobs_)
-        if ignore_fp32:
-            type_to_knobs_ = remove_fp32(type_to_knobs_)
-        return type_to_knobs_
-
-    def __getitem__(self, item: int) -> Approximation:
-        """Returns the approximation info for given approximation index."""
-        return self.all_knobs[item]
diff --git a/hpvm/projects/pred_tuner/toolkit/estimators.py b/hpvm/projects/pred_tuner/toolkit/estimators.py
deleted file mode 100644
index acd35331693c706df336a6e3a33d1c6098a6cb50..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/toolkit/estimators.py
+++ /dev/null
@@ -1,383 +0,0 @@
-import abc
-import gc
-import logging
-import pickle
-from math import sqrt
-from pathlib import Path
-from typing import Callable, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar
-
-import numpy as np
-import torch
-from torch.nn import Module
-from tqdm import tqdm, trange
-
-from models.domains import QoS, qos_stats
-from .transform import ConfigT, NetApproxSelector
-
-ProfT = TypeVar('ProfT')
-NetOutputT = TypeVar('NetOutputT')
-QoST = Callable[[NetOutputT], QoS]
-ThresholdEvalT = Callable[[NetOutputT], bool]
-ExeT = Callable[[Module], NetOutputT]
-KeyT = Tuple[int, int]
-KVT = Tuple[KeyT, NetOutputT]
-EstmT = Tuple[QoS, QoS]
-
-msg_logger = logging.getLogger(__name__)
-
-
-class LinearEstimator(abc.ABC):
-    """Estimate QoS of a config by linearly adding "something" from each approximation of config, and
-    then applying QoS metric.
-
-    That "something" could be QoS itself (see `LinearQoSEstimator`), or the direct tensor output from
-    the model (see `LinearTensorEstimator`).
-    In initialization phase, run the model for each 1-approximation config and store the quantity to
-    be linearly summed in a table.
-
-    Parameters
-    ----------
-    nas: NetApproxSelector
-        `NetApproxSelector` instance is used to select all 1-approximation configs and evaluate them.
-    qos: Callable[[torch.Tensor], float]
-        Quality of Service measure (such as accuracy). Takes model output tensor and returns QoS value.
-    independent_init: bool
-        If False, don't initialize self.profile_table, and wait for `coinit_estimators` to fill in
-        the profile. `coinit_estimators` must be manually called if `init_profile` is False.
-
-    Attributes
-    ----------
-    qos : Callable[[torch.Tensor], float]
-        Same as parameter `qos`.
-    baseline_profile : T
-        Profile value of the baseline model.
-    profile_table : Dict[KeyT, T]
-        A mapping from (`layer_idx`, `approx_idx`) to the profile value, with only this approximation
-        applied (in other words, with configuration ``{layer_idx: approx_idx}`` applied).
-    """
-
-    n_nondeterm_runs = 10
-
-    def __init__(
-            self, nas: NetApproxSelector, executor: ExeT, qos: QoST,
-            threshold_eval: ThresholdEvalT, confidence_level: float,
-            independent_init: bool = True, storage: Path = None
-    ):
-        self.nas = nas
-        self.qos = qos
-        self.executor = executor
-        self.storage = storage
-        self.baseline_profile: ProfT = self.get_baseline_profile()
-        self.profile_table: Dict[KeyT, ProfT] = {}
-        self.confidence_level = confidence_level
-        if independent_init:
-            for (k, i), output in self._get_all_outputs(nas, self.executor, threshold_eval, storage):
-                self.profile_table[k, i] = self.handle_output(output)
-
-    @staticmethod
-    def _load_from_pickle(storage: Path) -> Iterator[KVT]:
-        if not storage.is_file():
-            return
-        msg_logger.info(f"Found pickle at {storage}")
-        with storage.open('rb') as f:
-            while True:
-                try:
-                    key, tensor = pickle.load(f)
-                    yield key, tensor
-                except EOFError:
-                    return
-
-    @classmethod
-    def run_model(cls, nas: NetApproxSelector, config: ConfigT, executor: ExeT) -> torch.Tensor:
-        is_deterministic = nas.is_deterministic(config)
-        model = nas.apply_approx_by_config(config).module
-        if is_deterministic:
-            ret = executor(model).unsqueeze(0).cpu()
-        else:
-            assert cls.n_nondeterm_runs > 0
-            ret = torch.stack([
-                executor(model)
-                for _ in trange(cls.n_nondeterm_runs, leave=False)
-            ]).cpu()
-        gc.collect()
-        return ret
-
-    @classmethod
-    def _get_all_outputs(
-            cls, nas: NetApproxSelector, executor: ExeT,
-            threshold_eval: ThresholdEvalT, storage: Path = None
-    ) -> Iterator[KVT]:
-        preloaded_acceptable = {}
-        if storage is not None:
-            bar = tqdm(cls._load_from_pickle(storage))
-            for key, tensor in bar:
-                bar.set_postfix(key=key)
-                preloaded_acceptable[key] = threshold_eval(tensor)
-                yield key, tensor
-
-        def evaluate(k: int, i: int) -> Tuple[bool, Optional[KVT]]:
-            if (k, i) in preloaded_acceptable:
-                msg_logger.debug(f"Key {(k, i)} is preloaded.")
-                return preloaded_acceptable[(k, i)], None
-            outputs = cls.run_model(nas, {k: i}, executor)
-            if storage is not None:
-                with storage.open('ab') as f:
-                    pickle.dump(((k, i), outputs), f)
-            return threshold_eval(outputs), ((k, i), outputs)
-
-        for key_outputs in nas.filter_approxes(evaluate):
-            # key_outputs is None means corresponding key has been preloaded (we can't see the key)
-            if key_outputs is None:
-                continue
-            yield key_outputs
-
-    @classmethod
-    def coinit_estimators(
-            cls, nas: NetApproxSelector, executor: ExeT, threshold_eval: ThresholdEvalT,
-            *estm_insts: 'LinearEstimator', storage: Path = None
-    ):
-        for (k, i), output in cls._get_all_outputs(nas, executor, threshold_eval, storage):
-            for inst in estm_insts:
-                inst.profile_table[(k, i)] = inst.handle_output(output)
-
-    @abc.abstractmethod
-    def get_baseline_profile(self) -> ProfT:
-        pass
-
-    @abc.abstractmethod
-    def handle_output(self, outputs: torch.Tensor) -> ProfT:
-        pass
-
-    @abc.abstractmethod
-    def estimate(self, config: ConfigT) -> EstmT:
-        pass
-
-
-class LinearQoSEstimator(LinearEstimator):
-    """Estimate QoS of a config by linearly adding QoS value. See `LinearEstimator`.
-
-    ProfT = Tuple[QoS(mean), QoS(std)]
-    NetOutputT = torch.Tensor
-    """
-
-    def estimate(self, config: ConfigT) -> EstmT:
-        baseline_mean: QoS = self.baseline_profile[0]
-        if not config:
-            return baseline_mean, baseline_mean
-        # N * 2 array
-        profiles = np.array([self.profile_table[kv] for kv in config.items()])
-        profiles[:, 0] -= baseline_mean
-        estm_qos = profiles[:, 0].sum() + baseline_mean
-        estm_std = sqrt(np.sum(profiles[:, 1] ** 2))
-        # We're hardcoding 95% confidence interval here.
-        assert self.confidence_level == 0.95
-        normal_dist_95 = 1.644854
-        r1, r2 = estm_qos, estm_qos - normal_dist_95 * estm_std
-        return float(r1), float(r2)
-
-    def handle_output(self, outputs: torch.Tensor) -> Tuple[QoS, QoS]:
-        qoses = np.array([self.qos(o) for o in outputs])
-        msg_logger.debug(f"Handled {qoses.mean(), qoses.std()}")
-        return qoses.mean(), qoses.std()
-
-    def get_baseline_profile(self) -> Tuple[QoS, QoS]:
-        mean_qos = self.qos(self.run_model(self.nas, {}, self.executor)[0])
-        return mean_qos, mean_qos.null()
-
-
-class LinearCombEstimator(LinearEstimator):
-    """Estimate QoS of a config by linearly adding tensor output from network. See `LinearEstimator`.
-
-    On estimation, sums over the delta in tensor output (compared to baseline output) for each
-    approximation, and then the baseline tensor output is added back.
-    This works as an estimation of tensor output for this configuration, which is then sent to QoS
-    metric to get the final QoS.
-
-    QoST = float
-    ProfT = torch.Tensor (2 * n_inputs * n_classes)
-    NetOutputT = torch.Tensor (n_inputs * n_classes)
-    """
-
-    def estimate(self, config) -> EstmT:
-        if not config:
-            baseline_qos = self.qos(self.baseline_profile)
-            return baseline_qos, baseline_qos
-        # 4D tensor: n_approx * 2 * n_inputs * n_classes
-        profiles = torch.stack([self.profile_table[kv] for kv in config.items()])
-        profiles -= self.baseline_profile
-        mean_tensor, confidence_tensor = profiles.sum(dim=0) + self.baseline_profile
-        estm_mean_qos = self.qos(mean_tensor)
-        estm_confidence_qos = self.qos(confidence_tensor)
-        return estm_mean_qos, estm_confidence_qos
-
-    def handle_output(self, outputs: torch.Tensor) -> torch.Tensor:
-        if len(outputs) == 1:
-            return torch.stack((outputs[0], outputs[0]))
-        qoses = np.array([self.qos(o) for o in outputs])
-        percentile_pos = int(self.n_nondeterm_runs * (1 - self.confidence_level))
-        assert 0 <= percentile_pos < self.n_nondeterm_runs
-        mean_pos = np.searchsorted(qoses, qoses.mean(), 'right')
-        assert 0 <= mean_pos <= self.n_nondeterm_runs
-        if mean_pos == self.n_nondeterm_runs:
-            mean_pos = self.n_nondeterm_runs - 1
-        return torch.stack((outputs[mean_pos], outputs[percentile_pos]))
-
-    def get_baseline_profile(self) -> torch.Tensor:
-        return self.run_model(self.nas, {}, self.executor)[0]
-
-
-class TrainableEstimator(LinearEstimator, abc.ABC):
-    """
-    QoST = float
-    ProfT = ProfT
-    NetOutputT = torch.Tensor (n_inputs * n_classes)
-    """
-    n_train_confs = 50
-    weight_range = 0.8, 1.2, 20
-    n_cold_start = 500
-    accept_threshold = 5
-    penalize_overestm = 1.0
-
-    def __init__(
-            self, nas: NetApproxSelector, executor: ExeT, qos: QoST,
-            threshold_eval: ThresholdEvalT, confidence_level: float,
-            independent_init: bool = True, storage: Path = None
-    ):
-        super().__init__(nas, executor, qos, threshold_eval, confidence_level, independent_init, storage)
-        self.r_cands = np.linspace(*self.weight_range)
-        self.r_error = np.zeros((len(self.r_cands), self.n_train_confs))
-        self.r = self.weight_range[1]
-        self.trained_iters = 0
-        self.cold_start = 0
-
-    def update_r(self):
-        mean_error = np.mean(self.r_error, axis=1)
-        best_idx = np.argmin(mean_error)
-        self.r = self.r_cands[best_idx]
-        if best_idx == len(mean_error) - 1 or best_idx == 0:
-            msg_logger.warning(f"Parameter value r = {self.r} has reached the boundary. Consider a larger range.")
-
-    def get_qos_for_config(self, config: ConfigT) -> EstmT:
-        is_deterministic = self.nas.is_deterministic(config)
-        net = self.nas.apply_approx_by_config(config).module
-        n_runs = 1 if is_deterministic else self.n_nondeterm_runs
-        qoses = [self.qos(self.executor(net)) for _ in trange(n_runs, leave=False)]
-        mean_qos, qos_at_confidence, _ = qos_stats(qoses, confidence=self.confidence_level)
-        return mean_qos, qos_at_confidence
-
-    @abc.abstractmethod
-    def real_estimate(self, config, rs: Iterable[float] = None) -> List[EstmT]:
-        pass
-
-    def estimate(self, config) -> EstmT:
-        estm = self.real_estimate(config)[0]
-        if self.cold_start < self.n_cold_start:
-            self.cold_start += 1
-            if self.cold_start % 50 == 0:
-                msg_logger.info(f"WeightedLinearCombEstimator cold start {self.cold_start} / {self.n_cold_start}")
-            return estm
-        if self.trained_iters >= self.n_train_confs:
-            return estm
-        log_info_freq = 10
-        log_level = logging.INFO if self.trained_iters % log_info_freq == 0 else logging.DEBUG
-        msg_logger.log(
-            log_level,
-            f"{self.__class__} train iter {self.trained_iters} / {self.n_train_confs}"
-        )
-        mean_qos, qos_at_confidence = self.get_qos_for_config(config)
-        estm_conf_qoses = np.array(self.real_estimate(config, rs=self.r_cands))[:, 1]
-        diff_conf_qoses = qos_at_confidence - estm_conf_qoses
-        old_r = self.r
-        self.r_error[:, self.trained_iters] = np.where(
-            diff_conf_qoses > 0, diff_conf_qoses * self.penalize_overestm,
-            -diff_conf_qoses
-        )
-        self.trained_iters += 1
-        self.update_r()
-        msg_logger.debug(
-            f"{self.__class__} real mean qos = {mean_qos}, real conf qos = {qos_at_confidence}, "
-            f"estm conf qos = {estm[1]}, r: {old_r} -> {self.r}"
-        )
-        return mean_qos, qos_at_confidence
-
-
-class WeightedLinearCombEstimator(TrainableEstimator, LinearCombEstimator):
-    """
-    QoST = float
-    ProfT = torch.Tensor
-    NetOutputT = torch.Tensor (n_inputs * n_classes), logged
-    """
-
-    def __init__(
-            self, nas: NetApproxSelector, executor: ExeT, qos: QoST,
-            threshold_eval: ThresholdEvalT, confidence_level: float,
-            independent_init: bool = True, storage: Path = None
-    ):
-        log_qos = lambda x: qos(torch.exp(x))
-        super().__init__(nas, executor, log_qos, threshold_eval, confidence_level, independent_init, storage)
-
-    @staticmethod
-    def tensor_log(tensor: torch.Tensor) -> torch.Tensor:
-        # TODO: don't take log if there's no SoftMax layer.
-        eps = torch.ones_like(tensor) * 1e-10
-        return torch.log(torch.max(tensor, eps))
-
-    def real_estimate(self, config, rs: Iterable[float] = None) -> List[EstmT]:
-        # 3D tensor: 2 * n_inputs * n_classes
-        if config:
-            estm_delta_output = torch.sum(
-                torch.stack([self.profile_table[kv] for kv in config.items()]) - self.baseline_profile,
-                dim=0
-            )
-        else:
-            n_in, n_out = self.baseline_profile.shape
-            estm_delta_output = torch.zeros(2, n_in, n_out)
-        rets = []
-        rs = rs if rs is not None else [self.r]
-        for r in rs:
-            mean_tensor, confidence_tensor = estm_delta_output * r + self.baseline_profile
-            rets.append((self.qos(mean_tensor), self.qos(confidence_tensor)))
-        return rets
-
-    def handle_output(self, outputs: torch.Tensor) -> torch.Tensor:
-        return LinearCombEstimator.handle_output(self, self.tensor_log(outputs))
-
-    def get_baseline_profile(self) -> torch.Tensor:
-        return self.tensor_log(LinearCombEstimator.get_baseline_profile(self))
-
-
-class WeightedLinearQoSEstimator(TrainableEstimator, LinearQoSEstimator):
-    """
-    QoST = float
-    ProfT = torch.Tensor
-    NetOutputT = torch.Tensor (n_inputs * n_classes), logged
-    """
-
-    weight_range = 0.5, 5, 50
-
-    def estimate(self, config) -> EstmT:
-        ret = super().estimate(config)
-        msg_logger.debug(f"Config {config} -> estimation {ret}")
-        return ret
-
-    def real_estimate(self, config, rs: Iterable[float] = None) -> List[EstmT]:
-        baseline_mean_qos = self.baseline_profile[0]
-        if config:
-            # N * 2 array
-            profiles = np.array([self.profile_table[kv] for kv in config.items()])
-            profiles[:, 0] -= baseline_mean_qos
-            profiles[:, 0][profiles[:, 0] > 0] = 0
-            estm_mean_qos_delta = profiles[:, 0].sum()
-            estm_std = sqrt(np.sum(profiles[:, 1] ** 2))
-        else:
-            estm_mean_qos_delta = estm_std = 0.0
-        rets = []
-        rs = rs if rs is not None else [self.r]
-        for r in rs:
-            estm_mean_qos = float(estm_mean_qos_delta * r + baseline_mean_qos)
-            # We're hardcoding 95% confidence interval here.
-            assert self.confidence_level == 0.95
-            normal_dist_95 = 1.644854
-            estm_conf_qos = estm_mean_qos - normal_dist_95 * estm_std
-            rets.append((estm_mean_qos, estm_conf_qos))
-        return rets
diff --git a/hpvm/projects/pred_tuner/toolkit/indexing.py b/hpvm/projects/pred_tuner/toolkit/indexing.py
deleted file mode 100644
index 27500c152ac5130f6df787f16f53e84c3099bcf6..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/toolkit/indexing.py
+++ /dev/null
@@ -1,55 +0,0 @@
-from typing import Callable, Iterator, Optional, Set
-
-import torch
-from torch.nn import Module, Sequential
-
-UnaryForwardT = Callable[[torch.Tensor], torch.Tensor]
-ReplacedForwardT = Callable[[Module, UnaryForwardT, torch.Tensor], torch.Tensor]
-
-
-class ModuleIndexer:
-    def __init__(self, module: Module, ignore_module: Callable[[Module], bool]):
-        self.module_to_index = {}
-        for i, submodule in enumerate(module.modules()):
-            if ignore_module(submodule):
-                continue
-            self.module_to_index[submodule] = i
-        self.index_to_module = {i: m for m, i in self.module_to_index.items()}
-        self.module = module
-        self.layer_parents = self.find_layers_parent_info(module, set(self.all_modules))
-
-    @staticmethod
-    def find_layers_parent_info(net: Module, layers: Set[Module]):
-        ret = {}
-        for name, submodule in net.named_children():
-            if submodule in layers:
-                ret[submodule] = net, name
-            ret = {**ret, **ModuleIndexer.find_layers_parent_info(submodule, layers)}
-        return ret
-
-    @property
-    def all_modules(self) -> Iterator[Module]:
-        return iter(self.module_to_index.keys())
-
-    def find(self, module: Module) -> Optional[int]:
-        return self.module_to_index.get(module, None)
-
-    def __getitem__(self, item: int) -> Module:
-        return self.index_to_module[item]
-
-    def __setitem__(self, key: int, value: Module):
-        old = self.index_to_module[key]
-        if value != old:
-            self.index_to_module[key] = value
-            self.module_to_index[value] = self.module_to_index[old]
-            self.module_to_index.pop(old)
-            parent, name = self.layer_parents[old]
-            self.layer_parents[value] = parent, name
-            self.layer_parents.pop(old)
-            parent.__setattr__(name, value)
-
-    def __iter__(self) -> Iterator[Module]:
-        return self.all_modules
-
-    def __len__(self):
-        return len(self.module_to_index)
diff --git a/hpvm/projects/pred_tuner/toolkit/transform.py b/hpvm/projects/pred_tuner/toolkit/transform.py
deleted file mode 100644
index f19554181a9bb9ac10ee9261cd908c2003f18d48..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/toolkit/transform.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import copy
-import logging
-from collections import defaultdict
-from typing import Callable, Dict, Generic, Iterator, List, Tuple, TypeVar
-
-from torch.nn import Module
-
-from .approxdnn import Approximation, AvailableApproximations
-from .indexing import ModuleIndexer
-
-msg_logger = logging.getLogger(__name__)
-
-
-T1 = TypeVar('T1')
-T2 = TypeVar('T2')
-TransformerCT = Callable[[int, T1], T2]
-
-
-class StateCapturer(Module, Generic[T2]):
-    @staticmethod
-    def _id(_, x):
-        return x.clone().cpu().detach()
-
-    def __init__(self, net_index: ModuleIndexer, state_transformer: TransformerCT = None):
-        super().__init__()
-        self.net_state: Dict[int, List[T2]] = defaultdict(list)
-        self.state_transformer = state_transformer or self._id
-        self.net_index = net_index
-        for submodule in net_index.module.modules():
-            submodule.register_forward_hook(self.forward_hook)
-        self._output = None
-
-    @property
-    def module(self):
-        return self.net_index.module
-
-    @property
-    def output(self):
-        if self._output is None:
-            raise RuntimeError("Cannot get output before inference happens")
-        return self._output
-
-    def forward_hook(self, module: Module, _, outputs):
-        module_idx = self.net_index.find(module)
-        if module_idx is None:
-            raise RuntimeError("Cannot find module; module may have changed externally")
-        self.net_state[module_idx].append(self.state_transformer(module_idx, outputs))
-
-    def forward(self, *args, **kwargs):
-        return self.module.forward(*args, **kwargs)
-
-    def get_output_state(self) -> List[T2]:
-        return self.net_state[self.injected.output_loc()]
-
-
-T = TypeVar('T')
-ConfigT = Dict[int, int]
-EvaluatorT = Callable[[int, int], Tuple[bool, T]]
-
-
-class NetApproxSelector:
-    r"""List all 1-approximation configurations, and apply configurations to a `ModuleDAG` network.
-
-    Computes a list of available approximations for each layer of the network, given info on available
-    approximations in the system (in the form of an `AvailableApproximations` instance).
-    Capable of listing all single-approximation configurations, and apply a given configuration to the network.
-    A configuration is a dict from layer indices to approximation for these layers, one for each.
-    See `ConfigT`.
-
-    Parameters
-    ----------
-    net : Module
-        The network to be approximated.
-    dev_time_only : bool
-        If True, use only devtime approximations; otherwise use all available approximations.
-    aa : AvailableApproximations
-        A container with information of available approximations, and the type of layer each approximation
-        applies to, etc.
-
-    Attributes
-    ----------
-    net : Module
-        The network to be approximated (parameter `net`).
-    net_approxes: Dict[int, List[int]]
-        A list of available approximation indexes per layer index.
-    available_approx: AvailableApproximations
-        Available approximations (parameter `aa`).
-    """
-
-    class ApproximationGraph:
-        """Naive O(n^2) sort for a list of partially-ordered approximations."""
-
-        def __init__(self, approx_indices: List[int], aa: AvailableApproximations):
-            import networkx as nx
-            self.dep_graph = nx.DiGraph()
-            self.dep_graph.add_nodes_from(approx_indices)
-            for i, x in enumerate(approx_indices):
-                for y in approx_indices[i + 1:]:
-                    approx_x, approx_y = aa[x], aa[y]
-                    cmp = approx_x.is_less_approx(approx_y)
-                    if cmp is None:  # Not comparable
-                        continue
-                    if cmp:
-                        self.dep_graph.add_edge(x, y)
-                    else:
-                        self.dep_graph.add_edge(y, x)
-            self.sorted_indices = list(nx.algorithms.topological_sort(self.dep_graph))
-
-        def __len__(self) -> int:
-            return len(self.sorted_indices)
-
-        def __iter__(self) -> Iterator[Tuple[int, bool]]:
-            return iter(self.sorted_indices)
-
-    def __init__(
-            self, net: Module, dev_time_only: bool = True, ignore_fp32: bool = False,
-            aa: AvailableApproximations = None
-    ):
-        self.available_approx = aa or AvailableApproximations.from_global_knobs_file()
-        self.type_approxes = self.available_approx.items(dev_time=dev_time_only, ignore_fp32=ignore_fp32)
-        approximable_types = tuple(self.type_approxes.keys())
-        self.net_index = ModuleIndexer(net, lambda m: not isinstance(m, approximable_types))
-        self.dev_time_only = dev_time_only
-        self.net_approxes: Dict[int, List[int]] = defaultdict(list)
-        for i, layer in self.net_index.index_to_module.items():
-            for t, approxes in self.type_approxes.items():
-                if isinstance(layer, t):
-                    self.net_approxes[i].extend(approxes)
-
-    def apply_approx_by_config(self, config: ConfigT) -> ModuleIndexer:
-        """Applies given `config` to network."""
-        new_dag = copy.deepcopy(self.net_index)
-        for layer_idx, config_idx in config.items():
-            layer = new_dag[layer_idx]
-            new_dag[layer_idx] = self.available_approx[config_idx].apply(layer)
-        return new_dag
-
-    def list_single_approxes(self) -> Iterator[Tuple[int, int, Approximation]]:
-        for k, vs in self.net_approxes.items():
-            for v in vs:
-                yield k, v, self.available_approx[v]
-
-    def filter_approxes(self, evaluator: EvaluatorT) -> Iterator[T]:
-        """Enumerate through and apply each single-approximation configuration."""
-        net_approxes_graph: Dict[int, NetApproxSelector.ApproximationGraph] = {
-            k: self.ApproximationGraph(vs, self.available_approx) for k, vs in self.net_approxes.items()
-        }
-        from tqdm import tqdm
-        from utils import gpu_mem_mb
-        bar1 = tqdm(net_approxes_graph.items(), total=len(net_approxes_graph))
-        for k, graph in bar1:
-            bar1.set_postfix(layer=k)
-            bar2 = tqdm(graph, leave=None)
-            unacceptable_approx = None
-            filtered_layer_approxes = []
-            for approx_id in bar2:
-                approx = self.available_approx[approx_id]
-                if unacceptable_approx is not None:
-                    cmp = unacceptable_approx.is_less_approx(approx)
-                    if cmp:
-                        msg_logger.debug(f"{approx} is worse than unacceptable approx {unacceptable_approx}")
-                        continue
-                    else:
-                        unacceptable_approx = None
-                bar2.set_postfix(approx_id=approx_id, mem=gpu_mem_mb())
-                acceptable, ret_val = evaluator(k, approx_id)
-                if not acceptable:
-                    unacceptable_approx = approx
-                    msg_logger.debug(f"{approx} is unacceptable")
-                    continue
-                filtered_layer_approxes.append(approx_id)
-                yield ret_val
-            self.net_approxes[k] = filtered_layer_approxes
-
-    def get_baseline(self) -> Module:
-        return self.net_index.module
-
-    def get_layer_approxes(self) -> Dict[Module, List[int]]:
-        """Expose available knobs for autotuner usage."""
-        return {
-            self.net_index[layer_k]: approxes
-            for layer_k, approxes in self.net_approxes.items()
-        }
-
-    def is_deterministic(self, config: ConfigT):
-        return all(self.available_approx[knob_id].deterministic for knob_id in config.values())
diff --git a/hpvm/projects/pred_tuner/utils/__init__.py b/hpvm/projects/pred_tuner/utils/__init__.py
deleted file mode 100644
index 1f06b4ae222c3a8a56d4ab4516031e4c91dfa0d2..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/utils/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .config import Config
-from .logging import config_pylogger, reapply_last_config
-from .utils import device, get_knob_config_file, get_tensorrt_dir, gpu_mem_mb
diff --git a/hpvm/projects/pred_tuner/utils/benchmarks.json b/hpvm/projects/pred_tuner/utils/benchmarks.json
deleted file mode 100644
index 57184872a07de661c1c9ee4064ec01652e9966ff..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/utils/benchmarks.json
+++ /dev/null
@@ -1,100 +0,0 @@
-{
-  "lenet_hpvm": {
-    "model_name": "lenet_hpvm",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/lenet_keras/",
-    "layer_file": "autotuner/data/lenet/lenet_layers.txt",
-    "cost_file": "autotuner/data/lenet/op_cost.txt"
-  },
-  "alexnet_hpvm": {
-    "model_name": "alexnet_hpvm",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/alexnet_cifar10/",
-    "layer_file": "autotuner/data/alexnet/alexnet_layers.txt",
-    "cost_file": "autotuner/data/alexnet/op_cost.txt"
-  },
-  "alexnet2_hpvm": {
-    "model_name": "alexnet2_hpvm",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/alexnet2_cifar10/",
-    "layer_file": "autotuner/data/alexnet2/alexnet2_layers.txt",
-    "cost_file": "autotuner/data/alexnet2/op_cost.txt"
-  },
-  "vgg16_cifar10_hpvm": {
-    "model_name": "vgg16_cifar10_hpvm",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/vgg16_cifar10/",
-    "layer_file": "autotuner/data/vgg16_cifar10/vgg16_layers.txt",
-    "cost_file": "autotuner/data/vgg16_cifar10/op_cost.txt"
-  },
-  "vgg16_cifar100_hpvm": {
-    "model_name": "vgg16_cifar100_hpvm",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/vgg16_cifar100/",
-    "layer_file": "autotuner/data/vgg16_cifar100/vgg16_layers.txt",
-    "cost_file": "autotuner/data/vgg16_cifar100/op_cost.txt"
-  },
-  "vgg16_imagenet_hpvm": {
-    "model_name": "vgg16_imagenet_hpvm",
-    "autotuner_runs": 20000,
-    "base_dir": "tuner_results/vgg16_imagenet/",
-    "layer_file": "autotuner/data/vgg16_imagenet/vgg16_layers.txt",
-    "cost_file": "autotuner/data/vgg16_imagenet/op_cost.txt"
-  },
-  "resnet18_hpvm": {
-    "model_name": "resnet18_hpvm",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/resnet18_cifar10/",
-    "layer_file": "autotuner/data/resnet/resnet_layers.txt",
-    "cost_file": "autotuner/data/resnet/op_cost.txt"
-  },
-  "resnet50_imagenet_hpvm": {
-    "model_name": "resnet50_imagenet_hpvm",
-    "autotuner_runs": 30000,
-    "base_dir": "tuner_results/resnet50_imagenet/",
-    "layer_file": "autotuner/data/resnet50_imagenet/resnet50_layers.txt",
-    "cost_file": "autotuner/data/resnet50_imagenet/op_cost.txt"
-  },
-  "mobilenet_hpvm": {
-    "model_name": "mobilenet_hpvm",
-    "autotuner_runs": 20000,
-    "base_dir": "tuner_results/mobilenet/",
-    "layer_file": "autotuner/data/mobilenet/mobilenet_layer_comp.txt",
-    "cost_file": "autotuner/data/mobilenet/op_cost.txt"
-  },
-  "__unused_mobilenet_shallow": {
-    "model_name": "mobilenet_shallow_hpvm",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/mobilenet_shallow/",
-    "layer_file": "autotuner/data/mobilenet_shallow/mobilenet_shallow_layer_comp.txt",
-    "cost_file": "autotuner/data/mobilenet_shallow/op_cost.txt"
-  },
-  "alexnet_imagenet_hpvm": {
-    "model_name": "alexnet_imagenet_hpvm",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/alexnet_imagenet/",
-    "layer_file": "autotuner/data/alexnet_imagenet/layer_composition.txt",
-    "cost_file": "autotuner/data/alexnet_imagenet/op_cost.txt"
-  },
-  "alexnet2_canny_hpvm": {
-    "model_name": "alexnet2_canny_hpvm",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/alexnet2_canny_hpvm/",
-    "layer_file": "autotuner/data/alexnet2_canny_hpvm/layers.txt",
-    "cost_file": "autotuner/data/alexnet2_canny_hpvm/op_cost.txt"
-  },
-  "resnet18_torch": {
-    "model_name": "resnet18_torch",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/resnet18_cifar10_torch/",
-    "layer_file": "autotuner/data/resnet18_torch/resnet_layers.txt",
-    "cost_file": "autotuner/data/resnet18_torch/op_cost.txt"
-  },
-  "vgg16_torch": {
-    "model_name": "vgg16_torch",
-    "autotuner_runs": 10000,
-    "base_dir": "tuner_results/resnet18_cifar10_torch/",
-    "layer_file": "autotuner/data/resnet/resnet_layers.txt",
-    "cost_file": "autotuner/data/resnet/op_cost.txt"
-  }
-}
\ No newline at end of file
diff --git a/hpvm/projects/pred_tuner/utils/config.py b/hpvm/projects/pred_tuner/utils/config.py
deleted file mode 100644
index fced1a4d462ad9bb4c828f2bbc264bb4b4755081..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/utils/config.py
+++ /dev/null
@@ -1,318 +0,0 @@
-from pathlib import Path
-from typing import Dict, Iterable, List, Union
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-from models.domains import QoS
-from models.domains.qoses import Accuracy, AccuracyPSNR
-from .utils import get_knob_config_file
-
-op_mapping = {
-    "conv": "conv", "depthwise_conv": "group_conv", "dense": "mul", "batchnorm": "batchnorm",
-    "pool": "pool_max", "pool_mean": "pool_mean", "activation": "relu", "tanh": "tanh", "add": "add",
-    "reduce": "red_samp"
-}
-
-approx_map = {}
-PathLike = Union[str, Path]
-
-
-def initializeApproxMap(knobs_file_path):
-    f = open(knobs_file_path, "r")
-
-    for x in f:
-        toks = x.split("\t")
-        approx_type = toks[0].split(",")[0]
-        knob_id = toks[0].split(",")[1]
-        approx_str = approx_type + " " + knob_id
-        approx_map[knob_id] = approx_str
-
-
-initializeApproxMap(get_knob_config_file())
-
-# TODO: fix hardcoding
-fp32_to_fp16 = {
-    **{k: k + 30 for k in range(121, 138 + 1)},
-    **{k: k + 30 for k in range(231, 248 + 1)},
-    11: 12
-}
-fp16_to_fp32 = {v: k for k, v in fp32_to_fp16.items()}
-
-
-class Config:
-    def __init__(
-            self, avg_accuracy: QoS, baseline_accuracy: QoS, fname: str, flags: List[int],
-            total_runs: int, confidence: float, config_cost: float, speedup: float
-    ):
-        self.total_runs = total_runs
-        self.confidence = confidence
-        self.config_cost = config_cost
-        self.speedup = speedup
-        self.avg_qos = avg_accuracy
-        self.baseline_qos = baseline_accuracy
-        self.fname = fname
-        self.flags = flags
-        self.avg_loss = self.avg_loss.min_positive_loss()
-
-    @property
-    def avg_loss(self):
-        return self.baseline_qos - self.avg_qos
-
-    @avg_loss.setter
-    def avg_loss(self, value: QoS):
-        self.avg_qos = self.baseline_qos - value
-
-    def __repr__(self):
-        return repr((self.fname, self.speedup, self.avg_qos, self.avg_loss, self.flags))
-
-    @staticmethod
-    def qos_speedup_points(configs: Iterable['Config']) -> np.ndarray:
-        return np.array([[*conf.avg_qos.numpy(), conf.speedup] for conf in configs])
-
-    def update_acc(self, acc: QoS, confidence: float, baseline_acc: QoS = None):
-        if baseline_acc:
-            self.baseline_qos = baseline_acc
-        self.avg_qos = acc
-        self.avg_loss = self.avg_loss.min_positive_loss()
-        self.confidence = confidence
-
-    def to_fp16(self) -> 'Config':
-        import copy
-        fp16_conf = copy.copy(self)
-        fp16_conf.flags = [fp32_to_fp16.get(x, x) for x in self.flags]
-        return fp16_conf
-
-    def to_fp32(self) -> 'Config':
-        import copy
-        fp32_conf = copy.copy(self)
-        fp32_conf.flags = [fp16_to_fp32.get(x, x) for x in self.flags]
-        return fp32_conf
-
-    def to_rt_format(self, idx: int, bench_layer_composition, hardware_target: str):
-        config_str = build_config_str(self.flags, bench_layer_composition, hardware_target)
-        return (
-            "+++++\n"
-            f"conf{idx} {self.speedup} 0 {self.avg_qos} {self.avg_loss}\n"
-            f"{config_str}"
-            "-----\n"
-        )
-
-    def to_tuner_format(self):
-        topline = (
-            f"total_runs={self.total_runs}\tconfidence={self.confidence}\t"
-            f"avg_accuracy={self.avg_qos}\tconfig_cost={self.config_cost}\tspeedup={self.speedup}"
-        )
-        flags_lines = [str(x) for x in self.flags]
-        return '\n'.join([topline] + flags_lines)
-
-    @classmethod
-    def from_tuner_format(cls, lines: List[str], fname: str, baseline_accuracy: QoS):
-        def parseTopLine(x: str) -> Dict[str, str]:
-            toks = x.split()
-            fields = {}
-            for tok in toks:
-                field, value = tok.split('=')
-                fields[field] = value
-            return fields
-
-        top_line = parseTopLine(lines[0])
-        total_runs = int(top_line['total_runs'])
-        confidence = float(top_line['confidence'])
-        avg_accuracy = baseline_accuracy.parse(top_line['avg_accuracy'])
-        config_cost = float(top_line['config_cost'])
-        speedup = float(top_line['speedup'])
-        flags = [int(line.strip()) for line in lines[1:] if line.strip()]
-        return cls(avg_accuracy, baseline_accuracy, fname, flags, total_runs, confidence, config_cost, speedup)
-
-
-def genScatterPlotFromConfigs(configs, file_path):
-    speedups, accuracy_losses = [c.speedup for c in configs], [c.avg_loss for c in configs]
-    plt.scatter(accuracy_losses, speedups)
-    plt.xlabel("accuracy_loss")
-    plt.ylabel("speedup")
-    plt.xlim(left=-0.05)
-    plt.ylim(bottom=1)
-    plt.savefig(file_path)
-    plt.close()
-
-
-def _find_distance_to(points: np.ndarray, ref_points: np.ndarray) -> np.ndarray:
-    n_ref = len(ref_points)
-    if n_ref == 0:
-        return np.zeros(0)
-    if n_ref == 1:
-        return np.linalg.norm(points - ref_points, axis=1)
-    ref_points = np.array(sorted(ref_points, key=lambda p: p[0]))
-    px = points.T[0]
-    rx = ref_points.T[0]
-    local_unit_vecs = ref_points[1:] - ref_points[:-1]
-    dists = []
-    bins = np.digitize(px, rx) - 1
-    for point, left_ref_p in zip(points, bins):
-        if left_ref_p == -1:
-            left_ref_p = 0
-        to_left_ref = ref_points[left_ref_p] - point
-        local_unit_vec = local_unit_vecs[-1] if left_ref_p >= n_ref - 1 else local_unit_vecs[left_ref_p]
-        projection = np.dot(local_unit_vec, to_left_ref) / np.linalg.norm(local_unit_vec)
-        dist = np.sqrt(np.linalg.norm(to_left_ref) ** 2 - projection ** 2)
-        dists.append(dist)
-    return np.array(dists)
-
-
-def is_pareto_efficient(
-        configs: List[Config], margin: float = None,
-        ratio: float = None, n_min: int = None, n_max: int = None
-) -> List[Config]:
-    configs = np.array(configs)
-    acc_speedup = Config.qos_speedup_points(configs)
-    is_efficient = np.ones(acc_speedup.shape[0], dtype=bool)
-    for idx, c in enumerate(acc_speedup):
-        if is_efficient[idx]:
-            # Keep any point with a higher value
-            is_efficient[is_efficient] = np.any(acc_speedup[is_efficient] > c, axis=1)
-            is_efficient[idx] = True  # And keep self
-    pareto_acc_speedup = acc_speedup[is_efficient]
-    pareto_configs = configs[is_efficient]
-    non_pareto_acc_speedup = acc_speedup[np.logical_not(is_efficient)]
-    non_pareto_configs = configs[np.logical_not(is_efficient)]
-    dist_to_pareto = _find_distance_to(non_pareto_acc_speedup, pareto_acc_speedup)
-    if margin is not None:
-        marginal_accepted = non_pareto_configs[dist_to_pareto < margin]
-    elif ratio is not None:
-        dist_order = np.argsort(dist_to_pareto)
-        take_n = int(len(dist_to_pareto) * ratio)
-        if n_min is not None:
-            take_n = max(take_n, n_min)
-        if n_max is not None:
-            take_n = min(take_n, n_max)
-        take_n -= len(pareto_configs)
-        marginal_accepted = non_pareto_configs[dist_order[:take_n]]
-    else:
-        raise ValueError("Must provide margin or ratio")
-    return pareto_configs.tolist() + marginal_accepted.tolist()
-
-
-def print_layer_info(flag: int, hardware_target: str, layer_comp):
-    approx_tech = approx_map[str(flag)]
-    if flag <= 7:
-        # If is PROMISE
-        return f"promise {approx_tech}"
-    # If is GPU / CPU
-    op0 = op_mapping[layer_comp[0]]
-    config_str = f"{hardware_target} {op0} {approx_tech} "
-    for op in layer_comp[1:]:
-        op_name = op_mapping[op]
-        fp = "fp32" if is_fp32(flag) else "fp16"
-        config_str += f"{op_name} {fp} 1 "
-    return config_str
-
-
-def build_config_str(flags: List[int], layer_desc: List[List[str]], hardware_target: str):
-    lines = []
-    assert len(flags) == len(layer_desc)
-    for index, (flag, layer_comp) in enumerate(zip(flags, layer_desc), start=1):
-        layer_str = print_layer_info(flag, hardware_target, layer_comp)
-        config_str = f"{index} {layer_str}"
-        lines.append(config_str)
-    lines.append(f"{len(layer_desc) + 1} {hardware_target} softmax fp32 1\n")
-    return '\n'.join(lines)
-
-
-def is_fp32(flag: int):
-    return flag in fp32_to_fp16
-
-
-def dump_configs_to_rt(
-        layer_desc, configs: List[Config],
-        config_out_path: PathLike, baseline_acc: QoS, hardware_target: str
-):
-    baseline_flag = 11
-    baseline_config = Config(
-        baseline_acc, baseline_acc, '', [baseline_flag for _ in layer_desc],
-        1, 100.0, 0.0, 1.0
-    )
-    baseline_str = baseline_config.to_rt_format(1, layer_desc, hardware_target)
-    with config_out_path.open("w") as f:
-        f.write(baseline_str)
-        for it, config in enumerate(configs, start=2):
-            f.write(config.to_rt_format(it, layer_desc, hardware_target))
-
-
-# Public Interfaces
-def dump_rt_format_to(
-        layer_desc, configs: List[Config], gold_acc: QoS,
-        rt_cpu_path: PathLike = None, rt_gpu_path: PathLike = None
-):
-    if configs:
-        assert len(set([conf.baseline_qos for conf in configs])) == 1
-    # Sort configs
-    sorted_configs = sorted(configs, key=lambda conf: (conf.avg_loss, conf.speedup, conf.flags))
-    if rt_gpu_path is not None:
-        # Remap to fp16 for gpu.
-        fp16_configs = [conf.to_fp16() for conf in sorted_configs]
-        dump_configs_to_rt(
-            layer_desc, fp16_configs, rt_gpu_path, gold_acc, 'gpu'
-        )
-    if rt_cpu_path is not None:
-        # Remap to fp32 for cpu.
-        fp32_configs = [conf.to_fp32() for conf in sorted_configs]
-        dump_configs_to_rt(
-            layer_desc, fp32_configs, rt_cpu_path, gold_acc, 'cpu'
-        )
-
-
-def plot_configs(file_path: Path, **kw_configs: List[Config]):
-    from mpl_toolkits.mplot3d import Axes3D
-    # Decide 2D or 3D plot:
-    qos_type = None
-    for label, confs in kw_configs.items():
-        if not confs:
-            continue
-        if not qos_type:
-            qos_type = type(confs[0].avg_qos)
-        else:
-            assert qos_type == type(confs[0].avg_qos)
-    if qos_type is None:
-        return
-    if qos_type is AccuracyPSNR:
-        fig: plt.Figure = plt.figure()
-        ax: Axes3D = fig.add_subplot(111, projection='3d')
-        for label, confs in kw_configs.items():
-            data = np.array([
-                [c.avg_loss.qoses[0].to_scalar(), c.avg_qos.qoses[1].to_scalar(), c.speedup]
-                for c in confs]
-            )
-            x, y, z = data.T
-            ax.scatter(x, y, z, label=label)
-        ax.set_xlabel("accuracy_loss")
-        ax.set_ylabel("psnr")
-        ax.set_zlabel("speedup")
-        ax.set_xlim(left=-0.05)
-        ax.set_zlim(bottom=1)
-    elif qos_type is Accuracy:
-        fig, ax = plt.subplots()
-        fig: plt.Figure
-        ax: plt.Axes
-        for label, confs in kw_configs.items():
-            data = np.array([[c.avg_loss.to_scalar(), c.speedup] for c in confs])
-            x, y = data.T
-            ax.scatter(x, y, label=label)
-        ax.set_xlabel("accuracy_loss")
-        ax.set_ylabel("speedup")
-        ax.set_xlim(left=-0.05)
-        ax.set_ylim(bottom=1)
-    else:
-        raise ValueError(f"QoS type {qos_type} unsupported in plotting.")
-    ax.legend()
-    fig.savefig(file_path)
-    plt.close(fig)
-
-
-def load_configs_from_dir(result_dir: PathLike, baseline_accuracy: QoS):
-    config_arr = []
-    for path in Path(result_dir).glob('*'):
-        with path.open() as f:
-            lines = f.readlines()
-        config_arr.append(Config.from_tuner_format(lines, path.name, baseline_accuracy))
-    return config_arr
diff --git a/hpvm/projects/pred_tuner/utils/logging.py b/hpvm/projects/pred_tuner/utils/logging.py
deleted file mode 100644
index 6b6904bd2e0a0683ccc6905994f645fa6856ad4d..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/utils/logging.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import logging
-from logging import config
-import os
-from pathlib import Path
-
-import tqdm
-
-
-class TqdmStreamHandler(logging.Handler):
-    """tqdm-friendly logging handler. Uses tqdm.write instead of print for logging."""
-
-    def __init__(self, level=logging.NOTSET):
-        super().__init__(level)
-
-    def emit(self, record):
-        try:
-            msg = self.format(record)
-            tqdm.tqdm.write(msg)
-            self.flush()
-        except (KeyboardInterrupt, SystemExit, RecursionError):
-            raise
-        except:
-            self.handleError(record)
-
-
-_last_applied_config = None
-
-
-def config_pylogger(filename: str = None, output_dir: Path = None, verbose: bool = False) -> logging.Logger:
-    """Configure the Python logger.
-
-    For each execution of the application, we'd like to create a unique log file.
-    By default this file is named using the date and time of day, so that it can be sorted by recency.
-    You can also name your filename or choose the log directory.
-    """
-    import time
-    timestr = time.strftime("%Y.%m.%d-%H%M%S")
-    filename = filename or timestr
-    output_dir = output_dir or Path('.')
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-    file_path = output_dir / filename
-
-    global _last_applied_config
-    _last_applied_config = d = {
-        'version': 1,
-        'disable_existing_loggers': False,
-        'formatters': {
-            'simple': {
-                'format': '%(levelname)s %(name)s: '
-                          '%(message)s'
-            },
-            'detailed': {
-                'format': '[%(asctime)-15s] '
-                          '%(levelname)7s %(name)s: '
-                          '%(message)s '
-                          '@%(filename)s:%(lineno)d'
-            }
-        },
-        'handlers': {
-            'console': {
-                '()': TqdmStreamHandler,
-                'level': 'INFO',
-                'formatter': 'simple'
-            },
-            'file': {
-                'class': 'logging.FileHandler',
-                'filename': file_path.as_posix(),
-                'mode': 'a',  # Because we may apply this config again, want to keep existing content
-                'formatter': 'detailed',
-            },
-        },
-        'root': {
-            'level': 'DEBUG' if verbose else 'INFO',
-            'handlers': ['console', 'file']
-        },
-    }
-    config.dictConfig(d)
-
-    msglogger = logging.getLogger()
-    msglogger.info(f"Log file for this run: {file_path}")
-    return msglogger
-
-
-def reapply_last_config():
-    if _last_applied_config is not None:
-        config.dictConfig(_last_applied_config)
diff --git a/hpvm/projects/pred_tuner/utils/utils.py b/hpvm/projects/pred_tuner/utils/utils.py
deleted file mode 100644
index 16165574662ca91320784f827468002fbae21fa8..0000000000000000000000000000000000000000
--- a/hpvm/projects/pred_tuner/utils/utils.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import logging
-import os
-from pathlib import Path
-
-import torch
-
-device = f'cuda:{torch.cuda.device_count() - 1}' if torch.cuda.is_available() else 'cpu'
-n_cpu_threads = 12 if device == 'cuda:0' else 35
-torch.set_num_threads(n_cpu_threads)
-
-msg_logger = logging.getLogger(__name__)
-
-
-def gpu_mem_mb():
-    # noinspection PyTypeChecker
-    return torch.cuda.memory_allocated(device) / 1024 ** 2
-
-
-def get_tensorrt_dir() -> Path:
-    if 'LLVM_SRC_ROOT' not in os.environ:
-        return Path('.')
-    return Path(os.environ['LLVM_SRC_ROOT']) / "projects/hpvm-tensor-rt"
-
-
-def get_knob_config_file() -> Path:
-    return get_tensorrt_dir() / "autotuner/data/global_knobs.txt"
diff --git a/hpvm/projects/predtuner b/hpvm/projects/predtuner
new file mode 160000
index 0000000000000000000000000000000000000000..70ead4a70536ec7af29a99658a9e207b6e16d230
--- /dev/null
+++ b/hpvm/projects/predtuner
@@ -0,0 +1 @@
+Subproject commit 70ead4a70536ec7af29a99658a9e207b6e16d230
diff --git a/hpvm/projects/torch2hpvm/setup.py b/hpvm/projects/torch2hpvm/setup.py
index ae103a2cdf0c0872278c147ddac5774ce79da452..f0cd851e586cf4d35c856ead11915f97c7654901 100644
--- a/hpvm/projects/torch2hpvm/setup.py
+++ b/hpvm/projects/torch2hpvm/setup.py
@@ -7,6 +7,8 @@ setup(
     author="Yuanjing Shi, Yifan Zhao",
     author_email="ys26@illinois.edu, yifanz16@illinois.edu",
     packages=["torch2hpvm"],
-    install_requires=["jinja2>=2.11", "networkx>=2.5", "onnx>=1.8.0", "torch"],
+    install_requires=[
+        "jinja2>=2.11", "networkx>=2.5", "onnx>=1.8.0", "torch", "onnx-simplifier>=0.2.27"
+    ],
     entry_points={"console_scripts": ["torch2hpvm=torch2hpvm:main"]},
 )
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/approxknobs.json b/hpvm/projects/torch2hpvm/torch2hpvm/approxknobs.json
index 974b536c48cd1d5ab96120cfd0c5e9510846df17..9d7cb28a8b3fcc2301735c21e99119beb5a89907 100644
--- a/hpvm/projects/torch2hpvm/torch2hpvm/approxknobs.json
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/approxknobs.json
@@ -2,7 +2,8 @@
     {
         "name": "11",
         "speedup": 1.0,
-        "applies_to": null
+        "applies_to": null,
+        "is_baseline": true
     },
     {
         "name": "12",
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/codegen_hpvm.py b/hpvm/projects/torch2hpvm/torch2hpvm/codegen_hpvm.py
index cdba5f327f54f1b77889f34d51c6df54ad86786a..6f6b71eae0deda9176c3dcb32c76c99bccbf5f07 100644
--- a/hpvm/projects/torch2hpvm/torch2hpvm/codegen_hpvm.py
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/codegen_hpvm.py
@@ -6,10 +6,10 @@ import jinja2
 from .graph_builder import DFG
 from .graph_ir import DFGNode, TensorNode, WeightTensor
 
-TEMPLATE_FILE = "template_hpvm.cpp.in"
+PLAIN_TEMPLATE_FILE = "template_hpvm.cpp.in"
+INSPECT_TEMPLATE_FILE = "template_hpvm_inspect.cpp.in"
 loader = jinja2.FileSystemLoader(searchpath=Path(__file__).parent)
 template_env = jinja2.Environment(loader=loader, trim_blocks=True)
-template = template_env.get_template(TEMPLATE_FILE)
 
 PathLike = Union[str, Path]
 
@@ -69,11 +69,22 @@ class HpvmCodeGen(CodeGen):
     # Variable indicator is always int for hpvm gen
     variables: Dict[DFGNode, Tuple[int, bool]]
 
-    def __init__(self, dfg: DFG, prefix: PathLike, input_size: int, target: str):
+    def __init__(
+        self,
+        dfg: DFG,
+        prefix: PathLike,
+        input_size: int,
+        target: str,
+        inspectable: Optional[dict],
+    ):
         super().__init__(dfg, prefix, input_size)
         if target not in ("tensor", "cudnn"):
             raise ValueError(f"Unsupported target {target}")
         self.target = target
+        self.template = template_env.get_template(
+            PLAIN_TEMPLATE_FILE if inspectable is None else INSPECT_TEMPLATE_FILE
+        )
+        self.inspect_vars = inspectable or {}
 
     def _emit_hpvm_node_edges(self, input_vars: List[DFGNode]) -> List[dict]:
         ret = []
@@ -133,7 +144,7 @@ class HpvmCodeGen(CodeGen):
         weights = self.emit_weights(self.weights)
         with Path(output).open("w") as f:
             f.write(
-                template.render(
+                self.template.render(
                     nodes=nodes,
                     input_name=self.input_name,
                     input_size=self.input_size,
@@ -144,5 +155,6 @@ class HpvmCodeGen(CodeGen):
                     weights=weights,
                     prefix=self.prefix,
                     target=self.target,
+                    **self.inspect_vars
                 )
             )
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/compile.py b/hpvm/projects/torch2hpvm/torch2hpvm/compile.py
index cc2a670dad75661a296dcb4465a8de56358630b5..f0d8c3b131231d637429c40b7e68a94627ebd6bf 100644
--- a/hpvm/projects/torch2hpvm/torch2hpvm/compile.py
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/compile.py
@@ -34,6 +34,9 @@ class ModelExporter:
     weight_dir_name = "weights"
     source_file_name = "hpvm_c.cpp"
     metadata_file_name = "ops.json"
+    config_file_name = "tuner_confs.txt"
+    fifo_file_name_r = "hpvm_fifo_r"
+    fifo_file_name_w = "hpvm_fifo_w"
 
     def __init__(
         self,
@@ -43,39 +46,61 @@ class ModelExporter:
         output_dir: PathLike,
         target: str = "hpvm_tensor",
         opset: Optional[int] = None,
+        config_file: PathLike = None,
     ):
-        from onnxsim import simplify
-
         self.tune_dataset, self.test_dataset = tune_dataset, test_dataset
         self.dataset_shape = self._check_datasets(tune_dataset, test_dataset)
         self.dataset_size = self.dataset_shape[0]
-        onnx_model = self._load_model(model, self.dataset_shape)
-        if opset is not None:
-            onnx_model = check_onnx_version(onnx_model, opset)
-        onnx_model, check = simplify(onnx_model)
-        assert check, "Simplified ONNX model could not be validated"
-        onnx_model = onnx.shape_inference.infer_shapes(onnx_model)
-
+        onnx_model = self._load_model(model, self.dataset_shape, opset)
         self.dfg = DFG(onnx_model.graph)
-        self.output_dir = Path(output_dir)
+
+        output_dir = Path(output_dir).absolute()
         os.makedirs(output_dir, exist_ok=True)
-        self.weight_dir = self.output_dir / self.weight_dir_name
+        self.weight_dir = output_dir / self.weight_dir_name
         self.weight_dir.mkdir(exist_ok=True)
+        self.codefile = output_dir / self.source_file_name
+        self.metafile = output_dir / self.metadata_file_name
 
+        args3 = self.dfg, self.weight_dir, self.dataset_size
+        self.compile_args = None
+        self.path_params = {}
         if target == "hpvm_tensor":
-            self.codegen = HpvmCodeGen(self.dfg, self.weight_dir, self.dataset_size, "tensor")
+            if config_file is None:
+                raise ValueError(
+                    f"Config file must be given and exist under hpvm_tensor mode"
+                )
+            self.path_params = {"config_file": Path(config_file)}
+            self.compile_args = ["-t", "tensor", "--conf-file", str(config_file)]
+            self.codegen = HpvmCodeGen(*args3, "tensor", None)
+        elif target == "hpvm_tensor_inspect":
+            if config_file is None:
+                config_file = output_dir / self.config_file_name
+            else:
+                config_file = Path(config_file).absolute()
+            self.path_params = {
+                "tune_labels_path": (self.weight_dir / self.tuneset_name[1]).as_posix(),
+                "conf_path": config_file.as_posix(),
+                "fifo_path_r": (output_dir / self.fifo_file_name_r).as_posix(),
+                "fifo_path_w": (output_dir / self.fifo_file_name_w).as_posix()
+            }
+            self.compile_args = ["-t", "tensor", "--conf-file", str(config_file)]
+            self.codegen = HpvmCodeGen(*args3, "tensor", self.path_params)
         elif target == "hpvm_cudnn":
-            self.codegen = HpvmCodeGen(self.dfg, self.weight_dir, self.dataset_size, "cudnn")
+            self.compile_target = "cudnn"
+            self.compile_args = ["-t", "cudnn"]
+            self.codegen = HpvmCodeGen(*args3, "cudnn", None)
         elif target == "tensor":
-            self.codegen = TensorCodeGen(self.dfg, self.weight_dir, self.dataset_size)
+            self.codegen = TensorCodeGen(*args3)
         else:
             raise ValueError(f"Target {target} not recognized")
 
     def export_source_code(self, output: PathLike, batch_size: Optional[int] = None):
         self.codegen.compile(output, batch_size)
+        return self
 
     def export_weights(self):
         self.dfg.dump_weights(self.weight_dir)
+        return self
 
     def export_datasets(self):
         input_, labels = self.tuneset_name
@@ -86,6 +111,7 @@ class ModelExporter:
         self._dump_dataset(
             self.test_dataset, self.weight_dir / input_, self.weight_dir / labels
         )
+        return self
 
     def export_metadata(
         self, output: PathLike, approx_knobs_file: PathLike = def_approx_knobs_file
@@ -98,14 +124,21 @@ class ModelExporter:
         KnobInfoT = Tuple[str, float]
         ty_knobs: Dict[str, List[KnobInfoT]] = defaultdict(list)
         default_knobs: List[KnobInfoT] = []
+        baseline_knob = None
         for k in knobs:
-            applies_to = k.pop("applies_to")
-            k = k["name"], k["speedup"]
+            kp = k["name"], k["speedup"]
+            if "is_baseline" in k:
+                if baseline_knob:
+                    raise ValueError("Multiple baseline knobs")
+                baseline_knob = k["name"]
+            applies_to = k["applies_to"]
             if applies_to is None:
-                default_knobs.append(k)
+                default_knobs.append(kp)
                 continue
             for ty in applies_to:
-                ty_knobs[ty].append(k)
+                ty_knobs[ty].append(kp)
+        if not baseline_knob:
+            raise ValueError("No baseline knob given")
         idx = 0
         op_cost: Dict[str, int] = {}
         op_knobs: Dict[str, List[str]] = {}
@@ -127,18 +160,39 @@ class ModelExporter:
                     "op_cost": op_cost,
                     "knob_speedup": knob_speedup,
                     "op_knobs": op_knobs,
+                    "baseline_knob": baseline_knob,
+                    **self.path_params
                 },
                 f,
                 indent=2,
             )
-
-    def export_all(self, output: PathLike = None, batch_size: Optional[int] = None):
-        default_codefile = self.output_dir / self.source_file_name
-        self.export_source_code(output or default_codefile, batch_size)
-        default_metafile = self.output_dir / self.metadata_file_name
-        self.export_metadata(default_metafile)
+        return self
+
+    def compile(self, output_binary: PathLike, working_dir: Optional[PathLike] = None):
+        from subprocess import run
+
+        args = [
+            "approxhpvm.py",
+            str(self.codefile),
+            str(output_binary),
+            *self.compile_args,
+        ]
+        if working_dir is not None:
+            args.extend(["-d", str(working_dir)])
+        run(args, check=True)
+        return self
+
+    def generate(
+        self, output_code_file: PathLike = None, batch_size: Optional[int] = None
+    ):
+        self.codefile = (
+            self.codefile if output_code_file is None else Path(output_code_file)
+        )
+        self.export_source_code(self.codefile, batch_size)
+        self.export_metadata(self.metafile)
         self.export_weights()
         self.export_datasets()
+        return self
 
     @staticmethod
     def _dump_dataset(dataset: DatasetTy, input_filename: Path, labels_filename: Path):
@@ -216,7 +270,11 @@ class ModelExporter:
         return dataset.shape
 
     @staticmethod
-    def _load_model(model: ModelTy, dataset_shape: Sequence[int]) -> onnx.ModelProto:
+    def _load_model(
+        model: ModelTy, dataset_shape: Sequence[int], opset: Optional[int]
+    ) -> onnx.ModelProto:
+        from onnxsim import simplify
+
         if isinstance(model, Module):
             # Export to ONNX and load back.
             sample_input_shape = 1, *dataset_shape[1:]
@@ -224,10 +282,16 @@ class ModelExporter:
             with NamedTemporaryFile("w+b") as tmp:
                 torch_to_onnx(model, (sample_input,), tmp)
                 tmp.seek(0)
-                return onnx.load_model(tmp)
-        if isinstance(model, onnx.ModelProto):
-            return model
-        return onnx.load(Path(model).as_posix())
+                onnx_model = onnx.load_model(tmp)
+        elif isinstance(model, onnx.ModelProto):
+            onnx_model = model
+        else:
+            raise ValueError(f"Cannot accept model of type {type(model)}")
+        if opset is not None:
+            onnx_model = check_onnx_version(onnx_model, opset)
+        onnx_model, check = simplify(onnx_model)
+        assert check, "Simplified ONNX model could not be validated"
+        return onnx.shape_inference.infer_shapes(onnx_model)
 
 
 def check_onnx_version(model, new_version):
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in
index d7fd6c88840962b87a973c5d2d7b7aeff800ca52..0c1db9b1ff9d71cb9a8c8bbf3a2c64cec8331476 100644
--- a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in
@@ -99,8 +99,7 @@ int main(int argc, char *argv[]){
     void *result = static_cast<RootIn*>(args)->r.tensor;
     hpvm_request_tensor(result, 0);
 
-    uint32_t* labels = readLabelsBatch3(labels_path.c_str(), start, end);
-    computeAccuracy3(labels, result);
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
     freeBatchMemory();
   }
   __hpvm__cleanup();
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in
new file mode 100644
index 0000000000000000000000000000000000000000..94a8e0a534c04b323b4b66f369ab2d624a2a745f
--- /dev/null
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in
@@ -0,0 +1,168 @@
+#include <fstream>
+#include <string>
+#include <array>
+#include <hpvm.h>
+#include <tensorTypes.h>
+#include <tensorUtils.h>
+
+// For writing binary to file descriptors
+#include <cstdio>
+// For printing error
+#include <errno.h>
+#include <unistd.h>
+
+const int batch_size = {{batch_size}}, input_size = {{input_size}}, batch_count = input_size / batch_size;
+
+/**** Routines for Handling Piped Execution ***/
+
+FILE *open_fifo(const char *path, const char *mode) {
+  auto* fd = fopen(path, mode);
+  if (!fd) {
+    std::cerr << "Error opening FIFO file: " << strerror(errno) << '\n';
+    abort();
+  }
+  return fd;
+}
+
+int fifo_wait() {
+  auto* fp = open_fifo("{{fifo_path_r}}", "r");
+  const int maxn = 100;
+  char linebuf[maxn];
+  fgets(linebuf, maxn, fp);
+  fclose(fp);
+  std::string line(linebuf);
+  if (line == "test")
+    return 1;
+  if (line == "tune")
+    return 2;
+  if (line == "stop")
+    return 0;
+  std::cerr << "Invalid fifo file content \"" << line << "\"\n";
+  abort();
+}
+
+void fifo_write_batch(FILE *fp, void *output_ptr) {
+  auto *output = (Tensor *) output_ptr;
+  const auto &dim = output->dims;
+  size_t num_dims = dim.num_dims;
+  fwrite(&num_dims, sizeof(size_t), 1, fp);
+  fwrite(dim.dim_sizes, sizeof(size_t), dim.num_dims, fp);
+  fwrite(output->host_data, 1, output->size_in_bytes, fp);
+}
+
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
+
+{% for node in nodes %}
+void var_{{node.idx}}_node(
+{%- for n in range(node.input_size) -%}
+void *t{{n}}, size_t bytes_t{{n}}{{", " if not loop.last}}
+{%- endfor %}) {
+  __hpvm__hint(hpvm::{{target.upper()}}_TARGET);
+  __hpvm__attributes({{node.input_size}}, {% for n in range(node.input_size) -%}
+t{{n}}{{", " if not loop.last}}
+{%- endfor %}, 0);
+  __hpvm__node_id({{node.idx + 1}});
+  void *r = {{node.call_name}}({% for n in range(node.input_size) -%}
+t{{n}}{{", " if not loop.last}}
+{%- endfor %}{{", " if node.call_args}}{{node.call_args|join(", ")}});
+  __hpvm__return(2, r, (size_t) 0);
+}
+
+{% endfor -%}
+
+void root({%- for n in root_inputs -%}
+void *{{n}}, size_t {{n}}_bytes{{", " if not loop.last}}
+{%- endfor %}) {
+  __hpvm__hint(hpvm::CPU_TARGET);
+  __hpvm__attributes({{root_inputs|length}}, {% for n in root_inputs -%}
+{{n}}{{", " if not loop.last}}
+{%- endfor %}, 0);
+
+{% for node in nodes %}
+  void* var_{{node.idx}} = __hpvm__createNodeND(0, var_{{node.idx}}_node);
+{% for edge in node.edges %}
+{% if edge.is_bindin %}
+  __hpvm__bindIn(var_{{node.idx}}, {{edge.input_idx * 2}}, {{edge.edge_idx * 2}}, 0);
+  __hpvm__bindIn(var_{{node.idx}}, {{edge.input_idx * 2 + 1}}, {{edge.edge_idx * 2 + 1}}, 0);
+{% else %}
+  __hpvm__edge(var_{{edge.input_node}}, var_{{node.idx}}, 1, 0, {{edge.edge_idx * 2}}, 0);
+  __hpvm__edge(var_{{edge.input_node}}, var_{{node.idx}}, 1, 1, {{edge.edge_idx * 2 + 1}}, 0);
+{% endif %}
+{% endfor %}
+
+{% endfor %}
+  __hpvm__bindOut(var_{{root_output_idx}}, 0, 0, 0);
+  __hpvm__bindOut(var_{{root_output_idx}}, 1, 1, 0);
+}
+
+struct ret_t {
+  void* tensor;
+  size_t bytes;
+};
+
+typedef struct __attribute__((__packed__)) {
+{% for n in root_inputs %}
+  void *{{n}};
+  size_t {{n}}_bytes;
+{% endfor %}
+  struct ret_t r;
+} RootIn;
+
+int main(){
+  std::string dir_prefix = "{{prefix}}/";
+  std::string test_input = dir_prefix + "test_input.bin";
+  std::string test_labels = dir_prefix + "test_labels.bin";
+  std::string tune_input = dir_prefix + "tune_input.bin";
+  std::string tune_labels = dir_prefix + "tune_labels.bin";
+
+{% for w in weights %}
+  std::string {{w.name}}_path = dir_prefix + "{{w.filename}}";
+  void* {{w.name}} = readTrainedWeights({{w.name}}_path.c_str(), 0, {{w.shape|join(', ')}});
+{% endfor %}
+
+  RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn)));
+{% for n in root_inputs %}
+{% if n != input_name %}
+  args->{{n}} = {{n}};
+  args->{{n}}_bytes = 0;
+{% endif %}
+{% endfor %}
+
+  int ret = 0;
+  while ((ret = fifo_wait())) {
+    __hpvm__init();
+    startMemTracking();
+    const auto *input_pth = (ret == 1 ? test_input : tune_input).c_str();
+    const auto *labels_pth = (ret == 1 ? test_labels : tune_labels).c_str();
+
+    // Keep this open so the other side knows we have more batches to write
+    auto* fp = open_fifo("{{fifo_path_w}}", "wb");
+    float total_accuracy = 0;
+    for (int i = 0; i < batch_count; i++){
+      int start = i * batch_size, end = start + batch_size;
+      void *{{input_name}} = readInputBatch(input_pth, 0, start, end, {{input_shape|join(', ')}});
+      args->input = {{input_name}};
+      args->input_bytes = 0;
+
+      void* dfg = __hpvm__launch(0, root, (void*) args);
+      __hpvm__wait(dfg);
+      void *result = static_cast<RootIn*>(args)->r.tensor;
+      hpvm_request_tensor(result, 0);
+
+      uint32_t* labels = readLabelsBatch3(labels_pth, start, end);
+      float accuracy = computeAccuracy3(labels, result);
+      total_accuracy += accuracy * batch_size;
+
+      fifo_write_batch(fp, result);
+      freeBatchMemory();
+    }
+    fclose(fp);
+    write_accuracy(total_accuracy / input_size);
+    __hpvm__cleanup();
+  }
+
+  return 0;
+}
diff --git a/hpvm/scripts/llvm_installer.sh b/hpvm/scripts/llvm_installer.sh
index 3ed7fd3a951d27dedc9b84adf82835a0eedbd1e2..a8fa022047fb7983c466b618863a7b2a66a50f92 100755
--- a/hpvm/scripts/llvm_installer.sh
+++ b/hpvm/scripts/llvm_installer.sh
@@ -184,6 +184,7 @@ if [ ! -d $HPVM_DIR ]; then
   echo Adding HPVM sources to tree
   mkdir -p $HPVM_DIR
   ln -s $CURRENT_DIR/CMakeLists.txt $HPVM_DIR
+  ln -s $CURRENT_DIR/cmake $HPVM_DIR/
   ln -s $CURRENT_DIR/include $HPVM_DIR/
   ln -s $CURRENT_DIR/lib $HPVM_DIR/
   ln -s $CURRENT_DIR/projects $HPVM_DIR/
@@ -208,7 +209,7 @@ if ! $AUTOMATE ; then
   echo "To complete installation, follow these instructions:"
   echo "  - Create and navigate to a folder \"./build\" "
   echo "  - Run \"cmake ../llvm [options]\". Find potential options in README.md."
-  echo "  - Run \"make -j<number of threads>\" and then \"make install\""
+  echo "  - Run \"make -j<number of threads> approxhpvm.py\" and then \"make install\""
   echo "For more details refer to README.md."
   echo 
   echo "Exiting."
@@ -237,8 +238,8 @@ cd $BUILD_DIR
 echo cmake ../$LLVM_SRC -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLLVM_TARGETS_TO_BUILD=$TARGET  -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR
 cmake ../$LLVM_SRC -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLLVM_TARGETS_TO_BUILD=$TARGET  -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR
 
-echo make -j$NUM_THREADS
-make -j$NUM_THREADS
+echo make -j$NUM_THREADS approxhpvm.py
+make -j$NUM_THREADS approxhpvm.py
 #make install
 
 if [ -f $BUILD_DIR/tools/hpvm/projects/$HPVM_RT ]; then
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt
index 76d6910d2d43d641f5a2dfff1d48b39fe25686a4..37a856123d1ea9ee074a5ac2844b223a78c56e16 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt
@@ -1,5 +1,5 @@
 # First get approxhpvm.py which we then use to compile benchmarks.
-get_filename_component(APPROXHPVM_PY ${PROJECT_BINARY_DIR}/bin/approxhpvm.py REALPATH)
+get_filename_component(APPROXHPVM_PY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py REALPATH)
 
 # Configure config.h which tells the benchmarks where's the model parameter directory.
 # We can also use the one in tensor_runtime, but we're avoiding that so as to 
diff --git a/hpvm/test/dnn_benchmarks/pytorch/dnn/_container.py b/hpvm/test/dnn_benchmarks/pytorch/dnn/_container.py
index 5918d960a745e5d245410acfac7c827b5b011f14..6ddc1b8ea35df8a098e98a74cfa313cd9bf9e7a8 100644
--- a/hpvm/test/dnn_benchmarks/pytorch/dnn/_container.py
+++ b/hpvm/test/dnn_benchmarks/pytorch/dnn/_container.py
@@ -16,10 +16,10 @@ def make_conv_pool_activ(
     **conv_kwargs
 ):
     layers = [Conv2d(in_channels, out_channels, kernel_size, **conv_kwargs)]
-    if pool_size is not None:
-        layers.append(MaxPool2d(pool_size, stride=pool_stride))
     if activation:
         layers.append(activation())
+    if pool_size is not None:
+        layers.append(MaxPool2d(pool_size, stride=pool_stride))
     return layers
 
 
diff --git a/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py b/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py
index 7395136eb5f19adc2ad3450c34b60c911f72747e..19f17366459a7684c6df8a940438b661cf7f6029 100644
--- a/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py
+++ b/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py
@@ -42,15 +42,11 @@ for model_cls, nch, img_size, batch_size, pathname in benchmarks:
     checkpoint = self_folder / "../model_params" / f"{pathname}.pth.tar"
     model.load_state_dict(torch.load(checkpoint.as_posix()))
 
-    exporter = ModelExporter(model, bin_tuneset, bin_testset, codegen_dir)
-    exporter.export_all(batch_size=batch_size)
-
-    conf_file = self_folder / "../hpvm-c/benchmarks" / pathname / "data/tuner_confs.txt"
     build_dir = codegen_dir / "build"
     target_binary = build_dir / pathname
-    run([
-        "approxhpvm.py", str(codegen_dir / ModelExporter.source_file_name), str(target_binary),
-        "-d", str(build_dir),
-        "-t", "tensor", "--conf-file", str(conf_file)
-    ], check=True)
+    conf_file = self_folder / "../hpvm-c/benchmarks" / pathname / "data/tuner_confs.txt"
+    exporter = ModelExporter(
+        model, bin_tuneset, bin_testset, codegen_dir, config_file=conf_file
+    )
+    exporter.generate(batch_size=batch_size).compile(target_binary, build_dir)
     run([str(target_binary), "test"], check=True)
diff --git a/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py b/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0451b70b44325a355345ad95ab9bf85154002c5
--- /dev/null
+++ b/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py
@@ -0,0 +1,80 @@
+import os
+import shutil
+import site
+from pathlib import Path
+
+import torch
+from predtuner import config_pylogger
+from predtuner.pipedbin import PipedBinaryApp
+from torch2hpvm import BinDataset, ModelExporter
+from torch.nn import Module
+
+site.addsitedir(os.path.dirname(__file__))
+import dnn
+
+# Set up logger to put log file in /tmp
+msg_logger = config_pylogger(output_dir="/tmp", verbose=True)
+
+
+benchmarks = [
+    (dnn.LeNet, 1, 28, 500, "lenet_mnist"),
+    (dnn.AlexNet, 3, 32, 500, "alexnet_cifar10"),
+    (dnn.AlexNet2, 3, 32, 500, "alexnet2_cifar10"),
+    (dnn.AlexNetImageNet, 3, 224, 100, "alexnet_imagenet"),
+    (dnn.MobileNet, 3, 32, 500, "mobilenet_cifar10"),
+    (dnn.ResNet18, 3, 32, 500, "resnet18_cifar10"),
+    (dnn.ResNet50, 3, 224, 50, "resnet50_imagenet"),
+    (dnn.VGG16Cifar10, 3, 32, 500, "vgg16_cifar10"),
+    (dnn.VGG16Cifar100, 3, 32, 500, "vgg16_cifar100"),
+    (dnn.VGG16ImageNet, 3, 224, 50, "vgg16_imagenet"),
+]
+model_param = Path(__file__).parent / "../model_params"
+
+
+def generate(model_cls, nch, img_size, batch_size, pathname):
+    codegen_dir = Path(f"/tmp/{pathname}_tune")
+    build_dir = codegen_dir / "build"
+    metadata_file = codegen_dir / "ops.json"
+    binary_file = build_dir / pathname
+    build_dir = codegen_dir / "build"
+    # if binary_file.is_file() and metadata_file.is_file():
+    #     return binary_file, metadata_file
+
+    print(f"Generating {pathname} to {codegen_dir}")
+    if codegen_dir.exists():
+        shutil.rmtree(codegen_dir)
+    params = model_param / pathname
+    dataset_shape = 5000, nch, img_size, img_size
+    bin_tuneset = BinDataset(
+        params / "tune_input.bin", params / "tune_labels.bin", dataset_shape
+    )
+    bin_testset = BinDataset(
+        params / "test_input.bin", params / "test_labels.bin", dataset_shape
+    )
+    model: Module = model_cls()
+    checkpoint = model_param / f"{pathname}.pth.tar"
+    model.load_state_dict(torch.load(checkpoint.as_posix()))
+    exporter = ModelExporter(
+        model, bin_tuneset, bin_testset, codegen_dir, target="hpvm_tensor_inspect"
+    )
+    exporter.generate(batch_size=batch_size).compile(binary_file, build_dir)
+    return binary_file, metadata_file
+
+
+def main():
+    for model_cls, nch, img_size, batch_size, pathname in benchmarks:
+        print(f"Testing {pathname}")
+        binary_file, metadata_file = generate(
+            model_cls, nch, img_size, batch_size, pathname
+        )
+        app = PipedBinaryApp("test", binary_file, metadata_file)
+        tuner = app.get_tuner()
+        tuner.tune(100, 3.0, 3.0, True, 50, cost_model="cost_linear")
+        tuner.dump_configs("configs.json")
+        fig = tuner.plot_configs(show_qos_loss=True)
+        fig.savefig("configs.png", dpi=300)
+        app.dump_hpvm_configs(tuner.best_configs, "hpvm_confs.txt")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/hpvm/tools/py-approxhpvm/CMakeLists.txt b/hpvm/tools/py-approxhpvm/CMakeLists.txt
index e46c45623f13034e1cb4c5b1ed2434ec40d4c12c..60fbc66aadd362e6aceb507dec5f1bec1223c418 100644
--- a/hpvm/tools/py-approxhpvm/CMakeLists.txt
+++ b/hpvm/tools/py-approxhpvm/CMakeLists.txt
@@ -1,9 +1,9 @@
 # This file is very tightly coupled with main.py.in.
 # Watch out and keep them in sync.
 
-set(LLVM_PROJECT_DIR ${PROJECT_SOURCE_DIR})
-set(LLVM_BUILD_DIR ${PROJECT_BINARY_DIR})
-set(LIB_DIR ${PROJECT_BINARY_DIR}/lib)
+set(LLVM_PROJECT_DIR ${CMAKE_SOURCE_DIR})
+set(LLVM_BUILD_DIR ${CMAKE_BINARY_DIR})
+set(LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})
 # The hpvm-rt runtime
 # This has to be explicitly set as hpvm-rt.bc is created in a custom_target
 # and does not export its file location.
@@ -26,7 +26,8 @@ set(
     LLVMClearDFG
     LLVMGenHPVM
 )
-# CUDA_TOOLKIT_ROOT_DIR is already defined
+# CUDA_TOOLKIT_ROOT_DIR and CUDNN_LIBRARY_PATH has been defined globally
+set(CUDNN_DIR ${CUDNN_LIBRARY_PATH})
 # First resolve all `@symbol@` by configuring the file
 configure_file(main.py.in ${CMAKE_CURRENT_BINARY_DIR}/main.py.conf)
 # Then resolve all generator expressions we configured into the previous file
@@ -51,9 +52,9 @@ set(
     clang opt llvm-link
 )
 add_custom_command(
-    OUTPUT ${PROJECT_BINARY_DIR}/bin/approxhpvm.py
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/main.py ${PROJECT_BINARY_DIR}/bin/approxhpvm.py
-    COMMAND chmod +x ${PROJECT_BINARY_DIR}/bin/approxhpvm.py
+    OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/main.py ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py
+    COMMAND chmod +x ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py
     DEPENDS ${DEPS} ${CMAKE_CURRENT_BINARY_DIR}/main.py
 )
-add_custom_target(approxhpvm.py ALL DEPENDS ${PROJECT_BINARY_DIR}/bin/approxhpvm.py)
+add_custom_target(approxhpvm.py ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py)
diff --git a/hpvm/tools/py-approxhpvm/main.py.in b/hpvm/tools/py-approxhpvm/main.py.in
index 752a7609ca0831838949b037ac7b8c0323ac8871..fdbbaec1ccc070f87bedcd0f0c646e12531d99fe 100644
--- a/hpvm/tools/py-approxhpvm/main.py.in
+++ b/hpvm/tools/py-approxhpvm/main.py.in
@@ -12,6 +12,7 @@ CUDA_TOOLKIT_ROOT_DIR = Path("@CUDA_TOOLKIT_ROOT_DIR@")
 TENSOR_RUNTIME_LIBS = "@TENSOR_RUNTIME_LIBS@".split(";")
 AVAILABLE_PASSES = "@AVAILABLE_PASSES@".split(";")
 HPVM_RT_PATH = "@HPVM_RT_PATH@"
+CUDNN_DIR = "@CUDNN_DIR@"
 
 # Directories to include
 INCLUDE_DIRS = [
@@ -21,7 +22,7 @@ INCLUDE_DIRS = [
     HPVM_PROJECT_DIR / "test/dnn_benchmarks/hpvm-c/include",  # hpvm-c intrinsics decl dir
     CUDA_TOOLKIT_ROOT_DIR / "include",  # CUDA include dir
 ]
-LINK_DIRS = [CUDA_TOOLKIT_ROOT_DIR / "lib64"]
+LINK_DIRS = [CUDA_TOOLKIT_ROOT_DIR / "lib64", CUDNN_DIR]
 LINK_LIBS = [
     "pthread", "cudart", "curand", "cudnn", "cublas", "cufft", "OpenCL", "stdc++fs", "omp", "m"
 ]