diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
index efffcb0bb9aa466215fd825c9254ea4d91fe8b6a..0eab861beac699407114d0cae4e063f41dbdab95 100644
--- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -30,6 +30,8 @@ endif()
 
 include_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/include)
 include_directories(./tensor_runtime/include)
+include_directories(../gpu_profiler/include)
+include_directories(../soc_simulator/include)
 link_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64)
 
 
@@ -40,6 +42,16 @@ cuda_add_cublas_to_target(tensor_runtime)
 # Adding new rule for building a cuDNN runtime library
 cuda_add_library(tensor_cpu_runtime tensor_runtime/src/tensor_cpu_runtime.cc)
 
+find_library(GPU_PROFILER_LIB
+    NAMES libgpu_profiler.a
+    HINTS ../gpu_profiler/lib
+)
+
+find_library(SOC_SIMULATOR_LIB
+    NAMES libpromise_profiler.a
+    HINTS ../soc_simulator/lib
+)
+
 
 if(USE_GFLAGS)
   target_link_libraries(tensor_runtime gflags cudnn -lcurand)
@@ -51,94 +63,94 @@ target_link_libraries(tensor_cpu_runtime)
 
 # Adding rule for the debugging source
 add_executable(test_ops  dnn_sources/src/test_ops.cc)
-target_link_libraries(test_ops  tensor_runtime)
+target_link_libraries(test_ops  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 #**** CPU sources
 add_executable(fc2_cpu  dnn_sources/src/fc2_cpu.cc)
-target_link_libraries(fc2_cpu  tensor_cpu_runtime)
+target_link_libraries(fc2_cpu  tensor_cpu_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
 # Full-Precision versions
 add_executable(lenet_tanh  dnn_sources/src/lenet2_tanh.cc)
-target_link_libraries(lenet_tanh  tensor_runtime)
+target_link_libraries(lenet_tanh  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(lenet_keras  dnn_sources/src/lenet_keras.cc)
-target_link_libraries(lenet_keras  tensor_runtime)
+target_link_libraries(lenet_keras  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(lenet_int32  dnn_sources/src/lenet_int32.cc)
-target_link_libraries(lenet_int32  tensor_runtime)
+target_link_libraries(lenet_int32  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 add_executable(alexnet_cifar10  dnn_sources/src/alexnet_cifar10_front.cc)
-target_link_libraries(alexnet_cifar10  tensor_runtime)
+target_link_libraries(alexnet_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet_cifar10_tuner  dnn_sources/src/alexnet_cifar10_tuner.cc)
-target_link_libraries(alexnet_cifar10_tuner  tensor_runtime)
+target_link_libraries(alexnet_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet_cifar10_approx  dnn_sources/src/alexnet_cifar10_approx.cc)
-target_link_libraries(alexnet_cifar10_approx  tensor_runtime)
+target_link_libraries(alexnet_cifar10_approx  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet2_cifar10  dnn_sources/src/alexnet2_cifar10.cc)
-target_link_libraries(alexnet2_cifar10  tensor_runtime)
+target_link_libraries(alexnet2_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet2_cifar10_tuner  dnn_sources/src/alexnet2_cifar10_tuner.cc)
-target_link_libraries(alexnet2_cifar10_tuner  tensor_runtime)
+target_link_libraries(alexnet2_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar10  dnn_sources/src/vgg16_cifar10.cc)
-target_link_libraries(vgg16_cifar10  tensor_runtime)
+target_link_libraries(vgg16_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar10_tuner  dnn_sources/src/vgg16_cifar10_tuner.cc)
-target_link_libraries(vgg16_cifar10_tuner  tensor_runtime)
+target_link_libraries(vgg16_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_cifar10  dnn_sources/src/resnet18_cifar10.cc)
-target_link_libraries(resnet18_cifar10  tensor_runtime)
+target_link_libraries(resnet18_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_cifar10_approx  dnn_sources/src/resnet18_cifar10_approx.cc)
-target_link_libraries(resnet18_cifar10_approx  tensor_runtime)
+target_link_libraries(resnet18_cifar10_approx  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_cifar10_inputapprox  dnn_sources/src/resnet18_cifar10_inputapprox.cc)
-target_link_libraries(resnet18_cifar10_inputapprox  tensor_runtime)
+target_link_libraries(resnet18_cifar10_inputapprox  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_cifar10_tuner  dnn_sources/src/resnet18_cifar10_tuner.cc)
-target_link_libraries(resnet18_cifar10_tuner  tensor_runtime)
+target_link_libraries(resnet18_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100  dnn_sources/src/vgg16_cifar100.cc)
-target_link_libraries(vgg16_cifar100  tensor_runtime)
+target_link_libraries(vgg16_cifar100  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100_tuner  dnn_sources/src/vgg16_cifar100_tuner.cc)
-target_link_libraries(vgg16_cifar100_tuner  tensor_runtime)
+target_link_libraries(vgg16_cifar100_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100_top5  dnn_sources/src/vgg16_cifar100_5.cc)
-target_link_libraries(vgg16_cifar100_top5  tensor_runtime)
+target_link_libraries(vgg16_cifar100_top5  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 # REF binaries
 add_executable(mobilenet_cifar10  dnn_sources/src/mobilenet_cifar10.cc)
-target_link_libraries(mobilenet_cifar10  tensor_runtime)
+target_link_libraries(mobilenet_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 add_executable(mobilenet_cifar10_shallow  dnn_sources/src/mobilenet_cifar10_shallow.cc)
-target_link_libraries(mobilenet_cifar10_shallow  tensor_runtime)
+target_link_libraries(mobilenet_cifar10_shallow  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
 #### Image Pipeline Tuning sources
 
 add_executable(pipeline_GEMO  dnn_sources/src/pipeline_GEMO.cc)
-target_link_libraries(pipeline_GEMO  tensor_runtime)
+target_link_libraries(pipeline_GEMO  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEO  dnn_sources/src/pipeline_GEO.cc)
-target_link_libraries(pipeline_GEO  tensor_runtime)
+target_link_libraries(pipeline_GEO  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEOM  dnn_sources/src/pipeline_GEOM.cc)
-target_link_libraries(pipeline_GEOM  tensor_runtime)
+target_link_libraries(pipeline_GEOM  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSM  dnn_sources/src/pipeline_GSM.cc)
-target_link_libraries(pipeline_GSM  tensor_runtime)
+target_link_libraries(pipeline_GSM  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSME  dnn_sources/src/pipeline_GSME.cc)
-target_link_libraries(pipeline_GSME  tensor_runtime)
+target_link_libraries(pipeline_GSME  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
@@ -147,101 +159,101 @@ target_link_libraries(pipeline_GSME  tensor_runtime)
 
 
 #add_executable(fc4_half  dnn_sources/src/half/fc4_half.cc)
-#target_link_libraries(fc4_half  tensor_runtime)
+#target_link_libraries(fc4_half  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 #add_executable(lenet_tanh_half  dnn_sources/src/half/lenet_tanh_half.cc)
-#target_link_libraries(lenet_tanh_half  tensor_runtime)
+#target_link_libraries(lenet_tanh_half  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 #add_executable(lenet_keras_half  dnn_sources/src/half/lenet_keras_half.cc)
-#target_link_libraries(lenet_keras_half  tensor_runtime)
+#target_link_libraries(lenet_keras_half  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 #********* Promise API sources
 add_executable(lenet_promise  dnn_sources/src/promise/lenet_promise.cc)
-target_link_libraries(lenet_promise  tensor_runtime)
+target_link_libraries(lenet_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 # Quantized PROMISE sources
 add_executable(alexnet_promise  dnn_sources/src/promise/alexnet_promise.cc)
-target_link_libraries(alexnet_promise  tensor_runtime)
+target_link_libraries(alexnet_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet2_promise  dnn_sources/src/promise/alexnet2_promise.cc)
-target_link_libraries(alexnet2_promise  tensor_runtime)
+target_link_libraries(alexnet2_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_promise  dnn_sources/src/promise/resnet18_promise.cc)
-target_link_libraries(resnet18_promise  tensor_runtime)
+target_link_libraries(resnet18_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100_promise  dnn_sources/src/promise/vgg16_cifar100_promise.cc)
-target_link_libraries(vgg16_cifar100_promise  tensor_runtime)
+target_link_libraries(vgg16_cifar100_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar10_promise  dnn_sources/src/promise/vgg16_cifar10_promise.cc)
-target_link_libraries(vgg16_cifar10_promise  tensor_runtime)
+target_link_libraries(vgg16_cifar10_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 # REF Source - BUILT After Support for SMART QUANTIZATION
 add_executable(mobilenet_promise  dnn_sources/src/promise/mobilenet_promise.cc)
-target_link_libraries(mobilenet_promise  tensor_runtime)
+target_link_libraries(mobilenet_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(mobilenet_shallow_promise  dnn_sources/src/promise/mobilenet_shallow_promise.cc)
-target_link_libraries(mobilenet_shallow_promise  tensor_runtime)
+target_link_libraries(mobilenet_shallow_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
 #### Image Pipeline PROMISE sources
 add_executable(pipeline_GEMO_promise  dnn_sources/src/promise/pipeline_GEMO_promise.cc)
-target_link_libraries(pipeline_GEMO_promise  tensor_runtime)
+target_link_libraries(pipeline_GEMO_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEO_promise  dnn_sources/src/promise/pipeline_GEO_promise.cc)
-target_link_libraries(pipeline_GEO_promise  tensor_runtime)
+target_link_libraries(pipeline_GEO_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEOM_promise  dnn_sources/src/promise/pipeline_GEOM_promise.cc)
-target_link_libraries(pipeline_GEOM_promise  tensor_runtime)
+target_link_libraries(pipeline_GEOM_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSM_promise  dnn_sources/src/promise/pipeline_GSM_promise.cc)
-target_link_libraries(pipeline_GSM_promise  tensor_runtime)
+target_link_libraries(pipeline_GSM_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSME_promise  dnn_sources/src/promise/pipeline_GSME_promise.cc)
-target_link_libraries(pipeline_GSME_promise  tensor_runtime)
+target_link_libraries(pipeline_GSME_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
 #############  Promise Validation Sources #############
 
 add_executable(alexnet_valid  dnn_sources/src/promise/alexnet_valid.cc)
-target_link_libraries(alexnet_valid  tensor_runtime)
+target_link_libraries(alexnet_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet2_valid  dnn_sources/src/promise/alexnet2_valid.cc)
-target_link_libraries(alexnet2_valid  tensor_runtime)
+target_link_libraries(alexnet2_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_valid  dnn_sources/src/promise/resnet18_valid.cc)
-target_link_libraries(resnet18_valid  tensor_runtime)
+target_link_libraries(resnet18_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100_valid  dnn_sources/src/promise/vgg16_cifar100_valid.cc)
-target_link_libraries(vgg16_cifar100_valid  tensor_runtime)
+target_link_libraries(vgg16_cifar100_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar10_valid  dnn_sources/src/promise/vgg16_cifar10_valid.cc)
-target_link_libraries(vgg16_cifar10_valid  tensor_runtime)
+target_link_libraries(vgg16_cifar10_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 add_executable(mobilenet_valid  dnn_sources/src/promise/mobilenet_valid.cc)
-target_link_libraries(mobilenet_valid  tensor_runtime)
+target_link_libraries(mobilenet_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(mobilenet_shallow_valid  dnn_sources/src/promise/mobilenet_shallow_valid.cc)
-target_link_libraries(mobilenet_shallow_valid  tensor_runtime)
+target_link_libraries(mobilenet_shallow_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 ##### Image pipeline validation sources
 add_executable(pipeline_GEMO_valid  dnn_sources/src/promise/pipeline_GEMO_valid.cc)
-target_link_libraries(pipeline_GEMO_valid  tensor_runtime)
+target_link_libraries(pipeline_GEMO_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEO_valid  dnn_sources/src/promise/pipeline_GEO_valid.cc)
-target_link_libraries(pipeline_GEO_valid  tensor_runtime)
+target_link_libraries(pipeline_GEO_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEOM_valid  dnn_sources/src/promise/pipeline_GEOM_valid.cc)
-target_link_libraries(pipeline_GEOM_valid  tensor_runtime)
+target_link_libraries(pipeline_GEOM_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSM_valid  dnn_sources/src/promise/pipeline_GSM_valid.cc)
-target_link_libraries(pipeline_GSM_valid  tensor_runtime)
+target_link_libraries(pipeline_GSM_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSME_valid  dnn_sources/src/promise/pipeline_GSME_valid.cc)
-target_link_libraries(pipeline_GSME_valid  tensor_runtime)
+target_link_libraries(pipeline_GSME_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
diff --git a/llvm/projects/hpvm-tensor-rt/README.md b/llvm/projects/hpvm-tensor-rt/README.md
index c243c6dd1662083021a642b9a088fa55f5d1ed3d..5b3e5f99d39cf5c697051fa2580eb74f207bb031 100644
--- a/llvm/projects/hpvm-tensor-rt/README.md
+++ b/llvm/projects/hpvm-tensor-rt/README.md
@@ -6,10 +6,25 @@
 * CUDA-9.0 or above
 * CUBLAS-9.0 or above - often included with cuda-toolkit
 
+## Dependent Library Builds
+
+```shell
+cd ../gpu_profiler
+mkdir lib
+cmake ../
+make
+
+cd ../soc_simulator
+mkdir lib
+cmake ../
+make
+```
+
+
 ## BUILD
 
 ```shell
-source bin/setup_runtime_paths.sh
+source bin/setup_cuda_llvm_paths.sh
 mkdir build
 cd build
 cmake ../
diff --git a/llvm/projects/hpvm-tensor-rt/bin/setup_cuda_llvm_paths.sh b/llvm/projects/hpvm-tensor-rt/bin/setup_cuda_llvm_paths.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3548f182f198724600aee855b66169a1bdf12a3a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/bin/setup_cuda_llvm_paths.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# CUDNN Path setup
+module load cuda-toolkit/9.1
+export CUDA_INCLUDE_PATH=/software/cuda-9.1/include
+export CUDNN_PATH=/software/cuda-9.1/lib64/
+export LIBRARY_PATH=/software/cuda-9.1/lib64/:$LIBRARY_PATH
+export LD_LIBRARY_PATH=/software/cuda-9.1/lib64/:$LD_LIBRARY_PATH
+
+# HPVM Path setup
+export CPATH=$CPATH:/home/hsharif3/anaconda2/include/
+export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH
+export LLVM_BUILD_ROOT=/home/hsharif3/Gitlab/hpvm/build/
+export LLVM_SRC_ROOT=/home/hsharif3/Gitlab/hpvm/llvm/
diff --git a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py b/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py
index 2c57eaf5be7c09a05859221535a7aff709330fcf..e3b94082f5be7b83a1598625afd5ef05a0472506 100644
--- a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py
+++ b/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py
@@ -65,7 +65,7 @@ class TableGenerator:
         3. Writes the internal table to <network_name>_tensors.txt file and uses the 
         <network_name>_ops.txt file as a guideline in terms of row order 
         '''
-        #self.__run_inputted_binaries()
+        self.__run_inputted_binaries()
         self.__build_internal_table()
         self.__output_table_to_file()
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
index dd66c2febd94ff1a43dd7bdf0bf9eeb799e95dd9..217f575c2a999ba468d30f4b3c972b052ea83fd9 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
@@ -22,17 +22,25 @@ void* handleTensorAddApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorAdd(input, bias);
-        RC->addToCurrentIterationComputeTime("tensorAdd", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorAdd", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfAdd(input, bias);
-        RC->addToCurrentIterationComputeTime("tensorHalfAdd", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
         return t_out;
         }
       default :
@@ -62,17 +70,25 @@ void* handleTensorMulApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorGemmGPU(lhs, rhs);
-        RC->addToCurrentIterationComputeTime("tensorGemmGPU", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfGemmGPU(lhs, rhs);
-        RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
         return t_out;
         }
       default :
@@ -104,23 +120,31 @@ void* handleTensorConvApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorConvolution(input, filter,
                                   conv_pad_h, conv_pad_w,
                                   conv_stride_h, conv_stride_w,
                                   1, 1);
-        RC->addToCurrentIterationComputeTime("tensorConvolution", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorConvolution", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorConvolution", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorConvolution", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfConvolution(input, filter,
                                       conv_pad_h, conv_pad_w,
                                       conv_stride_h, conv_stride_w,
                                       1, 1);
-        RC->addToCurrentIterationComputeTime("tensorHalfConvolution", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfConvolution", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::PERFORATION :
@@ -144,12 +168,16 @@ void* handleTensorConvApproximationTuples(
               break;
           }
           void* t_out;
+          RC->resume_profiler();
           t_out = tensorConvPerf(input, filter,
                                  conv_pad_h, conv_pad_w,
                                  conv_stride_h, conv_stride_w,
                                  1, 1, row, col);
-          RC->addToCurrentIterationComputeTime("tensorConvPerf", 0.0);
-          RC->addToCurrentIterationComputeEnergy("tensorConvPerf", 0.0);
+          RC->pause_profiler();
+          std::pair<double, double> pinfo = RC->get_time_energy();
+          RC->reset_profiler();
+          RC->addToCurrentIterationComputeTime("tensorConvPerf", pinfo.first);
+          RC->addToCurrentIterationComputeEnergy("tensorConvPerf", pinfo.second);
           return t_out;
         }
       default :
@@ -182,23 +210,31 @@ void* handleTensorGroupConvApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorConvolution(input, filter,
                                   vertical_pad, horizontal_pad,
                                   vertical_stride, horizontal_stride,
                                   conv_mode, conv_groups);
-        RC->addToCurrentIterationComputeTime("tensorConvolution", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorConvolution", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorConvolution", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorConvolution", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfConvolution(input, filter,
                                       vertical_pad, horizontal_pad,
                                       vertical_stride, horizontal_stride,
                                       conv_mode, conv_groups);
-        RC->addToCurrentIterationComputeTime("tensorHalfConvolution", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfConvolution", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", pinfo.second);
         return t_out;
         }
       default :
@@ -229,19 +265,27 @@ void* handleTensorBatchNormApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr,
                                mean_ptr, variance_ptr, epsilon);
-        RC->addToCurrentIterationComputeTime("tensorBatchNorm", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr,
                                    mean_ptr, variance_ptr, epsilon);
-        RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second);
         return t_out;
         }
       default :
@@ -271,17 +315,25 @@ void* handleTensorReluApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorRelu(input);
-        RC->addToCurrentIterationComputeTime("tensorRelu", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfRelu(input);
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
         return t_out;
         }
       default :
@@ -311,17 +363,25 @@ void* handleTensorClippedReluApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorRelu2(input, min, max);
-        RC->addToCurrentIterationComputeTime("tensorRelu2", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu2", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfRelu2(input, min, max);
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu2", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
         return t_out;
         }
       default :
@@ -351,17 +411,25 @@ void* handleTensorTanhApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorTanh(input);
-        RC->addToCurrentIterationComputeTime("tensorTanh", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorTanh", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfTanh(input);
-        RC->addToCurrentIterationComputeTime("tensorHalfTanh", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
         return t_out;
         }
       default :
@@ -394,25 +462,33 @@ void* handleTensorPoolingApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorPooling(input_ptr,
                              poolFunction,
                              window_height, window_width,
                              vertical_pad, horizontal_pad,
                              vertical_stride, horizontal_stride);
-        RC->addToCurrentIterationComputeTime("tensorPooling", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorPooling", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfPooling(input_ptr,
                                  poolFunction,
                                  window_height, window_width,
                                  vertical_pad, horizontal_pad,
                                  vertical_stride, horizontal_stride);
-        RC->addToCurrentIterationComputeTime("tensorHalfPooling", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
         return t_out;
         }
       default :
@@ -437,9 +513,13 @@ void* handleTensorSoftmaxApproximationTuples(
   //TODO: if approximation choices are added for softmax operation,
   // implement this like the other handle* functions
   void* t_out;
+  RC->resume_profiler();
   t_out = tensorSoftmax(input_ptr);
-  RC->addToCurrentIterationComputeTime("tensorSoftmax", 0.0);
-  RC->addToCurrentIterationComputeEnergy("tensorSoftmax", 0.0);
+  RC->pause_profiler();
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationComputeTime("tensorSoftmax", pinfo.first);
+  RC->addToCurrentIterationComputeEnergy("tensorSoftmax", pinfo.second);
   return t_out;
 }
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
index b510e3849e62d4cdb316875346b87bb1d3715215..d11f9dd5222a579f5a57633331e10088a1119808 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
@@ -2,7 +2,6 @@
 #ifndef LLVM_HPVM_RT_CONTROLLER_H
 #define LLVM_HPVM_RT_CONTROLLER_H
 
-
 #include <fstream>
 #include <iostream>
 #include <map>
@@ -12,8 +11,13 @@
 
 #include "configuration.h"
 
+#include "profiler.h"
+#include "promise_timing_model.h"
+
 #include <sys/stat.h>
 
+//#define ACTIVE_PROFILING
+
 /*
  * Check if a file exists
  * Return true if the file exists, false else
@@ -26,33 +30,45 @@ bool fileExists(const std::string& file) {
 class ProfileInfo {
   private:
   // Members
-  float time_total;       // Total execution time of application
-  float time_compute;     // Compute
-  float time_control;     // Control
-  float time_config;      // Apply configuration
+  double time_total;       // Total execution time of application
+  double time_compute;     // Compute
+  double time_control;     // Control
+  double time_config;      // Apply configuration
 
-  float energy_total;     // Total energy consumed by applcation
-  float energy_compute;   // Compute
-  float energy_control;   // Control
-  float energy_config;    // Apply configuration
+  double energy_total;     // Total energy consumed by applcation
+  double energy_compute;   // Compute
+  double energy_control;   // Control
+  double energy_config;    // Apply configuration
 
   // Execution time of one loop iteration
-  float time_compute_current_iteration;   // Compute
-  float time_control_current_iteration;   // Control
-  float time_config_current_iteration;    // Apply configuration
+  double time_compute_current_iteration;   // Compute
+  double time_control_current_iteration;   // Control
+  double time_config_current_iteration;    // Apply configuration
 
   // Energy comsumed by one loop iteration
-  float energy_compute_current_iteration; // Compute
-  float energy_control_current_iteration; // Control
-  float energy_config_current_iteration;  // Apply configuration
+  double energy_compute_current_iteration; // Compute
+  double energy_control_current_iteration; // Control
+  double energy_config_current_iteration;  // Apply configuration
 
   // Vectors, where compute time and energy information
   // - for each loop iteration (outer vector)
   // - per operation (inner vector)
   //                 (tensor operation for GPU, or whole layer for PROMISE)
   // is stored
-  std::vector< std::vector< std::pair< std::string, float > > > tensor_time_info;
-  std::vector< std::vector< std::pair< std::string, float > > > tensor_energy_info;
+  std::vector< std::vector< std::pair< std::string, double > > > tensor_time_info;
+  std::vector< std::vector< std::pair< std::string, double > > > tensor_energy_info;
+
+  // Vectors, where total compute time and energy information per iteration are stored
+  std::vector< double > compute_time_info;
+  std::vector< double > compute_energy_info;
+
+  // Vectors, where control time and energy information per iteration are stored
+  std::vector< double > control_time_info;
+  std::vector< double > control_energy_info;
+
+  // Vectors, where control time and energy information per iteration are stored
+  std::vector< double > config_time_info;
+  std::vector< double > config_energy_info;
 
   bool in_iteration;
 
@@ -76,8 +92,8 @@ class ProfileInfo {
     if (!in_iteration) {
       resetCurrentIterationTime();
       resetCurrentIterationEnergy();
-      tensor_time_info.push_back(std::vector< std::pair< std::string, float > > ());
-      tensor_energy_info.push_back(std::vector< std::pair< std::string, float > > ());
+      tensor_time_info.push_back(std::vector< std::pair< std::string, double > > ());
+      tensor_energy_info.push_back(std::vector< std::pair< std::string, double > > ());
       in_iteration = true;
     }
   }
@@ -102,54 +118,63 @@ class ProfileInfo {
                      energy_control_current_iteration +
                      energy_config_current_iteration);
 
+    // Save current iteration counters
+    compute_time_info.push_back(time_compute_current_iteration);
+    compute_energy_info.push_back(energy_compute_current_iteration);
+    control_time_info.push_back(time_control_current_iteration);
+    control_energy_info.push_back(energy_control_current_iteration);
+    config_time_info.push_back(time_config_current_iteration);
+    config_energy_info.push_back(energy_config_current_iteration);
+
+    // Note end of iteration
     in_iteration = false;
   }
 
-  void addToCurrentIterationComputeTime(const char *s, float t) {
+  void addToCurrentIterationComputeTime(const char *s, double t) {
     start_iteration();
     time_compute_current_iteration += t;
     tensor_time_info.back().push_back(std::make_pair(std::string(s), t));
   }
 
-  void addToCurrentIterationControlTime(float t) {
+  void addToCurrentIterationControlTime(double t) {
     start_iteration();
     time_control_current_iteration += t;
   }
 
-  void addToCurrentIterationConfigTime(float t) {
+  void addToCurrentIterationConfigTime(double t) {
     start_iteration();
     time_config_current_iteration += t;
   }
 
-  void addToCurrentIterationComputeEnergy(const char *s, float e) {
+  void addToCurrentIterationComputeEnergy(const char *s, double e) {
     start_iteration();
     energy_compute_current_iteration += e;
     tensor_energy_info.back().push_back(std::make_pair(std::string(s), e));
   }
 
-  void addToCurrentIterationControlEnergy(float e) {
+  void addToCurrentIterationControlEnergy(double e) {
     start_iteration();
     energy_control_current_iteration += e;
   }
 
-  void addToCurrentIterationConfigEnergy(float e) {
+  void addToCurrentIterationConfigEnergy(double e) {
     start_iteration();
     energy_config_current_iteration += e;
   }
 
-  float getTotalTime() {
+  double getTotalTime() {
     return time_total;
   }
 
-  float getTotalEnergy() {
+  double getTotalEnergy() {
     return energy_total;
   }
 
-  float getCurrentIterationComputeTime() {
+  double getCurrentIterationComputeTime() {
     return time_compute_current_iteration;
   }
 
-  float getCurrentIterationComputeEnergy() {
+  double getCurrentIterationComputeEnergy() {
     return energy_compute_current_iteration;
   }
 
@@ -168,10 +193,18 @@ class ProfileInfo {
       abort();
     }
 
-    // By construction, time_unfo and energy_info are expected to have equal
-    // sizes, in outer and inner vectors both.
-    CUSTOM_ASSERT((tensor_time_info.size() == tensor_energy_info.size()) &&
-                  "time_info and energy_info size: iteration number does not match.");
+    // By construction, tensor_time_info and tensor_energy_info are expected 
+    // to have equal sizes, in outer and inner vectors both,
+    // and all time_info and energy_info vectors must have the same size.
+    unsigned iterations = tensor_time_info.size();
+    CUSTOM_ASSERT((tensor_time_info.size() == iterations) &&
+                  (tensor_energy_info.size() == iterations) &&
+                  (control_time_info.size() == iterations) &&
+                  (control_energy_info.size() == iterations) &&
+                  (config_time_info.size() == iterations) &&
+                  (config_energy_info.size() == iterations) &&
+                  "time_info and energy_info size: \
+                   iteration number does not match.");
 
     for (unsigned i = 0; i < tensor_time_info.size(); i++ ) {
       // time_info.size() == energy_info.size(), since we passed the assertion
@@ -187,8 +220,16 @@ class ProfileInfo {
               << tensor_time_info[i][j].second << " "
               << tensor_energy_info[i][j].second << "\n";
       }
+
+      s_out << "\nIteration Compute Time  : " << compute_time_info[i] << "\n";
+      s_out << "Iteration Compute Energy: " << compute_energy_info[i] << "\n";
+      s_out << "Iteration Control Time  : " << control_time_info[i] << "\n";
+      s_out << "Iteration Control Energy: " << control_energy_info[i] << "\n";
+      s_out << "Iteration Config Time  : " << config_time_info[i] << "\n";
+      s_out << "Iteration Control Energy: " << config_energy_info[i] << "\n";
+
     }
-    s_out << "\nTotal Compute Time  : " << time_compute << "\n";
+    s_out << "\n\nTotal Compute Time  : " << time_compute << "\n";
     s_out << "Total Compute Energy: " << energy_compute << "\n";
 
     s_out << "\nTotal Control Time  : " << time_control << "\n";
@@ -244,9 +285,23 @@ class RuntimeController {
   std::vector<struct Configuration *> *Configurations;
   unsigned configurationIdx = 0;
 
-  ProfileInfo PI;
+  /*** Objects used to gather timing and energy information for execution ***/
+  ProfileInfo *PI;
+  Profiler *profiler;
+  Promise *promise;
 
   //Functions
+
+  // Private functions of profiler
+  void start_profiler() {
+    if (profiler)
+      profiler->start_profiler();
+  }
+  void stop_profiler() {
+    if (profiler)
+      profiler->stop_profiler();
+  }
+
   void setProfileInfoFilename(const char *);
   void readQuantizationFile(const char *);
   void readConfigurationFile(const char *);
@@ -305,54 +360,148 @@ class RuntimeController {
     printConfigurations(ThreeDCurveConfigurations);
     configurationIdx = 0; //TODO: initialize using pareto curve - findTargetConfiguration ?
     Configurations = &SpeedupConfigurations;
+
+    // Start profiling thread in the background, ready to time
+    start_profiler();
+    pause_profiler();
+    reset_profiler();
  }
 
-  // Transferring functionality of ProfileInfo
+  // Exposing functionality of ProfileInfo
   void end_iteration() {
-    PI.end_iteration();
+    if (PI)
+      PI->end_iteration();
   }
 
-  void addToCurrentIterationComputeTime(const char *s, float t) {
-    PI.addToCurrentIterationComputeTime(s, t);
+  void addToCurrentIterationComputeTime(const char *s, double t) {
+    if (PI)
+      PI->addToCurrentIterationComputeTime(s, t);
   }
 
-  void addToCurrentIterationControlTime(float t) {
-    PI.addToCurrentIterationControlTime(t);
+  void addToCurrentIterationControlTime(double t) {
+    if (PI)
+      PI->addToCurrentIterationControlTime(t);
   }
 
-  void addToCurrentIterationConfigTime(float t) {
-    PI.addToCurrentIterationConfigTime(t);
+  void addToCurrentIterationConfigTime(double t) {
+    if (PI)
+      PI->addToCurrentIterationConfigTime(t);
   }
 
-  void addToCurrentIterationComputeEnergy(const char *s, float e) {
-    PI.addToCurrentIterationComputeEnergy(s, e);
+  void addToCurrentIterationComputeEnergy(const char *s, double e) {
+    if (PI)
+      PI->addToCurrentIterationComputeEnergy(s, e);
   }
 
-  void addToCurrentIterationControlEnergy(float e) {
-    PI.addToCurrentIterationControlEnergy(e);
+  void addToCurrentIterationControlEnergy(double e) {
+    if (PI)
+      PI->addToCurrentIterationControlEnergy(e);
   }
 
-  void addToCurrentIterationConfigEnergy(float e) {
-    PI.addToCurrentIterationConfigEnergy(e);
+  void addToCurrentIterationConfigEnergy(double e) {
+    if (PI)
+      PI->addToCurrentIterationConfigEnergy(e);
   }
 
-  float getCurrentIterationComputeTime() {
-    return PI.getCurrentIterationComputeTime();
+  double getCurrentIterationComputeTime() {
+    return (PI ? PI->getCurrentIterationComputeTime() : 0.0) ;
   }
 
-  float getCurrentIterationComputeEnergy() {
-    return PI.getCurrentIterationComputeEnergy();
+  double getCurrentIterationComputeEnergy() {
+    return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0) ;
   }
 
   void writeProfileInfo() {
-    PI.printToFile();
+    if (PI)
+      PI->printToFile();
+  }
+
+  // Exposing functionality of (gpu) profiler
+  void resume_profiler() {
+    if (profiler)
+      profiler->resume_profiler();
+  }
+
+  void pause_profiler() {
+    if (profiler)
+      profiler->pause_profiler();
+  }
+
+  void reset_profiler() {
+    if (profiler)
+      profiler->reset();
+  }
+
+  std::pair<double, double> get_time_energy() const {
+    return (profiler ? profiler->get_time_energy()
+                     : std::make_pair(0.0, 0.0)) ;
+  }
+
+  // Exposing functionality of promise simulator
+  std::pair<double, double> fc_profile(const unsigned num_rows_a,
+                            const unsigned num_cols_a,
+                            const unsigned num_rows_b,
+                            const unsigned num_cols_b,
+                            const unsigned voltage_swing,
+                            const unsigned patch_factor) {
+    return (promise ? promise->fc_profile(num_rows_a,
+                                          num_cols_a,
+                                          num_rows_b,
+                                          num_cols_b,
+                                          voltage_swing,
+                                          patch_factor)
+                    : std::make_pair(0.0, 0.0)) ;
   }
 
+  std::pair<double, double> conv_profile(const unsigned n,
+                            const unsigned c,
+                            const unsigned h,
+                            const unsigned w,
+                            const unsigned c_out,
+                            const unsigned c_in,
+                            const unsigned k_h,
+                            const unsigned k_w,
+                            const unsigned s_h,
+                            const unsigned s_w,
+                            const unsigned voltage_swing,
+                            const unsigned patch_factor) {
+    return (promise ? promise->conv_profile(n, c, h, w,
+                                            c_out, c_in, k_h, k_w,
+                                            s_h, s_w,
+                                            voltage_swing, patch_factor)
+                    : std::make_pair(0.0, 0.0)) ;
+  }
+
+  // Constructor and descructor
   RuntimeController() {
     configurationIdx = 0;
+#ifdef ACTIVE_PROFILING
+    PI = new ProfileInfo();
+    profiler = new Profiler();
+    promise = new Promise();
+#else
+    PI = NULL;
+    profiler = NULL;
+    promise = NULL;
+#endif
+
   }
 
   ~RuntimeController() {
+
+    stop_profiler();
+    writeProfileInfo();
+
+    if (PI) {
+      delete PI;
+    }
+    if (profiler) {
+      delete profiler;
+    }
+    if (promise) {
+      delete promise;
+    }
+
     for (std::vector<struct Configuration>::iterator it = InitialConfigurations.begin(),
 	   ie = InitialConfigurations.end(); it != ie; ++it) {
       std::map<std::string, NodeConfiguration * > ConfSetup = it->setup;
@@ -377,21 +526,23 @@ class RuntimeController {
 
 void RuntimeController::setProfileInfoFilename(const char *str) {
 
-  std::string file_path = std::string(str);
-  size_t idx = file_path.find_last_of("/");
-  file_path.erase(idx + 1);
-  file_path.append("profile_info_");
-
-  bool found = false;
-  std::string profile_filename;
-  for (unsigned i = 0; !found; i++) {
-    profile_filename = file_path;
-    profile_filename.append(std::to_string(i));
-    profile_filename.append(".txt");
-    found = !fileExists(profile_filename);
-  }
+  if (PI) {
+    std::string file_path = std::string(str);
+    size_t idx = file_path.find_last_of("/");
+    file_path.erase(idx + 1);
+    file_path.append("profile_info_");
+
+    bool found = false;
+    std::string profile_filename;
+    for (unsigned i = 0; !found; i++) {
+      profile_filename = file_path;
+      profile_filename.append(std::to_string(i));
+      profile_filename.append(".txt");
+      found = !fileExists(profile_filename);
+    }
 
-  PI.set_out_file_name(profile_filename);
+    PI->set_out_file_name(profile_filename);
+  }
 }
 
 #define NODE_NAME_BUFFER_SIZE 10
@@ -989,14 +1140,15 @@ float hpvm_rt_computeAccuracy3(uint32_t* labels, void* result_ptr) {
 
 void llvm_hpvm_invokeRtControl(void* result, const char* str, int start, int end) {
 
-  //resume_profiler();
+  RC->resume_profiler();
+
   uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
 
   hpvm_rt_computeAccuracy3(labels_cached, result);
 
   // Read stats for iteration that was just completed
-  float current_iteration_time = RC->getCurrentIterationComputeTime();
-  float current_iteration_energy = RC->getCurrentIterationComputeEnergy();
+  double current_iteration_time = RC->getCurrentIterationComputeTime();
+  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
 
   RC->findNextConfiguration();
   // Still use findNext configuration, to update the configurationIdx,
@@ -1005,13 +1157,11 @@ void llvm_hpvm_invokeRtControl(void* result, const char* str, int start, int end
   float goalVal = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss;
   RC->findTargetConfiguration(goalVal, k);
 
-  //pause_profiler();
-  //std::pair<float, float> pi = get();
-  //reset();
-  //RC->addToCurrentIterationControlTime(pi.first);
-  //RC->addToCurrentIterationControlEnergy(pi.second);
-  RC->addToCurrentIterationControlTime(0.0);
-  RC->addToCurrentIterationControlEnergy(0.0);
+  RC->pause_profiler();
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationControlTime(pinfo.first);
+  RC->addToCurrentIterationControlEnergy(pinfo.second);
 
   INFO("current iteration time = %f, current iteration energy = %f\n\n",
        current_iteration_time, current_iteration_energy);
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index 220bb6dda4562eabf52fd0777866325710ac591e..67c56aec3caf63b40e0ab499fa87a5e8cacfca6e 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -1676,6 +1676,24 @@ void* wrapper_ConvLayer(const char* hpvm_node_id,
       int param = approxTuples[0].second;
       if (approx == PROMISENodeConfiguration::APPROX::SWING_LEVEL) {
         DEBUG("Approximation choice for ConvLayer: swing level %d\n", param);
+
+        struct Tensor* input_tensor_cast = (struct Tensor*) input;
+        struct Tensor* filter_tensor_cast = (struct Tensor*) filter;
+        std::pair<double, double> pinfo =
+          RC->conv_profile(input_tensor_cast->dims.dim_sizes[0], //n
+                           input_tensor_cast->dims.dim_sizes[1], //c
+                           input_tensor_cast->dims.dim_sizes[2], //h
+                           input_tensor_cast->dims.dim_sizes[3], //w
+                           filter_tensor_cast->dims.dim_sizes[0], //c_out
+                           filter_tensor_cast->dims.dim_sizes[1], //c_in
+                           filter_tensor_cast->dims.dim_sizes[2], //k_h
+                           filter_tensor_cast->dims.dim_sizes[3], //k_w
+                           conv_stride_h, //s_h
+                           conv_stride_w, //s_w
+                           param, //voltage_swing
+                           1 /*patch_factor*/);
+        RC->addToCurrentIterationComputeTime("ConvLayer_PROMISE", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("ConvLayer_PROMISE", pinfo.second);
         void* t_out;
         t_out = ConvLayer_PROMISE(input, QRanges[0], QRanges[1],
                                   filter, QRanges[2], QRanges[3],
@@ -1685,8 +1703,7 @@ void* wrapper_ConvLayer(const char* hpvm_node_id,
                                   pool_id, pool_size,
                                   activation_id,
                                   QRanges[6], QRanges[7], param);
-        RC->addToCurrentIterationComputeTime("ConvLayer_PROMISE", 0.0);
-        RC->addToCurrentIterationComputeEnergy("ConvLayer_PROMISE", 0.0);
+
         return t_out;
       } else {
         CUSTOM_ASSERT(false && "Unknown approximation type");
@@ -1887,14 +1904,31 @@ void* wrapper_FCLayer(const char* hpvm_node_id,
       int param = approxTuples[0].second;
       if (approx == PROMISENodeConfiguration::APPROX::SWING_LEVEL) {
         DEBUG("Approximation choice for FCLayer: swing level %d\n", param);
+
+        struct Tensor* input_tensor_cast = (struct Tensor*) input;
+        struct Tensor* weights_tensor_cast = (struct Tensor*) weights;
+        CUSTOM_ASSERT((input_tensor_cast->dims.dim_sizes[1] *
+                       input_tensor_cast->dims.dim_sizes[2] *
+                       input_tensor_cast->dims.dim_sizes[3] ==
+                         weights_tensor_cast->dims.dim_sizes[2]) &&
+                      "Dimensions for matrix multiplication do not match.");
+        std::pair<double, double> pinfo =
+          RC->fc_profile(input_tensor_cast->dims.dim_sizes[0], //num_rows_a,
+                         input_tensor_cast->dims.dim_sizes[1] *
+                           input_tensor_cast->dims.dim_sizes[2] *
+                           input_tensor_cast->dims.dim_sizes[3], //num_cols_a,
+                         weights_tensor_cast->dims.dim_sizes[2], //num_rows_b,
+                         weights_tensor_cast->dims.dim_sizes[3], //num_cols_b,
+                         param, //voltage_swing,
+                         1 /*patch_factor*/);
+        RC->addToCurrentIterationComputeTime("FCLayer_PROMISE", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("FCLayer_PROMISE", pinfo.second);
         void* t_out;
         t_out = FCLayer_PROMISE(input, QRanges[0], QRanges[1],
                                 weights, QRanges[2], QRanges[3],
                                 bias, QRanges[4], QRanges[5],
                                 activation_id,
                                 QRanges[6], QRanges[7], param);
-        RC->addToCurrentIterationComputeTime("FCLayer_PROMISE", 0.0);
-        RC->addToCurrentIterationComputeEnergy("FCLayer_PROMISE", 0.0);
         return t_out;
       } else {
         CUSTOM_ASSERT(false && "Unknown approximation type");
diff --git a/llvm/projects/soc_simulator/src/driver_new_config.py b/llvm/projects/soc_simulator/src/driver_new_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..115237dac51c96b47d02c84a603d98bdcf0b84a4
--- /dev/null
+++ b/llvm/projects/soc_simulator/src/driver_new_config.py
@@ -0,0 +1,328 @@
+from collections import defaultdict
+import os
+import subprocess
+import sys
+
+class Driver:
+    fp16_swing = 8
+
+    class ApproxTypes:
+        FP16 = 0
+        FP32 = 1
+        PROMISE = 2
+        PERF = 3
+
+    results_time_key = "Time"
+    results_energy_key = "Energy"
+
+
+    def driver(self):
+        self.__parse_tensor_layer_file()
+        self.__parse_tensor_table()
+        self.__run_simulations()
+        self.__display_results()
+
+
+    def __init__(self, layer_filename, table_filename, config_filename, results_filename):
+        self.__layer_filename = layer_filename
+        self.__table_filename = table_filename
+        self.__config_filename = config_filename
+        self.__results_filename = results_filename
+
+        # NOTE: Use an OrderedDict if we want to search by operation name 
+        # Using a list bc we care about the order the data is read in
+        # since it corresponds to the data in the configuration file
+        self.__tensor_layers = []
+
+        # [layer_name][operation_name][cols] 
+        # Operation names need to be stored in order of insertion 
+        self.__tensor_table = defaultdict(lambda: list(defaultdict(str)))
+
+        # [Time/Energy][number corresponding to order the layer config was read in] = time/energy
+        self.__aggregate_results = defaultdict(lambda: defaultdict(float))
+        self.__config_count = 0
+
+
+    @staticmethod
+    def is_conv(operation_name):
+        return operation_name.startswith("Conv")
+
+
+    @staticmethod
+    def is_nml(operation_name):
+        return operation_name.startswith("NML")
+
+
+    @staticmethod
+    def is_fc(operation_name):
+        return operation_name.startswith("FC")
+
+
+    def __parse_tensor_layer_file(self): 
+        if not os.path.isfile(self.__layer_filename):
+            print("ERROR: %s was not found." % self.__layer_filename)
+            exit(1)
+
+        layer_file = open(self.__layer_filename, "r")
+        for line in layer_file:
+            layer_data = line.strip().split(',')
+            layer_name = layer_data[0]
+
+            tensor_layer = defaultdict(str)
+            tensor_layer["Name"] = layer_name
+
+            if Driver.is_conv(layer_name):
+                tensor_layer["N"] = float(layer_data[1])
+                tensor_layer["Cin"] = float(layer_data[2])
+                tensor_layer["H"] = float(layer_data[3])
+                tensor_layer["W"] = float(layer_data[4])
+                tensor_layer["Cout"] = float(layer_data[5])
+                tensor_layer["Kh"] = float(layer_data[7])
+                tensor_layer["Kw"] = float(layer_data[8])
+                tensor_layer["Sh"] = float(layer_data[9])
+                tensor_layer["Sw"] = float(layer_data[10])
+
+            elif Driver.is_fc(layer_name):
+                tensor_layer["RA"] = float(layer_data[1])
+                tensor_layer["CA"] = float(layer_data[2])
+                tensor_layer["RB"] = float(layer_data[3])
+                tensor_layer["CB"] = float(layer_data[4])
+
+            elif not Driver.is_nml(layer_name): # TODO should we store data for NMLs?
+                print("ERROR: Invalid layer name %s" % layer_name)
+                exit(1)
+
+            self.__tensor_layers.append(tensor_layer)
+        layer_file.close()
+
+
+    def __parse_tensor_table(self): 
+        if not os.path.isfile(self.__table_filename):
+            print("ERROR: %s was not found." % self.__table_filename)
+            exit(1)
+        table_file = open(self.__table_filename, "r")
+        line = table_file.readline().strip()
+
+        while line:
+            # Line here MUST be a header or there's a bug 
+            # Get the description of the layer 
+            assert(line.startswith("**"))
+
+            header_contents = line.split(' ')[1:] 
+            layer_name = header_contents[0]
+            num_ops = int(header_contents[1])
+            col_names = header_contents[2:]
+
+            layer_operations = []
+
+            # Go through all operations in the layer
+            for op_count in range(num_ops):
+                operation_data = defaultdict(str)
+
+                line = table_file.readline().strip()
+                op_data = line.split(' ')
+                op_name = op_data[0]
+                operation_data["Name"] = op_name
+
+                # Number of data items (#s) needs to match up with the # of cols 
+                assert(len(op_data) - 1 == len(col_names)) 
+
+                # Go through all data items (each col element) per operation 
+                for i in range(len(col_names)):
+                    operation_data[col_names[i]] = float(op_data[i + 1])
+
+                layer_operations.append(operation_data)
+
+            self.__tensor_table[layer_name] = layer_operations
+            line = table_file.readline().strip()
+        table_file.close()
+
+
+    @staticmethod
+    def is_promise(config_layer):
+        return float(config_layer.split(' ')[0]) < Driver.fp16_swing
+
+
+    def __quantize(self, curr_layer, prev_layer, h2f_f2h_operation_ind, layer_data):
+        if curr_layer == prev_layer or curr_layer == Driver.ApproxTypes.PROMISE \
+                    or prev_layer == Driver.ApproxTypes.PROMISE: # No quantization needed
+            return 0.0, 0.0
+       
+        layer_name = layer_data["Name"]
+
+        # NOTE: Ignoring logic where curr == promise or prev == promise bc 
+        # smartDMA is always true so we'd return near the beginning of the method
+
+        # Get h2f/f2h data using the first tensor operation in the layer
+        # (which is why order matters in the tensor table)
+        print(layer_name, self.__tensor_table[layer_name])
+        tensor_op_row = self.__tensor_table[layer_name][h2f_f2h_operation_ind]  
+        if curr_layer == Driver.ApproxTypes.FP32:
+            time = tensor_op_row["h2f_time"]
+            energy = tensor_op_row["h2f_energy"]
+        elif curr_layer == Driver.ApproxTypes.FP16:
+            time = tensor_op_row["f2h_time"]
+            energy = tensor_op_row["f2h_energy"]
+
+        print("Quantization: (%f, %f)" % (time, energy))
+        return (time, energy)
+
+
+    def __run_promise_simulation(self, swing, layer_data):
+        layer_name = layer_data["Name"] 
+        patch_factor = 1 
+
+        if Driver.is_conv(layer_name): 
+            rows_a = layer_data["N"] * layer_data["H"] * layer_data["W"] \
+                    / (layer_data["Sh"] * layer_data["Sw"])
+            cols_a = layer_data["Cin"] * layer_data["Kh"] * layer_data["Kw"]
+            rows_b = cols_a
+            cols_b = layer_data["Cout"]
+            patch_factor = layer_data["Kh"] * layer_data["Kw"]
+        elif Driver.is_fc(layer_name):
+            rows_a = layer_data["RA"] 
+            cols_a = layer_data["CA"]
+            rows_b = cols_a
+            cols_b = layer_data["CB"]
+        else:
+            print("PROMISE can't run whatever this layer is.")
+            exit(1)
+        # Run promise simulator
+        # TODO need to print time and energy in the ptm runner so we can pipe it
+        output = subprocess.Popen(["./ptm", str(rows_a), str(cols_a), str(rows_b), \
+                    str(cols_b), str(patch_factor), str(swing)], \
+                    stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0]
+        total_time_energy = output.strip().split(',')
+
+        assert(len(total_time_energy) == 2)
+        print("PROMISE: (%s, %s)" % (total_time_energy[0], total_time_energy[1]))
+        return float(total_time_energy[0]), float(total_time_energy[1])
+
+
+    def __run_simulations(self):
+        if not os.path.isfile(self.__config_filename):
+            print("ERROR: %s was not found" % self.__config_filename)
+            exit(1)
+
+        config_file = open(self.__config_filename, "r")
+     
+        line = config_file.readline().strip()
+			
+        while line:
+            assert(line.startswith("+++++"))
+            config_name = config_file.readline().strip().split(' ')[0] # Next line = configuration name 
+            print("CONFIGURATION")
+
+            line = config_file.readline().strip()
+            layer_ind = 0 # NOTE can also use the leftmost number in the currl ine  
+
+            prev_layer = Driver.ApproxTypes.FP32
+            curr_layer = None
+
+            while not line.startswith("-----"):
+                layer_info = line.split(' ')
+                layer_data = self.__tensor_layers[layer_ind]  
+                layer_name = layer_data["Name"]
+
+                if layer_info[1] == "promise":
+                    print("Running layer %s on PROMISE" % layer_name)
+                    curr_layer = Driver.ApproxTypes.PROMISE
+                    
+                    swing = int(layer_info[3])
+                    time, energy = self.__run_promise_simulation(swing, layer_data)
+                    print(time, energy)
+                    self.__aggregate_results[Driver.results_time_key][self.__config_count] += time
+                    self.__aggregate_results[Driver.results_energy_key][self.__config_count] += energy
+				
+                elif layer_info[1] == "gpu":
+                    # Parse each individual tensor operation
+                    # TODO not portable bc there can be multiple numbers after each approx later on 
+                    total_time = 0
+                    total_energy = 0
+
+                    tensor_ind = 0
+                    for i in range(2, len(layer_info), 3):
+                        tensor_op = layer_info[i]
+                        approx_type = layer_info[i + 1]
+                        approx_num = layer_info[i + 2] # only matters if perf
+
+                        if approx_type == "fp16":
+                            curr_layer = Driver.ApproxTypes.FP16
+                        elif approx_type == "fp32":
+                            curr_layer = Driver.ApproxTypes.FP32
+                        elif approx_type == "perf":
+                            curr_layer = DriverApproxTypes.PERF
+                        else: 
+                            assert(False) 
+
+                        quant_time, quant_energy = self.__quantize(curr_layer, prev_layer, tensor_ind, layer_data)
+                        time, energy = self.__run_gpu_simulation(curr_layer, layer_name, tensor_ind, approx_num)
+                        total_time += time
+                        total_energy += energy
+	
+                        tensor_ind += 1 
+
+                    self.__aggregate_results[Driver.results_time_key][self.__config_count] += total_time
+                    self.__aggregate_results[Driver.results_energy_key][self.__config_count] += total_energy
+
+                layer_ind += 1
+                line = config_file.readline().strip()
+
+            self.__config_count += 1
+            line = config_file.readline().strip()
+
+        config_file.close()
+
+
+    def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind, approx_num):
+        tensor_info = self.__tensor_table[layer_name][tensor_ind]
+
+        if curr_layer == Driver.ApproxTypes.FP32:
+			time = tensor_info["fp32_time"]
+			energy = tensor_info["fp32_energy"]
+
+        elif curr_layer == Driver.ApproxTypes.FP16:
+			time = tensor_info["fp16_time"]
+			energy = tensor_info["fp16_energy"]
+
+        elif curr_layer == Driver.ApproxTypes.PERF:
+			time = tensor_info["perf%s_energy" % approx_num]
+			energy = tensor_info["perf%s_energy" % approx_num]
+            
+        print("GPU: (%f, %f)" % (time, energy))
+        return time, energy
+
+
+    def __display_results(self):
+        results_file = open(self.__results_filename, "w")
+        attributes_to_print = [Driver.results_time_key, Driver.results_energy_key]
+
+        for attribute in attributes_to_print:
+            results_file.write("%s\n" % attribute)
+            results_file.write("Configuration,Total,Improvement\n") 
+
+            baseline_val = self.__aggregate_results[attribute][0]
+            print(baseline_val)
+            best_config = None
+            best_result = None
+
+            for config_ind in range(self.__config_count):
+                results_file.write("c%d" % config_ind)
+                time_or_energy_val = self.__aggregate_results[attribute][config_ind]
+
+                # Using repr to keep all decimal digits when writing to file
+                results_file.write(",%s" % repr(time_or_energy_val))
+                results_file.write(",%s\n" % repr(baseline_val / (time_or_energy_val + 0.0001)))
+
+                if not best_result or time_or_energy_val < best_result:
+                    best_result = time_or_energy_val
+                    best_config = config_ind
+            results_file.write("\nc%d,%s\n\n" % (best_config, repr(self.__aggregate_results[attribute][best_config])))
+        results_file.close()
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print("Usage: python driver.py <layer info> <tensor info> <configurations> <results file>")
+        exit(1)
+    Driver(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]).driver()
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile
index 9785fb5439e8f80ebb3ad6f752835ff7d7a9bb23..30c80f2a1a65ad122681b76e531ed5d99ec8a12b 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile
@@ -16,6 +16,8 @@ APP = alexnet2
 TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include
 TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include
 TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a
+PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a
+SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a
 TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a
 
 CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH)  -fno-exceptions -ffast-math -std=c++11 -O3
@@ -65,10 +67,10 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc
-	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS)
 	#$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS)
 
 $(BUILD_DIR):