From 0ea3ecb0d6fe3ffb9daeddc531c4c6879451b34f Mon Sep 17 00:00:00 2001
From: Maria Kotsifakou <kotsifa2@illinois.edu>
Date: Fri, 11 Oct 2019 17:15:42 -0500
Subject: [PATCH] Profiling in RT controller - tensor runtime

---
 llvm/projects/hpvm-tensor-rt/CMakeLists.txt   | 120 +++----
 .../include/approxhpvm_runtime_utils.h        | 160 ++++++---
 .../include/hpvm-rt-controller.h              | 306 +++++++++++++-----
 .../tensor_runtime/src/tensor_runtime.cu      |  42 ++-
 .../benchmarks/alexnet2/Makefile              |  10 +-
 5 files changed, 458 insertions(+), 180 deletions(-)

diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
index efffcb0bb9..0eab861bea 100644
--- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -30,6 +30,8 @@ endif()
 
 include_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/include)
 include_directories(./tensor_runtime/include)
+include_directories(../gpu_profiler/include)
+include_directories(../soc_simulator/include)
 link_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64)
 
 
@@ -40,6 +42,16 @@ cuda_add_cublas_to_target(tensor_runtime)
 # Adding new rule for building a cuDNN runtime library
 cuda_add_library(tensor_cpu_runtime tensor_runtime/src/tensor_cpu_runtime.cc)
 
+find_library(GPU_PROFILER_LIB
+    NAMES libgpu_profiler.a
+    HINTS ../gpu_profiler/lib
+)
+
+find_library(SOC_SIMULATOR_LIB
+    NAMES libpromise_profiler.a
+    HINTS ../soc_simulator/lib
+)
+
 
 if(USE_GFLAGS)
   target_link_libraries(tensor_runtime gflags cudnn -lcurand)
@@ -51,94 +63,94 @@ target_link_libraries(tensor_cpu_runtime)
 
 # Adding rule for the debugging source
 add_executable(test_ops  dnn_sources/src/test_ops.cc)
-target_link_libraries(test_ops  tensor_runtime)
+target_link_libraries(test_ops  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 #**** CPU sources
 add_executable(fc2_cpu  dnn_sources/src/fc2_cpu.cc)
-target_link_libraries(fc2_cpu  tensor_cpu_runtime)
+target_link_libraries(fc2_cpu  tensor_cpu_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
 # Full-Precision versions
 add_executable(lenet_tanh  dnn_sources/src/lenet2_tanh.cc)
-target_link_libraries(lenet_tanh  tensor_runtime)
+target_link_libraries(lenet_tanh  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(lenet_keras  dnn_sources/src/lenet_keras.cc)
-target_link_libraries(lenet_keras  tensor_runtime)
+target_link_libraries(lenet_keras  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(lenet_int32  dnn_sources/src/lenet_int32.cc)
-target_link_libraries(lenet_int32  tensor_runtime)
+target_link_libraries(lenet_int32  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 add_executable(alexnet_cifar10  dnn_sources/src/alexnet_cifar10_front.cc)
-target_link_libraries(alexnet_cifar10  tensor_runtime)
+target_link_libraries(alexnet_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet_cifar10_tuner  dnn_sources/src/alexnet_cifar10_tuner.cc)
-target_link_libraries(alexnet_cifar10_tuner  tensor_runtime)
+target_link_libraries(alexnet_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet_cifar10_approx  dnn_sources/src/alexnet_cifar10_approx.cc)
-target_link_libraries(alexnet_cifar10_approx  tensor_runtime)
+target_link_libraries(alexnet_cifar10_approx  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet2_cifar10  dnn_sources/src/alexnet2_cifar10.cc)
-target_link_libraries(alexnet2_cifar10  tensor_runtime)
+target_link_libraries(alexnet2_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet2_cifar10_tuner  dnn_sources/src/alexnet2_cifar10_tuner.cc)
-target_link_libraries(alexnet2_cifar10_tuner  tensor_runtime)
+target_link_libraries(alexnet2_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar10  dnn_sources/src/vgg16_cifar10.cc)
-target_link_libraries(vgg16_cifar10  tensor_runtime)
+target_link_libraries(vgg16_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar10_tuner  dnn_sources/src/vgg16_cifar10_tuner.cc)
-target_link_libraries(vgg16_cifar10_tuner  tensor_runtime)
+target_link_libraries(vgg16_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_cifar10  dnn_sources/src/resnet18_cifar10.cc)
-target_link_libraries(resnet18_cifar10  tensor_runtime)
+target_link_libraries(resnet18_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_cifar10_approx  dnn_sources/src/resnet18_cifar10_approx.cc)
-target_link_libraries(resnet18_cifar10_approx  tensor_runtime)
+target_link_libraries(resnet18_cifar10_approx  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_cifar10_inputapprox  dnn_sources/src/resnet18_cifar10_inputapprox.cc)
-target_link_libraries(resnet18_cifar10_inputapprox  tensor_runtime)
+target_link_libraries(resnet18_cifar10_inputapprox  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_cifar10_tuner  dnn_sources/src/resnet18_cifar10_tuner.cc)
-target_link_libraries(resnet18_cifar10_tuner  tensor_runtime)
+target_link_libraries(resnet18_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100  dnn_sources/src/vgg16_cifar100.cc)
-target_link_libraries(vgg16_cifar100  tensor_runtime)
+target_link_libraries(vgg16_cifar100  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100_tuner  dnn_sources/src/vgg16_cifar100_tuner.cc)
-target_link_libraries(vgg16_cifar100_tuner  tensor_runtime)
+target_link_libraries(vgg16_cifar100_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100_top5  dnn_sources/src/vgg16_cifar100_5.cc)
-target_link_libraries(vgg16_cifar100_top5  tensor_runtime)
+target_link_libraries(vgg16_cifar100_top5  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 # REF binaries
 add_executable(mobilenet_cifar10  dnn_sources/src/mobilenet_cifar10.cc)
-target_link_libraries(mobilenet_cifar10  tensor_runtime)
+target_link_libraries(mobilenet_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 add_executable(mobilenet_cifar10_shallow  dnn_sources/src/mobilenet_cifar10_shallow.cc)
-target_link_libraries(mobilenet_cifar10_shallow  tensor_runtime)
+target_link_libraries(mobilenet_cifar10_shallow  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
 #### Image Pipeline Tuning sources
 
 add_executable(pipeline_GEMO  dnn_sources/src/pipeline_GEMO.cc)
-target_link_libraries(pipeline_GEMO  tensor_runtime)
+target_link_libraries(pipeline_GEMO  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEO  dnn_sources/src/pipeline_GEO.cc)
-target_link_libraries(pipeline_GEO  tensor_runtime)
+target_link_libraries(pipeline_GEO  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEOM  dnn_sources/src/pipeline_GEOM.cc)
-target_link_libraries(pipeline_GEOM  tensor_runtime)
+target_link_libraries(pipeline_GEOM  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSM  dnn_sources/src/pipeline_GSM.cc)
-target_link_libraries(pipeline_GSM  tensor_runtime)
+target_link_libraries(pipeline_GSM  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSME  dnn_sources/src/pipeline_GSME.cc)
-target_link_libraries(pipeline_GSME  tensor_runtime)
+target_link_libraries(pipeline_GSME  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
@@ -147,101 +159,101 @@ target_link_libraries(pipeline_GSME  tensor_runtime)
 
 
 #add_executable(fc4_half  dnn_sources/src/half/fc4_half.cc)
-#target_link_libraries(fc4_half  tensor_runtime)
+#target_link_libraries(fc4_half  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 #add_executable(lenet_tanh_half  dnn_sources/src/half/lenet_tanh_half.cc)
-#target_link_libraries(lenet_tanh_half  tensor_runtime)
+#target_link_libraries(lenet_tanh_half  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 #add_executable(lenet_keras_half  dnn_sources/src/half/lenet_keras_half.cc)
-#target_link_libraries(lenet_keras_half  tensor_runtime)
+#target_link_libraries(lenet_keras_half  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 #********* Promise API sources
 add_executable(lenet_promise  dnn_sources/src/promise/lenet_promise.cc)
-target_link_libraries(lenet_promise  tensor_runtime)
+target_link_libraries(lenet_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 # Quantized PROMISE sources
 add_executable(alexnet_promise  dnn_sources/src/promise/alexnet_promise.cc)
-target_link_libraries(alexnet_promise  tensor_runtime)
+target_link_libraries(alexnet_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet2_promise  dnn_sources/src/promise/alexnet2_promise.cc)
-target_link_libraries(alexnet2_promise  tensor_runtime)
+target_link_libraries(alexnet2_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_promise  dnn_sources/src/promise/resnet18_promise.cc)
-target_link_libraries(resnet18_promise  tensor_runtime)
+target_link_libraries(resnet18_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100_promise  dnn_sources/src/promise/vgg16_cifar100_promise.cc)
-target_link_libraries(vgg16_cifar100_promise  tensor_runtime)
+target_link_libraries(vgg16_cifar100_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar10_promise  dnn_sources/src/promise/vgg16_cifar10_promise.cc)
-target_link_libraries(vgg16_cifar10_promise  tensor_runtime)
+target_link_libraries(vgg16_cifar10_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 # REF Source - BUILT After Support for SMART QUANTIZATION
 add_executable(mobilenet_promise  dnn_sources/src/promise/mobilenet_promise.cc)
-target_link_libraries(mobilenet_promise  tensor_runtime)
+target_link_libraries(mobilenet_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(mobilenet_shallow_promise  dnn_sources/src/promise/mobilenet_shallow_promise.cc)
-target_link_libraries(mobilenet_shallow_promise  tensor_runtime)
+target_link_libraries(mobilenet_shallow_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
 #### Image Pipeline PROMISE sources
 add_executable(pipeline_GEMO_promise  dnn_sources/src/promise/pipeline_GEMO_promise.cc)
-target_link_libraries(pipeline_GEMO_promise  tensor_runtime)
+target_link_libraries(pipeline_GEMO_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEO_promise  dnn_sources/src/promise/pipeline_GEO_promise.cc)
-target_link_libraries(pipeline_GEO_promise  tensor_runtime)
+target_link_libraries(pipeline_GEO_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEOM_promise  dnn_sources/src/promise/pipeline_GEOM_promise.cc)
-target_link_libraries(pipeline_GEOM_promise  tensor_runtime)
+target_link_libraries(pipeline_GEOM_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSM_promise  dnn_sources/src/promise/pipeline_GSM_promise.cc)
-target_link_libraries(pipeline_GSM_promise  tensor_runtime)
+target_link_libraries(pipeline_GSM_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSME_promise  dnn_sources/src/promise/pipeline_GSME_promise.cc)
-target_link_libraries(pipeline_GSME_promise  tensor_runtime)
+target_link_libraries(pipeline_GSME_promise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
 #############  Promise Validation Sources #############
 
 add_executable(alexnet_valid  dnn_sources/src/promise/alexnet_valid.cc)
-target_link_libraries(alexnet_valid  tensor_runtime)
+target_link_libraries(alexnet_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(alexnet2_valid  dnn_sources/src/promise/alexnet2_valid.cc)
-target_link_libraries(alexnet2_valid  tensor_runtime)
+target_link_libraries(alexnet2_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_valid  dnn_sources/src/promise/resnet18_valid.cc)
-target_link_libraries(resnet18_valid  tensor_runtime)
+target_link_libraries(resnet18_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100_valid  dnn_sources/src/promise/vgg16_cifar100_valid.cc)
-target_link_libraries(vgg16_cifar100_valid  tensor_runtime)
+target_link_libraries(vgg16_cifar100_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar10_valid  dnn_sources/src/promise/vgg16_cifar10_valid.cc)
-target_link_libraries(vgg16_cifar10_valid  tensor_runtime)
+target_link_libraries(vgg16_cifar10_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 add_executable(mobilenet_valid  dnn_sources/src/promise/mobilenet_valid.cc)
-target_link_libraries(mobilenet_valid  tensor_runtime)
+target_link_libraries(mobilenet_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(mobilenet_shallow_valid  dnn_sources/src/promise/mobilenet_shallow_valid.cc)
-target_link_libraries(mobilenet_shallow_valid  tensor_runtime)
+target_link_libraries(mobilenet_shallow_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 ##### Image pipeline validation sources
 add_executable(pipeline_GEMO_valid  dnn_sources/src/promise/pipeline_GEMO_valid.cc)
-target_link_libraries(pipeline_GEMO_valid  tensor_runtime)
+target_link_libraries(pipeline_GEMO_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEO_valid  dnn_sources/src/promise/pipeline_GEO_valid.cc)
-target_link_libraries(pipeline_GEO_valid  tensor_runtime)
+target_link_libraries(pipeline_GEO_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GEOM_valid  dnn_sources/src/promise/pipeline_GEOM_valid.cc)
-target_link_libraries(pipeline_GEOM_valid  tensor_runtime)
+target_link_libraries(pipeline_GEOM_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSM_valid  dnn_sources/src/promise/pipeline_GSM_valid.cc)
-target_link_libraries(pipeline_GSM_valid  tensor_runtime)
+target_link_libraries(pipeline_GSM_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(pipeline_GSME_valid  dnn_sources/src/promise/pipeline_GSME_valid.cc)
-target_link_libraries(pipeline_GSME_valid  tensor_runtime)
+target_link_libraries(pipeline_GSME_valid  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
index dd66c2febd..217f575c2a 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
@@ -22,17 +22,25 @@ void* handleTensorAddApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorAdd(input, bias);
-        RC->addToCurrentIterationComputeTime("tensorAdd", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorAdd", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfAdd(input, bias);
-        RC->addToCurrentIterationComputeTime("tensorHalfAdd", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
         return t_out;
         }
       default :
@@ -62,17 +70,25 @@ void* handleTensorMulApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorGemmGPU(lhs, rhs);
-        RC->addToCurrentIterationComputeTime("tensorGemmGPU", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfGemmGPU(lhs, rhs);
-        RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
         return t_out;
         }
       default :
@@ -104,23 +120,31 @@ void* handleTensorConvApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorConvolution(input, filter,
                                   conv_pad_h, conv_pad_w,
                                   conv_stride_h, conv_stride_w,
                                   1, 1);
-        RC->addToCurrentIterationComputeTime("tensorConvolution", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorConvolution", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorConvolution", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorConvolution", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfConvolution(input, filter,
                                       conv_pad_h, conv_pad_w,
                                       conv_stride_h, conv_stride_w,
                                       1, 1);
-        RC->addToCurrentIterationComputeTime("tensorHalfConvolution", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfConvolution", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::PERFORATION :
@@ -144,12 +168,16 @@ void* handleTensorConvApproximationTuples(
               break;
           }
           void* t_out;
+          RC->resume_profiler();
           t_out = tensorConvPerf(input, filter,
                                  conv_pad_h, conv_pad_w,
                                  conv_stride_h, conv_stride_w,
                                  1, 1, row, col);
-          RC->addToCurrentIterationComputeTime("tensorConvPerf", 0.0);
-          RC->addToCurrentIterationComputeEnergy("tensorConvPerf", 0.0);
+          RC->pause_profiler();
+          std::pair<double, double> pinfo = RC->get_time_energy();
+          RC->reset_profiler();
+          RC->addToCurrentIterationComputeTime("tensorConvPerf", pinfo.first);
+          RC->addToCurrentIterationComputeEnergy("tensorConvPerf", pinfo.second);
           return t_out;
         }
       default :
@@ -182,23 +210,31 @@ void* handleTensorGroupConvApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorConvolution(input, filter,
                                   vertical_pad, horizontal_pad,
                                   vertical_stride, horizontal_stride,
                                   conv_mode, conv_groups);
-        RC->addToCurrentIterationComputeTime("tensorConvolution", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorConvolution", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorConvolution", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorConvolution", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfConvolution(input, filter,
                                       vertical_pad, horizontal_pad,
                                       vertical_stride, horizontal_stride,
                                       conv_mode, conv_groups);
-        RC->addToCurrentIterationComputeTime("tensorHalfConvolution", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfConvolution", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", pinfo.second);
         return t_out;
         }
       default :
@@ -229,19 +265,27 @@ void* handleTensorBatchNormApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr,
                                mean_ptr, variance_ptr, epsilon);
-        RC->addToCurrentIterationComputeTime("tensorBatchNorm", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr,
                                    mean_ptr, variance_ptr, epsilon);
-        RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second);
         return t_out;
         }
       default :
@@ -271,17 +315,25 @@ void* handleTensorReluApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorRelu(input);
-        RC->addToCurrentIterationComputeTime("tensorRelu", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfRelu(input);
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
         return t_out;
         }
       default :
@@ -311,17 +363,25 @@ void* handleTensorClippedReluApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorRelu2(input, min, max);
-        RC->addToCurrentIterationComputeTime("tensorRelu2", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu2", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfRelu2(input, min, max);
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu2", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
         return t_out;
         }
       default :
@@ -351,17 +411,25 @@ void* handleTensorTanhApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorTanh(input);
-        RC->addToCurrentIterationComputeTime("tensorTanh", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorTanh", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfTanh(input);
-        RC->addToCurrentIterationComputeTime("tensorHalfTanh", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
         return t_out;
         }
       default :
@@ -394,25 +462,33 @@ void* handleTensorPoolingApproximationTuples(
       case GPUNodeConfiguration::APPROX::FP32 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorPooling(input_ptr,
                              poolFunction,
                              window_height, window_width,
                              vertical_pad, horizontal_pad,
                              vertical_stride, horizontal_stride);
-        RC->addToCurrentIterationComputeTime("tensorPooling", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorPooling", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
         return t_out;
         }
       case GPUNodeConfiguration::APPROX::FP16 :
         {
         void* t_out;
+        RC->resume_profiler();
         t_out = tensorHalfPooling(input_ptr,
                                  poolFunction,
                                  window_height, window_width,
                                  vertical_pad, horizontal_pad,
                                  vertical_stride, horizontal_stride);
-        RC->addToCurrentIterationComputeTime("tensorHalfPooling", 0.0);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", 0.0);
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
         return t_out;
         }
       default :
@@ -437,9 +513,13 @@ void* handleTensorSoftmaxApproximationTuples(
   //TODO: if approximation choices are added for softmax operation,
   // implement this like the other handle* functions
   void* t_out;
+  RC->resume_profiler();
   t_out = tensorSoftmax(input_ptr);
-  RC->addToCurrentIterationComputeTime("tensorSoftmax", 0.0);
-  RC->addToCurrentIterationComputeEnergy("tensorSoftmax", 0.0);
+  RC->pause_profiler();
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationComputeTime("tensorSoftmax", pinfo.first);
+  RC->addToCurrentIterationComputeEnergy("tensorSoftmax", pinfo.second);
   return t_out;
 }
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
index b510e3849e..d11f9dd522 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
@@ -2,7 +2,6 @@
 #ifndef LLVM_HPVM_RT_CONTROLLER_H
 #define LLVM_HPVM_RT_CONTROLLER_H
 
-
 #include <fstream>
 #include <iostream>
 #include <map>
@@ -12,8 +11,13 @@
 
 #include "configuration.h"
 
+#include "profiler.h"
+#include "promise_timing_model.h"
+
 #include <sys/stat.h>
 
+//#define ACTIVE_PROFILING
+
 /*
  * Check if a file exists
  * Return true if the file exists, false else
@@ -26,33 +30,45 @@ bool fileExists(const std::string& file) {
 class ProfileInfo {
   private:
   // Members
-  float time_total;       // Total execution time of application
-  float time_compute;     // Compute
-  float time_control;     // Control
-  float time_config;      // Apply configuration
+  double time_total;       // Total execution time of application
+  double time_compute;     // Compute
+  double time_control;     // Control
+  double time_config;      // Apply configuration
 
-  float energy_total;     // Total energy consumed by applcation
-  float energy_compute;   // Compute
-  float energy_control;   // Control
-  float energy_config;    // Apply configuration
+  double energy_total;     // Total energy consumed by applcation
+  double energy_compute;   // Compute
+  double energy_control;   // Control
+  double energy_config;    // Apply configuration
 
   // Execution time of one loop iteration
-  float time_compute_current_iteration;   // Compute
-  float time_control_current_iteration;   // Control
-  float time_config_current_iteration;    // Apply configuration
+  double time_compute_current_iteration;   // Compute
+  double time_control_current_iteration;   // Control
+  double time_config_current_iteration;    // Apply configuration
 
   // Energy comsumed by one loop iteration
-  float energy_compute_current_iteration; // Compute
-  float energy_control_current_iteration; // Control
-  float energy_config_current_iteration;  // Apply configuration
+  double energy_compute_current_iteration; // Compute
+  double energy_control_current_iteration; // Control
+  double energy_config_current_iteration;  // Apply configuration
 
   // Vectors, where compute time and energy information
   // - for each loop iteration (outer vector)
   // - per operation (inner vector)
   //                 (tensor operation for GPU, or whole layer for PROMISE)
   // is stored
-  std::vector< std::vector< std::pair< std::string, float > > > tensor_time_info;
-  std::vector< std::vector< std::pair< std::string, float > > > tensor_energy_info;
+  std::vector< std::vector< std::pair< std::string, double > > > tensor_time_info;
+  std::vector< std::vector< std::pair< std::string, double > > > tensor_energy_info;
+
+  // Vectors, where total compute time and energy information per iteration are stored
+  std::vector< double > compute_time_info;
+  std::vector< double > compute_energy_info;
+
+  // Vectors, where control time and energy information per iteration are stored
+  std::vector< double > control_time_info;
+  std::vector< double > control_energy_info;
+
+  // Vectors, where control time and energy information per iteration are stored
+  std::vector< double > config_time_info;
+  std::vector< double > config_energy_info;
 
   bool in_iteration;
 
@@ -76,8 +92,8 @@ class ProfileInfo {
     if (!in_iteration) {
       resetCurrentIterationTime();
       resetCurrentIterationEnergy();
-      tensor_time_info.push_back(std::vector< std::pair< std::string, float > > ());
-      tensor_energy_info.push_back(std::vector< std::pair< std::string, float > > ());
+      tensor_time_info.push_back(std::vector< std::pair< std::string, double > > ());
+      tensor_energy_info.push_back(std::vector< std::pair< std::string, double > > ());
       in_iteration = true;
     }
   }
@@ -102,54 +118,63 @@ class ProfileInfo {
                      energy_control_current_iteration +
                      energy_config_current_iteration);
 
+    // Save current iteration counters
+    compute_time_info.push_back(time_compute_current_iteration);
+    compute_energy_info.push_back(energy_compute_current_iteration);
+    control_time_info.push_back(time_control_current_iteration);
+    control_energy_info.push_back(energy_control_current_iteration);
+    config_time_info.push_back(time_config_current_iteration);
+    config_energy_info.push_back(energy_config_current_iteration);
+
+    // Note end of iteration
     in_iteration = false;
   }
 
-  void addToCurrentIterationComputeTime(const char *s, float t) {
+  void addToCurrentIterationComputeTime(const char *s, double t) {
     start_iteration();
     time_compute_current_iteration += t;
     tensor_time_info.back().push_back(std::make_pair(std::string(s), t));
   }
 
-  void addToCurrentIterationControlTime(float t) {
+  void addToCurrentIterationControlTime(double t) {
     start_iteration();
     time_control_current_iteration += t;
   }
 
-  void addToCurrentIterationConfigTime(float t) {
+  void addToCurrentIterationConfigTime(double t) {
     start_iteration();
     time_config_current_iteration += t;
   }
 
-  void addToCurrentIterationComputeEnergy(const char *s, float e) {
+  void addToCurrentIterationComputeEnergy(const char *s, double e) {
     start_iteration();
     energy_compute_current_iteration += e;
     tensor_energy_info.back().push_back(std::make_pair(std::string(s), e));
   }
 
-  void addToCurrentIterationControlEnergy(float e) {
+  void addToCurrentIterationControlEnergy(double e) {
     start_iteration();
     energy_control_current_iteration += e;
   }
 
-  void addToCurrentIterationConfigEnergy(float e) {
+  void addToCurrentIterationConfigEnergy(double e) {
     start_iteration();
     energy_config_current_iteration += e;
   }
 
-  float getTotalTime() {
+  double getTotalTime() {
     return time_total;
   }
 
-  float getTotalEnergy() {
+  double getTotalEnergy() {
     return energy_total;
   }
 
-  float getCurrentIterationComputeTime() {
+  double getCurrentIterationComputeTime() {
     return time_compute_current_iteration;
   }
 
-  float getCurrentIterationComputeEnergy() {
+  double getCurrentIterationComputeEnergy() {
     return energy_compute_current_iteration;
   }
 
@@ -168,10 +193,18 @@ class ProfileInfo {
       abort();
     }
 
-    // By construction, time_unfo and energy_info are expected to have equal
-    // sizes, in outer and inner vectors both.
-    CUSTOM_ASSERT((tensor_time_info.size() == tensor_energy_info.size()) &&
-                  "time_info and energy_info size: iteration number does not match.");
+    // By construction, tensor_time_info and tensor_energy_info are expected 
+    // to have equal sizes, in outer and inner vectors both,
+    // and all time_info and energy_info vectors must have the same size.
+    unsigned iterations = tensor_time_info.size();
+    CUSTOM_ASSERT((tensor_time_info.size() == iterations) &&
+                  (tensor_energy_info.size() == iterations) &&
+                  (control_time_info.size() == iterations) &&
+                  (control_energy_info.size() == iterations) &&
+                  (config_time_info.size() == iterations) &&
+                  (config_energy_info.size() == iterations) &&
+                  "time_info and energy_info size: \
+                   iteration number does not match.");
 
     for (unsigned i = 0; i < tensor_time_info.size(); i++ ) {
       // time_info.size() == energy_info.size(), since we passed the assertion
@@ -187,8 +220,16 @@ class ProfileInfo {
               << tensor_time_info[i][j].second << " "
               << tensor_energy_info[i][j].second << "\n";
       }
+
+      s_out << "\nIteration Compute Time  : " << compute_time_info[i] << "\n";
+      s_out << "Iteration Compute Energy: " << compute_energy_info[i] << "\n";
+      s_out << "Iteration Control Time  : " << control_time_info[i] << "\n";
+      s_out << "Iteration Control Energy: " << control_energy_info[i] << "\n";
+      s_out << "Iteration Config Time  : " << config_time_info[i] << "\n";
+      s_out << "Iteration Control Energy: " << config_energy_info[i] << "\n";
+
     }
-    s_out << "\nTotal Compute Time  : " << time_compute << "\n";
+    s_out << "\n\nTotal Compute Time  : " << time_compute << "\n";
     s_out << "Total Compute Energy: " << energy_compute << "\n";
 
     s_out << "\nTotal Control Time  : " << time_control << "\n";
@@ -244,9 +285,23 @@ class RuntimeController {
   std::vector<struct Configuration *> *Configurations;
   unsigned configurationIdx = 0;
 
-  ProfileInfo PI;
+  /*** Objects used to gather timing and energy information for execution ***/
+  ProfileInfo *PI;
+  Profiler *profiler;
+  Promise *promise;
 
   //Functions
+
+  // Private functions of profiler
+  void start_profiler() {
+    if (profiler)
+      profiler->start_profiler();
+  }
+  void stop_profiler() {
+    if (profiler)
+      profiler->stop_profiler();
+  }
+
   void setProfileInfoFilename(const char *);
   void readQuantizationFile(const char *);
   void readConfigurationFile(const char *);
@@ -305,54 +360,148 @@ class RuntimeController {
     printConfigurations(ThreeDCurveConfigurations);
     configurationIdx = 0; //TODO: initialize using pareto curve - findTargetConfiguration ?
     Configurations = &SpeedupConfigurations;
+
+    // Start profiling thread in the background, ready to time
+    start_profiler();
+    pause_profiler();
+    reset_profiler();
  }
 
-  // Transferring functionality of ProfileInfo
+  // Exposing functionality of ProfileInfo
   void end_iteration() {
-    PI.end_iteration();
+    if (PI)
+      PI->end_iteration();
   }
 
-  void addToCurrentIterationComputeTime(const char *s, float t) {
-    PI.addToCurrentIterationComputeTime(s, t);
+  void addToCurrentIterationComputeTime(const char *s, double t) {
+    if (PI)
+      PI->addToCurrentIterationComputeTime(s, t);
   }
 
-  void addToCurrentIterationControlTime(float t) {
-    PI.addToCurrentIterationControlTime(t);
+  void addToCurrentIterationControlTime(double t) {
+    if (PI)
+      PI->addToCurrentIterationControlTime(t);
   }
 
-  void addToCurrentIterationConfigTime(float t) {
-    PI.addToCurrentIterationConfigTime(t);
+  void addToCurrentIterationConfigTime(double t) {
+    if (PI)
+      PI->addToCurrentIterationConfigTime(t);
   }
 
-  void addToCurrentIterationComputeEnergy(const char *s, float e) {
-    PI.addToCurrentIterationComputeEnergy(s, e);
+  void addToCurrentIterationComputeEnergy(const char *s, double e) {
+    if (PI)
+      PI->addToCurrentIterationComputeEnergy(s, e);
   }
 
-  void addToCurrentIterationControlEnergy(float e) {
-    PI.addToCurrentIterationControlEnergy(e);
+  void addToCurrentIterationControlEnergy(double e) {
+    if (PI)
+      PI->addToCurrentIterationControlEnergy(e);
   }
 
-  void addToCurrentIterationConfigEnergy(float e) {
-    PI.addToCurrentIterationConfigEnergy(e);
+  void addToCurrentIterationConfigEnergy(double e) {
+    if (PI)
+      PI->addToCurrentIterationConfigEnergy(e);
   }
 
-  float getCurrentIterationComputeTime() {
-    return PI.getCurrentIterationComputeTime();
+  double getCurrentIterationComputeTime() {
+    return (PI ? PI->getCurrentIterationComputeTime() : 0.0) ;
   }
 
-  float getCurrentIterationComputeEnergy() {
-    return PI.getCurrentIterationComputeEnergy();
+  double getCurrentIterationComputeEnergy() {
+    return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0) ;
   }
 
   void writeProfileInfo() {
-    PI.printToFile();
+    if (PI)
+      PI->printToFile();
+  }
+
+  // Exposing functionality of (gpu) profiler
+  void resume_profiler() {
+    if (profiler)
+      profiler->resume_profiler();
+  }
+
+  void pause_profiler() {
+    if (profiler)
+      profiler->pause_profiler();
+  }
+
+  void reset_profiler() {
+    if (profiler)
+      profiler->reset();
+  }
+
+  std::pair<double, double> get_time_energy() const {
+    return (profiler ? profiler->get_time_energy()
+                     : std::make_pair(0.0, 0.0)) ;
+  }
+
+  // Exposing functionality of promise simulator
+  std::pair<double, double> fc_profile(const unsigned num_rows_a,
+                            const unsigned num_cols_a,
+                            const unsigned num_rows_b,
+                            const unsigned num_cols_b,
+                            const unsigned voltage_swing,
+                            const unsigned patch_factor) {
+    return (promise ? promise->fc_profile(num_rows_a,
+                                          num_cols_a,
+                                          num_rows_b,
+                                          num_cols_b,
+                                          voltage_swing,
+                                          patch_factor)
+                    : std::make_pair(0.0, 0.0)) ;
   }
 
+  std::pair<double, double> conv_profile(const unsigned n,
+                            const unsigned c,
+                            const unsigned h,
+                            const unsigned w,
+                            const unsigned c_out,
+                            const unsigned c_in,
+                            const unsigned k_h,
+                            const unsigned k_w,
+                            const unsigned s_h,
+                            const unsigned s_w,
+                            const unsigned voltage_swing,
+                            const unsigned patch_factor) {
+    return (promise ? promise->conv_profile(n, c, h, w,
+                                            c_out, c_in, k_h, k_w,
+                                            s_h, s_w,
+                                            voltage_swing, patch_factor)
+                    : std::make_pair(0.0, 0.0)) ;
+  }
+
+  // Constructor and descructor
   RuntimeController() {
     configurationIdx = 0;
+#ifdef ACTIVE_PROFILING
+    PI = new ProfileInfo();
+    profiler = new Profiler();
+    promise = new Promise();
+#else
+    PI = NULL;
+    profiler = NULL;
+    promise = NULL;
+#endif
+
   }
 
   ~RuntimeController() {
+
+    stop_profiler();
+    writeProfileInfo();
+
+    if (PI) {
+      delete PI;
+    }
+    if (profiler) {
+      delete profiler;
+    }
+    if (promise) {
+      delete promise;
+    }
+
     for (std::vector<struct Configuration>::iterator it = InitialConfigurations.begin(),
 	   ie = InitialConfigurations.end(); it != ie; ++it) {
       std::map<std::string, NodeConfiguration * > ConfSetup = it->setup;
@@ -377,21 +526,23 @@ class RuntimeController {
 
 void RuntimeController::setProfileInfoFilename(const char *str) {
 
-  std::string file_path = std::string(str);
-  size_t idx = file_path.find_last_of("/");
-  file_path.erase(idx + 1);
-  file_path.append("profile_info_");
-
-  bool found = false;
-  std::string profile_filename;
-  for (unsigned i = 0; !found; i++) {
-    profile_filename = file_path;
-    profile_filename.append(std::to_string(i));
-    profile_filename.append(".txt");
-    found = !fileExists(profile_filename);
-  }
+  if (PI) {
+    std::string file_path = std::string(str);
+    size_t idx = file_path.find_last_of("/");
+    file_path.erase(idx + 1);
+    file_path.append("profile_info_");
+
+    bool found = false;
+    std::string profile_filename;
+    for (unsigned i = 0; !found; i++) {
+      profile_filename = file_path;
+      profile_filename.append(std::to_string(i));
+      profile_filename.append(".txt");
+      found = !fileExists(profile_filename);
+    }
 
-  PI.set_out_file_name(profile_filename);
+    PI->set_out_file_name(profile_filename);
+  }
 }
 
 #define NODE_NAME_BUFFER_SIZE 10
@@ -989,14 +1140,15 @@ float hpvm_rt_computeAccuracy3(uint32_t* labels, void* result_ptr) {
 
 void llvm_hpvm_invokeRtControl(void* result, const char* str, int start, int end) {
 
-  //resume_profiler();
+  RC->resume_profiler();
+
   uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
 
   hpvm_rt_computeAccuracy3(labels_cached, result);
 
   // Read stats for iteration that was just completed
-  float current_iteration_time = RC->getCurrentIterationComputeTime();
-  float current_iteration_energy = RC->getCurrentIterationComputeEnergy();
+  double current_iteration_time = RC->getCurrentIterationComputeTime();
+  double current_iteration_energy = RC->getCurrentIterationComputeEnergy();
 
   RC->findNextConfiguration();
   // Still use findNext configuration, to update the configurationIdx,
@@ -1005,13 +1157,11 @@ void llvm_hpvm_invokeRtControl(void* result, const char* str, int start, int end
   float goalVal = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss;
   RC->findTargetConfiguration(goalVal, k);
 
-  //pause_profiler();
-  //std::pair<float, float> pi = get();
-  //reset();
-  //RC->addToCurrentIterationControlTime(pi.first);
-  //RC->addToCurrentIterationControlEnergy(pi.second);
-  RC->addToCurrentIterationControlTime(0.0);
-  RC->addToCurrentIterationControlEnergy(0.0);
+  RC->pause_profiler();
+  std::pair<double, double> pinfo = RC->get_time_energy();
+  RC->reset_profiler();
+  RC->addToCurrentIterationControlTime(pinfo.first);
+  RC->addToCurrentIterationControlEnergy(pinfo.second);
 
   INFO("current iteration time = %f, current iteration energy = %f\n\n",
        current_iteration_time, current_iteration_energy);
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index 220bb6dda4..67c56aec3c 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -1676,6 +1676,24 @@ void* wrapper_ConvLayer(const char* hpvm_node_id,
       int param = approxTuples[0].second;
       if (approx == PROMISENodeConfiguration::APPROX::SWING_LEVEL) {
         DEBUG("Approximation choice for ConvLayer: swing level %d\n", param);
+
+        struct Tensor* input_tensor_cast = (struct Tensor*) input;
+        struct Tensor* filter_tensor_cast = (struct Tensor*) filter;
+        std::pair<double, double> pinfo =
+          RC->conv_profile(input_tensor_cast->dims.dim_sizes[0], //n
+                           input_tensor_cast->dims.dim_sizes[1], //c
+                           input_tensor_cast->dims.dim_sizes[2], //h
+                           input_tensor_cast->dims.dim_sizes[3], //w
+                           filter_tensor_cast->dims.dim_sizes[0], //c_out
+                           filter_tensor_cast->dims.dim_sizes[1], //c_in
+                           filter_tensor_cast->dims.dim_sizes[2], //k_h
+                           filter_tensor_cast->dims.dim_sizes[3], //k_w
+                           conv_stride_h, //s_h
+                           conv_stride_w, //s_w
+                           param, //voltage_swing
+                           1 /*patch_factor*/);
+        RC->addToCurrentIterationComputeTime("ConvLayer_PROMISE", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("ConvLayer_PROMISE", pinfo.second);
         void* t_out;
         t_out = ConvLayer_PROMISE(input, QRanges[0], QRanges[1],
                                   filter, QRanges[2], QRanges[3],
@@ -1685,8 +1703,7 @@ void* wrapper_ConvLayer(const char* hpvm_node_id,
                                   pool_id, pool_size,
                                   activation_id,
                                   QRanges[6], QRanges[7], param);
-        RC->addToCurrentIterationComputeTime("ConvLayer_PROMISE", 0.0);
-        RC->addToCurrentIterationComputeEnergy("ConvLayer_PROMISE", 0.0);
+
         return t_out;
       } else {
         CUSTOM_ASSERT(false && "Unknown approximation type");
@@ -1887,14 +1904,31 @@ void* wrapper_FCLayer(const char* hpvm_node_id,
       int param = approxTuples[0].second;
       if (approx == PROMISENodeConfiguration::APPROX::SWING_LEVEL) {
         DEBUG("Approximation choice for FCLayer: swing level %d\n", param);
+
+        struct Tensor* input_tensor_cast = (struct Tensor*) input;
+        struct Tensor* weights_tensor_cast = (struct Tensor*) weights;
+        CUSTOM_ASSERT((input_tensor_cast->dims.dim_sizes[1] *
+                       input_tensor_cast->dims.dim_sizes[2] *
+                       input_tensor_cast->dims.dim_sizes[3] ==
+                         weights_tensor_cast->dims.dim_sizes[2]) &&
+                      "Dimensions for matrix multiplication do not match.");
+        std::pair<double, double> pinfo =
+          RC->fc_profile(input_tensor_cast->dims.dim_sizes[0], //num_rows_a,
+                         input_tensor_cast->dims.dim_sizes[1] *
+                           input_tensor_cast->dims.dim_sizes[2] *
+                           input_tensor_cast->dims.dim_sizes[3], //num_cols_a,
+                         weights_tensor_cast->dims.dim_sizes[2], //num_rows_b,
+                         weights_tensor_cast->dims.dim_sizes[3], //num_cols_b,
+                         param, //voltage_swing,
+                         1 /*patch_factor*/);
+        RC->addToCurrentIterationComputeTime("FCLayer_PROMISE", pinfo.first);
+        RC->addToCurrentIterationComputeEnergy("FCLayer_PROMISE", pinfo.second);
         void* t_out;
         t_out = FCLayer_PROMISE(input, QRanges[0], QRanges[1],
                                 weights, QRanges[2], QRanges[3],
                                 bias, QRanges[4], QRanges[5],
                                 activation_id,
                                 QRanges[6], QRanges[7], param);
-        RC->addToCurrentIterationComputeTime("FCLayer_PROMISE", 0.0);
-        RC->addToCurrentIterationComputeEnergy("FCLayer_PROMISE", 0.0);
         return t_out;
       } else {
         CUSTOM_ASSERT(false && "Unknown approximation type");
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile
index 9785fb5439..30c80f2a1a 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile
@@ -16,6 +16,8 @@ APP = alexnet2
 TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include
 TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include
 TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a
+PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a
+SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a
 TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a
 
 CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH)  -fno-exceptions -ffast-math -std=c++11 -O3
@@ -65,10 +67,10 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc
-	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS)
 	#$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS)
 
 $(BUILD_DIR):
-- 
GitLab