From 0ea3ecb0d6fe3ffb9daeddc531c4c6879451b34f Mon Sep 17 00:00:00 2001 From: Maria Kotsifakou <kotsifa2@illinois.edu> Date: Fri, 11 Oct 2019 17:15:42 -0500 Subject: [PATCH] Profiling in RT controller - tensor runtime --- llvm/projects/hpvm-tensor-rt/CMakeLists.txt | 120 +++---- .../include/approxhpvm_runtime_utils.h | 160 ++++++--- .../include/hpvm-rt-controller.h | 306 +++++++++++++----- .../tensor_runtime/src/tensor_runtime.cu | 42 ++- .../benchmarks/alexnet2/Makefile | 10 +- 5 files changed, 458 insertions(+), 180 deletions(-) diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt index efffcb0bb9..0eab861bea 100644 --- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt +++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt @@ -30,6 +30,8 @@ endif() include_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/include) include_directories(./tensor_runtime/include) +include_directories(../gpu_profiler/include) +include_directories(../soc_simulator/include) link_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64) @@ -40,6 +42,16 @@ cuda_add_cublas_to_target(tensor_runtime) # Adding new rule for building a cuDNN runtime library cuda_add_library(tensor_cpu_runtime tensor_runtime/src/tensor_cpu_runtime.cc) +find_library(GPU_PROFILER_LIB + NAMES libgpu_profiler.a + HINTS ../gpu_profiler/lib +) + +find_library(SOC_SIMULATOR_LIB + NAMES libpromise_profiler.a + HINTS ../soc_simulator/lib +) + if(USE_GFLAGS) target_link_libraries(tensor_runtime gflags cudnn -lcurand) @@ -51,94 +63,94 @@ target_link_libraries(tensor_cpu_runtime) # Adding rule for the debugging source add_executable(test_ops dnn_sources/src/test_ops.cc) -target_link_libraries(test_ops tensor_runtime) +target_link_libraries(test_ops tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #**** CPU sources add_executable(fc2_cpu dnn_sources/src/fc2_cpu.cc) -target_link_libraries(fc2_cpu tensor_cpu_runtime) +target_link_libraries(fc2_cpu tensor_cpu_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) # Full-Precision versions add_executable(lenet_tanh dnn_sources/src/lenet2_tanh.cc) -target_link_libraries(lenet_tanh tensor_runtime) +target_link_libraries(lenet_tanh tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(lenet_keras dnn_sources/src/lenet_keras.cc) -target_link_libraries(lenet_keras tensor_runtime) +target_link_libraries(lenet_keras tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(lenet_int32 dnn_sources/src/lenet_int32.cc) -target_link_libraries(lenet_int32 tensor_runtime) +target_link_libraries(lenet_int32 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet_cifar10 dnn_sources/src/alexnet_cifar10_front.cc) -target_link_libraries(alexnet_cifar10 tensor_runtime) +target_link_libraries(alexnet_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet_cifar10_tuner dnn_sources/src/alexnet_cifar10_tuner.cc) -target_link_libraries(alexnet_cifar10_tuner tensor_runtime) +target_link_libraries(alexnet_cifar10_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet_cifar10_approx dnn_sources/src/alexnet_cifar10_approx.cc) -target_link_libraries(alexnet_cifar10_approx tensor_runtime) +target_link_libraries(alexnet_cifar10_approx tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet2_cifar10 dnn_sources/src/alexnet2_cifar10.cc) -target_link_libraries(alexnet2_cifar10 tensor_runtime) +target_link_libraries(alexnet2_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet2_cifar10_tuner dnn_sources/src/alexnet2_cifar10_tuner.cc) -target_link_libraries(alexnet2_cifar10_tuner tensor_runtime) +target_link_libraries(alexnet2_cifar10_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar10 dnn_sources/src/vgg16_cifar10.cc) -target_link_libraries(vgg16_cifar10 tensor_runtime) +target_link_libraries(vgg16_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar10_tuner dnn_sources/src/vgg16_cifar10_tuner.cc) -target_link_libraries(vgg16_cifar10_tuner tensor_runtime) +target_link_libraries(vgg16_cifar10_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_cifar10 dnn_sources/src/resnet18_cifar10.cc) -target_link_libraries(resnet18_cifar10 tensor_runtime) +target_link_libraries(resnet18_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_cifar10_approx dnn_sources/src/resnet18_cifar10_approx.cc) -target_link_libraries(resnet18_cifar10_approx tensor_runtime) +target_link_libraries(resnet18_cifar10_approx tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_cifar10_inputapprox dnn_sources/src/resnet18_cifar10_inputapprox.cc) -target_link_libraries(resnet18_cifar10_inputapprox tensor_runtime) +target_link_libraries(resnet18_cifar10_inputapprox tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_cifar10_tuner dnn_sources/src/resnet18_cifar10_tuner.cc) -target_link_libraries(resnet18_cifar10_tuner tensor_runtime) +target_link_libraries(resnet18_cifar10_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100 dnn_sources/src/vgg16_cifar100.cc) -target_link_libraries(vgg16_cifar100 tensor_runtime) +target_link_libraries(vgg16_cifar100 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100_tuner dnn_sources/src/vgg16_cifar100_tuner.cc) -target_link_libraries(vgg16_cifar100_tuner tensor_runtime) +target_link_libraries(vgg16_cifar100_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100_top5 dnn_sources/src/vgg16_cifar100_5.cc) -target_link_libraries(vgg16_cifar100_top5 tensor_runtime) +target_link_libraries(vgg16_cifar100_top5 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) # REF binaries add_executable(mobilenet_cifar10 dnn_sources/src/mobilenet_cifar10.cc) -target_link_libraries(mobilenet_cifar10 tensor_runtime) +target_link_libraries(mobilenet_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(mobilenet_cifar10_shallow dnn_sources/src/mobilenet_cifar10_shallow.cc) -target_link_libraries(mobilenet_cifar10_shallow tensor_runtime) +target_link_libraries(mobilenet_cifar10_shallow tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #### Image Pipeline Tuning sources add_executable(pipeline_GEMO dnn_sources/src/pipeline_GEMO.cc) -target_link_libraries(pipeline_GEMO tensor_runtime) +target_link_libraries(pipeline_GEMO tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEO dnn_sources/src/pipeline_GEO.cc) -target_link_libraries(pipeline_GEO tensor_runtime) +target_link_libraries(pipeline_GEO tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEOM dnn_sources/src/pipeline_GEOM.cc) -target_link_libraries(pipeline_GEOM tensor_runtime) +target_link_libraries(pipeline_GEOM tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSM dnn_sources/src/pipeline_GSM.cc) -target_link_libraries(pipeline_GSM tensor_runtime) +target_link_libraries(pipeline_GSM tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSME dnn_sources/src/pipeline_GSME.cc) -target_link_libraries(pipeline_GSME tensor_runtime) +target_link_libraries(pipeline_GSME tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) @@ -147,101 +159,101 @@ target_link_libraries(pipeline_GSME tensor_runtime) #add_executable(fc4_half dnn_sources/src/half/fc4_half.cc) -#target_link_libraries(fc4_half tensor_runtime) +#target_link_libraries(fc4_half tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #add_executable(lenet_tanh_half dnn_sources/src/half/lenet_tanh_half.cc) -#target_link_libraries(lenet_tanh_half tensor_runtime) +#target_link_libraries(lenet_tanh_half tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #add_executable(lenet_keras_half dnn_sources/src/half/lenet_keras_half.cc) -#target_link_libraries(lenet_keras_half tensor_runtime) +#target_link_libraries(lenet_keras_half tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #********* Promise API sources add_executable(lenet_promise dnn_sources/src/promise/lenet_promise.cc) -target_link_libraries(lenet_promise tensor_runtime) +target_link_libraries(lenet_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) # Quantized PROMISE sources add_executable(alexnet_promise dnn_sources/src/promise/alexnet_promise.cc) -target_link_libraries(alexnet_promise tensor_runtime) +target_link_libraries(alexnet_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet2_promise dnn_sources/src/promise/alexnet2_promise.cc) -target_link_libraries(alexnet2_promise tensor_runtime) +target_link_libraries(alexnet2_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_promise dnn_sources/src/promise/resnet18_promise.cc) -target_link_libraries(resnet18_promise tensor_runtime) +target_link_libraries(resnet18_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100_promise dnn_sources/src/promise/vgg16_cifar100_promise.cc) -target_link_libraries(vgg16_cifar100_promise tensor_runtime) +target_link_libraries(vgg16_cifar100_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar10_promise dnn_sources/src/promise/vgg16_cifar10_promise.cc) -target_link_libraries(vgg16_cifar10_promise tensor_runtime) +target_link_libraries(vgg16_cifar10_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) # REF Source - BUILT After Support for SMART QUANTIZATION add_executable(mobilenet_promise dnn_sources/src/promise/mobilenet_promise.cc) -target_link_libraries(mobilenet_promise tensor_runtime) +target_link_libraries(mobilenet_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(mobilenet_shallow_promise dnn_sources/src/promise/mobilenet_shallow_promise.cc) -target_link_libraries(mobilenet_shallow_promise tensor_runtime) +target_link_libraries(mobilenet_shallow_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #### Image Pipeline PROMISE sources add_executable(pipeline_GEMO_promise dnn_sources/src/promise/pipeline_GEMO_promise.cc) -target_link_libraries(pipeline_GEMO_promise tensor_runtime) +target_link_libraries(pipeline_GEMO_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEO_promise dnn_sources/src/promise/pipeline_GEO_promise.cc) -target_link_libraries(pipeline_GEO_promise tensor_runtime) +target_link_libraries(pipeline_GEO_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEOM_promise dnn_sources/src/promise/pipeline_GEOM_promise.cc) -target_link_libraries(pipeline_GEOM_promise tensor_runtime) +target_link_libraries(pipeline_GEOM_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSM_promise dnn_sources/src/promise/pipeline_GSM_promise.cc) -target_link_libraries(pipeline_GSM_promise tensor_runtime) +target_link_libraries(pipeline_GSM_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSME_promise dnn_sources/src/promise/pipeline_GSME_promise.cc) -target_link_libraries(pipeline_GSME_promise tensor_runtime) +target_link_libraries(pipeline_GSME_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) ############# Promise Validation Sources ############# add_executable(alexnet_valid dnn_sources/src/promise/alexnet_valid.cc) -target_link_libraries(alexnet_valid tensor_runtime) +target_link_libraries(alexnet_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet2_valid dnn_sources/src/promise/alexnet2_valid.cc) -target_link_libraries(alexnet2_valid tensor_runtime) +target_link_libraries(alexnet2_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_valid dnn_sources/src/promise/resnet18_valid.cc) -target_link_libraries(resnet18_valid tensor_runtime) +target_link_libraries(resnet18_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100_valid dnn_sources/src/promise/vgg16_cifar100_valid.cc) -target_link_libraries(vgg16_cifar100_valid tensor_runtime) +target_link_libraries(vgg16_cifar100_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar10_valid dnn_sources/src/promise/vgg16_cifar10_valid.cc) -target_link_libraries(vgg16_cifar10_valid tensor_runtime) +target_link_libraries(vgg16_cifar10_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(mobilenet_valid dnn_sources/src/promise/mobilenet_valid.cc) -target_link_libraries(mobilenet_valid tensor_runtime) +target_link_libraries(mobilenet_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(mobilenet_shallow_valid dnn_sources/src/promise/mobilenet_shallow_valid.cc) -target_link_libraries(mobilenet_shallow_valid tensor_runtime) +target_link_libraries(mobilenet_shallow_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) ##### Image pipeline validation sources add_executable(pipeline_GEMO_valid dnn_sources/src/promise/pipeline_GEMO_valid.cc) -target_link_libraries(pipeline_GEMO_valid tensor_runtime) +target_link_libraries(pipeline_GEMO_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEO_valid dnn_sources/src/promise/pipeline_GEO_valid.cc) -target_link_libraries(pipeline_GEO_valid tensor_runtime) +target_link_libraries(pipeline_GEO_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEOM_valid dnn_sources/src/promise/pipeline_GEOM_valid.cc) -target_link_libraries(pipeline_GEOM_valid tensor_runtime) +target_link_libraries(pipeline_GEOM_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSM_valid dnn_sources/src/promise/pipeline_GSM_valid.cc) -target_link_libraries(pipeline_GSM_valid tensor_runtime) +target_link_libraries(pipeline_GSM_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSME_valid dnn_sources/src/promise/pipeline_GSME_valid.cc) -target_link_libraries(pipeline_GSME_valid tensor_runtime) +target_link_libraries(pipeline_GSME_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h index dd66c2febd..217f575c2a 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h @@ -22,17 +22,25 @@ void* handleTensorAddApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorAdd(input, bias); - RC->addToCurrentIterationComputeTime("tensorAdd", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorAdd", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfAdd(input, bias); - RC->addToCurrentIterationComputeTime("tensorHalfAdd", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second); return t_out; } default : @@ -62,17 +70,25 @@ void* handleTensorMulApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorGemmGPU(lhs, rhs); - RC->addToCurrentIterationComputeTime("tensorGemmGPU", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfGemmGPU(lhs, rhs); - RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second); return t_out; } default : @@ -104,23 +120,31 @@ void* handleTensorConvApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, 1); - RC->addToCurrentIterationComputeTime("tensorConvolution", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorConvolution", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvolution", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvolution", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfConvolution(input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, 1); - RC->addToCurrentIterationComputeTime("tensorHalfConvolution", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfConvolution", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::PERFORATION : @@ -144,12 +168,16 @@ void* handleTensorConvApproximationTuples( break; } void* t_out; + RC->resume_profiler(); t_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, 1, row, col); - RC->addToCurrentIterationComputeTime("tensorConvPerf", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorConvPerf", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvPerf", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvPerf", pinfo.second); return t_out; } default : @@ -182,23 +210,31 @@ void* handleTensorGroupConvApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorConvolution(input, filter, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, conv_groups); - RC->addToCurrentIterationComputeTime("tensorConvolution", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorConvolution", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvolution", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvolution", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfConvolution(input, filter, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, conv_groups); - RC->addToCurrentIterationComputeTime("tensorHalfConvolution", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfConvolution", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", pinfo.second); return t_out; } default : @@ -229,19 +265,27 @@ void* handleTensorBatchNormApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, variance_ptr, epsilon); - RC->addToCurrentIterationComputeTime("tensorBatchNorm", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, variance_ptr, epsilon); - RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second); return t_out; } default : @@ -271,17 +315,25 @@ void* handleTensorReluApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorRelu(input); - RC->addToCurrentIterationComputeTime("tensorRelu", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorRelu", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfRelu(input); - RC->addToCurrentIterationComputeTime("tensorHalfRelu", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second); return t_out; } default : @@ -311,17 +363,25 @@ void* handleTensorClippedReluApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorRelu2(input, min, max); - RC->addToCurrentIterationComputeTime("tensorRelu2", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorRelu2", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfRelu2(input, min, max); - RC->addToCurrentIterationComputeTime("tensorHalfRelu2", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second); return t_out; } default : @@ -351,17 +411,25 @@ void* handleTensorTanhApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorTanh(input); - RC->addToCurrentIterationComputeTime("tensorTanh", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorTanh", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfTanh(input); - RC->addToCurrentIterationComputeTime("tensorHalfTanh", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second); return t_out; } default : @@ -394,25 +462,33 @@ void* handleTensorPoolingApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorPooling(input_ptr, poolFunction, window_height, window_width, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - RC->addToCurrentIterationComputeTime("tensorPooling", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorPooling", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfPooling(input_ptr, poolFunction, window_height, window_width, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - RC->addToCurrentIterationComputeTime("tensorHalfPooling", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second); return t_out; } default : @@ -437,9 +513,13 @@ void* handleTensorSoftmaxApproximationTuples( //TODO: if approximation choices are added for softmax operation, // implement this like the other handle* functions void* t_out; + RC->resume_profiler(); t_out = tensorSoftmax(input_ptr); - RC->addToCurrentIterationComputeTime("tensorSoftmax", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorSoftmax", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorSoftmax", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorSoftmax", pinfo.second); return t_out; } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h index b510e3849e..d11f9dd522 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h @@ -2,7 +2,6 @@ #ifndef LLVM_HPVM_RT_CONTROLLER_H #define LLVM_HPVM_RT_CONTROLLER_H - #include <fstream> #include <iostream> #include <map> @@ -12,8 +11,13 @@ #include "configuration.h" +#include "profiler.h" +#include "promise_timing_model.h" + #include <sys/stat.h> +//#define ACTIVE_PROFILING + /* * Check if a file exists * Return true if the file exists, false else @@ -26,33 +30,45 @@ bool fileExists(const std::string& file) { class ProfileInfo { private: // Members - float time_total; // Total execution time of application - float time_compute; // Compute - float time_control; // Control - float time_config; // Apply configuration + double time_total; // Total execution time of application + double time_compute; // Compute + double time_control; // Control + double time_config; // Apply configuration - float energy_total; // Total energy consumed by applcation - float energy_compute; // Compute - float energy_control; // Control - float energy_config; // Apply configuration + double energy_total; // Total energy consumed by applcation + double energy_compute; // Compute + double energy_control; // Control + double energy_config; // Apply configuration // Execution time of one loop iteration - float time_compute_current_iteration; // Compute - float time_control_current_iteration; // Control - float time_config_current_iteration; // Apply configuration + double time_compute_current_iteration; // Compute + double time_control_current_iteration; // Control + double time_config_current_iteration; // Apply configuration // Energy comsumed by one loop iteration - float energy_compute_current_iteration; // Compute - float energy_control_current_iteration; // Control - float energy_config_current_iteration; // Apply configuration + double energy_compute_current_iteration; // Compute + double energy_control_current_iteration; // Control + double energy_config_current_iteration; // Apply configuration // Vectors, where compute time and energy information // - for each loop iteration (outer vector) // - per operation (inner vector) // (tensor operation for GPU, or whole layer for PROMISE) // is stored - std::vector< std::vector< std::pair< std::string, float > > > tensor_time_info; - std::vector< std::vector< std::pair< std::string, float > > > tensor_energy_info; + std::vector< std::vector< std::pair< std::string, double > > > tensor_time_info; + std::vector< std::vector< std::pair< std::string, double > > > tensor_energy_info; + + // Vectors, where total compute time and energy information per iteration are stored + std::vector< double > compute_time_info; + std::vector< double > compute_energy_info; + + // Vectors, where control time and energy information per iteration are stored + std::vector< double > control_time_info; + std::vector< double > control_energy_info; + + // Vectors, where control time and energy information per iteration are stored + std::vector< double > config_time_info; + std::vector< double > config_energy_info; bool in_iteration; @@ -76,8 +92,8 @@ class ProfileInfo { if (!in_iteration) { resetCurrentIterationTime(); resetCurrentIterationEnergy(); - tensor_time_info.push_back(std::vector< std::pair< std::string, float > > ()); - tensor_energy_info.push_back(std::vector< std::pair< std::string, float > > ()); + tensor_time_info.push_back(std::vector< std::pair< std::string, double > > ()); + tensor_energy_info.push_back(std::vector< std::pair< std::string, double > > ()); in_iteration = true; } } @@ -102,54 +118,63 @@ class ProfileInfo { energy_control_current_iteration + energy_config_current_iteration); + // Save current iteration counters + compute_time_info.push_back(time_compute_current_iteration); + compute_energy_info.push_back(energy_compute_current_iteration); + control_time_info.push_back(time_control_current_iteration); + control_energy_info.push_back(energy_control_current_iteration); + config_time_info.push_back(time_config_current_iteration); + config_energy_info.push_back(energy_config_current_iteration); + + // Note end of iteration in_iteration = false; } - void addToCurrentIterationComputeTime(const char *s, float t) { + void addToCurrentIterationComputeTime(const char *s, double t) { start_iteration(); time_compute_current_iteration += t; tensor_time_info.back().push_back(std::make_pair(std::string(s), t)); } - void addToCurrentIterationControlTime(float t) { + void addToCurrentIterationControlTime(double t) { start_iteration(); time_control_current_iteration += t; } - void addToCurrentIterationConfigTime(float t) { + void addToCurrentIterationConfigTime(double t) { start_iteration(); time_config_current_iteration += t; } - void addToCurrentIterationComputeEnergy(const char *s, float e) { + void addToCurrentIterationComputeEnergy(const char *s, double e) { start_iteration(); energy_compute_current_iteration += e; tensor_energy_info.back().push_back(std::make_pair(std::string(s), e)); } - void addToCurrentIterationControlEnergy(float e) { + void addToCurrentIterationControlEnergy(double e) { start_iteration(); energy_control_current_iteration += e; } - void addToCurrentIterationConfigEnergy(float e) { + void addToCurrentIterationConfigEnergy(double e) { start_iteration(); energy_config_current_iteration += e; } - float getTotalTime() { + double getTotalTime() { return time_total; } - float getTotalEnergy() { + double getTotalEnergy() { return energy_total; } - float getCurrentIterationComputeTime() { + double getCurrentIterationComputeTime() { return time_compute_current_iteration; } - float getCurrentIterationComputeEnergy() { + double getCurrentIterationComputeEnergy() { return energy_compute_current_iteration; } @@ -168,10 +193,18 @@ class ProfileInfo { abort(); } - // By construction, time_unfo and energy_info are expected to have equal - // sizes, in outer and inner vectors both. - CUSTOM_ASSERT((tensor_time_info.size() == tensor_energy_info.size()) && - "time_info and energy_info size: iteration number does not match."); + // By construction, tensor_time_info and tensor_energy_info are expected + // to have equal sizes, in outer and inner vectors both, + // and all time_info and energy_info vectors must have the same size. + unsigned iterations = tensor_time_info.size(); + CUSTOM_ASSERT((tensor_time_info.size() == iterations) && + (tensor_energy_info.size() == iterations) && + (control_time_info.size() == iterations) && + (control_energy_info.size() == iterations) && + (config_time_info.size() == iterations) && + (config_energy_info.size() == iterations) && + "time_info and energy_info size: \ + iteration number does not match."); for (unsigned i = 0; i < tensor_time_info.size(); i++ ) { // time_info.size() == energy_info.size(), since we passed the assertion @@ -187,8 +220,16 @@ class ProfileInfo { << tensor_time_info[i][j].second << " " << tensor_energy_info[i][j].second << "\n"; } + + s_out << "\nIteration Compute Time : " << compute_time_info[i] << "\n"; + s_out << "Iteration Compute Energy: " << compute_energy_info[i] << "\n"; + s_out << "Iteration Control Time : " << control_time_info[i] << "\n"; + s_out << "Iteration Control Energy: " << control_energy_info[i] << "\n"; + s_out << "Iteration Config Time : " << config_time_info[i] << "\n"; + s_out << "Iteration Control Energy: " << config_energy_info[i] << "\n"; + } - s_out << "\nTotal Compute Time : " << time_compute << "\n"; + s_out << "\n\nTotal Compute Time : " << time_compute << "\n"; s_out << "Total Compute Energy: " << energy_compute << "\n"; s_out << "\nTotal Control Time : " << time_control << "\n"; @@ -244,9 +285,23 @@ class RuntimeController { std::vector<struct Configuration *> *Configurations; unsigned configurationIdx = 0; - ProfileInfo PI; + /*** Objects used to gather timing and energy information for execution ***/ + ProfileInfo *PI; + Profiler *profiler; + Promise *promise; //Functions + + // Private functions of profiler + void start_profiler() { + if (profiler) + profiler->start_profiler(); + } + void stop_profiler() { + if (profiler) + profiler->stop_profiler(); + } + void setProfileInfoFilename(const char *); void readQuantizationFile(const char *); void readConfigurationFile(const char *); @@ -305,54 +360,148 @@ class RuntimeController { printConfigurations(ThreeDCurveConfigurations); configurationIdx = 0; //TODO: initialize using pareto curve - findTargetConfiguration ? Configurations = &SpeedupConfigurations; + + // Start profiling thread in the background, ready to time + start_profiler(); + pause_profiler(); + reset_profiler(); } - // Transferring functionality of ProfileInfo + // Exposing functionality of ProfileInfo void end_iteration() { - PI.end_iteration(); + if (PI) + PI->end_iteration(); } - void addToCurrentIterationComputeTime(const char *s, float t) { - PI.addToCurrentIterationComputeTime(s, t); + void addToCurrentIterationComputeTime(const char *s, double t) { + if (PI) + PI->addToCurrentIterationComputeTime(s, t); } - void addToCurrentIterationControlTime(float t) { - PI.addToCurrentIterationControlTime(t); + void addToCurrentIterationControlTime(double t) { + if (PI) + PI->addToCurrentIterationControlTime(t); } - void addToCurrentIterationConfigTime(float t) { - PI.addToCurrentIterationConfigTime(t); + void addToCurrentIterationConfigTime(double t) { + if (PI) + PI->addToCurrentIterationConfigTime(t); } - void addToCurrentIterationComputeEnergy(const char *s, float e) { - PI.addToCurrentIterationComputeEnergy(s, e); + void addToCurrentIterationComputeEnergy(const char *s, double e) { + if (PI) + PI->addToCurrentIterationComputeEnergy(s, e); } - void addToCurrentIterationControlEnergy(float e) { - PI.addToCurrentIterationControlEnergy(e); + void addToCurrentIterationControlEnergy(double e) { + if (PI) + PI->addToCurrentIterationControlEnergy(e); } - void addToCurrentIterationConfigEnergy(float e) { - PI.addToCurrentIterationConfigEnergy(e); + void addToCurrentIterationConfigEnergy(double e) { + if (PI) + PI->addToCurrentIterationConfigEnergy(e); } - float getCurrentIterationComputeTime() { - return PI.getCurrentIterationComputeTime(); + double getCurrentIterationComputeTime() { + return (PI ? PI->getCurrentIterationComputeTime() : 0.0) ; } - float getCurrentIterationComputeEnergy() { - return PI.getCurrentIterationComputeEnergy(); + double getCurrentIterationComputeEnergy() { + return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0) ; } void writeProfileInfo() { - PI.printToFile(); + if (PI) + PI->printToFile(); + } + + // Exposing functionality of (gpu) profiler + void resume_profiler() { + if (profiler) + profiler->resume_profiler(); + } + + void pause_profiler() { + if (profiler) + profiler->pause_profiler(); + } + + void reset_profiler() { + if (profiler) + profiler->reset(); + } + + std::pair<double, double> get_time_energy() const { + return (profiler ? profiler->get_time_energy() + : std::make_pair(0.0, 0.0)) ; + } + + // Exposing functionality of promise simulator + std::pair<double, double> fc_profile(const unsigned num_rows_a, + const unsigned num_cols_a, + const unsigned num_rows_b, + const unsigned num_cols_b, + const unsigned voltage_swing, + const unsigned patch_factor) { + return (promise ? promise->fc_profile(num_rows_a, + num_cols_a, + num_rows_b, + num_cols_b, + voltage_swing, + patch_factor) + : std::make_pair(0.0, 0.0)) ; } + std::pair<double, double> conv_profile(const unsigned n, + const unsigned c, + const unsigned h, + const unsigned w, + const unsigned c_out, + const unsigned c_in, + const unsigned k_h, + const unsigned k_w, + const unsigned s_h, + const unsigned s_w, + const unsigned voltage_swing, + const unsigned patch_factor) { + return (promise ? promise->conv_profile(n, c, h, w, + c_out, c_in, k_h, k_w, + s_h, s_w, + voltage_swing, patch_factor) + : std::make_pair(0.0, 0.0)) ; + } + + // Constructor and descructor RuntimeController() { configurationIdx = 0; +#ifdef ACTIVE_PROFILING + PI = new ProfileInfo(); + profiler = new Profiler(); + promise = new Promise(); +#else + PI = NULL; + profiler = NULL; + promise = NULL; +#endif + } ~RuntimeController() { + + stop_profiler(); + writeProfileInfo(); + + if (PI) { + delete PI; + } + if (profiler) { + delete profiler; + } + if (promise) { + delete promise; + } + for (std::vector<struct Configuration>::iterator it = InitialConfigurations.begin(), ie = InitialConfigurations.end(); it != ie; ++it) { std::map<std::string, NodeConfiguration * > ConfSetup = it->setup; @@ -377,21 +526,23 @@ class RuntimeController { void RuntimeController::setProfileInfoFilename(const char *str) { - std::string file_path = std::string(str); - size_t idx = file_path.find_last_of("/"); - file_path.erase(idx + 1); - file_path.append("profile_info_"); - - bool found = false; - std::string profile_filename; - for (unsigned i = 0; !found; i++) { - profile_filename = file_path; - profile_filename.append(std::to_string(i)); - profile_filename.append(".txt"); - found = !fileExists(profile_filename); - } + if (PI) { + std::string file_path = std::string(str); + size_t idx = file_path.find_last_of("/"); + file_path.erase(idx + 1); + file_path.append("profile_info_"); + + bool found = false; + std::string profile_filename; + for (unsigned i = 0; !found; i++) { + profile_filename = file_path; + profile_filename.append(std::to_string(i)); + profile_filename.append(".txt"); + found = !fileExists(profile_filename); + } - PI.set_out_file_name(profile_filename); + PI->set_out_file_name(profile_filename); + } } #define NODE_NAME_BUFFER_SIZE 10 @@ -989,14 +1140,15 @@ float hpvm_rt_computeAccuracy3(uint32_t* labels, void* result_ptr) { void llvm_hpvm_invokeRtControl(void* result, const char* str, int start, int end) { - //resume_profiler(); + RC->resume_profiler(); + uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); // Read stats for iteration that was just completed - float current_iteration_time = RC->getCurrentIterationComputeTime(); - float current_iteration_energy = RC->getCurrentIterationComputeEnergy(); + double current_iteration_time = RC->getCurrentIterationComputeTime(); + double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); RC->findNextConfiguration(); // Still use findNext configuration, to update the configurationIdx, @@ -1005,13 +1157,11 @@ void llvm_hpvm_invokeRtControl(void* result, const char* str, int start, int end float goalVal = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss; RC->findTargetConfiguration(goalVal, k); - //pause_profiler(); - //std::pair<float, float> pi = get(); - //reset(); - //RC->addToCurrentIterationControlTime(pi.first); - //RC->addToCurrentIterationControlEnergy(pi.second); - RC->addToCurrentIterationControlTime(0.0); - RC->addToCurrentIterationControlEnergy(0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationControlTime(pinfo.first); + RC->addToCurrentIterationControlEnergy(pinfo.second); INFO("current iteration time = %f, current iteration energy = %f\n\n", current_iteration_time, current_iteration_energy); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu index 220bb6dda4..67c56aec3c 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu @@ -1676,6 +1676,24 @@ void* wrapper_ConvLayer(const char* hpvm_node_id, int param = approxTuples[0].second; if (approx == PROMISENodeConfiguration::APPROX::SWING_LEVEL) { DEBUG("Approximation choice for ConvLayer: swing level %d\n", param); + + struct Tensor* input_tensor_cast = (struct Tensor*) input; + struct Tensor* filter_tensor_cast = (struct Tensor*) filter; + std::pair<double, double> pinfo = + RC->conv_profile(input_tensor_cast->dims.dim_sizes[0], //n + input_tensor_cast->dims.dim_sizes[1], //c + input_tensor_cast->dims.dim_sizes[2], //h + input_tensor_cast->dims.dim_sizes[3], //w + filter_tensor_cast->dims.dim_sizes[0], //c_out + filter_tensor_cast->dims.dim_sizes[1], //c_in + filter_tensor_cast->dims.dim_sizes[2], //k_h + filter_tensor_cast->dims.dim_sizes[3], //k_w + conv_stride_h, //s_h + conv_stride_w, //s_w + param, //voltage_swing + 1 /*patch_factor*/); + RC->addToCurrentIterationComputeTime("ConvLayer_PROMISE", pinfo.first); + RC->addToCurrentIterationComputeEnergy("ConvLayer_PROMISE", pinfo.second); void* t_out; t_out = ConvLayer_PROMISE(input, QRanges[0], QRanges[1], filter, QRanges[2], QRanges[3], @@ -1685,8 +1703,7 @@ void* wrapper_ConvLayer(const char* hpvm_node_id, pool_id, pool_size, activation_id, QRanges[6], QRanges[7], param); - RC->addToCurrentIterationComputeTime("ConvLayer_PROMISE", 0.0); - RC->addToCurrentIterationComputeEnergy("ConvLayer_PROMISE", 0.0); + return t_out; } else { CUSTOM_ASSERT(false && "Unknown approximation type"); @@ -1887,14 +1904,31 @@ void* wrapper_FCLayer(const char* hpvm_node_id, int param = approxTuples[0].second; if (approx == PROMISENodeConfiguration::APPROX::SWING_LEVEL) { DEBUG("Approximation choice for FCLayer: swing level %d\n", param); + + struct Tensor* input_tensor_cast = (struct Tensor*) input; + struct Tensor* weights_tensor_cast = (struct Tensor*) weights; + CUSTOM_ASSERT((input_tensor_cast->dims.dim_sizes[1] * + input_tensor_cast->dims.dim_sizes[2] * + input_tensor_cast->dims.dim_sizes[3] == + weights_tensor_cast->dims.dim_sizes[2]) && + "Dimensions for matrix multiplication do not match."); + std::pair<double, double> pinfo = + RC->fc_profile(input_tensor_cast->dims.dim_sizes[0], //num_rows_a, + input_tensor_cast->dims.dim_sizes[1] * + input_tensor_cast->dims.dim_sizes[2] * + input_tensor_cast->dims.dim_sizes[3], //num_cols_a, + weights_tensor_cast->dims.dim_sizes[2], //num_rows_b, + weights_tensor_cast->dims.dim_sizes[3], //num_cols_b, + param, //voltage_swing, + 1 /*patch_factor*/); + RC->addToCurrentIterationComputeTime("FCLayer_PROMISE", pinfo.first); + RC->addToCurrentIterationComputeEnergy("FCLayer_PROMISE", pinfo.second); void* t_out; t_out = FCLayer_PROMISE(input, QRanges[0], QRanges[1], weights, QRanges[2], QRanges[3], bias, QRanges[4], QRanges[5], activation_id, QRanges[6], QRanges[7], param); - RC->addToCurrentIterationComputeTime("FCLayer_PROMISE", 0.0); - RC->addToCurrentIterationComputeEnergy("FCLayer_PROMISE", 0.0); return t_out; } else { CUSTOM_ASSERT(false && "Unknown approximation type"); diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile index 9785fb5439..30c80f2a1a 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile @@ -16,6 +16,8 @@ APP = alexnet2 TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a +PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a +SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 @@ -65,10 +67,10 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc - $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS) $(BUILD_DIR): -- GitLab