diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt index efffcb0bb9aa466215fd825c9254ea4d91fe8b6a..0eab861beac699407114d0cae4e063f41dbdab95 100644 --- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt +++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt @@ -30,6 +30,8 @@ endif() include_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/include) include_directories(./tensor_runtime/include) +include_directories(../gpu_profiler/include) +include_directories(../soc_simulator/include) link_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64) @@ -40,6 +42,16 @@ cuda_add_cublas_to_target(tensor_runtime) # Adding new rule for building a cuDNN runtime library cuda_add_library(tensor_cpu_runtime tensor_runtime/src/tensor_cpu_runtime.cc) +find_library(GPU_PROFILER_LIB + NAMES libgpu_profiler.a + HINTS ../gpu_profiler/lib +) + +find_library(SOC_SIMULATOR_LIB + NAMES libpromise_profiler.a + HINTS ../soc_simulator/lib +) + if(USE_GFLAGS) target_link_libraries(tensor_runtime gflags cudnn -lcurand) @@ -51,94 +63,94 @@ target_link_libraries(tensor_cpu_runtime) # Adding rule for the debugging source add_executable(test_ops dnn_sources/src/test_ops.cc) -target_link_libraries(test_ops tensor_runtime) +target_link_libraries(test_ops tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #**** CPU sources add_executable(fc2_cpu dnn_sources/src/fc2_cpu.cc) -target_link_libraries(fc2_cpu tensor_cpu_runtime) +target_link_libraries(fc2_cpu tensor_cpu_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) # Full-Precision versions add_executable(lenet_tanh dnn_sources/src/lenet2_tanh.cc) -target_link_libraries(lenet_tanh tensor_runtime) +target_link_libraries(lenet_tanh tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(lenet_keras dnn_sources/src/lenet_keras.cc) -target_link_libraries(lenet_keras tensor_runtime) +target_link_libraries(lenet_keras tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(lenet_int32 dnn_sources/src/lenet_int32.cc) -target_link_libraries(lenet_int32 tensor_runtime) +target_link_libraries(lenet_int32 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet_cifar10 dnn_sources/src/alexnet_cifar10_front.cc) -target_link_libraries(alexnet_cifar10 tensor_runtime) +target_link_libraries(alexnet_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet_cifar10_tuner dnn_sources/src/alexnet_cifar10_tuner.cc) -target_link_libraries(alexnet_cifar10_tuner tensor_runtime) +target_link_libraries(alexnet_cifar10_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet_cifar10_approx dnn_sources/src/alexnet_cifar10_approx.cc) -target_link_libraries(alexnet_cifar10_approx tensor_runtime) +target_link_libraries(alexnet_cifar10_approx tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet2_cifar10 dnn_sources/src/alexnet2_cifar10.cc) -target_link_libraries(alexnet2_cifar10 tensor_runtime) +target_link_libraries(alexnet2_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet2_cifar10_tuner dnn_sources/src/alexnet2_cifar10_tuner.cc) -target_link_libraries(alexnet2_cifar10_tuner tensor_runtime) +target_link_libraries(alexnet2_cifar10_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar10 dnn_sources/src/vgg16_cifar10.cc) -target_link_libraries(vgg16_cifar10 tensor_runtime) +target_link_libraries(vgg16_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar10_tuner dnn_sources/src/vgg16_cifar10_tuner.cc) -target_link_libraries(vgg16_cifar10_tuner tensor_runtime) +target_link_libraries(vgg16_cifar10_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_cifar10 dnn_sources/src/resnet18_cifar10.cc) -target_link_libraries(resnet18_cifar10 tensor_runtime) +target_link_libraries(resnet18_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_cifar10_approx dnn_sources/src/resnet18_cifar10_approx.cc) -target_link_libraries(resnet18_cifar10_approx tensor_runtime) +target_link_libraries(resnet18_cifar10_approx tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_cifar10_inputapprox dnn_sources/src/resnet18_cifar10_inputapprox.cc) -target_link_libraries(resnet18_cifar10_inputapprox tensor_runtime) +target_link_libraries(resnet18_cifar10_inputapprox tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_cifar10_tuner dnn_sources/src/resnet18_cifar10_tuner.cc) -target_link_libraries(resnet18_cifar10_tuner tensor_runtime) +target_link_libraries(resnet18_cifar10_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100 dnn_sources/src/vgg16_cifar100.cc) -target_link_libraries(vgg16_cifar100 tensor_runtime) +target_link_libraries(vgg16_cifar100 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100_tuner dnn_sources/src/vgg16_cifar100_tuner.cc) -target_link_libraries(vgg16_cifar100_tuner tensor_runtime) +target_link_libraries(vgg16_cifar100_tuner tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100_top5 dnn_sources/src/vgg16_cifar100_5.cc) -target_link_libraries(vgg16_cifar100_top5 tensor_runtime) +target_link_libraries(vgg16_cifar100_top5 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) # REF binaries add_executable(mobilenet_cifar10 dnn_sources/src/mobilenet_cifar10.cc) -target_link_libraries(mobilenet_cifar10 tensor_runtime) +target_link_libraries(mobilenet_cifar10 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(mobilenet_cifar10_shallow dnn_sources/src/mobilenet_cifar10_shallow.cc) -target_link_libraries(mobilenet_cifar10_shallow tensor_runtime) +target_link_libraries(mobilenet_cifar10_shallow tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #### Image Pipeline Tuning sources add_executable(pipeline_GEMO dnn_sources/src/pipeline_GEMO.cc) -target_link_libraries(pipeline_GEMO tensor_runtime) +target_link_libraries(pipeline_GEMO tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEO dnn_sources/src/pipeline_GEO.cc) -target_link_libraries(pipeline_GEO tensor_runtime) +target_link_libraries(pipeline_GEO tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEOM dnn_sources/src/pipeline_GEOM.cc) -target_link_libraries(pipeline_GEOM tensor_runtime) +target_link_libraries(pipeline_GEOM tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSM dnn_sources/src/pipeline_GSM.cc) -target_link_libraries(pipeline_GSM tensor_runtime) +target_link_libraries(pipeline_GSM tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSME dnn_sources/src/pipeline_GSME.cc) -target_link_libraries(pipeline_GSME tensor_runtime) +target_link_libraries(pipeline_GSME tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) @@ -147,101 +159,101 @@ target_link_libraries(pipeline_GSME tensor_runtime) #add_executable(fc4_half dnn_sources/src/half/fc4_half.cc) -#target_link_libraries(fc4_half tensor_runtime) +#target_link_libraries(fc4_half tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #add_executable(lenet_tanh_half dnn_sources/src/half/lenet_tanh_half.cc) -#target_link_libraries(lenet_tanh_half tensor_runtime) +#target_link_libraries(lenet_tanh_half tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #add_executable(lenet_keras_half dnn_sources/src/half/lenet_keras_half.cc) -#target_link_libraries(lenet_keras_half tensor_runtime) +#target_link_libraries(lenet_keras_half tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #********* Promise API sources add_executable(lenet_promise dnn_sources/src/promise/lenet_promise.cc) -target_link_libraries(lenet_promise tensor_runtime) +target_link_libraries(lenet_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) # Quantized PROMISE sources add_executable(alexnet_promise dnn_sources/src/promise/alexnet_promise.cc) -target_link_libraries(alexnet_promise tensor_runtime) +target_link_libraries(alexnet_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet2_promise dnn_sources/src/promise/alexnet2_promise.cc) -target_link_libraries(alexnet2_promise tensor_runtime) +target_link_libraries(alexnet2_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_promise dnn_sources/src/promise/resnet18_promise.cc) -target_link_libraries(resnet18_promise tensor_runtime) +target_link_libraries(resnet18_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100_promise dnn_sources/src/promise/vgg16_cifar100_promise.cc) -target_link_libraries(vgg16_cifar100_promise tensor_runtime) +target_link_libraries(vgg16_cifar100_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar10_promise dnn_sources/src/promise/vgg16_cifar10_promise.cc) -target_link_libraries(vgg16_cifar10_promise tensor_runtime) +target_link_libraries(vgg16_cifar10_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) # REF Source - BUILT After Support for SMART QUANTIZATION add_executable(mobilenet_promise dnn_sources/src/promise/mobilenet_promise.cc) -target_link_libraries(mobilenet_promise tensor_runtime) +target_link_libraries(mobilenet_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(mobilenet_shallow_promise dnn_sources/src/promise/mobilenet_shallow_promise.cc) -target_link_libraries(mobilenet_shallow_promise tensor_runtime) +target_link_libraries(mobilenet_shallow_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #### Image Pipeline PROMISE sources add_executable(pipeline_GEMO_promise dnn_sources/src/promise/pipeline_GEMO_promise.cc) -target_link_libraries(pipeline_GEMO_promise tensor_runtime) +target_link_libraries(pipeline_GEMO_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEO_promise dnn_sources/src/promise/pipeline_GEO_promise.cc) -target_link_libraries(pipeline_GEO_promise tensor_runtime) +target_link_libraries(pipeline_GEO_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEOM_promise dnn_sources/src/promise/pipeline_GEOM_promise.cc) -target_link_libraries(pipeline_GEOM_promise tensor_runtime) +target_link_libraries(pipeline_GEOM_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSM_promise dnn_sources/src/promise/pipeline_GSM_promise.cc) -target_link_libraries(pipeline_GSM_promise tensor_runtime) +target_link_libraries(pipeline_GSM_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSME_promise dnn_sources/src/promise/pipeline_GSME_promise.cc) -target_link_libraries(pipeline_GSME_promise tensor_runtime) +target_link_libraries(pipeline_GSME_promise tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) ############# Promise Validation Sources ############# add_executable(alexnet_valid dnn_sources/src/promise/alexnet_valid.cc) -target_link_libraries(alexnet_valid tensor_runtime) +target_link_libraries(alexnet_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(alexnet2_valid dnn_sources/src/promise/alexnet2_valid.cc) -target_link_libraries(alexnet2_valid tensor_runtime) +target_link_libraries(alexnet2_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(resnet18_valid dnn_sources/src/promise/resnet18_valid.cc) -target_link_libraries(resnet18_valid tensor_runtime) +target_link_libraries(resnet18_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar100_valid dnn_sources/src/promise/vgg16_cifar100_valid.cc) -target_link_libraries(vgg16_cifar100_valid tensor_runtime) +target_link_libraries(vgg16_cifar100_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(vgg16_cifar10_valid dnn_sources/src/promise/vgg16_cifar10_valid.cc) -target_link_libraries(vgg16_cifar10_valid tensor_runtime) +target_link_libraries(vgg16_cifar10_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(mobilenet_valid dnn_sources/src/promise/mobilenet_valid.cc) -target_link_libraries(mobilenet_valid tensor_runtime) +target_link_libraries(mobilenet_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(mobilenet_shallow_valid dnn_sources/src/promise/mobilenet_shallow_valid.cc) -target_link_libraries(mobilenet_shallow_valid tensor_runtime) +target_link_libraries(mobilenet_shallow_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) ##### Image pipeline validation sources add_executable(pipeline_GEMO_valid dnn_sources/src/promise/pipeline_GEMO_valid.cc) -target_link_libraries(pipeline_GEMO_valid tensor_runtime) +target_link_libraries(pipeline_GEMO_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEO_valid dnn_sources/src/promise/pipeline_GEO_valid.cc) -target_link_libraries(pipeline_GEO_valid tensor_runtime) +target_link_libraries(pipeline_GEO_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GEOM_valid dnn_sources/src/promise/pipeline_GEOM_valid.cc) -target_link_libraries(pipeline_GEOM_valid tensor_runtime) +target_link_libraries(pipeline_GEOM_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSM_valid dnn_sources/src/promise/pipeline_GSM_valid.cc) -target_link_libraries(pipeline_GSM_valid tensor_runtime) +target_link_libraries(pipeline_GSM_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) add_executable(pipeline_GSME_valid dnn_sources/src/promise/pipeline_GSME_valid.cc) -target_link_libraries(pipeline_GSME_valid tensor_runtime) +target_link_libraries(pipeline_GSME_valid tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) diff --git a/llvm/projects/hpvm-tensor-rt/README.md b/llvm/projects/hpvm-tensor-rt/README.md index c243c6dd1662083021a642b9a088fa55f5d1ed3d..5b3e5f99d39cf5c697051fa2580eb74f207bb031 100644 --- a/llvm/projects/hpvm-tensor-rt/README.md +++ b/llvm/projects/hpvm-tensor-rt/README.md @@ -6,10 +6,25 @@ * CUDA-9.0 or above * CUBLAS-9.0 or above - often included with cuda-toolkit +## Dependent Library Builds + +```shell +cd ../gpu_profiler +mkdir lib +cmake ../ +make + +cd ../soc_simulator +mkdir lib +cmake ../ +make +``` + + ## BUILD ```shell -source bin/setup_runtime_paths.sh +source bin/setup_cuda_llvm_paths.sh mkdir build cd build cmake ../ diff --git a/llvm/projects/hpvm-tensor-rt/bin/setup_cuda_llvm_paths.sh b/llvm/projects/hpvm-tensor-rt/bin/setup_cuda_llvm_paths.sh new file mode 100644 index 0000000000000000000000000000000000000000..3548f182f198724600aee855b66169a1bdf12a3a --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/bin/setup_cuda_llvm_paths.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# CUDNN Path setup +module load cuda-toolkit/9.1 +export CUDA_INCLUDE_PATH=/software/cuda-9.1/include +export CUDNN_PATH=/software/cuda-9.1/lib64/ +export LIBRARY_PATH=/software/cuda-9.1/lib64/:$LIBRARY_PATH +export LD_LIBRARY_PATH=/software/cuda-9.1/lib64/:$LD_LIBRARY_PATH + +# HPVM Path setup +export CPATH=$CPATH:/home/hsharif3/anaconda2/include/ +export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH +export LLVM_BUILD_ROOT=/home/hsharif3/Gitlab/hpvm/build/ +export LLVM_SRC_ROOT=/home/hsharif3/Gitlab/hpvm/llvm/ diff --git a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py b/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py index 2c57eaf5be7c09a05859221535a7aff709330fcf..e3b94082f5be7b83a1598625afd5ef05a0472506 100644 --- a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py +++ b/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py @@ -65,7 +65,7 @@ class TableGenerator: 3. Writes the internal table to <network_name>_tensors.txt file and uses the <network_name>_ops.txt file as a guideline in terms of row order ''' - #self.__run_inputted_binaries() + self.__run_inputted_binaries() self.__build_internal_table() self.__output_table_to_file() diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h index dd66c2febd94ff1a43dd7bdf0bf9eeb799e95dd9..217f575c2a999ba468d30f4b3c972b052ea83fd9 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h @@ -22,17 +22,25 @@ void* handleTensorAddApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorAdd(input, bias); - RC->addToCurrentIterationComputeTime("tensorAdd", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorAdd", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfAdd(input, bias); - RC->addToCurrentIterationComputeTime("tensorHalfAdd", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second); return t_out; } default : @@ -62,17 +70,25 @@ void* handleTensorMulApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorGemmGPU(lhs, rhs); - RC->addToCurrentIterationComputeTime("tensorGemmGPU", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfGemmGPU(lhs, rhs); - RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second); return t_out; } default : @@ -104,23 +120,31 @@ void* handleTensorConvApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, 1); - RC->addToCurrentIterationComputeTime("tensorConvolution", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorConvolution", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvolution", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvolution", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfConvolution(input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, 1); - RC->addToCurrentIterationComputeTime("tensorHalfConvolution", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfConvolution", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::PERFORATION : @@ -144,12 +168,16 @@ void* handleTensorConvApproximationTuples( break; } void* t_out; + RC->resume_profiler(); t_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, 1, row, col); - RC->addToCurrentIterationComputeTime("tensorConvPerf", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorConvPerf", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvPerf", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvPerf", pinfo.second); return t_out; } default : @@ -182,23 +210,31 @@ void* handleTensorGroupConvApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorConvolution(input, filter, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, conv_groups); - RC->addToCurrentIterationComputeTime("tensorConvolution", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorConvolution", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvolution", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvolution", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfConvolution(input, filter, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, conv_groups); - RC->addToCurrentIterationComputeTime("tensorHalfConvolution", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfConvolution", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfConvolution", pinfo.second); return t_out; } default : @@ -229,19 +265,27 @@ void* handleTensorBatchNormApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, variance_ptr, epsilon); - RC->addToCurrentIterationComputeTime("tensorBatchNorm", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, variance_ptr, epsilon); - RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second); return t_out; } default : @@ -271,17 +315,25 @@ void* handleTensorReluApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorRelu(input); - RC->addToCurrentIterationComputeTime("tensorRelu", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorRelu", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfRelu(input); - RC->addToCurrentIterationComputeTime("tensorHalfRelu", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second); return t_out; } default : @@ -311,17 +363,25 @@ void* handleTensorClippedReluApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorRelu2(input, min, max); - RC->addToCurrentIterationComputeTime("tensorRelu2", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorRelu2", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfRelu2(input, min, max); - RC->addToCurrentIterationComputeTime("tensorHalfRelu2", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second); return t_out; } default : @@ -351,17 +411,25 @@ void* handleTensorTanhApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorTanh(input); - RC->addToCurrentIterationComputeTime("tensorTanh", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorTanh", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfTanh(input); - RC->addToCurrentIterationComputeTime("tensorHalfTanh", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second); return t_out; } default : @@ -394,25 +462,33 @@ void* handleTensorPoolingApproximationTuples( case GPUNodeConfiguration::APPROX::FP32 : { void* t_out; + RC->resume_profiler(); t_out = tensorPooling(input_ptr, poolFunction, window_height, window_width, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - RC->addToCurrentIterationComputeTime("tensorPooling", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorPooling", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second); return t_out; } case GPUNodeConfiguration::APPROX::FP16 : { void* t_out; + RC->resume_profiler(); t_out = tensorHalfPooling(input_ptr, poolFunction, window_height, window_width, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - RC->addToCurrentIterationComputeTime("tensorHalfPooling", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second); return t_out; } default : @@ -437,9 +513,13 @@ void* handleTensorSoftmaxApproximationTuples( //TODO: if approximation choices are added for softmax operation, // implement this like the other handle* functions void* t_out; + RC->resume_profiler(); t_out = tensorSoftmax(input_ptr); - RC->addToCurrentIterationComputeTime("tensorSoftmax", 0.0); - RC->addToCurrentIterationComputeEnergy("tensorSoftmax", 0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorSoftmax", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorSoftmax", pinfo.second); return t_out; } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h index b510e3849e62d4cdb316875346b87bb1d3715215..d11f9dd5222a579f5a57633331e10088a1119808 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h @@ -2,7 +2,6 @@ #ifndef LLVM_HPVM_RT_CONTROLLER_H #define LLVM_HPVM_RT_CONTROLLER_H - #include <fstream> #include <iostream> #include <map> @@ -12,8 +11,13 @@ #include "configuration.h" +#include "profiler.h" +#include "promise_timing_model.h" + #include <sys/stat.h> +//#define ACTIVE_PROFILING + /* * Check if a file exists * Return true if the file exists, false else @@ -26,33 +30,45 @@ bool fileExists(const std::string& file) { class ProfileInfo { private: // Members - float time_total; // Total execution time of application - float time_compute; // Compute - float time_control; // Control - float time_config; // Apply configuration + double time_total; // Total execution time of application + double time_compute; // Compute + double time_control; // Control + double time_config; // Apply configuration - float energy_total; // Total energy consumed by applcation - float energy_compute; // Compute - float energy_control; // Control - float energy_config; // Apply configuration + double energy_total; // Total energy consumed by applcation + double energy_compute; // Compute + double energy_control; // Control + double energy_config; // Apply configuration // Execution time of one loop iteration - float time_compute_current_iteration; // Compute - float time_control_current_iteration; // Control - float time_config_current_iteration; // Apply configuration + double time_compute_current_iteration; // Compute + double time_control_current_iteration; // Control + double time_config_current_iteration; // Apply configuration // Energy comsumed by one loop iteration - float energy_compute_current_iteration; // Compute - float energy_control_current_iteration; // Control - float energy_config_current_iteration; // Apply configuration + double energy_compute_current_iteration; // Compute + double energy_control_current_iteration; // Control + double energy_config_current_iteration; // Apply configuration // Vectors, where compute time and energy information // - for each loop iteration (outer vector) // - per operation (inner vector) // (tensor operation for GPU, or whole layer for PROMISE) // is stored - std::vector< std::vector< std::pair< std::string, float > > > tensor_time_info; - std::vector< std::vector< std::pair< std::string, float > > > tensor_energy_info; + std::vector< std::vector< std::pair< std::string, double > > > tensor_time_info; + std::vector< std::vector< std::pair< std::string, double > > > tensor_energy_info; + + // Vectors, where total compute time and energy information per iteration are stored + std::vector< double > compute_time_info; + std::vector< double > compute_energy_info; + + // Vectors, where control time and energy information per iteration are stored + std::vector< double > control_time_info; + std::vector< double > control_energy_info; + + // Vectors, where control time and energy information per iteration are stored + std::vector< double > config_time_info; + std::vector< double > config_energy_info; bool in_iteration; @@ -76,8 +92,8 @@ class ProfileInfo { if (!in_iteration) { resetCurrentIterationTime(); resetCurrentIterationEnergy(); - tensor_time_info.push_back(std::vector< std::pair< std::string, float > > ()); - tensor_energy_info.push_back(std::vector< std::pair< std::string, float > > ()); + tensor_time_info.push_back(std::vector< std::pair< std::string, double > > ()); + tensor_energy_info.push_back(std::vector< std::pair< std::string, double > > ()); in_iteration = true; } } @@ -102,54 +118,63 @@ class ProfileInfo { energy_control_current_iteration + energy_config_current_iteration); + // Save current iteration counters + compute_time_info.push_back(time_compute_current_iteration); + compute_energy_info.push_back(energy_compute_current_iteration); + control_time_info.push_back(time_control_current_iteration); + control_energy_info.push_back(energy_control_current_iteration); + config_time_info.push_back(time_config_current_iteration); + config_energy_info.push_back(energy_config_current_iteration); + + // Note end of iteration in_iteration = false; } - void addToCurrentIterationComputeTime(const char *s, float t) { + void addToCurrentIterationComputeTime(const char *s, double t) { start_iteration(); time_compute_current_iteration += t; tensor_time_info.back().push_back(std::make_pair(std::string(s), t)); } - void addToCurrentIterationControlTime(float t) { + void addToCurrentIterationControlTime(double t) { start_iteration(); time_control_current_iteration += t; } - void addToCurrentIterationConfigTime(float t) { + void addToCurrentIterationConfigTime(double t) { start_iteration(); time_config_current_iteration += t; } - void addToCurrentIterationComputeEnergy(const char *s, float e) { + void addToCurrentIterationComputeEnergy(const char *s, double e) { start_iteration(); energy_compute_current_iteration += e; tensor_energy_info.back().push_back(std::make_pair(std::string(s), e)); } - void addToCurrentIterationControlEnergy(float e) { + void addToCurrentIterationControlEnergy(double e) { start_iteration(); energy_control_current_iteration += e; } - void addToCurrentIterationConfigEnergy(float e) { + void addToCurrentIterationConfigEnergy(double e) { start_iteration(); energy_config_current_iteration += e; } - float getTotalTime() { + double getTotalTime() { return time_total; } - float getTotalEnergy() { + double getTotalEnergy() { return energy_total; } - float getCurrentIterationComputeTime() { + double getCurrentIterationComputeTime() { return time_compute_current_iteration; } - float getCurrentIterationComputeEnergy() { + double getCurrentIterationComputeEnergy() { return energy_compute_current_iteration; } @@ -168,10 +193,18 @@ class ProfileInfo { abort(); } - // By construction, time_unfo and energy_info are expected to have equal - // sizes, in outer and inner vectors both. - CUSTOM_ASSERT((tensor_time_info.size() == tensor_energy_info.size()) && - "time_info and energy_info size: iteration number does not match."); + // By construction, tensor_time_info and tensor_energy_info are expected + // to have equal sizes, in outer and inner vectors both, + // and all time_info and energy_info vectors must have the same size. + unsigned iterations = tensor_time_info.size(); + CUSTOM_ASSERT((tensor_time_info.size() == iterations) && + (tensor_energy_info.size() == iterations) && + (control_time_info.size() == iterations) && + (control_energy_info.size() == iterations) && + (config_time_info.size() == iterations) && + (config_energy_info.size() == iterations) && + "time_info and energy_info size: \ + iteration number does not match."); for (unsigned i = 0; i < tensor_time_info.size(); i++ ) { // time_info.size() == energy_info.size(), since we passed the assertion @@ -187,8 +220,16 @@ class ProfileInfo { << tensor_time_info[i][j].second << " " << tensor_energy_info[i][j].second << "\n"; } + + s_out << "\nIteration Compute Time : " << compute_time_info[i] << "\n"; + s_out << "Iteration Compute Energy: " << compute_energy_info[i] << "\n"; + s_out << "Iteration Control Time : " << control_time_info[i] << "\n"; + s_out << "Iteration Control Energy: " << control_energy_info[i] << "\n"; + s_out << "Iteration Config Time : " << config_time_info[i] << "\n"; + s_out << "Iteration Control Energy: " << config_energy_info[i] << "\n"; + } - s_out << "\nTotal Compute Time : " << time_compute << "\n"; + s_out << "\n\nTotal Compute Time : " << time_compute << "\n"; s_out << "Total Compute Energy: " << energy_compute << "\n"; s_out << "\nTotal Control Time : " << time_control << "\n"; @@ -244,9 +285,23 @@ class RuntimeController { std::vector<struct Configuration *> *Configurations; unsigned configurationIdx = 0; - ProfileInfo PI; + /*** Objects used to gather timing and energy information for execution ***/ + ProfileInfo *PI; + Profiler *profiler; + Promise *promise; //Functions + + // Private functions of profiler + void start_profiler() { + if (profiler) + profiler->start_profiler(); + } + void stop_profiler() { + if (profiler) + profiler->stop_profiler(); + } + void setProfileInfoFilename(const char *); void readQuantizationFile(const char *); void readConfigurationFile(const char *); @@ -305,54 +360,148 @@ class RuntimeController { printConfigurations(ThreeDCurveConfigurations); configurationIdx = 0; //TODO: initialize using pareto curve - findTargetConfiguration ? Configurations = &SpeedupConfigurations; + + // Start profiling thread in the background, ready to time + start_profiler(); + pause_profiler(); + reset_profiler(); } - // Transferring functionality of ProfileInfo + // Exposing functionality of ProfileInfo void end_iteration() { - PI.end_iteration(); + if (PI) + PI->end_iteration(); } - void addToCurrentIterationComputeTime(const char *s, float t) { - PI.addToCurrentIterationComputeTime(s, t); + void addToCurrentIterationComputeTime(const char *s, double t) { + if (PI) + PI->addToCurrentIterationComputeTime(s, t); } - void addToCurrentIterationControlTime(float t) { - PI.addToCurrentIterationControlTime(t); + void addToCurrentIterationControlTime(double t) { + if (PI) + PI->addToCurrentIterationControlTime(t); } - void addToCurrentIterationConfigTime(float t) { - PI.addToCurrentIterationConfigTime(t); + void addToCurrentIterationConfigTime(double t) { + if (PI) + PI->addToCurrentIterationConfigTime(t); } - void addToCurrentIterationComputeEnergy(const char *s, float e) { - PI.addToCurrentIterationComputeEnergy(s, e); + void addToCurrentIterationComputeEnergy(const char *s, double e) { + if (PI) + PI->addToCurrentIterationComputeEnergy(s, e); } - void addToCurrentIterationControlEnergy(float e) { - PI.addToCurrentIterationControlEnergy(e); + void addToCurrentIterationControlEnergy(double e) { + if (PI) + PI->addToCurrentIterationControlEnergy(e); } - void addToCurrentIterationConfigEnergy(float e) { - PI.addToCurrentIterationConfigEnergy(e); + void addToCurrentIterationConfigEnergy(double e) { + if (PI) + PI->addToCurrentIterationConfigEnergy(e); } - float getCurrentIterationComputeTime() { - return PI.getCurrentIterationComputeTime(); + double getCurrentIterationComputeTime() { + return (PI ? PI->getCurrentIterationComputeTime() : 0.0) ; } - float getCurrentIterationComputeEnergy() { - return PI.getCurrentIterationComputeEnergy(); + double getCurrentIterationComputeEnergy() { + return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0) ; } void writeProfileInfo() { - PI.printToFile(); + if (PI) + PI->printToFile(); + } + + // Exposing functionality of (gpu) profiler + void resume_profiler() { + if (profiler) + profiler->resume_profiler(); + } + + void pause_profiler() { + if (profiler) + profiler->pause_profiler(); + } + + void reset_profiler() { + if (profiler) + profiler->reset(); + } + + std::pair<double, double> get_time_energy() const { + return (profiler ? profiler->get_time_energy() + : std::make_pair(0.0, 0.0)) ; + } + + // Exposing functionality of promise simulator + std::pair<double, double> fc_profile(const unsigned num_rows_a, + const unsigned num_cols_a, + const unsigned num_rows_b, + const unsigned num_cols_b, + const unsigned voltage_swing, + const unsigned patch_factor) { + return (promise ? promise->fc_profile(num_rows_a, + num_cols_a, + num_rows_b, + num_cols_b, + voltage_swing, + patch_factor) + : std::make_pair(0.0, 0.0)) ; } + std::pair<double, double> conv_profile(const unsigned n, + const unsigned c, + const unsigned h, + const unsigned w, + const unsigned c_out, + const unsigned c_in, + const unsigned k_h, + const unsigned k_w, + const unsigned s_h, + const unsigned s_w, + const unsigned voltage_swing, + const unsigned patch_factor) { + return (promise ? promise->conv_profile(n, c, h, w, + c_out, c_in, k_h, k_w, + s_h, s_w, + voltage_swing, patch_factor) + : std::make_pair(0.0, 0.0)) ; + } + + // Constructor and descructor RuntimeController() { configurationIdx = 0; +#ifdef ACTIVE_PROFILING + PI = new ProfileInfo(); + profiler = new Profiler(); + promise = new Promise(); +#else + PI = NULL; + profiler = NULL; + promise = NULL; +#endif + } ~RuntimeController() { + + stop_profiler(); + writeProfileInfo(); + + if (PI) { + delete PI; + } + if (profiler) { + delete profiler; + } + if (promise) { + delete promise; + } + for (std::vector<struct Configuration>::iterator it = InitialConfigurations.begin(), ie = InitialConfigurations.end(); it != ie; ++it) { std::map<std::string, NodeConfiguration * > ConfSetup = it->setup; @@ -377,21 +526,23 @@ class RuntimeController { void RuntimeController::setProfileInfoFilename(const char *str) { - std::string file_path = std::string(str); - size_t idx = file_path.find_last_of("/"); - file_path.erase(idx + 1); - file_path.append("profile_info_"); - - bool found = false; - std::string profile_filename; - for (unsigned i = 0; !found; i++) { - profile_filename = file_path; - profile_filename.append(std::to_string(i)); - profile_filename.append(".txt"); - found = !fileExists(profile_filename); - } + if (PI) { + std::string file_path = std::string(str); + size_t idx = file_path.find_last_of("/"); + file_path.erase(idx + 1); + file_path.append("profile_info_"); + + bool found = false; + std::string profile_filename; + for (unsigned i = 0; !found; i++) { + profile_filename = file_path; + profile_filename.append(std::to_string(i)); + profile_filename.append(".txt"); + found = !fileExists(profile_filename); + } - PI.set_out_file_name(profile_filename); + PI->set_out_file_name(profile_filename); + } } #define NODE_NAME_BUFFER_SIZE 10 @@ -989,14 +1140,15 @@ float hpvm_rt_computeAccuracy3(uint32_t* labels, void* result_ptr) { void llvm_hpvm_invokeRtControl(void* result, const char* str, int start, int end) { - //resume_profiler(); + RC->resume_profiler(); + uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); // Read stats for iteration that was just completed - float current_iteration_time = RC->getCurrentIterationComputeTime(); - float current_iteration_energy = RC->getCurrentIterationComputeEnergy(); + double current_iteration_time = RC->getCurrentIterationComputeTime(); + double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); RC->findNextConfiguration(); // Still use findNext configuration, to update the configurationIdx, @@ -1005,13 +1157,11 @@ void llvm_hpvm_invokeRtControl(void* result, const char* str, int start, int end float goalVal = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss; RC->findTargetConfiguration(goalVal, k); - //pause_profiler(); - //std::pair<float, float> pi = get(); - //reset(); - //RC->addToCurrentIterationControlTime(pi.first); - //RC->addToCurrentIterationControlEnergy(pi.second); - RC->addToCurrentIterationControlTime(0.0); - RC->addToCurrentIterationControlEnergy(0.0); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationControlTime(pinfo.first); + RC->addToCurrentIterationControlEnergy(pinfo.second); INFO("current iteration time = %f, current iteration energy = %f\n\n", current_iteration_time, current_iteration_energy); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu index 220bb6dda4562eabf52fd0777866325710ac591e..67c56aec3caf63b40e0ab499fa87a5e8cacfca6e 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu @@ -1676,6 +1676,24 @@ void* wrapper_ConvLayer(const char* hpvm_node_id, int param = approxTuples[0].second; if (approx == PROMISENodeConfiguration::APPROX::SWING_LEVEL) { DEBUG("Approximation choice for ConvLayer: swing level %d\n", param); + + struct Tensor* input_tensor_cast = (struct Tensor*) input; + struct Tensor* filter_tensor_cast = (struct Tensor*) filter; + std::pair<double, double> pinfo = + RC->conv_profile(input_tensor_cast->dims.dim_sizes[0], //n + input_tensor_cast->dims.dim_sizes[1], //c + input_tensor_cast->dims.dim_sizes[2], //h + input_tensor_cast->dims.dim_sizes[3], //w + filter_tensor_cast->dims.dim_sizes[0], //c_out + filter_tensor_cast->dims.dim_sizes[1], //c_in + filter_tensor_cast->dims.dim_sizes[2], //k_h + filter_tensor_cast->dims.dim_sizes[3], //k_w + conv_stride_h, //s_h + conv_stride_w, //s_w + param, //voltage_swing + 1 /*patch_factor*/); + RC->addToCurrentIterationComputeTime("ConvLayer_PROMISE", pinfo.first); + RC->addToCurrentIterationComputeEnergy("ConvLayer_PROMISE", pinfo.second); void* t_out; t_out = ConvLayer_PROMISE(input, QRanges[0], QRanges[1], filter, QRanges[2], QRanges[3], @@ -1685,8 +1703,7 @@ void* wrapper_ConvLayer(const char* hpvm_node_id, pool_id, pool_size, activation_id, QRanges[6], QRanges[7], param); - RC->addToCurrentIterationComputeTime("ConvLayer_PROMISE", 0.0); - RC->addToCurrentIterationComputeEnergy("ConvLayer_PROMISE", 0.0); + return t_out; } else { CUSTOM_ASSERT(false && "Unknown approximation type"); @@ -1887,14 +1904,31 @@ void* wrapper_FCLayer(const char* hpvm_node_id, int param = approxTuples[0].second; if (approx == PROMISENodeConfiguration::APPROX::SWING_LEVEL) { DEBUG("Approximation choice for FCLayer: swing level %d\n", param); + + struct Tensor* input_tensor_cast = (struct Tensor*) input; + struct Tensor* weights_tensor_cast = (struct Tensor*) weights; + CUSTOM_ASSERT((input_tensor_cast->dims.dim_sizes[1] * + input_tensor_cast->dims.dim_sizes[2] * + input_tensor_cast->dims.dim_sizes[3] == + weights_tensor_cast->dims.dim_sizes[2]) && + "Dimensions for matrix multiplication do not match."); + std::pair<double, double> pinfo = + RC->fc_profile(input_tensor_cast->dims.dim_sizes[0], //num_rows_a, + input_tensor_cast->dims.dim_sizes[1] * + input_tensor_cast->dims.dim_sizes[2] * + input_tensor_cast->dims.dim_sizes[3], //num_cols_a, + weights_tensor_cast->dims.dim_sizes[2], //num_rows_b, + weights_tensor_cast->dims.dim_sizes[3], //num_cols_b, + param, //voltage_swing, + 1 /*patch_factor*/); + RC->addToCurrentIterationComputeTime("FCLayer_PROMISE", pinfo.first); + RC->addToCurrentIterationComputeEnergy("FCLayer_PROMISE", pinfo.second); void* t_out; t_out = FCLayer_PROMISE(input, QRanges[0], QRanges[1], weights, QRanges[2], QRanges[3], bias, QRanges[4], QRanges[5], activation_id, QRanges[6], QRanges[7], param); - RC->addToCurrentIterationComputeTime("FCLayer_PROMISE", 0.0); - RC->addToCurrentIterationComputeEnergy("FCLayer_PROMISE", 0.0); return t_out; } else { CUSTOM_ASSERT(false && "Unknown approximation type"); diff --git a/llvm/projects/soc_simulator/src/driver_new_config.py b/llvm/projects/soc_simulator/src/driver_new_config.py new file mode 100644 index 0000000000000000000000000000000000000000..115237dac51c96b47d02c84a603d98bdcf0b84a4 --- /dev/null +++ b/llvm/projects/soc_simulator/src/driver_new_config.py @@ -0,0 +1,328 @@ +from collections import defaultdict +import os +import subprocess +import sys + +class Driver: + fp16_swing = 8 + + class ApproxTypes: + FP16 = 0 + FP32 = 1 + PROMISE = 2 + PERF = 3 + + results_time_key = "Time" + results_energy_key = "Energy" + + + def driver(self): + self.__parse_tensor_layer_file() + self.__parse_tensor_table() + self.__run_simulations() + self.__display_results() + + + def __init__(self, layer_filename, table_filename, config_filename, results_filename): + self.__layer_filename = layer_filename + self.__table_filename = table_filename + self.__config_filename = config_filename + self.__results_filename = results_filename + + # NOTE: Use an OrderedDict if we want to search by operation name + # Using a list bc we care about the order the data is read in + # since it corresponds to the data in the configuration file + self.__tensor_layers = [] + + # [layer_name][operation_name][cols] + # Operation names need to be stored in order of insertion + self.__tensor_table = defaultdict(lambda: list(defaultdict(str))) + + # [Time/Energy][number corresponding to order the layer config was read in] = time/energy + self.__aggregate_results = defaultdict(lambda: defaultdict(float)) + self.__config_count = 0 + + + @staticmethod + def is_conv(operation_name): + return operation_name.startswith("Conv") + + + @staticmethod + def is_nml(operation_name): + return operation_name.startswith("NML") + + + @staticmethod + def is_fc(operation_name): + return operation_name.startswith("FC") + + + def __parse_tensor_layer_file(self): + if not os.path.isfile(self.__layer_filename): + print("ERROR: %s was not found." % self.__layer_filename) + exit(1) + + layer_file = open(self.__layer_filename, "r") + for line in layer_file: + layer_data = line.strip().split(',') + layer_name = layer_data[0] + + tensor_layer = defaultdict(str) + tensor_layer["Name"] = layer_name + + if Driver.is_conv(layer_name): + tensor_layer["N"] = float(layer_data[1]) + tensor_layer["Cin"] = float(layer_data[2]) + tensor_layer["H"] = float(layer_data[3]) + tensor_layer["W"] = float(layer_data[4]) + tensor_layer["Cout"] = float(layer_data[5]) + tensor_layer["Kh"] = float(layer_data[7]) + tensor_layer["Kw"] = float(layer_data[8]) + tensor_layer["Sh"] = float(layer_data[9]) + tensor_layer["Sw"] = float(layer_data[10]) + + elif Driver.is_fc(layer_name): + tensor_layer["RA"] = float(layer_data[1]) + tensor_layer["CA"] = float(layer_data[2]) + tensor_layer["RB"] = float(layer_data[3]) + tensor_layer["CB"] = float(layer_data[4]) + + elif not Driver.is_nml(layer_name): # TODO should we store data for NMLs? + print("ERROR: Invalid layer name %s" % layer_name) + exit(1) + + self.__tensor_layers.append(tensor_layer) + layer_file.close() + + + def __parse_tensor_table(self): + if not os.path.isfile(self.__table_filename): + print("ERROR: %s was not found." % self.__table_filename) + exit(1) + table_file = open(self.__table_filename, "r") + line = table_file.readline().strip() + + while line: + # Line here MUST be a header or there's a bug + # Get the description of the layer + assert(line.startswith("**")) + + header_contents = line.split(' ')[1:] + layer_name = header_contents[0] + num_ops = int(header_contents[1]) + col_names = header_contents[2:] + + layer_operations = [] + + # Go through all operations in the layer + for op_count in range(num_ops): + operation_data = defaultdict(str) + + line = table_file.readline().strip() + op_data = line.split(' ') + op_name = op_data[0] + operation_data["Name"] = op_name + + # Number of data items (#s) needs to match up with the # of cols + assert(len(op_data) - 1 == len(col_names)) + + # Go through all data items (each col element) per operation + for i in range(len(col_names)): + operation_data[col_names[i]] = float(op_data[i + 1]) + + layer_operations.append(operation_data) + + self.__tensor_table[layer_name] = layer_operations + line = table_file.readline().strip() + table_file.close() + + + @staticmethod + def is_promise(config_layer): + return float(config_layer.split(' ')[0]) < Driver.fp16_swing + + + def __quantize(self, curr_layer, prev_layer, h2f_f2h_operation_ind, layer_data): + if curr_layer == prev_layer or curr_layer == Driver.ApproxTypes.PROMISE \ + or prev_layer == Driver.ApproxTypes.PROMISE: # No quantization needed + return 0.0, 0.0 + + layer_name = layer_data["Name"] + + # NOTE: Ignoring logic where curr == promise or prev == promise bc + # smartDMA is always true so we'd return near the beginning of the method + + # Get h2f/f2h data using the first tensor operation in the layer + # (which is why order matters in the tensor table) + print(layer_name, self.__tensor_table[layer_name]) + tensor_op_row = self.__tensor_table[layer_name][h2f_f2h_operation_ind] + if curr_layer == Driver.ApproxTypes.FP32: + time = tensor_op_row["h2f_time"] + energy = tensor_op_row["h2f_energy"] + elif curr_layer == Driver.ApproxTypes.FP16: + time = tensor_op_row["f2h_time"] + energy = tensor_op_row["f2h_energy"] + + print("Quantization: (%f, %f)" % (time, energy)) + return (time, energy) + + + def __run_promise_simulation(self, swing, layer_data): + layer_name = layer_data["Name"] + patch_factor = 1 + + if Driver.is_conv(layer_name): + rows_a = layer_data["N"] * layer_data["H"] * layer_data["W"] \ + / (layer_data["Sh"] * layer_data["Sw"]) + cols_a = layer_data["Cin"] * layer_data["Kh"] * layer_data["Kw"] + rows_b = cols_a + cols_b = layer_data["Cout"] + patch_factor = layer_data["Kh"] * layer_data["Kw"] + elif Driver.is_fc(layer_name): + rows_a = layer_data["RA"] + cols_a = layer_data["CA"] + rows_b = cols_a + cols_b = layer_data["CB"] + else: + print("PROMISE can't run whatever this layer is.") + exit(1) + # Run promise simulator + # TODO need to print time and energy in the ptm runner so we can pipe it + output = subprocess.Popen(["./ptm", str(rows_a), str(cols_a), str(rows_b), \ + str(cols_b), str(patch_factor), str(swing)], \ + stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0] + total_time_energy = output.strip().split(',') + + assert(len(total_time_energy) == 2) + print("PROMISE: (%s, %s)" % (total_time_energy[0], total_time_energy[1])) + return float(total_time_energy[0]), float(total_time_energy[1]) + + + def __run_simulations(self): + if not os.path.isfile(self.__config_filename): + print("ERROR: %s was not found" % self.__config_filename) + exit(1) + + config_file = open(self.__config_filename, "r") + + line = config_file.readline().strip() + + while line: + assert(line.startswith("+++++")) + config_name = config_file.readline().strip().split(' ')[0] # Next line = configuration name + print("CONFIGURATION") + + line = config_file.readline().strip() + layer_ind = 0 # NOTE can also use the leftmost number in the currl ine + + prev_layer = Driver.ApproxTypes.FP32 + curr_layer = None + + while not line.startswith("-----"): + layer_info = line.split(' ') + layer_data = self.__tensor_layers[layer_ind] + layer_name = layer_data["Name"] + + if layer_info[1] == "promise": + print("Running layer %s on PROMISE" % layer_name) + curr_layer = Driver.ApproxTypes.PROMISE + + swing = int(layer_info[3]) + time, energy = self.__run_promise_simulation(swing, layer_data) + print(time, energy) + self.__aggregate_results[Driver.results_time_key][self.__config_count] += time + self.__aggregate_results[Driver.results_energy_key][self.__config_count] += energy + + elif layer_info[1] == "gpu": + # Parse each individual tensor operation + # TODO not portable bc there can be multiple numbers after each approx later on + total_time = 0 + total_energy = 0 + + tensor_ind = 0 + for i in range(2, len(layer_info), 3): + tensor_op = layer_info[i] + approx_type = layer_info[i + 1] + approx_num = layer_info[i + 2] # only matters if perf + + if approx_type == "fp16": + curr_layer = Driver.ApproxTypes.FP16 + elif approx_type == "fp32": + curr_layer = Driver.ApproxTypes.FP32 + elif approx_type == "perf": + curr_layer = DriverApproxTypes.PERF + else: + assert(False) + + quant_time, quant_energy = self.__quantize(curr_layer, prev_layer, tensor_ind, layer_data) + time, energy = self.__run_gpu_simulation(curr_layer, layer_name, tensor_ind, approx_num) + total_time += time + total_energy += energy + + tensor_ind += 1 + + self.__aggregate_results[Driver.results_time_key][self.__config_count] += total_time + self.__aggregate_results[Driver.results_energy_key][self.__config_count] += total_energy + + layer_ind += 1 + line = config_file.readline().strip() + + self.__config_count += 1 + line = config_file.readline().strip() + + config_file.close() + + + def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind, approx_num): + tensor_info = self.__tensor_table[layer_name][tensor_ind] + + if curr_layer == Driver.ApproxTypes.FP32: + time = tensor_info["fp32_time"] + energy = tensor_info["fp32_energy"] + + elif curr_layer == Driver.ApproxTypes.FP16: + time = tensor_info["fp16_time"] + energy = tensor_info["fp16_energy"] + + elif curr_layer == Driver.ApproxTypes.PERF: + time = tensor_info["perf%s_energy" % approx_num] + energy = tensor_info["perf%s_energy" % approx_num] + + print("GPU: (%f, %f)" % (time, energy)) + return time, energy + + + def __display_results(self): + results_file = open(self.__results_filename, "w") + attributes_to_print = [Driver.results_time_key, Driver.results_energy_key] + + for attribute in attributes_to_print: + results_file.write("%s\n" % attribute) + results_file.write("Configuration,Total,Improvement\n") + + baseline_val = self.__aggregate_results[attribute][0] + print(baseline_val) + best_config = None + best_result = None + + for config_ind in range(self.__config_count): + results_file.write("c%d" % config_ind) + time_or_energy_val = self.__aggregate_results[attribute][config_ind] + + # Using repr to keep all decimal digits when writing to file + results_file.write(",%s" % repr(time_or_energy_val)) + results_file.write(",%s\n" % repr(baseline_val / (time_or_energy_val + 0.0001))) + + if not best_result or time_or_energy_val < best_result: + best_result = time_or_energy_val + best_config = config_ind + results_file.write("\nc%d,%s\n\n" % (best_config, repr(self.__aggregate_results[attribute][best_config]))) + results_file.close() + + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("Usage: python driver.py <layer info> <tensor info> <configurations> <results file>") + exit(1) + Driver(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]).driver() diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile index 9785fb5439e8f80ebb3ad6f752835ff7d7a9bb23..30c80f2a1a65ad122681b76e531ed5d99ec8a12b 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile @@ -16,6 +16,8 @@ APP = alexnet2 TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a +PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a +SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 @@ -65,10 +67,10 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc - $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS) $(BUILD_DIR):