diff --git a/llvm/projects/.gitignore b/llvm/projects/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e45ff9f4750ef9c2823589f4fcfe03b5e686aab8 --- /dev/null +++ b/llvm/projects/.gitignore @@ -0,0 +1,2 @@ +soc_simulator/lib +gpu_profiler/lib diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt index bf6f0f3a2668af7c25c955042d257f4e32c75b33..74ec43d03292045e7e0b745996dba0e9437bc813 100644 --- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt +++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt @@ -46,43 +46,62 @@ include_directories(../gpu_profiler/include) include_directories(../soc_simulator/include) link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64 $ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64) -# Adding new rule for building a cuDNN runtime library -cuda_add_library(tensor_runtime tensor_runtime/src/tensor_runtime.cu tensor_runtime/src/half_precision_api.cu tensor_runtime/src/tensor_utils.cu tensor_runtime/src/fp16_gemm.cu tensor_runtime/src/debug.cc tensor_runtime/src/global_data.cc tensor_runtime/src/approx_techniques.cu tensor_runtime/src/approx_techniques2.cu tensor_runtime/src/approx_simulation.cu tensor_runtime/src/profiling.cc tensor_runtime/src/op_overheads.cc tensor_runtime/src/error.cu tensor_runtime/src/wrapper_runtime.cu ) - -cuda_add_cublas_to_target(tensor_runtime) - +set( + RUNTIME_SRCS_FILENAME + approx_simulation.cu + approx_techniques.cu + approx_techniques2.cu + common.cpp + configuration.cpp + debug.cc + device_math.cu + error.cu + fp16_gemm.cu + global_data.cc + half_precision_api.cu + hpvm-rt-controller.cpp + img_tensor_runtime.cu + img_tensor_utils.cpp + op_overheads.cc + profiling.cc + tensor_runtime.cu + tensor_utils.cu + wrapper_runtime.cu +) +foreach(FILE ${RUNTIME_SRCS_FILENAME}) + list(APPEND RUNTIME_SRCS "tensor_runtime/src/${FILE}") +endforeach() find_library(GPU_PROFILER_LIB NAMES libgpu_profiler.a HINTS ../gpu_profiler/lib ) - find_library(SOC_SIMULATOR_LIB NAMES libpromise_profiler.a HINTS ../soc_simulator/lib ) - -target_link_libraries(tensor_runtime cudnn cufft stdc++fs -lcurand) +set(LINK_LIBS cudnn cufft stdc++fs -lcurand) if(USE_GFLAGS) - target_link_libraries(tensor_runtime gflags) + list(APPEND LINK_LIBS gflags) endif() +# Adding new rule for building a cuDNN runtime library +# Offline version +cuda_add_library(tensor_runtime ${RUNTIME_SRCS}) +cuda_add_cublas_to_target(tensor_runtime) +target_link_libraries(tensor_runtime ${LINK_LIBS}) +# Online version remove_definitions(-DONLINE_PROFILING=false) add_definitions(-DONLINE_PROFILING=true) - -cuda_add_library(tensor_runtime_online tensor_runtime/src/tensor_runtime.cu tensor_runtime/src/half_precision_api.cu tensor_runtime/src/tensor_utils.cu tensor_runtime/src/fp16_gemm.cu tensor_runtime/src/debug.cc tensor_runtime/src/global_data.cc tensor_runtime/src/approx_techniques.cu tensor_runtime/src/approx_techniques2.cu tensor_runtime/src/approx_simulation.cu tensor_runtime/src/profiling.cc tensor_runtime/src/op_overheads.cc tensor_runtime/src/error.cu tensor_runtime/src/wrapper_runtime.cu ) - +cuda_add_library(tensor_runtime_online ${RUNTIME_SRCS}) cuda_add_cublas_to_target(tensor_runtime_online) - target_link_libraries(tensor_runtime_online cudnn -lcurand) # Adding new rule for building a cuDNN runtime library cuda_add_library(tensor_cpu_runtime tensor_runtime/src/tensor_cpu_runtime.cc) target_link_libraries(tensor_cpu_runtime) - - ### TODO: Remove unsued CMake rules after careful consideration # Adding rule for the debugging source diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h index e2905db99ae142b013215dcf90e0e0cbb9c5f70e..22a6b5ca951793d26003f0c5ff4dc1e7d4c39f95 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h @@ -1,5 +1,4 @@ - -#include "tensor_utils.cu" +#include "tensor_utils.h" //produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h index 99c465434a2879d85624b7ff6bb4141dd8fe4634..75e60a1c4a5d6fda8bf32fcaee3e2b9d192cbbcd 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h @@ -11,24 +11,15 @@ // Describes the internal choices made for an ApproxHPVM node class NodeConfiguration { public: - enum NODE_CONFIGURATION_TARGET - { - PROMISE, - GPU, - END - }; + enum NODE_CONFIGURATION_TARGET { PROMISE, GPU, END }; protected: enum NODE_CONFIGURATION_TARGET NODE_CONFIGURATION_TARGET_ID; public: - bool isPROMISENodeConfiguration() { - return NODE_CONFIGURATION_TARGET_ID == PROMISE; - } + bool isPROMISENodeConfiguration(); - bool isGPUNodeConfiguration() { - return NODE_CONFIGURATION_TARGET_ID == GPU; - } + bool isGPUNodeConfiguration(); virtual void print() = 0; }; @@ -36,30 +27,20 @@ public: class PROMISENodeConfiguration : public NodeConfiguration { public: // Approximation methods available for this HW type - enum APPROX - { - SWING_LEVEL, - END - }; + enum APPROX { SWING_LEVEL, END }; private: // A vector, containing pairs of approximation method and tunable parameter // (expressed as int, or ignored when not applicable) - std::vector< std::pair<enum APPROX, int> > ApproxChoices; + std::vector<std::pair<enum APPROX, int>> ApproxChoices; public: - void pushNewApproximationChoice(enum APPROX approx, int u) { - ApproxChoices.push_back(std::make_pair(approx, u)); - } + void pushNewApproximationChoice(enum APPROX approx, int u); - std::vector<std::pair<enum APPROX, int> > &getApproxChoices() { - return ApproxChoices; - } + std::vector<std::pair<enum APPROX, int>> &getApproxChoices(); - PROMISENodeConfiguration() { - NODE_CONFIGURATION_TARGET_ID = PROMISE; - } - ~PROMISENodeConfiguration() {} + PROMISENodeConfiguration(); + ~PROMISENodeConfiguration(); void print() override; }; @@ -67,20 +48,18 @@ public: class GPUNodeConfiguration : public NodeConfiguration { public: // Approximation methods available for this HW type - enum APPROX - { + enum APPROX { FP32, FP16, PERFORATION, INPUT_SAMPLING, REDUCTION_SAMPLING, -// ADDITIONAL_APPROXIMATION_METHOD + // ADDITIONAL_APPROXIMATION_METHOD APPROX_END }; // Operations to be approximated in the node using this configuration - enum TENSOR_OP - { + enum TENSOR_OP { ADD, BATCHNORM, CONV, @@ -99,37 +78,30 @@ public: MAP1, MAP2, MAP3, -// STENCIL, -// COSINE_T, -// ADDITIONAL_TENSOR_OPERATION + // STENCIL, + // COSINE_T, + // ADDITIONAL_TENSOR_OPERATION TENSOR_OP_END }; private: // A vector, containing pairs of approximation method and tunable parameter // (expressed as int, or ignored when not applicable) for each operation - std::vector< std::pair< enum TENSOR_OP, std::vector< std::pair<enum APPROX, int> > > > ApproxChoices; + std::vector< + std::pair<enum TENSOR_OP, std::vector<std::pair<enum APPROX, int>>>> + ApproxChoices; public: - void pushNewTensorOperation(enum TENSOR_OP top) { - std::vector< std::pair<enum APPROX, int> > emptyVec; - ApproxChoices.push_back(std::make_pair(top, emptyVec)); - } - - void pushNewApproximationChoiceForOperation(enum APPROX approx, int u) { - unsigned size = ApproxChoices.size(); - CUSTOM_ASSERT(size >=1 && "Cannot apply approximation choice to non existent operation."); - ApproxChoices[size-1].second.push_back(std::make_pair(approx, u)); - } - - std::vector< std::pair< enum TENSOR_OP, std::vector< std::pair<enum APPROX, int> > > > &getApproxChoices() { - return ApproxChoices; - } - - GPUNodeConfiguration() { - NODE_CONFIGURATION_TARGET_ID = GPU; - } - ~GPUNodeConfiguration() {} + void pushNewTensorOperation(enum TENSOR_OP top); + + void pushNewApproximationChoiceForOperation(enum APPROX approx, int u); + + std::vector< + std::pair<enum TENSOR_OP, std::vector<std::pair<enum APPROX, int>>>> & + getApproxChoices(); + + GPUNodeConfiguration(); + ~GPUNodeConfiguration(); void print() override; }; @@ -140,33 +112,25 @@ public: // - energy // - accuracy (compared to golden output) // - accuracy loss (compared to baseline) -// - a hardware choice and set or operations-approximation choices, described in setup +// - a hardware choice and set or operations-approximation choices, described in +// setup struct Configuration { std::string name; float speedup; float energy; float accuracy; float accuracyLoss; - std::map<std::string, NodeConfiguration * > setup; + std::map<std::string, NodeConfiguration *> setup; - Configuration(std::string &n, float f, float e, float a, float al) : - name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {} + Configuration(std::string &n, float f, float e, float a, float al); - float getSpeedup() { - return speedup; - } + float getSpeedup(); - float getEnergy() { - return energy; - } + float getEnergy(); - float getAccuracy() { - return accuracy; - } + float getAccuracy(); - float getAccuracyLoss() { - return accuracyLoss; - } + float getAccuracyLoss(); void print(); }; @@ -174,187 +138,28 @@ struct Configuration { // Comparison operator definition, in increasing accuracy loss // (for std sort, used in pareto optimal computation) struct ConfigurationLessThan { - bool operator()(const struct Configuration &a, const struct Configuration &b) const { - return (a.accuracyLoss < b.accuracyLoss) ; - } + bool operator()( + const struct Configuration &a, const struct Configuration &b) const; }; // Comparison operator definition, in increasing accuracy loss // (for std lower bound, used in pareto optimal frontier search) struct ConfigurationLessThan_AL { - bool operator()(const struct Configuration *a, const float &b) const { - return (a->accuracyLoss < b) ; - } + bool operator()(const struct Configuration *a, const float &b) const; }; // Comparison operator definition, in increasing speedup // (for std lower bound, used in pareto optimal frontier search) struct ConfigurationLessThan_SP { - bool operator()(const struct Configuration *a, const float &b) const { - return (a->speedup < b) ; - } + bool operator()(const struct Configuration *a, const float &b) const; }; // Comparison operator definition, in decreasing energy // (for std lower bound, used in pareto optimal frontier search) struct ConfigurationLessThan_E { - bool operator()(const struct Configuration *a, const float &b) const { - return (a->energy < b) ; - } + bool operator()(const struct Configuration *a, const float &b) const; }; -enum SEARCH_KIND - { - SPEEDUP, - ENERGY, - ACCURACY_LOSS, - END - }; - -//****** HEADER Ends - Source Starts - - -// Helper configuration print methods - -void PROMISENodeConfiguration::print() { - - printf(" promise"); - for (auto &it : ApproxChoices) { - printf(" "); - switch (it.first) { - case APPROX::SWING_LEVEL : - printf("swing_level"); - break; - default: - ERROR("Unknown approximation option"); - break; - // TODO additional approx methods to be printed here - } - printf(" %d", it.second); - } - - printf("\n"); - -} - -void GPUNodeConfiguration::print() { - - printf(" gpu"); - for (auto &it : ApproxChoices) { - - printf(" "); - switch (it.first) { - case TENSOR_OP::ADD : - printf("add"); - break; - case TENSOR_OP::BATCHNORM : - printf("batchnorm"); - break; - case TENSOR_OP::CONV : - printf("conv"); - break; - case TENSOR_OP::GROUP_CONV : - printf("group_conv"); - break; - case TENSOR_OP::MUL : - printf("mul"); - break; - case TENSOR_OP::RELU : - printf("relu"); - break; - case TENSOR_OP::CLIPPED_RELU : - printf("clipped_relu"); - break; - case TENSOR_OP::TANH : - printf("tanh"); - break; - case TENSOR_OP::POOL_MAX : - printf("pool_max"); - break; - case TENSOR_OP::POOL_MEAN : - printf("pool_mean"); - break; - case TENSOR_OP::POOL_MIN : - printf("pool_min"); - break; - case TENSOR_OP::SOFTMAX : - printf("softmax"); - break; - case TENSOR_OP::FFT : - printf("fft"); - break; - case TENSOR_OP::REDUCE : - printf("reduce"); - break; - case TENSOR_OP::PROJECTIVE_T : - printf("projectiveT"); - break; - case TENSOR_OP::MAP1 : - printf("map1"); - break; - case TENSOR_OP::MAP2 : - printf("map2"); - break; - case TENSOR_OP::MAP3 : - printf("map3"); - break; - default : - ERROR("Unknown tensor operation."); - break; - // TODO additional operations to be printed here - } - - auto &approxVec = it.second; - for (auto &inner_it : approxVec) { - printf(" "); - switch (inner_it.first) { - case APPROX::FP32 : - printf("fp32"); - break; - case APPROX::FP16 : - printf("fp16"); - break; - case APPROX::PERFORATION : - printf("perf"); - break; - case APPROX::INPUT_SAMPLING : - printf("samp"); - break; - case APPROX::REDUCTION_SAMPLING : - printf("red_samp"); - break; - default: - ERROR("Unknown approximation option"); - break; - // TODO additional approx methods to be printed here - } - - printf(" %d", inner_it.second); - } - - } - - printf("\n"); - -} - -void Configuration::print() { - - printf("+++++\n"); - printf("%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy, accuracyLoss); - for (std::map<std::string, NodeConfiguration* >::const_iterator it = setup.begin(); - it != setup.end(); ++it) { - printf("%s :", it->first.c_str()); - - it->second->print(); - } - - printf("-----\n"); -} - - - - - +enum SEARCH_KIND { SPEEDUP, ENERGY, ACCURACY_LOSS, END }; #endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h index 8c0922ba67451cdbb32e7af7925cf8b95ef0f647..235ac088da8bc9b19785f2e0f00bbb81afba42a6 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h @@ -2,6 +2,8 @@ #define DEVICE_MATH_H #include <device_launch_parameters.h> +#include <cuda_fp16.h> +#include <stdexcept> #include <limits> enum class MathOp { diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h index 6cffc38201f43874de02a12a0fcc77369dc1786f..0b4d7d056550c6f7f32dbc2c276388328fce6e60 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h @@ -9,6 +9,7 @@ #include "debug.h" #include "tensor.h" +#include "profiling.h" template <typename T> __host__ __device__ __forceinline__ T ceilDiv(T a, T b) { return (a + b - 1) / b; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh index 5e2f672161f3fcfcc1c59c6ba6437bfa963c7a90..7727ffc34962a338e69f08b626ad504f2071244b 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh @@ -11,7 +11,7 @@ #include "debug.h" #include "map_typing.h" #include "tensor.h" -#include "tensor_utils.cu" +#include "tensor_utils.h" template <size_t N> void mapPrecheck(const std::array<Tensor *, N> &srcs) { for (Tensor *src : srcs) { diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh index 111b50725e4ffc9d676762118d7c46124858c3a8..de5402942cf7be965a342ebfedf80d21475c49d3 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh @@ -5,7 +5,7 @@ #include "common.h" #include "debug.h" #include "tensor.h" -#include "tensor_utils.cu" +#include "tensor_utils.h" // Between CUDA compute capability 1.0 and 7.5, // Least "max # threads per block" is 512, so 512 is used to be compatible; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h index cc3d785985ffac4ffc3863f19f27dae3bc0d3b52..a14fe22700de6a12017294dfec9f73e28c7bd0b9 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h @@ -47,7 +47,7 @@ extern int total_ops; extern std::vector<int> op_accuracies; extern std::vector<Range*> quant_ranges; -std::unordered_set<void*> tensors_ptr, host_ptr, obj_ptr; +extern std::unordered_set<void*> tensors_ptr, host_ptr, obj_ptr; extern std::unordered_map<void*, int> tracked_tensors; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h index 6a3f9b610938b19ca33a3544ac30e78b7c264710..4e3542fdfae1ae3e5d8f074f791e2de05b21f9ad 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h @@ -1,4 +1,3 @@ - #ifndef LLVM_HPVM_RT_CONTROLLER_H #define LLVM_HPVM_RT_CONTROLLER_H @@ -23,10 +22,7 @@ * Check if a file exists * Return true if the file exists, false else */ -bool fileExists(const std::string& file) { - struct stat buf; - return (stat(file.c_str(), &buf) == 0); -} +bool fileExists(const std::string& file); class ProfileInfo { private: @@ -77,184 +73,40 @@ class ProfileInfo { std::string out_file_name; // Functions - void resetCurrentIterationTime() { - time_compute_current_iteration = 0.0; - time_control_current_iteration = 0.0; - time_config_current_iteration = 0.0; - } - - void resetCurrentIterationEnergy() { - energy_compute_current_iteration = 0.0; - energy_control_current_iteration = 0.0; - energy_config_current_iteration = 0.0; - } - - void start_iteration() { - if (!in_iteration) { - resetCurrentIterationTime(); - resetCurrentIterationEnergy(); - tensor_time_info.push_back(std::vector< std::pair< std::string, double > > ()); - tensor_energy_info.push_back(std::vector< std::pair< std::string, double > > ()); - in_iteration = true; - } - } + void resetCurrentIterationTime(); + + void resetCurrentIterationEnergy(); + + void start_iteration(); public: - void end_iteration() { - // Update time counters - time_compute += time_compute_current_iteration; - time_control += time_control_current_iteration; - time_config += time_config_current_iteration; - - time_total += (time_compute_current_iteration + - time_control_current_iteration + - time_config_current_iteration); - - // Update energy counters - energy_compute += energy_compute_current_iteration; - energy_control += energy_control_current_iteration; - energy_config += energy_config_current_iteration; - - energy_total += (energy_compute_current_iteration + - energy_control_current_iteration + - energy_config_current_iteration); - - // Save current iteration counters - compute_time_info.push_back(time_compute_current_iteration); - compute_energy_info.push_back(energy_compute_current_iteration); - control_time_info.push_back(time_control_current_iteration); - control_energy_info.push_back(energy_control_current_iteration); - config_time_info.push_back(time_config_current_iteration); - config_energy_info.push_back(energy_config_current_iteration); - - // Note end of iteration - in_iteration = false; - } - - void addToCurrentIterationComputeTime(const char *s, double t) { - start_iteration(); - time_compute_current_iteration += t; - tensor_time_info.back().push_back(std::make_pair(std::string(s), t)); - } - - void addToCurrentIterationControlTime(double t) { - start_iteration(); - time_control_current_iteration += t; - } - - void addToCurrentIterationConfigTime(double t) { - start_iteration(); - time_config_current_iteration += t; - } - - void addToCurrentIterationComputeEnergy(const char *s, double e) { - start_iteration(); - energy_compute_current_iteration += e; - tensor_energy_info.back().push_back(std::make_pair(std::string(s), e)); - } - - void addToCurrentIterationControlEnergy(double e) { - start_iteration(); - energy_control_current_iteration += e; - } - - void addToCurrentIterationConfigEnergy(double e) { - start_iteration(); - energy_config_current_iteration += e; - } - - double getTotalTime() { - return time_total; - } - - double getTotalEnergy() { - return energy_total; - } - - double getCurrentIterationComputeTime() { - return time_compute_current_iteration; - } - - double getCurrentIterationComputeEnergy() { - return energy_compute_current_iteration; - } - - void set_out_file_name(std::string &str) { - out_file_name = str; - } - - void printToFile() { - - INFO("Writing Runtime Profile Info File...\n"); - std::ofstream s_out(out_file_name.c_str()); - - if (!s_out) { - ERROR("Failed to open output file."); - abort(); - } - - // By construction, tensor_time_info and tensor_energy_info are expected - // to have equal sizes, in outer and inner vectors both, - // and all time_info and energy_info vectors must have the same size. - unsigned iterations = tensor_time_info.size(); - CUSTOM_ASSERT((tensor_time_info.size() == iterations) && - (tensor_energy_info.size() == iterations) && - (control_time_info.size() == iterations) && - (control_energy_info.size() == iterations) && - (config_time_info.size() == iterations) && - (config_energy_info.size() == iterations) && - "time_info and energy_info size: \ - iteration number does not match."); - - for (unsigned i = 0; i < tensor_time_info.size(); i++ ) { - // time_info.size() == energy_info.size(), since we passed the assertion - s_out << "Iteration " << i << "\n"; - - CUSTOM_ASSERT((tensor_time_info[i].size() == tensor_energy_info[i].size()) && - "time_info and energy_info size: operation number does not match."); - for (unsigned j = 0; j < tensor_time_info[i].size(); j++) { - // time_info[i].size() == energy_info[i].size(), we passed the assertion - CUSTOM_ASSERT((tensor_time_info[i][j].first == tensor_energy_info[i][j].first) && - "time_info and energy_info: operation does not match."); - s_out << tensor_time_info[i][j].first << " " - << tensor_time_info[i][j].second << " " - << tensor_energy_info[i][j].second << "\n"; - } - - s_out << "\nIteration Compute Time : " << compute_time_info[i] << "\n"; - s_out << "Iteration Compute Energy: " << compute_energy_info[i] << "\n"; - s_out << "Iteration Control Time : " << control_time_info[i] << "\n"; - s_out << "Iteration Control Energy: " << control_energy_info[i] << "\n"; - s_out << "Iteration Config Time : " << config_time_info[i] << "\n"; - s_out << "Iteration Control Energy: " << config_energy_info[i] << "\n\n\n"; - - } - s_out << "\n\nTotal Compute Time : " << time_compute << "\n"; - s_out << "Total Compute Energy: " << energy_compute << "\n"; - - s_out << "\nTotal Control Time : " << time_control << "\n"; - s_out << "Total Control Energy: " << energy_control << "\n"; - - s_out << "\nTotal Config Time : " << time_config << "\n"; - s_out << "Total Config Energy: " << energy_config << "\n"; - - s_out << "\nTotal Time : " << time_total << "\n"; - s_out << "Total Energy: " << energy_total << "\n"; - - s_out.close(); - - INFO("Done writing profile.\n"); - - } - - ProfileInfo() : time_total(0.0), energy_total(0.0), - time_compute_current_iteration(0.0), - time_control_current_iteration(0.0), - time_config_current_iteration(0.0), - energy_compute_current_iteration(0.0), - energy_control_current_iteration(0.0), - energy_config_current_iteration(0.0), - in_iteration(false) {} + void end_iteration(); + + void addToCurrentIterationComputeTime(const char *s, double t); + + void addToCurrentIterationControlTime(double t); + + void addToCurrentIterationConfigTime(double t); + + void addToCurrentIterationComputeEnergy(const char *s, double e); + + void addToCurrentIterationControlEnergy(double e); + + void addToCurrentIterationConfigEnergy(double e); + + double getTotalTime(); + + double getTotalEnergy(); + + double getCurrentIterationComputeTime(); + + double getCurrentIterationComputeEnergy(); + + void set_out_file_name(std::string &str); + + void printToFile(); + + ProfileInfo(); }; @@ -264,32 +116,11 @@ class Slowdowns { unsigned idx; public: - Slowdowns() { - idx = 0; - - std::ifstream s_in("slowdowns.txt"); - if (!s_in) { - DEBUG("slowdowns file not found. Initializing slowdowns randomly.\n"); - for (unsigned i = 0; i < 10; i++) { - slowdowns.push_back( 1.0 + (rand()/(RAND_MAX/(5.0-1.0))) ); - } - } else { - for (std::string line; std::getline(s_in, line); ) { - float s = std::stof(line); - slowdowns.push_back(s); - } - } - } - - unsigned getSlowdownsNumber() { - return slowdowns.size(); - } - - float getNextSlowdown() { - float tmp = slowdowns[idx]; - idx = (idx + 1) % slowdowns.size(); - return tmp; - } + Slowdowns(); + + unsigned getSlowdownsNumber(); + + float getNextSlowdown(); }; @@ -333,14 +164,8 @@ class RuntimeController { //Functions // Private functions of profiler - void start_profiler() { - if (profiler) - profiler->start_profiler(); - } - void stop_profiler() { - if (profiler) - profiler->stop_profiler(); - } + void start_profiler(); + void stop_profiler(); void setProfileInfoFilename(const char *); void readQuantizationFile(const char *); @@ -351,33 +176,17 @@ class RuntimeController { public: // For testing purposes only - do not use widely - std::vector<struct Configuration *> &getSpeedupConfigurations() { - return SpeedupConfigurations; - } + std::vector<struct Configuration *> &getSpeedupConfigurations(); // For testing purposes only - do not use widely - std::vector<struct Configuration *> &getEnergyConfigurations() { - return EnergyConfigurations; - } + std::vector<struct Configuration *> &getEnergyConfigurations(); // For testing purposes only - do not use widely - std::vector<struct Configuration *> &getThreeDCurveConfigurations() { - return ThreeDCurveConfigurations; - } + std::vector<struct Configuration *> &getThreeDCurveConfigurations(); // For testing purposes only - do not use widely - unsigned getConfigurationIdx() { - return configurationIdx; - } - - std::vector<float> &getQuantizationRanges(const char *data) { - std::string s(data); - // All nodes are expected to have quantization ranges - return QuantizationMap.at(s); - } - - NodeConfiguration *getNodeConfiguration(const char *data) { - std::string s(data); - // All nodes are expected to have a configuration - return (*Configurations)[configurationIdx]->setup.at(s); - } + unsigned getConfigurationIdx(); + + std::vector<float> &getQuantizationRanges(const char *data); + + NodeConfiguration *getNodeConfiguration(const char *data); // Functions for runtime control void findNextConfiguration(); @@ -387,105 +196,37 @@ class RuntimeController { double getBaselineTime(); Slowdowns *getSlowdowns(); - void init(const char *Cstr, const char *Qstr) { - // We initialize the path to the profile info output file, - // based on the path given for the configuration file - setProfileInfoFilename(Cstr); - - readQuantizationFile(Qstr); - readConfigurationFile(Cstr); - Configurations = NULL; - computeParetoConfigurationPoints(); -// compute3DParetoConfigurationPoints(); Not using 3D curve - INFO("Speedup Configurations\n"); - printConfigurations(SpeedupConfigurations); -// INFO("Energy Configurations\n"); -// printConfigurations(EnergyConfigurations); -// INFO("3D Configurations\n"); -// printConfigurations(ThreeDCurveConfigurations); - configurationIdx = 0; //TODO: initialize using pareto curve - findTargetConfiguration ? - Configurations = &SpeedupConfigurations; - - // Initializations for different runtime control strategies - srand(static_cast <unsigned> (time(0))); - slowdowns = new Slowdowns(); - pseudo_rd = 0.0; - - // Start profiling thread in the background, ready to time - start_profiler(); - pause_profiler(); - reset_profiler(); - } + void init(const char *Cstr, const char *Qstr); // Exposing functionality of ProfileInfo - void end_iteration() { - if (PI) - PI->end_iteration(); - } - - void addToCurrentIterationComputeTime(const char *s, double t) { - if (PI) - PI->addToCurrentIterationComputeTime(s, t); - } - - void addToCurrentIterationControlTime(double t) { - if (PI) - PI->addToCurrentIterationControlTime(t); - } - - void addToCurrentIterationConfigTime(double t) { - if (PI) - PI->addToCurrentIterationConfigTime(t); - } - - void addToCurrentIterationComputeEnergy(const char *s, double e) { - if (PI) - PI->addToCurrentIterationComputeEnergy(s, e); - } - - void addToCurrentIterationControlEnergy(double e) { - if (PI) - PI->addToCurrentIterationControlEnergy(e); - } - - void addToCurrentIterationConfigEnergy(double e) { - if (PI) - PI->addToCurrentIterationConfigEnergy(e); - } - - double getCurrentIterationComputeTime() { - return (PI ? PI->getCurrentIterationComputeTime() : 0.0) ; - } - - double getCurrentIterationComputeEnergy() { - return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0) ; - } - - void writeProfileInfo() { - if (PI) - PI->printToFile(); - } + void end_iteration(); + + void addToCurrentIterationComputeTime(const char *s, double t); + + void addToCurrentIterationControlTime(double t); + + void addToCurrentIterationConfigTime(double t); + + void addToCurrentIterationComputeEnergy(const char *s, double e); + + void addToCurrentIterationControlEnergy(double e); + + void addToCurrentIterationConfigEnergy(double e); + + double getCurrentIterationComputeTime(); + + double getCurrentIterationComputeEnergy(); + + void writeProfileInfo(); // Exposing functionality of (gpu) profiler - void resume_profiler() { - if (profiler) - profiler->resume_profiler(); - } - - void pause_profiler() { - if (profiler) - profiler->pause_profiler(); - } - - void reset_profiler() { - if (profiler) - profiler->reset(); - } - - std::pair<double, double> get_time_energy() const { - return (profiler ? profiler->get_time_energy() - : std::make_pair(0.0, 0.0)) ; - } + void resume_profiler(); + + void pause_profiler(); + + void reset_profiler(); + + std::pair<double, double> get_time_energy() const; // Exposing functionality of promise simulator std::pair<double, double> fc_profile(const unsigned num_rows_a, @@ -493,15 +234,7 @@ class RuntimeController { const unsigned num_rows_b, const unsigned num_cols_b, const unsigned voltage_swing, - const unsigned patch_factor) { - return (promise ? promise->fc_profile(num_rows_a, - num_cols_a, - num_rows_b, - num_cols_b, - voltage_swing, - patch_factor) - : std::make_pair(0.0, 0.0)) ; - } + const unsigned patch_factor); std::pair<double, double> conv_profile(const unsigned n, const unsigned c, @@ -514,58 +247,12 @@ class RuntimeController { const unsigned s_h, const unsigned s_w, const unsigned voltage_swing, - const unsigned patch_factor) { - return (promise ? promise->conv_profile(n, c, h, w, - c_out, c_in, k_h, k_w, - s_h, s_w, - voltage_swing, patch_factor) - : std::make_pair(0.0, 0.0)) ; - } + const unsigned patch_factor); // Constructor and descructor - RuntimeController() { - configurationIdx = 0; -#ifdef ACTIVE_PROFILING - PI = new ProfileInfo(); - profiler = new Profiler(); - promise = new Promise(); -#else - PI = NULL; - profiler = NULL; - promise = NULL; -#endif + RuntimeController(); - } - - ~RuntimeController() { - - stop_profiler(); - writeProfileInfo(); - - if (PI) { - delete PI; - } - if (profiler) { - delete profiler; - } - if (promise) { - delete promise; - } - - for (std::vector<struct Configuration>::iterator it = InitialConfigurations.begin(), - ie = InitialConfigurations.end(); it != ie; ++it) { - std::map<std::string, NodeConfiguration * > ConfSetup = it->setup; - for (std::map<std::string, NodeConfiguration* >::const_iterator it = ConfSetup.begin(); - it != ConfSetup.end(); ++it) { - delete it->second; - } - } - // Handle freeing memory, for all configurations - // A way to do that is to not free the initial configurations in the pareto curve, - // and free all at once in the end - // This is done because configurations are stored in different containers, but - // share the node setup - } + ~RuntimeController(); // Helper Functions void printQuantizationMap(); @@ -573,1037 +260,35 @@ class RuntimeController { void printConfigurations(std::vector<struct Configuration *> &); }; - -void RuntimeController::setProfileInfoFilename(const char *str) { - - if (PI) { - std::string file_path = std::string(str); - size_t idx = file_path.find_last_of("/"); - file_path.erase(idx + 1); - file_path.append("profile_info_"); - - bool found = false; - std::string profile_filename; - for (unsigned i = 0; !found; i++) { - profile_filename = file_path; - profile_filename.append(std::to_string(i)); - profile_filename.append(".txt"); - found = !fileExists(profile_filename); - } - - PI->set_out_file_name(profile_filename); - } -} - #define NODE_NAME_BUFFER_SIZE 10 - -void RuntimeController::readQuantizationFile(const char *str) { - - INFO("Reading Quantization Ranges File...\n"); - - if (std::string(str).empty()) { - INFO("Empty quantization file string.\n"); - return; - } - - std::ifstream qin(str); - - if (!qin) { - ERROR("Failed to open PROMISE quantization file."); - abort(); - } - - while (!qin.eof()) { - char NodeName[NODE_NAME_BUFFER_SIZE]; - std::vector<float> QuantRangeVector; - - qin >> NodeName; - - float qrange; - for (unsigned i = 0; i < 8; i++ ) { - qin >> qrange; - QuantRangeVector.push_back(qrange); - } - // See if we need to insert this in map instead - my lookup test seemed to work without it - // std::string s(NodeName); - QuantizationMap.insert(std::pair< std::string, std::vector<float> > (NodeName, - QuantRangeVector)); - } - - qin.close(); - INFO("DONE.\n"); -} - -void RuntimeController::printQuantizationMap() { - - DEBUG("Quantization Ranges Map:\n"); - - for (std::map<std::string, std::vector<float> >::const_iterator it = QuantizationMap.begin(); - it != QuantizationMap.end(); ++it) { - DEBUG("%s :", it->first.c_str()); - - for (unsigned i = 0; i < it->second.size() ; i++) { - DEBUG(" %f", it->second[i]); - } - - DEBUG("\n"); - } - -} - - -void RuntimeController::readConfigurationFile(const char *str) { - - INFO("Reading Configuration File...\n"); - - std::ifstream qin(str); - - if (!qin) { - ERROR("Failed to open configuration file."); - abort(); - } - - bool readingConfiguration = false; - bool readingFirstLine = false; - - // Read baseline_time from first line of configuration file - std::string first_line; - std::getline(qin, first_line); - DEBUG("first_line: %s\n", first_line.c_str()); - baseline_time = std::stod(first_line); - DEBUG("Baseline time: %lf\n\n", baseline_time); - - for (std::string line; std::getline(qin, line); ) { - DEBUG("line: %s\n", line.c_str()); - - // Tokenize using ' ' as delimiter - // Vector to store tokens - std::vector<std::string> tokens; - - for (auto i = strtok(&line[0], " "); i != NULL; i = strtok(NULL, " ")) - tokens.push_back(i); - - for (unsigned i = 0; i < tokens.size(); i++ ) - DEBUG("t: %s\n", tokens[i].c_str()); - - DEBUG("\n"); - - if (tokens[0] == "+++++") { // Found new configuration start token - // Mark the start of a new configuration - readingConfiguration = true; - readingFirstLine = true; - continue; - } - - if (tokens[0] == "-----") { // Found configuration end token - readingConfiguration = false; - // Mark the end of current configuration - continue; - } - - if (readingFirstLine) { - // Read first line, to create the new configuration struct - readingFirstLine = false; - InitialConfigurations.push_back(Configuration(tokens[0], - std::stof(tokens[1]), - std::stof(tokens[2]), - std::stof(tokens[3]), - std::stof(tokens[4]))); - continue; - } - - if (tokens[1] == "promise") { - DEBUG("Found promise configuration\n"); - - // There must be at least one approximation option - CUSTOM_ASSERT((tokens.size() >= 2) && "Not enough approximation options."); - - PROMISENodeConfiguration *NodeConf = new PROMISENodeConfiguration(); - InitialConfigurations.back().setup.insert(std::make_pair(tokens[0], NodeConf)); - - // In increments of two, to handle pairs of approx option - tunable parameter - for (unsigned idx = 2; idx < tokens.size(); idx += 2) { - if (tokens[idx] == "swing_level") { - DEBUG("Found swing voltage option\n"); - int vswing = std::stoi(tokens[idx+1]); - DEBUG("vswing: %d\n", vswing); - NodeConf->pushNewApproximationChoice(PROMISENodeConfiguration::APPROX::SWING_LEVEL, - vswing); - } - // TODO: other approximation options handled here - } - - } else if (tokens[1] == "gpu") { - DEBUG("Found gpu configuration\n"); - - // There must be at least one operation, with an approximation option - CUSTOM_ASSERT((tokens.size() >= 5) && "Not enough operations - approximation options."); - - GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration(); - InitialConfigurations.back().setup.insert(std::make_pair(tokens[0], NodeConf)); - - unsigned idx = 2; - while (idx < tokens.size()) { - if (tokens[idx] == "add") { - DEBUG("Found add operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::ADD); - idx++; - } else if (tokens[idx] == "batchnorm") { - DEBUG("Found batchnorm operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::BATCHNORM); - idx++; - } else if (tokens[idx] == "conv") { - DEBUG("Found conv operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::CONV); - idx++; - } else if (tokens[idx] == "group_conv") { - DEBUG("Found group_conv operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::GROUP_CONV); - idx++; - } else if (tokens[idx] == "mul") { - DEBUG("Found mul operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MUL); - idx++; - } else if (tokens[idx] == "relu") { - DEBUG("Found relu operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::RELU); - idx++; - } else if (tokens[idx] == "clipped_relu") { - DEBUG("Found clipped_relu operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU); - idx++; - } else if (tokens[idx] == "tanh") { - DEBUG("Found tanh operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::TANH); - idx++; - } else if (tokens[idx] == "pool_max") { - DEBUG("Found pool_max operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::POOL_MAX); - idx++; - } else if (tokens[idx] == "pool_mean") { - DEBUG("Found pool_mean operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::POOL_MEAN); - idx++; - } else if (tokens[idx] == "pool_min") { - DEBUG("Found pool_min operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::POOL_MIN); - idx++; - } else if (tokens[idx] == "softmax") { - DEBUG ("Found softmax operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::SOFTMAX); - idx++; - } else if (tokens[idx] == "fft") { - DEBUG ("Found fft operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::FFT); - idx++; - } else if (tokens[idx] == "reduce") { - DEBUG ("Found reduce operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::REDUCE); - idx++; - } else if (tokens[idx] == "projectiveT") { - DEBUG ("Found projectiveT operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T); - idx++; - } else if (tokens[idx] == "map1") { - DEBUG ("Found map1 operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP1); - idx++; - } else if (tokens[idx] == "map2") { - DEBUG ("Found map2 operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP2); - idx++; - } else if (tokens[idx] == "map3") { - DEBUG ("Found map3 operation\n"); - NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP3); - idx++; - } else /*Not a new operation. This means an approximation option*/ - if (tokens[idx] == "fp32") { - DEBUG("Found fp32 option\n"); - int fp32 = std::stoi(tokens[idx+1]); - DEBUG("fp32 parameter: %d, ignoring\n", fp32); - NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::FP32, - fp32); - idx += 2; - } else if (tokens[idx] == "fp16") { - DEBUG("Found fp16 option\n"); - int fp16 = std::stoi(tokens[idx+1]); - DEBUG("fp16 parameter: %d, ignoring\n", fp16); - NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::FP16, - fp16); - idx += 2; - } else if (tokens[idx] == "perf") { - DEBUG("Found perf option\n"); - int perf = std::stoi(tokens[idx+1]); - DEBUG("perf parameter: %d\n", perf); - NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::PERFORATION, perf); - idx += 2; - } else if (tokens[idx] == "samp") { - DEBUG("Found samp option\n"); - int samp = std::stoi(tokens[idx+1]); - DEBUG("samp parameter: %d\n", samp); - NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::INPUT_SAMPLING, samp); - idx += 2; - } else if (tokens[idx] == "red_samp") { - DEBUG("Found red_samp option\n"); - int red_samp = std::stoi(tokens[idx+1]); - DEBUG("red_samp parameter: %d\n", red_samp); - NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING, red_samp); - idx += 2; - } - // TODO: other approximation options handled here - - } - - } else { - DEBUG ("Invalid Configuration File\n"); - exit(1); - } - - } - - qin.close(); - DEBUG("DONE.\n"); - -} - #define AL_THRESHOLD 0.01 -void RuntimeController::computeParetoConfigurationPoints() { - - // Keep indices of pareto optimal points (configurations from - // InitialConfigurations vector that were copied to Configurations vector.) - // The others' setup pointer needs to be deleted - std::vector<unsigned> Indices; - - // Baseline configuration (first one we read) always belongs to the curve - SpeedupConfigurations.push_back(&InitialConfigurations[0]); - EnergyConfigurations.push_back(&InitialConfigurations[0]); - - // Sort the configurations according to accuracy loss - INFO("Sorting autotuner configurations...\n"); - std::sort(InitialConfigurations.begin()+1, - InitialConfigurations.end(), - ConfigurationLessThan()); - INFO("Done sorting.\n"); - - for (unsigned start_idx = 1; start_idx < InitialConfigurations.size(); ) { - // Points to first Configuration with different (higher) accuracy loss - // compared to the one pointed by start_idx - unsigned end_idx = start_idx + 1; - while ((end_idx < InitialConfigurations.size()) && - (InitialConfigurations[end_idx].accuracyLoss - - InitialConfigurations[start_idx].accuracyLoss < AL_THRESHOLD)) { - end_idx++; - } - DEBUG("start_idx = %d, end_idx = %d\n", start_idx, end_idx); - // Now, all elements in [start_idx, end_idx) have equal accuracy loss, - // that is lower from later ones. - - // Find the best speedup and energy between them as well - float sp = -1.0; //FLT_MIN - unsigned sp_idx = 0; - - float en = -1.0; //FLT_MIN - unsigned en_idx = 0; - - for (unsigned i = start_idx; i < end_idx; i++) { - if (InitialConfigurations[i].speedup > sp) { - sp = InitialConfigurations[i].speedup; - sp_idx = i; - } - if (InitialConfigurations[i].energy > en) { - en = InitialConfigurations[i].energy; - en_idx = i; - } - } - DEBUG("accuracy loss = %f, speedup = %f, at sp_idx = %d\n", - InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx); - // Found best speedup for this accuracy point (not dominated by any of these). - DEBUG("accuracy loss = %f, energy = %f, at en_idx = %d\n", - InitialConfigurations[en_idx].accuracyLoss, en, en_idx); - // Found best energy for this accuracy point (not dominated by any of these). - - // Now, we need to check that it is not dominated. - // - better accuracy loss of all in initial configurations out of - // start_idx, end_idx range - // - better or equal speedup to the ones within this range - // We only need to check the points already in Configurations, that have - // already been inserted in pareto frontier. These have better accuracy - // loss, so this one will only be added if it shows better speedup - // The one in curve with best speedup so far is the last one (with worst - // = highest accuracy loss), so compare only with that one. - - // Similar handling of energy vector - - bool sp_notDominated = true; - if (!SpeedupConfigurations.empty()) { - if (SpeedupConfigurations.back()->speedup >= sp) - sp_notDominated = false; - } - - bool en_notDominated = true; - if (!EnergyConfigurations.empty()) { - if (EnergyConfigurations.back()->energy >= en) - en_notDominated = false; - } - - DEBUG("sp_notDominated = %d\n", sp_notDominated); - DEBUG("en_notDominated = %d\n", en_notDominated); - - // If not dominated, insert in pareto frontier set - if (sp_notDominated) { - SpeedupConfigurations.push_back(&InitialConfigurations[sp_idx]); - } - if (en_notDominated) { - EnergyConfigurations.push_back(&InitialConfigurations[en_idx]); - } - - // Keep track of unnecessary configurations - for (unsigned i = start_idx; i < end_idx; i++) { - if (((i != sp_idx) || (!sp_notDominated)) && - ((i != en_idx) || (!en_notDominated))) - Indices.push_back(i); - } - - // Continue from next accuracy loss level - start_idx = end_idx; - - } - - // All elements in InitialConfigurations whose index is in Indices are no - // longer needed. -// for (std::vector<unsigned>::iterator idx_it = Indices.begin(), idx_e = Indices.end(); -// idx_it != idx_e; ++idx_it) { -// std::map<std::string, NodeConfiguration * > ConfSetup = -// InitialConfigurations[*idx_it].setup; -// for (std::map<std::string, NodeConfiguration* >::const_iterator it = ConfSetup.begin(); -// it != ConfSetup.end(); ++it) { -// delete it->second; -// } -// } -// InitialConfigurations.clear(); - -} - -void RuntimeController::compute3DParetoConfigurationPoints() { - - // Sort the configurations according to accuracy loss - INFO("Sorting autotuner configurations...\n"); - std::sort(InitialConfigurations.begin(), - InitialConfigurations.end(), - ConfigurationLessThan()); - INFO("Done sorting.\n"); - - for (unsigned start_idx = 0; start_idx < InitialConfigurations.size(); ) { - // Points to first Configuration with different (higher) accuracy loss - // compared to the one pointed by start_idx - unsigned end_idx = start_idx + 1; - while ((end_idx < InitialConfigurations.size()) && - (InitialConfigurations[end_idx].accuracyLoss - - InitialConfigurations[start_idx].accuracyLoss < AL_THRESHOLD)) { - end_idx++; - } - DEBUG("start_idx = %d, end_idx = %d\n", start_idx, end_idx); - // Now, all elements in [start_idx, end_idx) have equal accuracy loss, - // that is lower from later ones and worse than those already in curve - // (so they cannot displace them). - - // Find candidates from [start_idx, end_idx) to be inserted - // Keep their indices. If a point is dominated (strictly worse), - // its index will not be inserted - std::vector<unsigned> Indices; - - for (unsigned i = start_idx; i < end_idx; i++) { - bool dominated = false; - for (unsigned j = i+1; (j < end_idx) && !dominated; j++) { - if ((InitialConfigurations[i].speedup < InitialConfigurations[j].speedup) && - (InitialConfigurations[i].energy < InitialConfigurations[j].energy)) { - dominated = true; - } - } - if (!dominated) { - DEBUG("accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n", - InitialConfigurations[i].accuracyLoss, - InitialConfigurations[i].speedup, - InitialConfigurations[i].energy, - i); - Indices.push_back(i); - } - } - - for (std::vector<unsigned>::iterator idx_it = Indices.begin(), idx_e = Indices.end(); - idx_it != idx_e; ++idx_it) { - Configuration &CandidateConfiguration = InitialConfigurations[*idx_it]; - - if (!ThreeDCurveConfigurations.empty()) { - bool notDominated = true; - for (unsigned i = 0; (i < ThreeDCurveConfigurations.size()) && notDominated; i++) { - if ((CandidateConfiguration.speedup <= ThreeDCurveConfigurations[i]->speedup) && - (CandidateConfiguration.energy <= ThreeDCurveConfigurations[i]->energy)) { - // This configuration is not better, in at least one characteristic, - // compared to the existing ones in the curve. - notDominated = false; - } - } - if (notDominated) { - ThreeDCurveConfigurations.push_back(&CandidateConfiguration); - } - } else { - // If the curve is empty, we know that this is a point that must be - // inserted. It has the best accuracy loss, and belongs here because - // it is not dominated by any point in this accuracy range. - ThreeDCurveConfigurations.push_back(&CandidateConfiguration); - } - } - - // Continue from next accuracy loss level - start_idx = end_idx; - } - -} - - -void RuntimeController::printConfigurations(std::vector<struct Configuration> &Confs) { - - for (std::vector<struct Configuration>::iterator it = Confs.begin(), - ie = Confs.end(); it != ie; ++it) { - it->print(); - } - -} - -void RuntimeController::printConfigurations(std::vector<struct Configuration *> &Confs) { - - for (std::vector<struct Configuration *>::iterator it = Confs.begin(), - ie = Confs.end(); it != ie; ++it) { - (*it)->print(); - } - -} - -void RuntimeController::findNextConfiguration() { - configurationIdx = (configurationIdx + 1) % Configurations->size() ; - DEBUG("findNextConfiguration: Updated configurationIdx to %u.\n", configurationIdx); -} - -void RuntimeController::findTargetConfiguration(float goal, enum SEARCH_KIND sk) { - // We search in range begin(), end()-1 . It is OK to decrement end(), because - // the configurations vector always points to one of the pareto curves, and - // they are never empty - we have always pushed at least one configuration. - - DEBUG("findTargetConfiguration: goalVal: %f, search kind: %d.\n", goal, sk); - std::vector<struct Configuration *>::iterator low_it; - switch (sk) { - case SPEEDUP: - { - Configurations = &SpeedupConfigurations; - low_it = std::lower_bound(Configurations->begin(), - Configurations->end()-1, - goal, - ConfigurationLessThan_SP()); - configurationIdx = low_it - Configurations->begin(); - break; - } - case ENERGY: - { - Configurations = &EnergyConfigurations; - low_it = std::lower_bound(Configurations->begin(), - Configurations->end()-1, - goal, - ConfigurationLessThan_E()); - configurationIdx = low_it - Configurations->begin(); - break; - } - case ACCURACY_LOSS: - { - Configurations = &SpeedupConfigurations; - low_it = std::lower_bound(Configurations->begin(), - Configurations->end()-1, - goal, - ConfigurationLessThan_AL()); - if ((*low_it)->accuracyLoss > goal) - --low_it; - configurationIdx = low_it - Configurations->begin(); - break; - } - default: - { - CUSTOM_ASSERT(false && "Unknown search option for optimization target"); - ERROR("Unknown search option for optimization target."); - abort(); - } - } - // After search, low_it points to the Configuration to the element with the - // goal value or the immediately lower value if it does not exist - - DEBUG("findTargetConfiguration: Updated configurationIdx to %u.\n", configurationIdx); - -} - -void RuntimeController::adjustTargetConfiguration(float goal) { - - DEBUG("adjustTargetConfiguration: goalVal: %f.\n\n", goal); - - // Find configuration before the selected one. There is always one. - // Get the two configurations' speedup, and compute the appropriate ranges - float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup; - float prev_conf_speedup = (*Configurations)[configurationIdx-1]->speedup; - float sp_diff = curr_conf_speedup - prev_conf_speedup; - - float high_range = curr_conf_speedup - goal; - float low_range = goal - prev_conf_speedup; - - // These represent how likely we are to pick the upper or lower configuration - float high_pb = low_range / sp_diff; - float low_pb = high_range / sp_diff; - - DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: %f.\n", - ((*Configurations)[configurationIdx]->name).c_str(), high_pb); - DEBUG("**---- adjustTargetConfiguration: lower conf = %s with probability: %f.\n\n", - ((*Configurations)[configurationIdx-1]->name).c_str(), low_pb); - - // Select a random number from 0 to 1 - // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1) to the upper -// float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ; - pseudo_rd += 0.1f; - float rd = pseudo_rd; - if (rd < low_pb) { - // If the probability is in the low range - configurationIdx--; - } - - DEBUG("adjustTargetConfiguration: rand: %f : Updated configurationIdx to %u.\n", - rd, configurationIdx); -} - #define MAX_GOAL_SPEEDUP 9 -float RuntimeController::getGoalSpeedup() { - return 1.0 + (rand() / (RAND_MAX / (MAX_GOAL_SPEEDUP - 1.0) ) ); -} - -double RuntimeController::getBaselineTime() { - return baseline_time; -} - -Slowdowns *RuntimeController::getSlowdowns() { - return slowdowns; -} // Functions to be inserted with initializeTensorRT and clearTensorRT -void llvm_hpvm_initializeRuntimeController(const char *ConfigFile, const char *QRangeFile) { - RC = new RuntimeController(); - RC->init(ConfigFile, QRangeFile); - return; -} +void llvm_hpvm_initializeRuntimeController(const char *ConfigFile, const char *QRangeFile); -void llvm_hpvm_clearRuntimeController() { - delete RC; - return; -} +void llvm_hpvm_clearRuntimeController(); //*** Methods to compute accuracy of a tensor by the runtime controller ***// -uint32_t* labels_from_file = NULL; - -uint32_t* hpvm_rt_readLabelsBatch_cached(const char* labels_file, int start, int end) { - - // Initialize buffer - if (!labels_from_file) { - FILE* file = fopen(labels_file, "rb"); - if (file == NULL) { - ERROR("Data file %s is not found. Aborting...\n", labels_file); - abort(); - } - // Get number of labels - fseek(file, 0, SEEK_END); - long size = ftell(file); - fseek(file, 0, SEEK_SET); // return file pointer to beginning - - // Allocate memory for labels - labels_from_file = (uint32_t*) malloc(size); - if (labels_from_file == NULL) { - ERROR("Memory allocation for labels unsucessfull. Aborting...\n"); - abort(); - } - - // Copy the labels file into the allocated buffer - size_t result = fread(labels_from_file, 1, size, file); - if (result != size) { - // We did not read as many elemets as there are in the file - ERROR("Reading labels file unsucessfull. Aborting...\n"); - abort(); - } - - fclose(file); - } - -// int num_labels = end - start; -// uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels); -// for (unsigned i = start; i < end; i++) { -// labels[i-start] = labels_from_file[i]; -// } -// return labels; - - // Return pointer to labels - return &labels_from_file[start]; - -} +extern uint32_t* labels_from_file; + +uint32_t* hpvm_rt_readLabelsBatch_cached(const char* labels_file, int start, int end); //*** Copied from dnn_sources/include/utils.h ***// -float hpvm_rt_computeAccuracy3(uint32_t* labels, void* result_ptr) { - - struct Tensor* result = (struct Tensor*) result_ptr; - - size_t batch_dim = result->dims.dim_sizes[0]; - size_t num_classes = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; - int num_errors = 0; - - printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes); - - for(int i = 0; i < batch_dim; i++){ - - int chosen = 0; - for (int id = 1; id < num_classes; ++id){ - if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id; - } - - if(chosen != labels[i]) - num_errors++; - } - - float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; - printf("****** Accuracy = %f \n\n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ - - std::ostringstream ss; - ss << std::fixed << accuracy; - std::string print_str = ss.str(); - - fwrite(print_str.c_str(), 1, print_str.length(), fp); - } - - fclose(fp); - - return accuracy; -} +float hpvm_rt_computeAccuracy3(uint32_t* labels, void* result_ptr); #define llvm_hpvm_invokeRtControl_SLOWDOWN_PR llvm_hpvm_invokeRtControl -void llvm_hpvm_invokeRtControl_BASE(void* result, const char* str, int start, int end) { - - RC->resume_profiler(); - - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); - - // Note the end of iteration - RC->end_iteration(); -} - -void llvm_hpvm_invokeRtControl_ITERATE(void* result, const char* str, int start, int end) { - - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - RC->resume_profiler(); - RC->findNextConfiguration(); - // Still use findNext configuration, to update the configurationIdx, - // to point to next location - enum SEARCH_KIND k = ACCURACY_LOSS; - float goalVal = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss; - RC->findTargetConfiguration(goalVal, k); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); - - // Note the end of iteration - RC->end_iteration(); -} +void llvm_hpvm_invokeRtControl_BASE(void* result, const char* str, int start, int end); -void llvm_hpvm_invokeRtControl_ADJUST(void* result, const char* str, int start, int end) { +void llvm_hpvm_invokeRtControl_ITERATE(void* result, const char* str, int start, int end); - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); +void llvm_hpvm_invokeRtControl_ADJUST(void* result, const char* str, int start, int end); - // Read stats for iteration that was just completed - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); +void llvm_hpvm_invokeRtControl_SLOWDOWN(void* result, const char* str, int start, int end); - RC->resume_profiler(); - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double baseline_time = RC->getBaselineTime(); - double target_speedup = current_iteration_time / baseline_time; - RC->findTargetConfiguration(target_speedup, SPEEDUP); - RC->pause_profiler(); +void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void* result, const char* str, int start, int end); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); +void llvm_hpvm_invokeRtControl_RAND(void* result, const char* str, int start, int end); - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); - INFO("target speedup = %lf\n\n", target_speedup); - - // Note the end of iteration - RC->end_iteration(); -} - -void llvm_hpvm_invokeRtControl_SLOWDOWN(void* result, const char* str, int start, int end) { - - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - std::string prev_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - - RC->resume_profiler(); - float slowdown = RC->getSlowdowns()->getNextSlowdown(); - RC->findTargetConfiguration(slowdown, SPEEDUP); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - std::string next_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - float next_conf_speedup = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); - INFO("slowdown (target speedup) = %f\n", slowdown); - INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO("Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); - - // Note the end of iteration - RC->end_iteration(); -} - -void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void* result, const char* str, int start, int end) { - - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - std::string prev_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - - RC->resume_profiler(); - float slowdown = RC->getSlowdowns()->getNextSlowdown(); - RC->findTargetConfiguration(slowdown, SPEEDUP); - RC->adjustTargetConfiguration(slowdown); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - std::string next_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - float next_conf_speedup = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); - INFO("slowdown (target speedup) = %f\n", slowdown); - INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO("Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); - - // Note the end of iteration - RC->end_iteration(); -} - -void llvm_hpvm_invokeRtControl_RAND(void* result, const char* str, int start, int end) { - - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - RC->resume_profiler(); - RC->findTargetConfiguration(RC->getGoalSpeedup(), SPEEDUP); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); - - // Note the end of iteration - RC->end_iteration(); -} - - - - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - RC->resume_profiler(); - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double baseline_time = RC->getBaselineTime(); - double target_speedup = current_iteration_time / baseline_time; - RC->findTargetConfiguration(target_speedup, SPEEDUP); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); - INFO("target speedup = %lf\n\n", target_speedup); - - // Note the end of iteration - RC->end_iteration(); -} - -void llvm_hpvm_invokeRtControl_SLOWDOWN(void* result, const char* str, int start, int end) { - - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - std::string prev_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - - RC->resume_profiler(); - float slowdown = RC->getSlowdowns()->getNextSlowdown(); - RC->findTargetConfiguration(slowdown, SPEEDUP); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - std::string next_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - float next_conf_speedup = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); - INFO("slowdown (target speedup) = %f\n", slowdown); - INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO("Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); - - // Note the end of iteration - RC->end_iteration(); -} - -void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void* result, const char* str, int start, int end) { - - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - std::string prev_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - - RC->resume_profiler(); - float slowdown = RC->getSlowdowns()->getNextSlowdown(); - RC->findTargetConfiguration(slowdown, SPEEDUP); - RC->adjustTargetConfiguration(slowdown); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - std::string next_conf_name = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; - float next_conf_speedup = - RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - - INFO("current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); - INFO("slowdown (target speedup) = %f\n", slowdown); - INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO("Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); - - // Note the end of iteration - RC->end_iteration(); -} - -void llvm_hpvm_invokeRtControl_RAND(void* result, const char* str, int start, int end) { - - uint32_t* labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); - hpvm_rt_computeAccuracy3(labels_cached, result); - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - RC->resume_profiler(); - RC->findTargetConfiguration(RC->getGoalSpeedup(), SPEEDUP); - RC->pause_profiler(); - - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); - - // Note the end of iteration - RC->end_iteration(); -} - - - -#endif \ No newline at end of file +#endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp index b1c472dc61665144c09743629192ce069c43a2d2..a35fd3eae58ac708ea10386416072736ddd98d91 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp @@ -1,4 +1,5 @@ #include "functional/common.h" +#include "tensor_utils.h" #include <algorithm> #include <functional> diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp new file mode 100644 index 0000000000000000000000000000000000000000..63dd927cd09f25ed995a8dd3ec2fd3f91c1a6032 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp @@ -0,0 +1,221 @@ +#include "configuration.h" + +using P_APPROX = PROMISENodeConfiguration::APPROX; +using G_APPROX = GPUNodeConfiguration::APPROX; +using G_TENSOR_OP = GPUNodeConfiguration::TENSOR_OP; + +bool NodeConfiguration::isPROMISENodeConfiguration() { + return NODE_CONFIGURATION_TARGET_ID == PROMISE; +} + +bool NodeConfiguration::isGPUNodeConfiguration() { + return NODE_CONFIGURATION_TARGET_ID == GPU; +} + +void PROMISENodeConfiguration::pushNewApproximationChoice( + P_APPROX approx, int u) { + ApproxChoices.push_back(std::make_pair(approx, u)); +} + +std::vector<std::pair<P_APPROX, int>> & +PROMISENodeConfiguration::getApproxChoices() { + return ApproxChoices; +} + +PROMISENodeConfiguration::PROMISENodeConfiguration() { + NODE_CONFIGURATION_TARGET_ID = PROMISE; +} + +PROMISENodeConfiguration::~PROMISENodeConfiguration() {} + +void GPUNodeConfiguration::pushNewTensorOperation(enum TENSOR_OP top) { + std::vector<std::pair<G_APPROX, int>> emptyVec; + ApproxChoices.push_back(std::make_pair(top, emptyVec)); +} + +void GPUNodeConfiguration::pushNewApproximationChoiceForOperation( + G_APPROX approx, int u) { + unsigned size = ApproxChoices.size(); + CUSTOM_ASSERT( + size >= 1 && + "Cannot apply approximation choice to non existent operation."); + ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u)); +} + +std::vector<std::pair<G_TENSOR_OP, std::vector<std::pair<G_APPROX, int>>>> & +GPUNodeConfiguration::getApproxChoices() { + return ApproxChoices; +} + +GPUNodeConfiguration::GPUNodeConfiguration() { + NODE_CONFIGURATION_TARGET_ID = GPU; +} +GPUNodeConfiguration::~GPUNodeConfiguration() {} + +Configuration::Configuration( + std::string &n, float f, float e, float a, float al) + : name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {} + +float Configuration::getSpeedup() { return speedup; } + +float Configuration::getEnergy() { return energy; } + +float Configuration::getAccuracy() { return accuracy; } + +float Configuration::getAccuracyLoss() { return accuracyLoss; } +bool ConfigurationLessThan:: +operator()(const struct Configuration &a, const struct Configuration &b) const { + return (a.accuracyLoss < b.accuracyLoss); +} +bool ConfigurationLessThan_AL:: +operator()(const struct Configuration *a, const float &b) const { + return (a->accuracyLoss < b); +} +bool ConfigurationLessThan_SP:: +operator()(const struct Configuration *a, const float &b) const { + return (a->speedup < b); +} +bool ConfigurationLessThan_E:: +operator()(const struct Configuration *a, const float &b) const { + return (a->energy < b); +} + +//****** HEADER Ends - Source Starts + +// Helper configuration print methods + +void PROMISENodeConfiguration::print() { + + printf(" promise"); + for (auto &it : ApproxChoices) { + printf(" "); + switch (it.first) { + case P_APPROX::SWING_LEVEL: + printf("swing_level"); + break; + default: + ERROR("Unknown approximation option"); + break; + // TODO additional approx methods to be printed here + } + printf(" %d", it.second); + } + + printf("\n"); +} + +void GPUNodeConfiguration::print() { + + printf(" gpu"); + for (auto &it : ApproxChoices) { + + printf(" "); + switch (it.first) { + case TENSOR_OP::ADD: + printf("add"); + break; + case TENSOR_OP::BATCHNORM: + printf("batchnorm"); + break; + case TENSOR_OP::CONV: + printf("conv"); + break; + case TENSOR_OP::GROUP_CONV: + printf("group_conv"); + break; + case TENSOR_OP::MUL: + printf("mul"); + break; + case TENSOR_OP::RELU: + printf("relu"); + break; + case TENSOR_OP::CLIPPED_RELU: + printf("clipped_relu"); + break; + case TENSOR_OP::TANH: + printf("tanh"); + break; + case TENSOR_OP::POOL_MAX: + printf("pool_max"); + break; + case TENSOR_OP::POOL_MEAN: + printf("pool_mean"); + break; + case TENSOR_OP::POOL_MIN: + printf("pool_min"); + break; + case TENSOR_OP::SOFTMAX: + printf("softmax"); + break; + case TENSOR_OP::FFT: + printf("fft"); + break; + case TENSOR_OP::REDUCE: + printf("reduce"); + break; + case TENSOR_OP::PROJECTIVE_T: + printf("projectiveT"); + break; + case TENSOR_OP::MAP1: + printf("map1"); + break; + case TENSOR_OP::MAP2: + printf("map2"); + break; + case TENSOR_OP::MAP3: + printf("map3"); + break; + default: + ERROR("Unknown tensor operation."); + break; + // TODO additional operations to be printed here + } + + auto &approxVec = it.second; + for (auto &inner_it : approxVec) { + printf(" "); + switch (inner_it.first) { + case G_APPROX::FP32: + printf("fp32"); + break; + case G_APPROX::FP16: + printf("fp16"); + break; + case G_APPROX::PERFORATION: + printf("perf"); + break; + case G_APPROX::INPUT_SAMPLING: + printf("samp"); + break; + case G_APPROX::REDUCTION_SAMPLING: + printf("red_samp"); + break; + default: + ERROR("Unknown approximation option"); + break; + // TODO additional approx methods to be printed here + } + + printf(" %d", inner_it.second); + } + } + + printf("\n"); +} + +void Configuration::print() { + + printf("+++++\n"); + printf( + "%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy, + accuracyLoss); + for (std::map<std::string, NodeConfiguration *>::const_iterator it = + setup.begin(); + it != setup.end(); ++it) { + printf("%s :", it->first.c_str()); + + it->second->print(); + } + + printf("-----\n"); +} diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu index 6a8e5cb0f4fae763328114576213cb66f131e00c..41be0d65a46214daffac44719140a8faca505946 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu @@ -1,6 +1,6 @@ #include "device_math.h" +#include "error.h" -#include <cuda_fp16.h> #include <thrust/complex.h> #define DEF_FUNC_PTR(fname) __device__ void *fname##_ptr = (void *)(fname); @@ -18,6 +18,18 @@ namespace _internal { +// The following functions are not used, but they reference their cuda counterpart +// which is necessary, otherwise ptx compilation breaks +__device__ float hypotf_(float x, float y) { return hypotf(x, y); } + +__device__ float atan2f_(float x, float y) { return atan2f(x, y); } + +__device__ float sqrtf_(float x) { return sqrtf(x); } + +__device__ float fmax_(float x, float y) { return fmax(x, y); } + +__device__ float fmin_(float x, float y) { return fmin(x, y); } + __device__ float add(float x, float y) { return x + y; } __device__ float sub(float x, float y) { return x - y; } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc index 951cfbfdb0097e3376a7c0038ed3e550ce795c07..98ea25df2ed793220857e9da944054543472a724 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc @@ -1,8 +1,3 @@ - -#ifndef GLOBAL_DATA_HEADER -#define GLOBAL_DATA_HEADER - - #include <stdio.h> #include <stdarg.h> #include <cstdio> @@ -14,11 +9,13 @@ #include <cublas_v2.h> #include <cudnn.h> #include <cublas_api.h> -#include "tensor.h" #include <string> #include <unordered_map> #include <vector> +#include "tensor.h" +#include "global_data.h" + /* Data declarations */ cudnnHandle_t cudnnHandle; cublasHandle_t cublasHandle; @@ -35,9 +32,7 @@ int total_ops = 0; std::vector<int> op_accuracies; std::vector<Range*> quant_ranges; -std::vector<void*> tensors_ptr; -std::vector<void*> host_ptr; -std::vector<void*> obj_ptr; +std::unordered_set<void*> tensors_ptr, host_ptr, obj_ptr; std::unordered_map<void*, int> tracked_tensors; @@ -47,6 +42,3 @@ std::unordered_map<int, int> skip_tensors; // Profiling Data std::unordered_map<std::string, int> func_counters; std::string profile_data = ""; - - -#endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2730e2f2786987eb77527011bf29de1a87b3f582 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp @@ -0,0 +1,1357 @@ +#include "hpvm-rt-controller.h" + +/* + * Check if a file exists + * Return true if the file exists, false else + */ +bool fileExists(const std::string &file) { + struct stat buf; + return (stat(file.c_str(), &buf) == 0); +} + +// Functions +void ProfileInfo::resetCurrentIterationTime() { + time_compute_current_iteration = 0.0; + time_control_current_iteration = 0.0; + time_config_current_iteration = 0.0; +} + +void ProfileInfo::resetCurrentIterationEnergy() { + energy_compute_current_iteration = 0.0; + energy_control_current_iteration = 0.0; + energy_config_current_iteration = 0.0; +} + +void ProfileInfo::start_iteration() { + if (!in_iteration) { + resetCurrentIterationTime(); + resetCurrentIterationEnergy(); + tensor_time_info.push_back(std::vector<std::pair<std::string, double>>()); + tensor_energy_info.push_back(std::vector<std::pair<std::string, double>>()); + in_iteration = true; + } +} +void ProfileInfo::end_iteration() { + // Update time counters + time_compute += time_compute_current_iteration; + time_control += time_control_current_iteration; + time_config += time_config_current_iteration; + + time_total += + (time_compute_current_iteration + time_control_current_iteration + + time_config_current_iteration); + + // Update energy counters + energy_compute += energy_compute_current_iteration; + energy_control += energy_control_current_iteration; + energy_config += energy_config_current_iteration; + + energy_total += + (energy_compute_current_iteration + energy_control_current_iteration + + energy_config_current_iteration); + + // Save current iteration counters + compute_time_info.push_back(time_compute_current_iteration); + compute_energy_info.push_back(energy_compute_current_iteration); + control_time_info.push_back(time_control_current_iteration); + control_energy_info.push_back(energy_control_current_iteration); + config_time_info.push_back(time_config_current_iteration); + config_energy_info.push_back(energy_config_current_iteration); + + // Note end of iteration + in_iteration = false; +} + +void ProfileInfo::addToCurrentIterationComputeTime(const char *s, double t) { + start_iteration(); + time_compute_current_iteration += t; + tensor_time_info.back().push_back(std::make_pair(std::string(s), t)); +} + +void ProfileInfo::addToCurrentIterationControlTime(double t) { + start_iteration(); + time_control_current_iteration += t; +} + +void ProfileInfo::addToCurrentIterationConfigTime(double t) { + start_iteration(); + time_config_current_iteration += t; +} + +void ProfileInfo::addToCurrentIterationComputeEnergy(const char *s, double e) { + start_iteration(); + energy_compute_current_iteration += e; + tensor_energy_info.back().push_back(std::make_pair(std::string(s), e)); +} + +void ProfileInfo::addToCurrentIterationControlEnergy(double e) { + start_iteration(); + energy_control_current_iteration += e; +} + +void ProfileInfo::addToCurrentIterationConfigEnergy(double e) { + start_iteration(); + energy_config_current_iteration += e; +} + +double ProfileInfo::getTotalTime() { return time_total; } + +double ProfileInfo::getTotalEnergy() { return energy_total; } + +double ProfileInfo::getCurrentIterationComputeTime() { + return time_compute_current_iteration; +} + +double ProfileInfo::getCurrentIterationComputeEnergy() { + return energy_compute_current_iteration; +} + +void ProfileInfo::set_out_file_name(std::string &str) { out_file_name = str; } + +void ProfileInfo::printToFile() { + + INFO("Writing Runtime Profile Info File...\n"); + std::ofstream s_out(out_file_name.c_str()); + + if (!s_out) { + ERROR("Failed to open output file."); + abort(); + } + + // By construction, tensor_time_info and tensor_energy_info are expected + // to have equal sizes, in outer and inner vectors both, + // and all time_info and energy_info vectors must have the same size. + unsigned iterations = tensor_time_info.size(); + CUSTOM_ASSERT( + (tensor_time_info.size() == iterations) && + (tensor_energy_info.size() == iterations) && + (control_time_info.size() == iterations) && + (control_energy_info.size() == iterations) && + (config_time_info.size() == iterations) && + (config_energy_info.size() == iterations) && + "time_info and energy_info size: \ + iteration number does not match."); + + for (unsigned i = 0; i < tensor_time_info.size(); i++) { + // time_info.size() == energy_info.size(), since we passed the assertion + s_out << "Iteration " << i << "\n"; + + CUSTOM_ASSERT( + (tensor_time_info[i].size() == tensor_energy_info[i].size()) && + "time_info and energy_info size: operation number does not match."); + for (unsigned j = 0; j < tensor_time_info[i].size(); j++) { + // time_info[i].size() == energy_info[i].size(), we passed the assertion + CUSTOM_ASSERT( + (tensor_time_info[i][j].first == tensor_energy_info[i][j].first) && + "time_info and energy_info: operation does not match."); + s_out << tensor_time_info[i][j].first << " " + << tensor_time_info[i][j].second << " " + << tensor_energy_info[i][j].second << "\n"; + } + + s_out << "\nIteration Compute Time : " << compute_time_info[i] << "\n"; + s_out << "Iteration Compute Energy: " << compute_energy_info[i] << "\n"; + s_out << "Iteration Control Time : " << control_time_info[i] << "\n"; + s_out << "Iteration Control Energy: " << control_energy_info[i] << "\n"; + s_out << "Iteration Config Time : " << config_time_info[i] << "\n"; + s_out << "Iteration Control Energy: " << config_energy_info[i] << "\n\n\n"; + } + s_out << "\n\nTotal Compute Time : " << time_compute << "\n"; + s_out << "Total Compute Energy: " << energy_compute << "\n"; + + s_out << "\nTotal Control Time : " << time_control << "\n"; + s_out << "Total Control Energy: " << energy_control << "\n"; + + s_out << "\nTotal Config Time : " << time_config << "\n"; + s_out << "Total Config Energy: " << energy_config << "\n"; + + s_out << "\nTotal Time : " << time_total << "\n"; + s_out << "Total Energy: " << energy_total << "\n"; + + s_out.close(); + + INFO("Done writing profile.\n"); +} + +ProfileInfo::ProfileInfo() + : time_total(0.0), energy_total(0.0), time_compute_current_iteration(0.0), + time_control_current_iteration(0.0), time_config_current_iteration(0.0), + energy_compute_current_iteration(0.0), + energy_control_current_iteration(0.0), + energy_config_current_iteration(0.0), in_iteration(false) {} +Slowdowns::Slowdowns() { + idx = 0; + + std::ifstream s_in("slowdowns.txt"); + if (!s_in) { + DEBUG("slowdowns file not found. Initializing slowdowns randomly.\n"); + for (unsigned i = 0; i < 10; i++) { + slowdowns.push_back(1.0 + (rand() / (RAND_MAX / (5.0 - 1.0)))); + } + } else { + for (std::string line; std::getline(s_in, line);) { + float s = std::stof(line); + slowdowns.push_back(s); + } + } +} + +unsigned Slowdowns::getSlowdownsNumber() { return slowdowns.size(); } + +float Slowdowns::getNextSlowdown() { + float tmp = slowdowns[idx]; + idx = (idx + 1) % slowdowns.size(); + return tmp; +} + +// Functions + +// Private functions of profiler +void RuntimeController::start_profiler() { + if (profiler) + profiler->start_profiler(); +} +void RuntimeController::stop_profiler() { + if (profiler) + profiler->stop_profiler(); +} +// For testing purposes only - do not use widely +std::vector<struct Configuration *> &RuntimeController:: +getSpeedupConfigurations() { + return SpeedupConfigurations; +} +// For testing purposes only - do not use widely +std::vector<struct Configuration *> &RuntimeController:: +getEnergyConfigurations() { + return EnergyConfigurations; +} +// For testing purposes only - do not use widely +std::vector<struct Configuration *> &RuntimeController:: +getThreeDCurveConfigurations() { + return ThreeDCurveConfigurations; +} +// For testing purposes only - do not use widely +unsigned RuntimeController::getConfigurationIdx() { return configurationIdx; } + +std::vector<float> &RuntimeController::getQuantizationRanges(const char *data) { + std::string s(data); + // All nodes are expected to have quantization ranges + return QuantizationMap.at(s); +} + +NodeConfiguration *RuntimeController::getNodeConfiguration(const char *data) { + std::string s(data); + // All nodes are expected to have a configuration + return (*Configurations)[configurationIdx]->setup.at(s); +} + +void RuntimeController::init(const char *Cstr, const char *Qstr) { + // We initialize the path to the profile info output file, + // based on the path given for the configuration file + setProfileInfoFilename(Cstr); + + readQuantizationFile(Qstr); + readConfigurationFile(Cstr); + Configurations = NULL; + computeParetoConfigurationPoints(); + // compute3DParetoConfigurationPoints(); Not using 3D curve + INFO("Speedup Configurations\n"); + printConfigurations(SpeedupConfigurations); + // INFO("Energy Configurations\n"); + // printConfigurations(EnergyConfigurations); + // INFO("3D Configurations\n"); + // printConfigurations(ThreeDCurveConfigurations); + configurationIdx = + 0; // TODO: initialize using pareto curve - findTargetConfiguration ? + Configurations = &SpeedupConfigurations; + + // Initializations for different runtime control strategies + srand(static_cast<unsigned>(time(0))); + slowdowns = new Slowdowns(); + pseudo_rd = 0.0; + + // Start profiling thread in the background, ready to time + start_profiler(); + pause_profiler(); + reset_profiler(); +} + +// Exposing functionality of ProfileInfo +void RuntimeController::end_iteration() { + if (PI) + PI->end_iteration(); +} + +void RuntimeController::addToCurrentIterationComputeTime( + const char *s, double t) { + if (PI) + PI->addToCurrentIterationComputeTime(s, t); +} + +void RuntimeController::addToCurrentIterationControlTime(double t) { + if (PI) + PI->addToCurrentIterationControlTime(t); +} + +void RuntimeController::addToCurrentIterationConfigTime(double t) { + if (PI) + PI->addToCurrentIterationConfigTime(t); +} + +void RuntimeController::addToCurrentIterationComputeEnergy( + const char *s, double e) { + if (PI) + PI->addToCurrentIterationComputeEnergy(s, e); +} + +void RuntimeController::addToCurrentIterationControlEnergy(double e) { + if (PI) + PI->addToCurrentIterationControlEnergy(e); +} + +void RuntimeController::addToCurrentIterationConfigEnergy(double e) { + if (PI) + PI->addToCurrentIterationConfigEnergy(e); +} + +double RuntimeController::getCurrentIterationComputeTime() { + return (PI ? PI->getCurrentIterationComputeTime() : 0.0); +} + +double RuntimeController::getCurrentIterationComputeEnergy() { + return (PI ? PI->getCurrentIterationComputeEnergy() : 0.0); +} + +void RuntimeController::writeProfileInfo() { + if (PI) + PI->printToFile(); +} + +// Exposing functionality of (gpu) profiler +void RuntimeController::resume_profiler() { + if (profiler) + profiler->resume_profiler(); +} + +void RuntimeController::pause_profiler() { + if (profiler) + profiler->pause_profiler(); +} + +void RuntimeController::reset_profiler() { + if (profiler) + profiler->reset(); +} + +std::pair<double, double> RuntimeController::get_time_energy() const { + return (profiler ? profiler->get_time_energy() : std::make_pair(0.0, 0.0)); +} + +// Exposing functionality of promise simulator +std::pair<double, double> RuntimeController::fc_profile( + const unsigned num_rows_a, const unsigned num_cols_a, + const unsigned num_rows_b, const unsigned num_cols_b, + const unsigned voltage_swing, const unsigned patch_factor) { + return ( + promise ? promise->fc_profile( + num_rows_a, num_cols_a, num_rows_b, num_cols_b, + voltage_swing, patch_factor) + : std::make_pair(0.0, 0.0)); +} + +std::pair<double, double> RuntimeController::conv_profile( + const unsigned n, const unsigned c, const unsigned h, const unsigned w, + const unsigned c_out, const unsigned c_in, const unsigned k_h, + const unsigned k_w, const unsigned s_h, const unsigned s_w, + const unsigned voltage_swing, const unsigned patch_factor) { + return ( + promise ? promise->conv_profile( + n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing, + patch_factor) + : std::make_pair(0.0, 0.0)); +} + +// Constructor and descructor +RuntimeController::RuntimeController() { + configurationIdx = 0; +#ifdef ACTIVE_PROFILING + PI = new ProfileInfo(); + profiler = new Profiler(); + promise = new Promise(); +#else + PI = NULL; + profiler = NULL; + promise = NULL; +#endif +} + +RuntimeController::~RuntimeController() { + + stop_profiler(); + writeProfileInfo(); + + if (PI) { + delete PI; + } + if (profiler) { + delete profiler; + } + if (promise) { + delete promise; + } + + for (std::vector<struct Configuration>::iterator + it = InitialConfigurations.begin(), + ie = InitialConfigurations.end(); + it != ie; ++it) { + std::map<std::string, NodeConfiguration *> ConfSetup = it->setup; + for (std::map<std::string, NodeConfiguration *>::const_iterator it = + ConfSetup.begin(); + it != ConfSetup.end(); ++it) { + delete it->second; + } + } + // Handle freeing memory, for all configurations + // A way to do that is to not free the initial configurations in the pareto + // curve, and free all at once in the end This is done because configurations + // are stored in different containers, but share the node setup +} + +void RuntimeController::setProfileInfoFilename(const char *str) { + + if (PI) { + std::string file_path = std::string(str); + size_t idx = file_path.find_last_of("/"); + file_path.erase(idx + 1); + file_path.append("profile_info_"); + + bool found = false; + std::string profile_filename; + for (unsigned i = 0; !found; i++) { + profile_filename = file_path; + profile_filename.append(std::to_string(i)); + profile_filename.append(".txt"); + found = !fileExists(profile_filename); + } + + PI->set_out_file_name(profile_filename); + } +} +void RuntimeController::readQuantizationFile(const char *str) { + + INFO("Reading Quantization Ranges File...\n"); + + if (std::string(str).empty()) { + INFO("Empty quantization file string.\n"); + return; + } + + std::ifstream qin(str); + + if (!qin) { + ERROR("Failed to open PROMISE quantization file."); + abort(); + } + + while (!qin.eof()) { + char NodeName[NODE_NAME_BUFFER_SIZE]; + std::vector<float> QuantRangeVector; + + qin >> NodeName; + + float qrange; + for (unsigned i = 0; i < 8; i++) { + qin >> qrange; + QuantRangeVector.push_back(qrange); + } + // See if we need to insert this in map instead - my lookup test seemed to + // work without it std::string s(NodeName); + QuantizationMap.insert( + std::pair<std::string, std::vector<float>>(NodeName, QuantRangeVector)); + } + + qin.close(); + INFO("DONE.\n"); +} + +void RuntimeController::printQuantizationMap() { + + DEBUG("Quantization Ranges Map:\n"); + + for (std::map<std::string, std::vector<float>>::const_iterator it = + QuantizationMap.begin(); + it != QuantizationMap.end(); ++it) { + DEBUG("%s :", it->first.c_str()); + + for (unsigned i = 0; i < it->second.size(); i++) { + DEBUG(" %f", it->second[i]); + } + + DEBUG("\n"); + } +} + +void RuntimeController::readConfigurationFile(const char *str) { + + INFO("Reading Configuration File...\n"); + + std::ifstream qin(str); + + if (!qin) { + ERROR("Failed to open configuration file."); + abort(); + } + + bool readingConfiguration = false; + bool readingFirstLine = false; + + // Read baseline_time from first line of configuration file + std::string first_line; + std::getline(qin, first_line); + DEBUG("first_line: %s\n", first_line.c_str()); + baseline_time = std::stod(first_line); + DEBUG("Baseline time: %lf\n\n", baseline_time); + + for (std::string line; std::getline(qin, line);) { + DEBUG("line: %s\n", line.c_str()); + + // Tokenize using ' ' as delimiter + // Vector to store tokens + std::vector<std::string> tokens; + + for (auto i = strtok(&line[0], " "); i != NULL; i = strtok(NULL, " ")) + tokens.push_back(i); + + for (unsigned i = 0; i < tokens.size(); i++) + DEBUG("t: %s\n", tokens[i].c_str()); + + DEBUG("\n"); + + if (tokens[0] == "+++++") { // Found new configuration start token + // Mark the start of a new configuration + readingConfiguration = true; + readingFirstLine = true; + continue; + } + + if (tokens[0] == "-----") { // Found configuration end token + readingConfiguration = false; + // Mark the end of current configuration + continue; + } + + if (readingFirstLine) { + // Read first line, to create the new configuration struct + readingFirstLine = false; + InitialConfigurations.push_back(Configuration( + tokens[0], std::stof(tokens[1]), std::stof(tokens[2]), + std::stof(tokens[3]), std::stof(tokens[4]))); + continue; + } + + if (tokens[1] == "promise") { + DEBUG("Found promise configuration\n"); + + // There must be at least one approximation option + CUSTOM_ASSERT( + (tokens.size() >= 2) && "Not enough approximation options."); + + PROMISENodeConfiguration *NodeConf = new PROMISENodeConfiguration(); + InitialConfigurations.back().setup.insert( + std::make_pair(tokens[0], NodeConf)); + + // In increments of two, to handle pairs of approx option - tunable + // parameter + for (unsigned idx = 2; idx < tokens.size(); idx += 2) { + if (tokens[idx] == "swing_level") { + DEBUG("Found swing voltage option\n"); + int vswing = std::stoi(tokens[idx + 1]); + DEBUG("vswing: %d\n", vswing); + NodeConf->pushNewApproximationChoice( + PROMISENodeConfiguration::APPROX::SWING_LEVEL, vswing); + } + // TODO: other approximation options handled here + } + + } else if (tokens[1] == "gpu") { + DEBUG("Found gpu configuration\n"); + + // There must be at least one operation, with an approximation option + CUSTOM_ASSERT( + (tokens.size() >= 5) && + "Not enough operations - approximation options."); + + GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration(); + InitialConfigurations.back().setup.insert( + std::make_pair(tokens[0], NodeConf)); + + unsigned idx = 2; + while (idx < tokens.size()) { + if (tokens[idx] == "add") { + DEBUG("Found add operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::ADD); + idx++; + } else if (tokens[idx] == "batchnorm") { + DEBUG("Found batchnorm operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::BATCHNORM); + idx++; + } else if (tokens[idx] == "conv") { + DEBUG("Found conv operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::CONV); + idx++; + } else if (tokens[idx] == "group_conv") { + DEBUG("Found group_conv operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::GROUP_CONV); + idx++; + } else if (tokens[idx] == "mul") { + DEBUG("Found mul operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::MUL); + idx++; + } else if (tokens[idx] == "relu") { + DEBUG("Found relu operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::RELU); + idx++; + } else if (tokens[idx] == "clipped_relu") { + DEBUG("Found clipped_relu operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU); + idx++; + } else if (tokens[idx] == "tanh") { + DEBUG("Found tanh operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::TANH); + idx++; + } else if (tokens[idx] == "pool_max") { + DEBUG("Found pool_max operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::POOL_MAX); + idx++; + } else if (tokens[idx] == "pool_mean") { + DEBUG("Found pool_mean operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::POOL_MEAN); + idx++; + } else if (tokens[idx] == "pool_min") { + DEBUG("Found pool_min operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::POOL_MIN); + idx++; + } else if (tokens[idx] == "softmax") { + DEBUG("Found softmax operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::SOFTMAX); + idx++; + } else if (tokens[idx] == "fft") { + DEBUG("Found fft operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::FFT); + idx++; + } else if (tokens[idx] == "reduce") { + DEBUG("Found reduce operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::REDUCE); + idx++; + } else if (tokens[idx] == "projectiveT") { + DEBUG("Found projectiveT operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T); + idx++; + } else if (tokens[idx] == "map1") { + DEBUG("Found map1 operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::MAP1); + idx++; + } else if (tokens[idx] == "map2") { + DEBUG("Found map2 operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::MAP2); + idx++; + } else if (tokens[idx] == "map3") { + DEBUG("Found map3 operation\n"); + NodeConf->pushNewTensorOperation( + GPUNodeConfiguration::TENSOR_OP::MAP3); + idx++; + } else /*Not a new operation. This means an approximation option*/ + if (tokens[idx] == "fp32") { + DEBUG("Found fp32 option\n"); + int fp32 = std::stoi(tokens[idx + 1]); + DEBUG("fp32 parameter: %d, ignoring\n", fp32); + NodeConf->pushNewApproximationChoiceForOperation( + GPUNodeConfiguration::APPROX::FP32, fp32); + idx += 2; + } else if (tokens[idx] == "fp16") { + DEBUG("Found fp16 option\n"); + int fp16 = std::stoi(tokens[idx + 1]); + DEBUG("fp16 parameter: %d, ignoring\n", fp16); + NodeConf->pushNewApproximationChoiceForOperation( + GPUNodeConfiguration::APPROX::FP16, fp16); + idx += 2; + } else if (tokens[idx] == "perf") { + DEBUG("Found perf option\n"); + int perf = std::stoi(tokens[idx + 1]); + DEBUG("perf parameter: %d\n", perf); + NodeConf->pushNewApproximationChoiceForOperation( + GPUNodeConfiguration::APPROX::PERFORATION, perf); + idx += 2; + } else if (tokens[idx] == "samp") { + DEBUG("Found samp option\n"); + int samp = std::stoi(tokens[idx + 1]); + DEBUG("samp parameter: %d\n", samp); + NodeConf->pushNewApproximationChoiceForOperation( + GPUNodeConfiguration::APPROX::INPUT_SAMPLING, samp); + idx += 2; + } else if (tokens[idx] == "red_samp") { + DEBUG("Found red_samp option\n"); + int red_samp = std::stoi(tokens[idx + 1]); + DEBUG("red_samp parameter: %d\n", red_samp); + NodeConf->pushNewApproximationChoiceForOperation( + GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING, red_samp); + idx += 2; + } + // TODO: other approximation options handled here + } + + } else { + DEBUG("Invalid Configuration File\n"); + exit(1); + } + } + + qin.close(); + DEBUG("DONE.\n"); +} +void RuntimeController::computeParetoConfigurationPoints() { + + // Keep indices of pareto optimal points (configurations from + // InitialConfigurations vector that were copied to Configurations vector.) + // The others' setup pointer needs to be deleted + std::vector<unsigned> Indices; + + // Baseline configuration (first one we read) always belongs to the curve + SpeedupConfigurations.push_back(&InitialConfigurations[0]); + EnergyConfigurations.push_back(&InitialConfigurations[0]); + + // Sort the configurations according to accuracy loss + INFO("Sorting autotuner configurations...\n"); + std::sort( + InitialConfigurations.begin() + 1, InitialConfigurations.end(), + ConfigurationLessThan()); + INFO("Done sorting.\n"); + + for (unsigned start_idx = 1; start_idx < InitialConfigurations.size();) { + // Points to first Configuration with different (higher) accuracy loss + // compared to the one pointed by start_idx + unsigned end_idx = start_idx + 1; + while ((end_idx < InitialConfigurations.size()) && + (InitialConfigurations[end_idx].accuracyLoss - + InitialConfigurations[start_idx].accuracyLoss < + AL_THRESHOLD)) { + end_idx++; + } + DEBUG("start_idx = %d, end_idx = %d\n", start_idx, end_idx); + // Now, all elements in [start_idx, end_idx) have equal accuracy loss, + // that is lower from later ones. + + // Find the best speedup and energy between them as well + float sp = -1.0; // FLT_MIN + unsigned sp_idx = 0; + + float en = -1.0; // FLT_MIN + unsigned en_idx = 0; + + for (unsigned i = start_idx; i < end_idx; i++) { + if (InitialConfigurations[i].speedup > sp) { + sp = InitialConfigurations[i].speedup; + sp_idx = i; + } + if (InitialConfigurations[i].energy > en) { + en = InitialConfigurations[i].energy; + en_idx = i; + } + } + DEBUG( + "accuracy loss = %f, speedup = %f, at sp_idx = %d\n", + InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx); + // Found best speedup for this accuracy point (not dominated by any of + // these). + DEBUG( + "accuracy loss = %f, energy = %f, at en_idx = %d\n", + InitialConfigurations[en_idx].accuracyLoss, en, en_idx); + // Found best energy for this accuracy point (not dominated by any of + // these). + + // Now, we need to check that it is not dominated. + // - better accuracy loss of all in initial configurations out of + // start_idx, end_idx range + // - better or equal speedup to the ones within this range + // We only need to check the points already in Configurations, that have + // already been inserted in pareto frontier. These have better accuracy + // loss, so this one will only be added if it shows better speedup + // The one in curve with best speedup so far is the last one (with worst + // = highest accuracy loss), so compare only with that one. + + // Similar handling of energy vector + + bool sp_notDominated = true; + if (!SpeedupConfigurations.empty()) { + if (SpeedupConfigurations.back()->speedup >= sp) + sp_notDominated = false; + } + + bool en_notDominated = true; + if (!EnergyConfigurations.empty()) { + if (EnergyConfigurations.back()->energy >= en) + en_notDominated = false; + } + + DEBUG("sp_notDominated = %d\n", sp_notDominated); + DEBUG("en_notDominated = %d\n", en_notDominated); + + // If not dominated, insert in pareto frontier set + if (sp_notDominated) { + SpeedupConfigurations.push_back(&InitialConfigurations[sp_idx]); + } + if (en_notDominated) { + EnergyConfigurations.push_back(&InitialConfigurations[en_idx]); + } + + // Keep track of unnecessary configurations + for (unsigned i = start_idx; i < end_idx; i++) { + if (((i != sp_idx) || (!sp_notDominated)) && + ((i != en_idx) || (!en_notDominated))) + Indices.push_back(i); + } + + // Continue from next accuracy loss level + start_idx = end_idx; + } + + // All elements in InitialConfigurations whose index is in Indices are no + // longer needed. + // for (std::vector<unsigned>::iterator idx_it = Indices.begin(), idx_e = + // Indices.end(); + // idx_it != idx_e; ++idx_it) { + // std::map<std::string, NodeConfiguration * > ConfSetup = + // InitialConfigurations[*idx_it].setup; + // for (std::map<std::string, NodeConfiguration* >::const_iterator it = + // ConfSetup.begin(); + // it != ConfSetup.end(); ++it) { + // delete it->second; + // } + // } + // InitialConfigurations.clear(); +} + +void RuntimeController::compute3DParetoConfigurationPoints() { + + // Sort the configurations according to accuracy loss + INFO("Sorting autotuner configurations...\n"); + std::sort( + InitialConfigurations.begin(), InitialConfigurations.end(), + ConfigurationLessThan()); + INFO("Done sorting.\n"); + + for (unsigned start_idx = 0; start_idx < InitialConfigurations.size();) { + // Points to first Configuration with different (higher) accuracy loss + // compared to the one pointed by start_idx + unsigned end_idx = start_idx + 1; + while ((end_idx < InitialConfigurations.size()) && + (InitialConfigurations[end_idx].accuracyLoss - + InitialConfigurations[start_idx].accuracyLoss < + AL_THRESHOLD)) { + end_idx++; + } + DEBUG("start_idx = %d, end_idx = %d\n", start_idx, end_idx); + // Now, all elements in [start_idx, end_idx) have equal accuracy loss, + // that is lower from later ones and worse than those already in curve + // (so they cannot displace them). + + // Find candidates from [start_idx, end_idx) to be inserted + // Keep their indices. If a point is dominated (strictly worse), + // its index will not be inserted + std::vector<unsigned> Indices; + + for (unsigned i = start_idx; i < end_idx; i++) { + bool dominated = false; + for (unsigned j = i + 1; (j < end_idx) && !dominated; j++) { + if ((InitialConfigurations[i].speedup < + InitialConfigurations[j].speedup) && + (InitialConfigurations[i].energy < + InitialConfigurations[j].energy)) { + dominated = true; + } + } + if (!dominated) { + DEBUG( + "accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n", + InitialConfigurations[i].accuracyLoss, + InitialConfigurations[i].speedup, InitialConfigurations[i].energy, + i); + Indices.push_back(i); + } + } + + for (std::vector<unsigned>::iterator idx_it = Indices.begin(), + idx_e = Indices.end(); + idx_it != idx_e; ++idx_it) { + Configuration &CandidateConfiguration = InitialConfigurations[*idx_it]; + + if (!ThreeDCurveConfigurations.empty()) { + bool notDominated = true; + for (unsigned i = 0; + (i < ThreeDCurveConfigurations.size()) && notDominated; i++) { + if ((CandidateConfiguration.speedup <= + ThreeDCurveConfigurations[i]->speedup) && + (CandidateConfiguration.energy <= + ThreeDCurveConfigurations[i]->energy)) { + // This configuration is not better, in at least one characteristic, + // compared to the existing ones in the curve. + notDominated = false; + } + } + if (notDominated) { + ThreeDCurveConfigurations.push_back(&CandidateConfiguration); + } + } else { + // If the curve is empty, we know that this is a point that must be + // inserted. It has the best accuracy loss, and belongs here because + // it is not dominated by any point in this accuracy range. + ThreeDCurveConfigurations.push_back(&CandidateConfiguration); + } + } + + // Continue from next accuracy loss level + start_idx = end_idx; + } +} + +void RuntimeController::printConfigurations( + std::vector<struct Configuration> &Confs) { + + for (std::vector<struct Configuration>::iterator it = Confs.begin(), + ie = Confs.end(); + it != ie; ++it) { + it->print(); + } +} + +void RuntimeController::printConfigurations( + std::vector<struct Configuration *> &Confs) { + + for (std::vector<struct Configuration *>::iterator it = Confs.begin(), + ie = Confs.end(); + it != ie; ++it) { + (*it)->print(); + } +} + +void RuntimeController::findNextConfiguration() { + configurationIdx = (configurationIdx + 1) % Configurations->size(); + DEBUG( + "findNextConfiguration: Updated configurationIdx to %u.\n", + configurationIdx); +} + +void RuntimeController::findTargetConfiguration( + float goal, enum SEARCH_KIND sk) { + // We search in range begin(), end()-1 . It is OK to decrement end(), because + // the configurations vector always points to one of the pareto curves, and + // they are never empty - we have always pushed at least one configuration. + + DEBUG("findTargetConfiguration: goalVal: %f, search kind: %d.\n", goal, sk); + std::vector<struct Configuration *>::iterator low_it; + switch (sk) { + case SPEEDUP: { + Configurations = &SpeedupConfigurations; + low_it = std::lower_bound( + Configurations->begin(), Configurations->end() - 1, goal, + ConfigurationLessThan_SP()); + configurationIdx = low_it - Configurations->begin(); + break; + } + case ENERGY: { + Configurations = &EnergyConfigurations; + low_it = std::lower_bound( + Configurations->begin(), Configurations->end() - 1, goal, + ConfigurationLessThan_E()); + configurationIdx = low_it - Configurations->begin(); + break; + } + case ACCURACY_LOSS: { + Configurations = &SpeedupConfigurations; + low_it = std::lower_bound( + Configurations->begin(), Configurations->end() - 1, goal, + ConfigurationLessThan_AL()); + if ((*low_it)->accuracyLoss > goal) + --low_it; + configurationIdx = low_it - Configurations->begin(); + break; + } + default: { + CUSTOM_ASSERT(false && "Unknown search option for optimization target"); + ERROR("Unknown search option for optimization target."); + abort(); + } + } + // After search, low_it points to the Configuration to the element with the + // goal value or the immediately lower value if it does not exist + + DEBUG( + "findTargetConfiguration: Updated configurationIdx to %u.\n", + configurationIdx); +} + +void RuntimeController::adjustTargetConfiguration(float goal) { + + DEBUG("adjustTargetConfiguration: goalVal: %f.\n\n", goal); + + // Find configuration before the selected one. There is always one. + // Get the two configurations' speedup, and compute the appropriate ranges + float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup; + float prev_conf_speedup = (*Configurations)[configurationIdx - 1]->speedup; + float sp_diff = curr_conf_speedup - prev_conf_speedup; + + float high_range = curr_conf_speedup - goal; + float low_range = goal - prev_conf_speedup; + + // These represent how likely we are to pick the upper or lower configuration + float high_pb = low_range / sp_diff; + float low_pb = high_range / sp_diff; + + DEBUG( + "**---- adjustTargetConfiguration: upper conf = %s with probability: " + "%f.\n", + ((*Configurations)[configurationIdx]->name).c_str(), high_pb); + DEBUG( + "**---- adjustTargetConfiguration: lower conf = %s with probability: " + "%f.\n\n", + ((*Configurations)[configurationIdx - 1]->name).c_str(), low_pb); + + // Select a random number from 0 to 1 + // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1) + // to the upper + // float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ; + pseudo_rd += 0.1f; + float rd = pseudo_rd; + if (rd < low_pb) { + // If the probability is in the low range + configurationIdx--; + } + + DEBUG( + "adjustTargetConfiguration: rand: %f : Updated configurationIdx to %u.\n", + rd, configurationIdx); +} +float RuntimeController::getGoalSpeedup() { + return 1.0 + (rand() / (RAND_MAX / (MAX_GOAL_SPEEDUP - 1.0))); +} + +double RuntimeController::getBaselineTime() { return baseline_time; } + +Slowdowns *RuntimeController::getSlowdowns() { return slowdowns; } + +// Functions to be inserted with initializeTensorRT and clearTensorRT +void llvm_hpvm_initializeRuntimeController( + const char *ConfigFile, const char *QRangeFile) { + RC = new RuntimeController(); + RC->init(ConfigFile, QRangeFile); + return; +} + +void llvm_hpvm_clearRuntimeController() { + delete RC; + return; +} + +//*** Methods to compute accuracy of a tensor by the runtime controller ***// +uint32_t *labels_from_file = NULL; + +uint32_t * +hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) { + + // Initialize buffer + if (!labels_from_file) { + FILE *file = fopen(labels_file, "rb"); + if (file == NULL) { + ERROR("Data file %s is not found. Aborting...\n", labels_file); + abort(); + } + // Get number of labels + fseek(file, 0, SEEK_END); + long size = ftell(file); + fseek(file, 0, SEEK_SET); // return file pointer to beginning + + // Allocate memory for labels + labels_from_file = (uint32_t *)malloc(size); + if (labels_from_file == NULL) { + ERROR("Memory allocation for labels unsucessfull. Aborting...\n"); + abort(); + } + + // Copy the labels file into the allocated buffer + size_t result = fread(labels_from_file, 1, size, file); + if (result != size) { + // We did not read as many elemets as there are in the file + ERROR("Reading labels file unsucessfull. Aborting...\n"); + abort(); + } + + fclose(file); + } + + // int num_labels = end - start; + // uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels); + // for (unsigned i = start; i < end; i++) { + // labels[i-start] = labels_from_file[i]; + // } + // return labels; + + // Return pointer to labels + return &labels_from_file[start]; +} + +//*** Copied from dnn_sources/include/utils.h ***// +float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { + + struct Tensor *result = (struct Tensor *)result_ptr; + + size_t batch_dim = result->dims.dim_sizes[0]; + size_t num_classes = result->dims.dim_sizes[1]; + float *data = (float *)result->host_data; + int num_errors = 0; + + printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes); + + for (int i = 0; i < batch_dim; i++) { + + int chosen = 0; + for (int id = 1; id < num_classes; ++id) { + if (data[i * num_classes + chosen] < data[i * num_classes + id]) + chosen = id; + } + + if (chosen != labels[i]) + num_errors++; + } + + float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; + printf("****** Accuracy = %f \n\n", accuracy); + + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { + + std::ostringstream ss; + ss << std::fixed << accuracy; + std::string print_str = ss.str(); + + fwrite(print_str.c_str(), 1, print_str.length(), fp); + } + + fclose(fp); + + return accuracy; +} +void llvm_hpvm_invokeRtControl_BASE( + void *result, const char *str, int start, int end) { + + RC->resume_profiler(); + + uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); + hpvm_rt_computeAccuracy3(labels_cached, result); + + // Read stats for iteration that was just completed + double current_iteration_time = RC->getCurrentIterationComputeTime(); + double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationControlTime(pinfo.first); + RC->addToCurrentIterationControlEnergy(pinfo.second); + + INFO( + "current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); + + // Note the end of iteration + RC->end_iteration(); +} + +void llvm_hpvm_invokeRtControl_ITERATE( + void *result, const char *str, int start, int end) { + + uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); + hpvm_rt_computeAccuracy3(labels_cached, result); + + // Read stats for iteration that was just completed + double current_iteration_time = RC->getCurrentIterationComputeTime(); + double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); + + RC->resume_profiler(); + RC->findNextConfiguration(); + // Still use findNext configuration, to update the configurationIdx, + // to point to next location + enum SEARCH_KIND k = ACCURACY_LOSS; + float goalVal = + RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->accuracyLoss; + RC->findTargetConfiguration(goalVal, k); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationControlTime(pinfo.first); + RC->addToCurrentIterationControlEnergy(pinfo.second); + + INFO( + "current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); + + // Note the end of iteration + RC->end_iteration(); +} + +void llvm_hpvm_invokeRtControl_ADJUST( + void *result, const char *str, int start, int end) { + + uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); + hpvm_rt_computeAccuracy3(labels_cached, result); + + // Read stats for iteration that was just completed + double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); + + RC->resume_profiler(); + double current_iteration_time = RC->getCurrentIterationComputeTime(); + double baseline_time = RC->getBaselineTime(); + double target_speedup = current_iteration_time / baseline_time; + RC->findTargetConfiguration(target_speedup, SPEEDUP); + RC->pause_profiler(); + + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationControlTime(pinfo.first); + RC->addToCurrentIterationControlEnergy(pinfo.second); + + INFO( + "current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); + INFO("target speedup = %lf\n\n", target_speedup); + + // Note the end of iteration + RC->end_iteration(); +} + +void llvm_hpvm_invokeRtControl_SLOWDOWN( + void *result, const char *str, int start, int end) { + + uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); + hpvm_rt_computeAccuracy3(labels_cached, result); + + // Read stats for iteration that was just completed + double current_iteration_time = RC->getCurrentIterationComputeTime(); + double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); + + std::string prev_conf_name = + RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; + + RC->resume_profiler(); + float slowdown = RC->getSlowdowns()->getNextSlowdown(); + RC->findTargetConfiguration(slowdown, SPEEDUP); + RC->pause_profiler(); + + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationControlTime(pinfo.first); + RC->addToCurrentIterationControlEnergy(pinfo.second); + + std::string next_conf_name = + RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; + float next_conf_speedup = + RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; + + INFO( + "current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); + INFO("slowdown (target speedup) = %f\n", slowdown); + INFO("Previous configuration: %s\n", prev_conf_name.c_str()); + INFO( + "Swapping to next configuration: %s with speedup %f\n\n", + next_conf_name.c_str(), next_conf_speedup); + + // Note the end of iteration + RC->end_iteration(); +} + +void llvm_hpvm_invokeRtControl_SLOWDOWN_PR( + void *result, const char *str, int start, int end) { + + uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); + hpvm_rt_computeAccuracy3(labels_cached, result); + + // Read stats for iteration that was just completed + double current_iteration_time = RC->getCurrentIterationComputeTime(); + double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); + + std::string prev_conf_name = + RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; + + RC->resume_profiler(); + float slowdown = RC->getSlowdowns()->getNextSlowdown(); + RC->findTargetConfiguration(slowdown, SPEEDUP); + RC->adjustTargetConfiguration(slowdown); + RC->pause_profiler(); + + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationControlTime(pinfo.first); + RC->addToCurrentIterationControlEnergy(pinfo.second); + + std::string next_conf_name = + RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->name; + float next_conf_speedup = + RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; + + INFO( + "current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); + INFO("slowdown (target speedup) = %f\n", slowdown); + INFO("Previous configuration: %s\n", prev_conf_name.c_str()); + INFO( + "Swapping to next configuration: %s with speedup %f\n\n", + next_conf_name.c_str(), next_conf_speedup); + + // Note the end of iteration + RC->end_iteration(); +} + +void llvm_hpvm_invokeRtControl_RAND( + void *result, const char *str, int start, int end) { + + uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); + hpvm_rt_computeAccuracy3(labels_cached, result); + + // Read stats for iteration that was just completed + double current_iteration_time = RC->getCurrentIterationComputeTime(); + double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); + + RC->resume_profiler(); + RC->findTargetConfiguration(RC->getGoalSpeedup(), SPEEDUP); + RC->pause_profiler(); + + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationControlTime(pinfo.first); + RC->addToCurrentIterationControlEnergy(pinfo.second); + + INFO( + "current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); + + // Note the end of iteration + RC->end_iteration(); +} diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu index 6929785df3532d917bff726e6d0af0b2dbb93842..7f10ee0acdb3da4ec7008b23cc3322ab3a0b13fa 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu @@ -4,7 +4,7 @@ #include "functional/map.cuh" #include "functional/reduce.cuh" -#include "tensor_utils.cu" +#include "tensor_utils.h" #include <cufft.h> #include <cufftXt.h> diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp index dc65b8acdc31242879b4b1f1e34af1b1ccc661c2..de254b218853047b5f2147054c7a18a2faa71cf5 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp @@ -1,13 +1,17 @@ +#include <algorithm> #include <cmath> #include <cstring> #include <experimental/filesystem> -#include <string> +#include <numeric> #include <sstream> +#include <string> #include "debug.h" #include "device_math.h" +#include "functional/common.h" #include "img_tensor_runtime.h" #include "img_tensor_utils.h" +#include "tensor_utils.h" // Image I/O utilities #define STB_IMAGE_IMPLEMENTATION @@ -18,12 +22,12 @@ static inline uint8_t *float_to_uint8(const float *fl, size_t count) { auto *ret = new uint8_t[count]; - float max_v = *std::max_element(fl, fl + count), min_v = *std::min_element(fl, fl + count); + float max_v = *std::max_element(fl, fl + count), + min_v = *std::min_element(fl, fl + count); if (max_v - min_v < 1e-3) { for (size_t i = 0; i < count; i++) ret[i] = 0; - } - else { + } else { float frac = 255 / (max_v - min_v); for (size_t i = 0; i < count; i++) ret[i] = uint8_t(frac * (fl[i] - min_v)); @@ -149,7 +153,8 @@ readDataSet(const char *path, size_t start, size_t count, size_t n_color) { throw std::runtime_error("Image load failed"); if (x != h || y != w) { std::ostringstream os; - os << "Image file " << path << " have different shape (" << x << ", " << y << ")"; + os << "Image file " << path << " have different shape (" << x << ", " << y + << ")"; throw std::runtime_error(os.str()); } float *converted = uint8_to_float(data, n_floats); @@ -398,7 +403,7 @@ float violationRate( float mean(const std::vector<float> &values) { std::vector<float> non_nan; - for (float f: values) + for (float f : values) if (!std::isnan(f)) non_nan.push_back(f); if (non_nan.empty()) @@ -428,5 +433,6 @@ void reshape(void *t, const std::vector<size_t> &shape) { free(tensor->dims.dim_sizes); tensor->dims.dim_sizes = (size_t *)malloc(sizeof(size_t) * shape.size()); std::copy(shape.begin(), shape.end(), tensor->dims.dim_sizes); - set4DTensorDescriptor(tensor, tensor->data_format, shape[0], shape[1], shape[2], shape[3]); + set4DTensorDescriptor( + tensor, tensor->data_format, shape[0], shape[1], shape[2], shape[3]); } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu index 4210d5cfcee8b38c426c418d611e3e5264ec50b8..1d848430330c346f0e10efef407324f23defd911 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu @@ -1,8 +1,3 @@ - -#ifndef TENSOR_UTILS_HEADER -#define TENSOR_UTILS_HEADER - - #include <stdio.h> #include <stdlib.h> #include <stdarg.h> @@ -30,6 +25,7 @@ #include <cublas_api.h> #include <vector> +#include "tensor_utils.h" #include "tensor_runtime.h" #include "debug.h" #include "tensor.h" @@ -58,7 +54,7 @@ void freeTensor(void* tensor_ptr){ // Returns the size of the target datatype -static int getTypeSize(int data_type){ +int getTypeSize(int data_type){ // TODO: Add support for more data types switch (data_type) { case float_type: @@ -530,14 +526,4 @@ void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement ERROR("Tensor == NULL"); tensor->data_placement = data_placement; } - - - - - } // end of Extern"C" - - -#endif - -