diff --git a/llvm/projects/gpu_profiler/CMakeLists.txt b/llvm/projects/gpu_profiler/CMakeLists.txt index cce1b3239ab7622bcc94d404fbca29b0a131c421..c6cf3041eee354609b3999e5a8dcd424990f75ec 100644 --- a/llvm/projects/gpu_profiler/CMakeLists.txt +++ b/llvm/projects/gpu_profiler/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.5) -set(libsrc profiler.cpp) +set(libsrc src/profiler.cpp) set (CMAKE_CXX_STANDARD 11) add_library(gpu_profiler STATIC ${libsrc}) +target_include_directories(gpu_profiler PRIVATE include) diff --git a/llvm/projects/gpu_profiler/include/profiler.h b/llvm/projects/gpu_profiler/include/profiler.h new file mode 100644 index 0000000000000000000000000000000000000000..b776ed2b6642ee773783e48c9ba408d33d211f43 --- /dev/null +++ b/llvm/projects/gpu_profiler/include/profiler.h @@ -0,0 +1,110 @@ +#include <atomic> +#include <chrono> +#include <cmath> +#include <condition_variable> +#include <fstream> +#include <iostream> +#include <string> +#include <thread> +#include <vector> + +// Reads power rails at runtime and computes the GPU and DDR energy within a window +// of time, which is delimitered by the calls to resume_profiler() and pause_profiler() +// +// IMPORTANT: Must call pause_profiler() to kill the profiler thread +// +// Public interface methods: +// void start_profiler(); +// void resume_profiler(); +// void pause_profiler(); +// std::pair<double, double> get_time_energy() const; +// void reset() +// void pause_profiler(); +class Profiler { +public: + Profiler(); + + ~Profiler(); + + // Reinitializes boolean vars used for control flow and launches the profiler + // thread. DOES NOT reset other internal data structures. + void start_profiler(); + + // Resumes the profiling of whatever executable's currently running + // DOES NOT reset any data + void resume_profiler(); + + // Stops profiler by putting profiler thread to sleep + void pause_profiler(); + + // Gets the delta time and total GPU and DDR energy between the last two + // calls to resume_profiler and pause_profiler + // + // Returns this as a pair of <delta time in milliseconds, energy> + std::pair<double, double> get_time_energy() const; + + // Resets all internal data structures, including the vector storing all power_readings. + void reset(); + + // Exit the profiler and kill the thread + // Must call start_profiler() to reuse this object after calling pause_profiler() + void stop_profiler(); + +private: + // Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and + // we can't use them. + const unsigned core0 = 0; + const unsigned core1 = 3; + const unsigned core2 = 4; + const unsigned core3 = 5; + + // Power rails are mounted as files. Keeping the old power rail file names for possible future + // integrations + const std::string cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input"; + const std::string gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input"; + const std::string ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input"; + const std::string soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input"; + const std::string sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input"; + + // An individual power reading + struct PowerReading { + std::chrono::time_point<std::chrono::high_resolution_clock> time_; + double cpu_; + double gpu_; + double ddr_; + double soc_; + double sys_; + }; + + // Stores all power readings and is cleared only when reset() is called + std::vector<PowerReading> power_readings_; + + std::chrono::time_point<std::chrono::high_resolution_clock> start_time_; + + // For reading the i2c buses via sysfs + std::ifstream cpu_stream_; + std::ifstream gpu_stream_; + std::ifstream ddr_stream_; + std::ifstream soc_stream_; + std::ifstream sys_stream_; + + std::mutex mutex_; + + std::condition_variable cond_var_; + + bool should_run_profiler_; // True if we want to resume the profiling thread + + std::atomic_bool should_stop_profiler_; // Quit profiling + + std::thread profiler_thread_; + + // Obtain's a single power reading from the GPU and DDR rails + void obtain_power_reading(); + + // Pins the given thread to the specified core + void pin_thread(std::thread &t, const unsigned core) const; + + // Runs the profiler thread, keeping it alive by wrapping the functionality + // in an infinite loop + void run_profiler(); +}; diff --git a/llvm/projects/gpu_profiler/src/offline_profiler.cpp b/llvm/projects/gpu_profiler/src/offline_profiler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..25ca45241c29e7a0f8edb0518d8347a185caf5a4 --- /dev/null +++ b/llvm/projects/gpu_profiler/src/offline_profiler.cpp @@ -0,0 +1,584 @@ +#include <cmath> +#include <chrono> + +#include <iostream> +#include <fstream> +#include <string> +#include <boost/algorithm/string.hpp> + +#include <vector> +#include <map> + +#include <thread> +#include <atomic> +#include <sched.h> + +#define NUM_ARGS 4 + +// This is a simple power profiler that can sample the power of the various +// components in a Jetson TX2. The usage is simple: profile() measures power +// for the specified program, and then dumpOutput() prints the readings to a +// file. profile() can be called as many times as desired - the internal state +// is reset each time and thus the measurements are not cumulative. +class Profiler { +private: + // Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and + // we can't use them. + const unsigned core0 = 0; + const unsigned core1 = 3; + const unsigned core2 = 4; + const unsigned core3 = 5; + + // sysfs paths for i2c buses of various components + const char * const cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input"; + const char * const gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input"; + const char * const ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input"; + const char * const soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input"; + const char * const sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input"; + + // It takes some time for the GPU's power to return to idle (ms) + const unsigned gpu_idle_time = 0; + + // An individual power reading + struct PowerReading { + std::chrono::time_point<std::chrono::high_resolution_clock> time_; + double cpu_; + double gpu_; + double ddr_; + double soc_; + double sys_; + }; + + // Individual tensor op + struct TensorOp { + std::string name_; + + double start_; + double finish_; + double time_; + + double energy_; + double gpu_energy_; + double ddr_energy_; + + double power_; + double gpu_power_; + double ddr_power_; + + TensorOp(std::string name, double start, double finish) + : name_(name), start_(start), finish_(finish), time_(finish - start), + energy_(0.0), gpu_energy_(0.0), ddr_energy_(0.0), + power_(0.0), gpu_power_(0.0), ddr_power_(0.0) { + } + }; + + // Aggregate tensor info + struct AggTensorInfo { + // Op name + std::string name_; + + // Averages + double average_time_; + + double average_energy_; + double average_gpu_energy_; + double average_ddr_energy_; + + double average_power_; + double average_gpu_power_; + double average_ddr_power_; + + // Standard deviations + double time_std_; + + double energy_std_; + double gpu_energy_std_; + double ddr_energy_std_; + + double power_std_; + double gpu_power_std_; + double ddr_power_std_; + }; + + // Total time, energy, and power + struct TotalInfo { + double time_; + + double energy_; + double gpu_energy_; + double ddr_energy_; + + double power_; + double gpu_power_; + double ddr_power_; + + void clear() { + time_ = 0.0; + + energy_ = 0.0; + gpu_energy_ = 0.0; + ddr_energy_ = 0.0; + + power_ = 0.0; + gpu_power_ = 0.0; + ddr_power_ = 0.0; + } + }; + + // For reading the i2c buses via sysfs + std::ifstream cpu_stream_; + std::ifstream gpu_stream_; + std::ifstream ddr_stream_; + std::ifstream soc_stream_; + std::ifstream sys_stream_; + + // Start time (so graph begins from t=0) + std::chrono::time_point<std::chrono::high_resolution_clock> start_time_; + + // Per-run info + std::vector<PowerReading> power_readings_; + + // Aggregate (across all runs) info + std::map<std::string, std::vector<TensorOp>> tensor_info_; + std::vector<AggTensorInfo> agg_tensor_info_; + TotalInfo total_info_; + unsigned iterations_; + + // Start and stop flags to synchronize the program and profiling threads + std::atomic_bool start_; + std::atomic_bool stop_; + +private: + // Resets tensor info and total time and energy + void resetGlobal() { + tensor_info_.clear(); + agg_tensor_info_.clear(); + total_info_.clear(); + } + + // Resets power readings and flags + void resetLocal() { + power_readings_.clear(); + start_ = false; + stop_ = false; + } + + // Pins the given thread to the specified core + void pinThread(std::thread &t, const unsigned core) const { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core, &cpuset); + if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0) + std::cout << "Couldn't set thread affinity\n"; + } + + // Adds a tensor op to the map + void addTensorOp(std::string &op_name, TensorOp &top) { + // Create a vector if this is the first entry + auto it = tensor_info_.find(op_name); + if (it == tensor_info_.end()) { + tensor_info_.insert(std::pair<std::string, std::vector<TensorOp>>(op_name, std::vector<TensorOp>())); + } + tensor_info_[op_name].push_back(top); + } + + // Obtain's a single power reading from the GPU and DDR rails + void getPowerReading() { + PowerReading reading; + + // The order matters here. All the reads have to happen together first + // and then all the seeks have to happen together at the end, otherwise + // there will be a significant time difference between the readings of + // the different rails. + reading.time_ = std::chrono::high_resolution_clock::now(); + gpu_stream_ >> reading.gpu_; + ddr_stream_ >> reading.ddr_; + power_readings_.push_back(reading); + + // Reset the input position of the files + gpu_stream_.seekg(0); + ddr_stream_.seekg(0); + } + + // Executes the program to be profiled + void runProgram(const char * const program) { + // Tell the profiling thread to start, execute the program that needs + // to be profiled, and then tell the profiling thread to stop. + start_ = true; + const auto result = std::system(program); + stop_ = true; + } + + // Records power while the program is running + void recordPower() { + // Obtain the new start time, wait for the start signal, and keep + // profiling until the stop flag is set. + start_time_ = std::chrono::high_resolution_clock::now(); + while (!start_); + while (!stop_) + getPowerReading(); + } + + // Calculates stats for the entire execution (CPU+GPU phase) + void updateTotalStats() { + double energy = 0.0; + double gpu_energy = 0.0; + double ddr_energy = 0.0; + + std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_; + for (auto reading : power_readings_) { + std::chrono::duration<double> duration = reading.time_ - prev_time; + gpu_energy += reading.gpu_ * duration.count(); + ddr_energy += reading.ddr_ * duration.count(); + prev_time = reading.time_; + } + energy = gpu_energy + ddr_energy; + auto time = std::chrono::duration<double>(prev_time - start_time_).count(); + + total_info_.time_ += time; + total_info_.energy_ += (gpu_energy + ddr_energy); + total_info_.gpu_energy_ += gpu_energy; + total_info_.ddr_energy_ += ddr_energy; + + total_info_.power_ += (energy / time); + total_info_.gpu_power_ += (gpu_energy / time); + total_info_.ddr_power_ += (ddr_energy / time); + } + + // Calculates energy and power usage of the given tensor operation + void calculateTensorEP(TensorOp &top) const { + auto prev_time = top.start_; + unsigned i = 0; + + // Skip until we hit the start time of the operation + for (; std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count() < top.start_; i++); + + // Keep going until we hit the finish time of the operation or we run out of readings + for (double curr_time; ((curr_time = std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count()) <= top.finish_) + && (i < power_readings_.size()); i++) { + auto duration = curr_time - prev_time; + prev_time = curr_time; + + top.gpu_energy_ += power_readings_[i].gpu_ * duration; + top.ddr_energy_ += power_readings_[i].ddr_ * duration; + } + top.energy_ = top.gpu_energy_ + top.ddr_energy_; + + top.power_ = top.energy_ / top.time_; + top.gpu_power_ = top.gpu_energy_ / top.time_; + top.ddr_power_ = top.ddr_energy_ / top.time_; + } + + // Calculates stats for all the tensors in the timestamp file + void updatePerOpStats() { + const char * const op_file = "profile_data.txt"; + std::string line; + std::ifstream ifs(op_file, std::ios::in); + + // Calculate time and energy for each tensor operation. There are two + // possibilities for the file format: + // If the line doesn't begin with #, we are looking at FP32 code + // without any conversions to/from FP16, and each operation occupies + // two consecutive lines in the timestamp file. + // If the line does begin with #, we are looking at FP16 code with + // conversion routines in the middle. In this case, *after* the current + // line, there will be two lines for F2H, two lines for H2F, and then + // one line for the end of the operation. + while (std::getline(ifs, line)) { + std::vector<std::string> tokens; + boost::split(tokens, line, boost::is_any_of("\t")); + std::string op_name = tokens[0]; + + // FP32 + if (tokens[0][0] != '#') { + // First line with tensor op name and start time + std::string op_name = tokens[0]; + const auto start = std::stod(tokens[1]); + + // Second line with tensor op end time + std::getline(ifs, line); + tokens.clear(); + boost::split(tokens, line, boost::is_any_of("\t")); + const auto finish = std::stod(tokens[1]); + + TensorOp top(op_name, start, finish); + calculateTensorEP(top); + addTensorOp(op_name, top); + } else { + // First line with tensor op name and start time + std::string op_name = tokens[0].substr(1); + const auto start = std::stod(tokens[1]); + + // Second line with f2h + std::getline(ifs, line); + tokens.clear(); + boost::split(tokens, line, boost::is_any_of("\t")); + std::string f2h_name = op_name + "_f2h"; + const auto f2h_start = std::stod(tokens[1]); + + // Third line with f2h + std::getline(ifs, line); + tokens.clear(); + boost::split(tokens, line, boost::is_any_of("\t")); + const auto f2h_finish = std::stod(tokens[1]); + + // Add f2h + TensorOp f2h(f2h_name, f2h_start, f2h_finish); + calculateTensorEP(f2h); + addTensorOp(f2h_name, f2h); + + // Fourth line with h2f + std::getline(ifs, line); + tokens.clear(); + boost::split(tokens, line, boost::is_any_of("\t")); + std::string h2f_name = op_name + "_h2f"; + const auto h2f_start = std::stod(tokens[1]); + + // Fifth line with h2f + std::getline(ifs, line); + tokens.clear(); + boost::split(tokens, line, boost::is_any_of("\t")); + const auto h2f_finish = std::stod(tokens[1]); + + // Add h2f + TensorOp h2f(h2f_name, h2f_start, h2f_finish); + calculateTensorEP(h2f); + addTensorOp(h2f_name, h2f); + + // Sixth and final line with tensor op end time + std::getline(ifs, line); + tokens.clear(); + boost::split(tokens, line, boost::is_any_of("\t")); + const auto finish = std::stod(tokens[1]); + + // Subtract f2h's and h2f's time and energy to get just the computation's info + TensorOp top(op_name, start, finish); + calculateTensorEP(top); + + top.time_ -= (f2h.time_ + h2f.time_); + top.energy_ -= (f2h.energy_ + h2f.energy_); + top.gpu_energy_ -= (f2h.gpu_energy_ + h2f.gpu_energy_); + top.ddr_energy_ -= (f2h.ddr_energy_ + h2f.ddr_energy_); + top.power_ = top.energy_ / top.time_; + top.gpu_power_ = top.gpu_energy_ / top.time_; + top.ddr_power_ = top.ddr_energy_ / top.time_; + + addTensorOp(op_name, top); + } + } + ifs.close(); + } + + void updateStats() { + updatePerOpStats(); + updateTotalStats(); + } + + // Calculates the average and standard deviation of each metric of each tensor op + void calculateAggregateStats() { + for (auto it = tensor_info_.begin(); it != tensor_info_.end(); it++) { + AggTensorInfo ati; + ati.name_ = it->first; + auto topv = it->second; + + double total_time = 0.0; + double total_energy = 0.0; + double total_gpu_energy = 0.0; + double total_ddr_energy = 0.0; + double total_power = 0.0; + double total_gpu_power = 0.0; + double total_ddr_power = 0.0; + + double time_sum = 0.0; + double energy_sum = 0.0; + double gpu_energy_sum = 0.0; + double ddr_energy_sum = 0.0; + double power_sum = 0.0; + double gpu_power_sum = 0.0; + double ddr_power_sum = 0.0; + + // Calculate average + for (const auto &top : topv) { + total_time += top.time_; + total_energy += top.energy_; + total_gpu_energy += top.gpu_energy_; + total_ddr_energy += top.ddr_energy_; + total_power += top.power_; + total_gpu_power += top.gpu_power_; + total_ddr_power += top.ddr_power_; + } + + ati.average_time_ = total_time / iterations_; + ati.average_energy_ = total_energy / iterations_; + ati.average_gpu_energy_ = total_gpu_energy / iterations_; + ati.average_ddr_energy_ = total_ddr_energy / iterations_; + ati.average_power_ = total_power / iterations_; + ati.average_gpu_power_ = total_gpu_power / iterations_; + ati.average_ddr_power_ = total_ddr_power / iterations_; + + // Calculate standard deviation + for (const auto &top : topv) { + auto time_diff = top.time_ - ati.average_time_; + time_sum += time_diff * time_diff; + + auto energy_diff = top.energy_ - ati.average_energy_; + energy_sum += energy_diff * energy_diff; + auto gpu_energy_diff = top.gpu_energy_ - ati.average_gpu_energy_; + gpu_energy_sum += gpu_energy_diff * gpu_energy_diff; + auto ddr_energy_diff = top.ddr_energy_ - ati.average_ddr_energy_; + ddr_energy_sum += ddr_energy_diff * ddr_energy_diff; + + auto power_diff = top.power_ - ati.average_power_; + power_sum += power_diff * power_diff; + auto gpu_power_diff = top.gpu_power_ - ati.average_gpu_power_; + gpu_power_sum += gpu_power_diff * gpu_power_diff; + auto ddr_power_diff = top.ddr_power_ - ati.average_ddr_power_; + ddr_power_sum += ddr_power_diff * ddr_power_diff; + } + + ati.time_std_ = std::sqrt(time_sum / iterations_); + ati.energy_std_ = std::sqrt(energy_sum / iterations_); + ati.gpu_energy_std_ = std::sqrt(gpu_energy_sum / iterations_); + ati.ddr_energy_std_ = std::sqrt(ddr_energy_sum / iterations_); + ati.power_std_ = std::sqrt(power_sum / iterations_); + ati.gpu_power_std_ = std::sqrt(gpu_power_sum / iterations_); + ati.ddr_power_std_ = std::sqrt(ddr_power_sum / iterations_); + + agg_tensor_info_.push_back(ati); + } + } + +public: + Profiler() { + cpu_stream_.open(cpu_power_rail, std::ifstream::in); + gpu_stream_.open(gpu_power_rail, std::ifstream::in); + ddr_stream_.open(ddr_power_rail, std::ifstream::in); + soc_stream_.open(soc_power_rail, std::ifstream::in); + sys_stream_.open(sys_power_rail, std::ifstream::in); + + if (!cpu_stream_.is_open() or !gpu_stream_.is_open() or !ddr_stream_.is_open() + or !soc_stream_.is_open() or !sys_stream_.is_open()) { + std::cout << "Failed to open one of the power rails for reading\n"; + exit(1); + } + } + + ~Profiler() { + cpu_stream_.close(); + gpu_stream_.close(); + ddr_stream_.close(); + soc_stream_.close(); + sys_stream_.close(); + } + + void profile(const char * const program, const int iterations) { + iterations_ = iterations; + resetGlobal(); + + for (unsigned i = 0; i < iterations_; i++) { + resetLocal(); + + // Launch two threads: one for running the program and one for + // profiling it. Pin the threads to specific cores to remove migration + // overhead. Profiling showed that the sampling rate increases slightly + // with pinning. + std::thread prog(&Profiler::runProgram, this, program); + std::thread power(&Profiler::recordPower, this); + pinThread(prog, core1); + pinThread(power, core2); + prog.join(); + power.join(); + + updateStats(); + + // Sleep for some time to bring the GPU back to idle + std::this_thread::sleep_for(std::chrono::milliseconds(gpu_idle_time)); + } + + calculateAggregateStats(); + } + + void dumpTensorInfo(const char * const filename) const { + const std::string header = "Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std\n"; + std::ofstream ofs; + ofs.open(filename); + //ofs << header; + for (const auto &ati : agg_tensor_info_) { + ofs << ati.name_ + << "," << ati.average_time_ * 1e3 + << "," << ati.average_energy_ + /* + << "," << ati.average_gpu_energy_ + << "," << ati.average_ddr_energy_ + << "," << ati.average_power_ + << "," << ati.average_gpu_power_ + << "," << ati.average_ddr_power_ + << "," << ati.time_std_ * 1e3 + << "," << ati.energy_std_ + << "," << ati.gpu_energy_std_ + << "," << ati.ddr_energy_std_ + << "," << ati.power_std_ + << "," << ati.gpu_power_std_ + << "," << ati.ddr_power_std_*/ + << "\n"; + + std::cout << ati.average_time_ * 1e3 << "," << ati.average_energy_ << "\n"; + } + ofs.close(); + } + + void dumpPowerReadings(const char * const filename) const { + std::ofstream ofs; + ofs.open(filename); + for (const auto &reading : power_readings_) { + std::chrono::duration<double> duration = reading.time_ - start_time_; + //std::chrono::duration<double> duration = reading.time_.time_since_epoch(); + ofs << std::to_string(duration.count()) + << " " << reading.gpu_ + << " " << reading.ddr_ + << "\n"; + } + ofs.close(); + } + + void dumpTotalInfo() const { + auto total_time = total_info_.time_ / iterations_; + + auto total_energy = total_info_.energy_ / iterations_; + auto gpu_energy = total_info_.gpu_energy_ / iterations_; + auto ddr_energy = total_info_.ddr_energy_ / iterations_; + + auto power = total_info_.power_ / iterations_; + auto gpu_power = total_info_.gpu_power_ / iterations_; + auto ddr_power = total_info_.ddr_power_ / iterations_; + + std::cout << "-----------------------------------------------------\n"; + std::cout << "Program info (average)\n"; + std::cout << "-----------------------------------------------------\n"; + std::cout << "\tExecution time: " << total_time << " seconds\n"; + std::cout << "\tTotal energy: " << total_energy << " mJ\n"; + std::cout << "\t GPU: " << gpu_energy << " mJ\n"; + std::cout << "\t DDR: " << ddr_energy << " mJ\n"; + std::cout << "\tPower: " << power << " mW\n"; + std::cout << "\t GPU: " << gpu_power << " mW\n"; + std::cout << "\t DDR: " << ddr_power << " mW\n"; + std::cout << "-----------------------------------------------------\n"; + } +}; + +int main(int argc, char *argv[]) { + if (argc < NUM_ARGS) { + std::cout << "Usage: " << argv[0] << " <program> <iterations> <tensor output file> [power output file]\n"; + exit(1); + } + + Profiler pp; + pp.profile(argv[1], std::stoi(argv[2])); + pp.dumpTensorInfo(argv[3]); + + if (argc > NUM_ARGS) + pp.dumpPowerReadings(argv[4]); + + return 0; +} + diff --git a/llvm/projects/gpu_profiler/src/profiler.cpp b/llvm/projects/gpu_profiler/src/profiler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..188223a9059eede6d2a32e853dee22b95ecb719e --- /dev/null +++ b/llvm/projects/gpu_profiler/src/profiler.cpp @@ -0,0 +1,180 @@ +#include "profiler.h" + +Profiler::Profiler() : should_run_profiler_(false), should_stop_profiler_(false) { + // Open all streams. Not done in start_profiler() function bc the streams + // should be strictly opened once + cpu_stream_.open(cpu_power_rail, std::ifstream::in); + gpu_stream_.open(gpu_power_rail, std::ifstream::in); + ddr_stream_.open(ddr_power_rail, std::ifstream::in); + soc_stream_.open(soc_power_rail, std::ifstream::in); + sys_stream_.open(sys_power_rail, std::ifstream::in); + + if (!cpu_stream_.is_open() || !gpu_stream_.is_open() || !ddr_stream_.is_open() + || !soc_stream_.is_open() || !sys_stream_.is_open()) { + std::cout << "Failed to open one of the power rails for reading\n"; + exit(1); + } +} + +Profiler::~Profiler() { + cpu_stream_.close(); + gpu_stream_.close(); + ddr_stream_.close(); + soc_stream_.close(); + sys_stream_.close(); +} + +// Reinitializes boolean vars used for control flow and launches the profiler +// thread. DOES NOT reset other internal data structures. +void Profiler::start_profiler(){ + // Reinitialize in case the profiler object has been used before + should_run_profiler_ = false; + should_stop_profiler_ = false; + + // Launch profiler thread + profiler_thread_ = std::thread(&Profiler::run_profiler, this); + pin_thread(profiler_thread_, core1); +} + +// Resumes the profiling of whatever executable's currently running +// DOES NOT reset any data +void Profiler::resume_profiler() { + { + std::unique_lock<std::mutex> mutex_lock(mutex_); + if (should_run_profiler_){ + std::cout << "WARNING: resume_profiler was already called\n"; + } + should_run_profiler_ = true; + start_time_ = std::chrono::high_resolution_clock::now(); + } + cond_var_.notify_one(); +} + +// Stops profiler by putting profiler thread to sleep +void Profiler::pause_profiler() { + { + std::unique_lock<std::mutex> mutex_lock(mutex_); + if (!should_run_profiler_){ + std::cout << "WARNING: pause_profiler was already called\n"; + } + should_run_profiler_ = false; + } + cond_var_.notify_one(); +} + +// Gets the delta time and total GPU and DDR energy between the last two +// calls to resume_profiler and pause_profiler +// +// Returns this as a pair of <delta time in milliseconds, energy> +std::pair<double, double> Profiler::get_time_energy() const { + double total_energy = 0.0; + + std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_; + for (auto reading : power_readings_) { + std::chrono::duration<double> duration = reading.time_ - prev_time; + total_energy += reading.gpu_ * duration.count(); + total_energy += reading.ddr_ * duration.count(); + prev_time = reading.time_; + } + double delta_time = std::chrono::duration<double, std::milli>(prev_time + - start_time_).count(); + return std::make_pair(delta_time, total_energy); +} + +// Resets all internal data structures, including the vector storing all power_readings. +void Profiler::reset() { + should_stop_profiler_ = false; // Can call reset after calling pause_profiler() + should_run_profiler_ = false; // Can call reset after calling resume + power_readings_.clear(); +} + +// Exit the profiler and kill the thread +// Must call start_profiler() to reuse this object after calling pause_profiler() +void Profiler::stop_profiler() { + std::cout << "Exiting profiler\n"; + should_stop_profiler_ = true; + cond_var_.notify_one(); + profiler_thread_.join(); +} + +// Obtain's a single power reading from the GPU and DDR rails +void Profiler::obtain_power_reading() { + PowerReading reading; + + // The order matters here. All the reads have to happen together first + // and then all the seeks have to happen together at the end, otherwise + // there will be a significant time difference between the readings of + // the different rails. + reading.time_ = std::chrono::high_resolution_clock::now(); + gpu_stream_ >> reading.gpu_; + ddr_stream_ >> reading.ddr_; + power_readings_.push_back(reading); + + // Reset the input position of the files + gpu_stream_.seekg(0); + ddr_stream_.seekg(0); +} + +// Pins the given thread to the specified core +void Profiler::pin_thread(std::thread &t, const unsigned core) const { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core, &cpuset); + if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0) + std::cout << "Couldn't set thread affinity\n"; +} + +// Runs the profiler thread, keeping it alive by wrapping the functionality +// in an infinite loop +void Profiler::run_profiler(){ + while (true){ + if (should_stop_profiler_) { + break; + } + // Need to lock the mutex and check the condition var + { + std::unique_lock<std::mutex> mutex_lock(mutex_); + if (should_stop_profiler_) { + break; + } + // Wake the thread up when it's time to run the profiler or exit + // the profiler + cond_var_.wait(mutex_lock, [this]{return should_run_profiler_ + || should_stop_profiler_; }); + } + if (should_stop_profiler_) { + break; + } + obtain_power_reading(); + } +} + +/* +// TESTS +void resume_pause_profiler(Profiler& profile_wrapper, unsigned long sleep_millis){ + profile_wrapper.resume_profiler(); + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis)); + profile_wrapper.pause_profiler(); + + auto time_energy_pair = profile_wrapper.get_time_energy(); + profile_wrapper.reset(); + + printf("time: %f, energy: %f\n", time_energy_pair.first, time_energy_pair.second); + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis)); +} + +int main(){ + Profiler profile_wrapper; + profile_wrapper.start_profiler(); + + unsigned long sleep_millis = 500; + resume_pause_profiler(profile_wrapper, sleep_millis); + resume_pause_profiler(profile_wrapper, sleep_millis); + resume_pause_profiler(profile_wrapper, sleep_millis); + resume_pause_profiler(profile_wrapper, sleep_millis); + + // IMPORTANT + profile_wrapper.stop_profiler(); + return 0; +} +*/