Skip to content
Snippets Groups Projects
Commit a1b00e80 authored by Elizabeth's avatar Elizabeth
Browse files

Separated .cpp and .h files into different dirs

parent 9e890e90
No related branches found
No related tags found
No related merge requests found
cmake_minimum_required(VERSION 3.5) cmake_minimum_required(VERSION 3.5)
set(libsrc profiler.cpp) set(libsrc src/profiler.cpp)
set (CMAKE_CXX_STANDARD 11) set (CMAKE_CXX_STANDARD 11)
add_library(gpu_profiler STATIC ${libsrc}) add_library(gpu_profiler STATIC ${libsrc})
target_include_directories(gpu_profiler PRIVATE include)
#include <atomic>
#include <chrono>
#include <cmath>
#include <condition_variable>
#include <fstream>
#include <iostream>
#include <string>
#include <thread>
#include <vector>
// Reads power rails at runtime and computes the GPU and DDR energy within a window
// of time, which is delimitered by the calls to resume_profiler() and pause_profiler()
//
// IMPORTANT: Must call pause_profiler() to kill the profiler thread
//
// Public interface methods:
// void start_profiler();
// void resume_profiler();
// void pause_profiler();
// std::pair<double, double> get_time_energy() const;
// void reset()
// void pause_profiler();
class Profiler {
public:
Profiler();
~Profiler();
// Reinitializes boolean vars used for control flow and launches the profiler
// thread. DOES NOT reset other internal data structures.
void start_profiler();
// Resumes the profiling of whatever executable's currently running
// DOES NOT reset any data
void resume_profiler();
// Stops profiler by putting profiler thread to sleep
void pause_profiler();
// Gets the delta time and total GPU and DDR energy between the last two
// calls to resume_profiler and pause_profiler
//
// Returns this as a pair of <delta time in milliseconds, energy>
std::pair<double, double> get_time_energy() const;
// Resets all internal data structures, including the vector storing all power_readings.
void reset();
// Exit the profiler and kill the thread
// Must call start_profiler() to reuse this object after calling pause_profiler()
void stop_profiler();
private:
// Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
// we can't use them.
const unsigned core0 = 0;
const unsigned core1 = 3;
const unsigned core2 = 4;
const unsigned core3 = 5;
// Power rails are mounted as files. Keeping the old power rail file names for possible future
// integrations
const std::string cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
const std::string gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
const std::string ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
const std::string soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
const std::string sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
// An individual power reading
struct PowerReading {
std::chrono::time_point<std::chrono::high_resolution_clock> time_;
double cpu_;
double gpu_;
double ddr_;
double soc_;
double sys_;
};
// Stores all power readings and is cleared only when reset() is called
std::vector<PowerReading> power_readings_;
std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
// For reading the i2c buses via sysfs
std::ifstream cpu_stream_;
std::ifstream gpu_stream_;
std::ifstream ddr_stream_;
std::ifstream soc_stream_;
std::ifstream sys_stream_;
std::mutex mutex_;
std::condition_variable cond_var_;
bool should_run_profiler_; // True if we want to resume the profiling thread
std::atomic_bool should_stop_profiler_; // Quit profiling
std::thread profiler_thread_;
// Obtain's a single power reading from the GPU and DDR rails
void obtain_power_reading();
// Pins the given thread to the specified core
void pin_thread(std::thread &t, const unsigned core) const;
// Runs the profiler thread, keeping it alive by wrapping the functionality
// in an infinite loop
void run_profiler();
};
#include <cmath>
#include <chrono>
#include <iostream>
#include <fstream>
#include <string>
#include <boost/algorithm/string.hpp>
#include <vector>
#include <map>
#include <thread>
#include <atomic>
#include <sched.h>
#define NUM_ARGS 4
// This is a simple power profiler that can sample the power of the various
// components in a Jetson TX2. The usage is simple: profile() measures power
// for the specified program, and then dumpOutput() prints the readings to a
// file. profile() can be called as many times as desired - the internal state
// is reset each time and thus the measurements are not cumulative.
class Profiler {
private:
// Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
// we can't use them.
const unsigned core0 = 0;
const unsigned core1 = 3;
const unsigned core2 = 4;
const unsigned core3 = 5;
// sysfs paths for i2c buses of various components
const char * const cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
const char * const gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
const char * const ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
const char * const soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
const char * const sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
// It takes some time for the GPU's power to return to idle (ms)
const unsigned gpu_idle_time = 0;
// An individual power reading
struct PowerReading {
std::chrono::time_point<std::chrono::high_resolution_clock> time_;
double cpu_;
double gpu_;
double ddr_;
double soc_;
double sys_;
};
// Individual tensor op
struct TensorOp {
std::string name_;
double start_;
double finish_;
double time_;
double energy_;
double gpu_energy_;
double ddr_energy_;
double power_;
double gpu_power_;
double ddr_power_;
TensorOp(std::string name, double start, double finish)
: name_(name), start_(start), finish_(finish), time_(finish - start),
energy_(0.0), gpu_energy_(0.0), ddr_energy_(0.0),
power_(0.0), gpu_power_(0.0), ddr_power_(0.0) {
}
};
// Aggregate tensor info
struct AggTensorInfo {
// Op name
std::string name_;
// Averages
double average_time_;
double average_energy_;
double average_gpu_energy_;
double average_ddr_energy_;
double average_power_;
double average_gpu_power_;
double average_ddr_power_;
// Standard deviations
double time_std_;
double energy_std_;
double gpu_energy_std_;
double ddr_energy_std_;
double power_std_;
double gpu_power_std_;
double ddr_power_std_;
};
// Total time, energy, and power
struct TotalInfo {
double time_;
double energy_;
double gpu_energy_;
double ddr_energy_;
double power_;
double gpu_power_;
double ddr_power_;
void clear() {
time_ = 0.0;
energy_ = 0.0;
gpu_energy_ = 0.0;
ddr_energy_ = 0.0;
power_ = 0.0;
gpu_power_ = 0.0;
ddr_power_ = 0.0;
}
};
// For reading the i2c buses via sysfs
std::ifstream cpu_stream_;
std::ifstream gpu_stream_;
std::ifstream ddr_stream_;
std::ifstream soc_stream_;
std::ifstream sys_stream_;
// Start time (so graph begins from t=0)
std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
// Per-run info
std::vector<PowerReading> power_readings_;
// Aggregate (across all runs) info
std::map<std::string, std::vector<TensorOp>> tensor_info_;
std::vector<AggTensorInfo> agg_tensor_info_;
TotalInfo total_info_;
unsigned iterations_;
// Start and stop flags to synchronize the program and profiling threads
std::atomic_bool start_;
std::atomic_bool stop_;
private:
// Resets tensor info and total time and energy
void resetGlobal() {
tensor_info_.clear();
agg_tensor_info_.clear();
total_info_.clear();
}
// Resets power readings and flags
void resetLocal() {
power_readings_.clear();
start_ = false;
stop_ = false;
}
// Pins the given thread to the specified core
void pinThread(std::thread &t, const unsigned core) const {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core, &cpuset);
if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0)
std::cout << "Couldn't set thread affinity\n";
}
// Adds a tensor op to the map
void addTensorOp(std::string &op_name, TensorOp &top) {
// Create a vector if this is the first entry
auto it = tensor_info_.find(op_name);
if (it == tensor_info_.end()) {
tensor_info_.insert(std::pair<std::string, std::vector<TensorOp>>(op_name, std::vector<TensorOp>()));
}
tensor_info_[op_name].push_back(top);
}
// Obtain's a single power reading from the GPU and DDR rails
void getPowerReading() {
PowerReading reading;
// The order matters here. All the reads have to happen together first
// and then all the seeks have to happen together at the end, otherwise
// there will be a significant time difference between the readings of
// the different rails.
reading.time_ = std::chrono::high_resolution_clock::now();
gpu_stream_ >> reading.gpu_;
ddr_stream_ >> reading.ddr_;
power_readings_.push_back(reading);
// Reset the input position of the files
gpu_stream_.seekg(0);
ddr_stream_.seekg(0);
}
// Executes the program to be profiled
void runProgram(const char * const program) {
// Tell the profiling thread to start, execute the program that needs
// to be profiled, and then tell the profiling thread to stop.
start_ = true;
const auto result = std::system(program);
stop_ = true;
}
// Records power while the program is running
void recordPower() {
// Obtain the new start time, wait for the start signal, and keep
// profiling until the stop flag is set.
start_time_ = std::chrono::high_resolution_clock::now();
while (!start_);
while (!stop_)
getPowerReading();
}
// Calculates stats for the entire execution (CPU+GPU phase)
void updateTotalStats() {
double energy = 0.0;
double gpu_energy = 0.0;
double ddr_energy = 0.0;
std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
for (auto reading : power_readings_) {
std::chrono::duration<double> duration = reading.time_ - prev_time;
gpu_energy += reading.gpu_ * duration.count();
ddr_energy += reading.ddr_ * duration.count();
prev_time = reading.time_;
}
energy = gpu_energy + ddr_energy;
auto time = std::chrono::duration<double>(prev_time - start_time_).count();
total_info_.time_ += time;
total_info_.energy_ += (gpu_energy + ddr_energy);
total_info_.gpu_energy_ += gpu_energy;
total_info_.ddr_energy_ += ddr_energy;
total_info_.power_ += (energy / time);
total_info_.gpu_power_ += (gpu_energy / time);
total_info_.ddr_power_ += (ddr_energy / time);
}
// Calculates energy and power usage of the given tensor operation
void calculateTensorEP(TensorOp &top) const {
auto prev_time = top.start_;
unsigned i = 0;
// Skip until we hit the start time of the operation
for (; std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count() < top.start_; i++);
// Keep going until we hit the finish time of the operation or we run out of readings
for (double curr_time; ((curr_time = std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count()) <= top.finish_)
&& (i < power_readings_.size()); i++) {
auto duration = curr_time - prev_time;
prev_time = curr_time;
top.gpu_energy_ += power_readings_[i].gpu_ * duration;
top.ddr_energy_ += power_readings_[i].ddr_ * duration;
}
top.energy_ = top.gpu_energy_ + top.ddr_energy_;
top.power_ = top.energy_ / top.time_;
top.gpu_power_ = top.gpu_energy_ / top.time_;
top.ddr_power_ = top.ddr_energy_ / top.time_;
}
// Calculates stats for all the tensors in the timestamp file
void updatePerOpStats() {
const char * const op_file = "profile_data.txt";
std::string line;
std::ifstream ifs(op_file, std::ios::in);
// Calculate time and energy for each tensor operation. There are two
// possibilities for the file format:
// If the line doesn't begin with #, we are looking at FP32 code
// without any conversions to/from FP16, and each operation occupies
// two consecutive lines in the timestamp file.
// If the line does begin with #, we are looking at FP16 code with
// conversion routines in the middle. In this case, *after* the current
// line, there will be two lines for F2H, two lines for H2F, and then
// one line for the end of the operation.
while (std::getline(ifs, line)) {
std::vector<std::string> tokens;
boost::split(tokens, line, boost::is_any_of("\t"));
std::string op_name = tokens[0];
// FP32
if (tokens[0][0] != '#') {
// First line with tensor op name and start time
std::string op_name = tokens[0];
const auto start = std::stod(tokens[1]);
// Second line with tensor op end time
std::getline(ifs, line);
tokens.clear();
boost::split(tokens, line, boost::is_any_of("\t"));
const auto finish = std::stod(tokens[1]);
TensorOp top(op_name, start, finish);
calculateTensorEP(top);
addTensorOp(op_name, top);
} else {
// First line with tensor op name and start time
std::string op_name = tokens[0].substr(1);
const auto start = std::stod(tokens[1]);
// Second line with f2h
std::getline(ifs, line);
tokens.clear();
boost::split(tokens, line, boost::is_any_of("\t"));
std::string f2h_name = op_name + "_f2h";
const auto f2h_start = std::stod(tokens[1]);
// Third line with f2h
std::getline(ifs, line);
tokens.clear();
boost::split(tokens, line, boost::is_any_of("\t"));
const auto f2h_finish = std::stod(tokens[1]);
// Add f2h
TensorOp f2h(f2h_name, f2h_start, f2h_finish);
calculateTensorEP(f2h);
addTensorOp(f2h_name, f2h);
// Fourth line with h2f
std::getline(ifs, line);
tokens.clear();
boost::split(tokens, line, boost::is_any_of("\t"));
std::string h2f_name = op_name + "_h2f";
const auto h2f_start = std::stod(tokens[1]);
// Fifth line with h2f
std::getline(ifs, line);
tokens.clear();
boost::split(tokens, line, boost::is_any_of("\t"));
const auto h2f_finish = std::stod(tokens[1]);
// Add h2f
TensorOp h2f(h2f_name, h2f_start, h2f_finish);
calculateTensorEP(h2f);
addTensorOp(h2f_name, h2f);
// Sixth and final line with tensor op end time
std::getline(ifs, line);
tokens.clear();
boost::split(tokens, line, boost::is_any_of("\t"));
const auto finish = std::stod(tokens[1]);
// Subtract f2h's and h2f's time and energy to get just the computation's info
TensorOp top(op_name, start, finish);
calculateTensorEP(top);
top.time_ -= (f2h.time_ + h2f.time_);
top.energy_ -= (f2h.energy_ + h2f.energy_);
top.gpu_energy_ -= (f2h.gpu_energy_ + h2f.gpu_energy_);
top.ddr_energy_ -= (f2h.ddr_energy_ + h2f.ddr_energy_);
top.power_ = top.energy_ / top.time_;
top.gpu_power_ = top.gpu_energy_ / top.time_;
top.ddr_power_ = top.ddr_energy_ / top.time_;
addTensorOp(op_name, top);
}
}
ifs.close();
}
void updateStats() {
updatePerOpStats();
updateTotalStats();
}
// Calculates the average and standard deviation of each metric of each tensor op
void calculateAggregateStats() {
for (auto it = tensor_info_.begin(); it != tensor_info_.end(); it++) {
AggTensorInfo ati;
ati.name_ = it->first;
auto topv = it->second;
double total_time = 0.0;
double total_energy = 0.0;
double total_gpu_energy = 0.0;
double total_ddr_energy = 0.0;
double total_power = 0.0;
double total_gpu_power = 0.0;
double total_ddr_power = 0.0;
double time_sum = 0.0;
double energy_sum = 0.0;
double gpu_energy_sum = 0.0;
double ddr_energy_sum = 0.0;
double power_sum = 0.0;
double gpu_power_sum = 0.0;
double ddr_power_sum = 0.0;
// Calculate average
for (const auto &top : topv) {
total_time += top.time_;
total_energy += top.energy_;
total_gpu_energy += top.gpu_energy_;
total_ddr_energy += top.ddr_energy_;
total_power += top.power_;
total_gpu_power += top.gpu_power_;
total_ddr_power += top.ddr_power_;
}
ati.average_time_ = total_time / iterations_;
ati.average_energy_ = total_energy / iterations_;
ati.average_gpu_energy_ = total_gpu_energy / iterations_;
ati.average_ddr_energy_ = total_ddr_energy / iterations_;
ati.average_power_ = total_power / iterations_;
ati.average_gpu_power_ = total_gpu_power / iterations_;
ati.average_ddr_power_ = total_ddr_power / iterations_;
// Calculate standard deviation
for (const auto &top : topv) {
auto time_diff = top.time_ - ati.average_time_;
time_sum += time_diff * time_diff;
auto energy_diff = top.energy_ - ati.average_energy_;
energy_sum += energy_diff * energy_diff;
auto gpu_energy_diff = top.gpu_energy_ - ati.average_gpu_energy_;
gpu_energy_sum += gpu_energy_diff * gpu_energy_diff;
auto ddr_energy_diff = top.ddr_energy_ - ati.average_ddr_energy_;
ddr_energy_sum += ddr_energy_diff * ddr_energy_diff;
auto power_diff = top.power_ - ati.average_power_;
power_sum += power_diff * power_diff;
auto gpu_power_diff = top.gpu_power_ - ati.average_gpu_power_;
gpu_power_sum += gpu_power_diff * gpu_power_diff;
auto ddr_power_diff = top.ddr_power_ - ati.average_ddr_power_;
ddr_power_sum += ddr_power_diff * ddr_power_diff;
}
ati.time_std_ = std::sqrt(time_sum / iterations_);
ati.energy_std_ = std::sqrt(energy_sum / iterations_);
ati.gpu_energy_std_ = std::sqrt(gpu_energy_sum / iterations_);
ati.ddr_energy_std_ = std::sqrt(ddr_energy_sum / iterations_);
ati.power_std_ = std::sqrt(power_sum / iterations_);
ati.gpu_power_std_ = std::sqrt(gpu_power_sum / iterations_);
ati.ddr_power_std_ = std::sqrt(ddr_power_sum / iterations_);
agg_tensor_info_.push_back(ati);
}
}
public:
Profiler() {
cpu_stream_.open(cpu_power_rail, std::ifstream::in);
gpu_stream_.open(gpu_power_rail, std::ifstream::in);
ddr_stream_.open(ddr_power_rail, std::ifstream::in);
soc_stream_.open(soc_power_rail, std::ifstream::in);
sys_stream_.open(sys_power_rail, std::ifstream::in);
if (!cpu_stream_.is_open() or !gpu_stream_.is_open() or !ddr_stream_.is_open()
or !soc_stream_.is_open() or !sys_stream_.is_open()) {
std::cout << "Failed to open one of the power rails for reading\n";
exit(1);
}
}
~Profiler() {
cpu_stream_.close();
gpu_stream_.close();
ddr_stream_.close();
soc_stream_.close();
sys_stream_.close();
}
void profile(const char * const program, const int iterations) {
iterations_ = iterations;
resetGlobal();
for (unsigned i = 0; i < iterations_; i++) {
resetLocal();
// Launch two threads: one for running the program and one for
// profiling it. Pin the threads to specific cores to remove migration
// overhead. Profiling showed that the sampling rate increases slightly
// with pinning.
std::thread prog(&Profiler::runProgram, this, program);
std::thread power(&Profiler::recordPower, this);
pinThread(prog, core1);
pinThread(power, core2);
prog.join();
power.join();
updateStats();
// Sleep for some time to bring the GPU back to idle
std::this_thread::sleep_for(std::chrono::milliseconds(gpu_idle_time));
}
calculateAggregateStats();
}
void dumpTensorInfo(const char * const filename) const {
const std::string header = "Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std\n";
std::ofstream ofs;
ofs.open(filename);
//ofs << header;
for (const auto &ati : agg_tensor_info_) {
ofs << ati.name_
<< "," << ati.average_time_ * 1e3
<< "," << ati.average_energy_
/*
<< "," << ati.average_gpu_energy_
<< "," << ati.average_ddr_energy_
<< "," << ati.average_power_
<< "," << ati.average_gpu_power_
<< "," << ati.average_ddr_power_
<< "," << ati.time_std_ * 1e3
<< "," << ati.energy_std_
<< "," << ati.gpu_energy_std_
<< "," << ati.ddr_energy_std_
<< "," << ati.power_std_
<< "," << ati.gpu_power_std_
<< "," << ati.ddr_power_std_*/
<< "\n";
std::cout << ati.average_time_ * 1e3 << "," << ati.average_energy_ << "\n";
}
ofs.close();
}
void dumpPowerReadings(const char * const filename) const {
std::ofstream ofs;
ofs.open(filename);
for (const auto &reading : power_readings_) {
std::chrono::duration<double> duration = reading.time_ - start_time_;
//std::chrono::duration<double> duration = reading.time_.time_since_epoch();
ofs << std::to_string(duration.count())
<< " " << reading.gpu_
<< " " << reading.ddr_
<< "\n";
}
ofs.close();
}
void dumpTotalInfo() const {
auto total_time = total_info_.time_ / iterations_;
auto total_energy = total_info_.energy_ / iterations_;
auto gpu_energy = total_info_.gpu_energy_ / iterations_;
auto ddr_energy = total_info_.ddr_energy_ / iterations_;
auto power = total_info_.power_ / iterations_;
auto gpu_power = total_info_.gpu_power_ / iterations_;
auto ddr_power = total_info_.ddr_power_ / iterations_;
std::cout << "-----------------------------------------------------\n";
std::cout << "Program info (average)\n";
std::cout << "-----------------------------------------------------\n";
std::cout << "\tExecution time: " << total_time << " seconds\n";
std::cout << "\tTotal energy: " << total_energy << " mJ\n";
std::cout << "\t GPU: " << gpu_energy << " mJ\n";
std::cout << "\t DDR: " << ddr_energy << " mJ\n";
std::cout << "\tPower: " << power << " mW\n";
std::cout << "\t GPU: " << gpu_power << " mW\n";
std::cout << "\t DDR: " << ddr_power << " mW\n";
std::cout << "-----------------------------------------------------\n";
}
};
int main(int argc, char *argv[]) {
if (argc < NUM_ARGS) {
std::cout << "Usage: " << argv[0] << " <program> <iterations> <tensor output file> [power output file]\n";
exit(1);
}
Profiler pp;
pp.profile(argv[1], std::stoi(argv[2]));
pp.dumpTensorInfo(argv[3]);
if (argc > NUM_ARGS)
pp.dumpPowerReadings(argv[4]);
return 0;
}
#include "profiler.h"
Profiler::Profiler() : should_run_profiler_(false), should_stop_profiler_(false) {
// Open all streams. Not done in start_profiler() function bc the streams
// should be strictly opened once
cpu_stream_.open(cpu_power_rail, std::ifstream::in);
gpu_stream_.open(gpu_power_rail, std::ifstream::in);
ddr_stream_.open(ddr_power_rail, std::ifstream::in);
soc_stream_.open(soc_power_rail, std::ifstream::in);
sys_stream_.open(sys_power_rail, std::ifstream::in);
if (!cpu_stream_.is_open() || !gpu_stream_.is_open() || !ddr_stream_.is_open()
|| !soc_stream_.is_open() || !sys_stream_.is_open()) {
std::cout << "Failed to open one of the power rails for reading\n";
exit(1);
}
}
Profiler::~Profiler() {
cpu_stream_.close();
gpu_stream_.close();
ddr_stream_.close();
soc_stream_.close();
sys_stream_.close();
}
// Reinitializes boolean vars used for control flow and launches the profiler
// thread. DOES NOT reset other internal data structures.
void Profiler::start_profiler(){
// Reinitialize in case the profiler object has been used before
should_run_profiler_ = false;
should_stop_profiler_ = false;
// Launch profiler thread
profiler_thread_ = std::thread(&Profiler::run_profiler, this);
pin_thread(profiler_thread_, core1);
}
// Resumes the profiling of whatever executable's currently running
// DOES NOT reset any data
void Profiler::resume_profiler() {
{
std::unique_lock<std::mutex> mutex_lock(mutex_);
if (should_run_profiler_){
std::cout << "WARNING: resume_profiler was already called\n";
}
should_run_profiler_ = true;
start_time_ = std::chrono::high_resolution_clock::now();
}
cond_var_.notify_one();
}
// Stops profiler by putting profiler thread to sleep
void Profiler::pause_profiler() {
{
std::unique_lock<std::mutex> mutex_lock(mutex_);
if (!should_run_profiler_){
std::cout << "WARNING: pause_profiler was already called\n";
}
should_run_profiler_ = false;
}
cond_var_.notify_one();
}
// Gets the delta time and total GPU and DDR energy between the last two
// calls to resume_profiler and pause_profiler
//
// Returns this as a pair of <delta time in milliseconds, energy>
std::pair<double, double> Profiler::get_time_energy() const {
double total_energy = 0.0;
std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
for (auto reading : power_readings_) {
std::chrono::duration<double> duration = reading.time_ - prev_time;
total_energy += reading.gpu_ * duration.count();
total_energy += reading.ddr_ * duration.count();
prev_time = reading.time_;
}
double delta_time = std::chrono::duration<double, std::milli>(prev_time
- start_time_).count();
return std::make_pair(delta_time, total_energy);
}
// Resets all internal data structures, including the vector storing all power_readings.
void Profiler::reset() {
should_stop_profiler_ = false; // Can call reset after calling pause_profiler()
should_run_profiler_ = false; // Can call reset after calling resume
power_readings_.clear();
}
// Exit the profiler and kill the thread
// Must call start_profiler() to reuse this object after calling pause_profiler()
void Profiler::stop_profiler() {
std::cout << "Exiting profiler\n";
should_stop_profiler_ = true;
cond_var_.notify_one();
profiler_thread_.join();
}
// Obtain's a single power reading from the GPU and DDR rails
void Profiler::obtain_power_reading() {
PowerReading reading;
// The order matters here. All the reads have to happen together first
// and then all the seeks have to happen together at the end, otherwise
// there will be a significant time difference between the readings of
// the different rails.
reading.time_ = std::chrono::high_resolution_clock::now();
gpu_stream_ >> reading.gpu_;
ddr_stream_ >> reading.ddr_;
power_readings_.push_back(reading);
// Reset the input position of the files
gpu_stream_.seekg(0);
ddr_stream_.seekg(0);
}
// Pins the given thread to the specified core
void Profiler::pin_thread(std::thread &t, const unsigned core) const {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core, &cpuset);
if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0)
std::cout << "Couldn't set thread affinity\n";
}
// Runs the profiler thread, keeping it alive by wrapping the functionality
// in an infinite loop
void Profiler::run_profiler(){
while (true){
if (should_stop_profiler_) {
break;
}
// Need to lock the mutex and check the condition var
{
std::unique_lock<std::mutex> mutex_lock(mutex_);
if (should_stop_profiler_) {
break;
}
// Wake the thread up when it's time to run the profiler or exit
// the profiler
cond_var_.wait(mutex_lock, [this]{return should_run_profiler_
|| should_stop_profiler_; });
}
if (should_stop_profiler_) {
break;
}
obtain_power_reading();
}
}
/*
// TESTS
void resume_pause_profiler(Profiler& profile_wrapper, unsigned long sleep_millis){
profile_wrapper.resume_profiler();
std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis));
profile_wrapper.pause_profiler();
auto time_energy_pair = profile_wrapper.get_time_energy();
profile_wrapper.reset();
printf("time: %f, energy: %f\n", time_energy_pair.first, time_energy_pair.second);
std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis));
}
int main(){
Profiler profile_wrapper;
profile_wrapper.start_profiler();
unsigned long sleep_millis = 500;
resume_pause_profiler(profile_wrapper, sleep_millis);
resume_pause_profiler(profile_wrapper, sleep_millis);
resume_pause_profiler(profile_wrapper, sleep_millis);
resume_pause_profiler(profile_wrapper, sleep_millis);
// IMPORTANT
profile_wrapper.stop_profiler();
return 0;
}
*/
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment