Removed all fields besides total energy/time from output file

5febbf3a · Elizabeth · dbfd8a93 · 5febbf3a
Commit 5febbf3a authored 5 years ago by Elizabeth
--- a/llvm/projects/gpu_profiler/offline_profiler.cpp
+++ b/llvm/projects/gpu_profiler/offline_profiler.cpp
+#include <cmath>
+#include <chrono>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <boost/algorithm/string.hpp>
+#include <vector>
+#include <map>
+#include <thread>
+#include <atomic>
+#include <sched.h>
+#define NUM_ARGS 4
+// This is a simple power profiler that can sample the power of the various
+// components in a Jetson TX2. The usage is simple: profile() measures power
+// for the specified program, and then dumpOutput() prints the readings to a
+// file. profile() can be called as many times as desired - the internal state
+// is reset each time and thus the measurements are not cumulative.
+class Profiler {
+private:
+    // Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
+    // we can't use them.
+    const unsigned core0 = 0;
+    const unsigned core1 = 3;
+    const unsigned core2 = 4;
+    const unsigned core3 = 5;
+    // sysfs paths for i2c buses of various components
+    const char * const cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
+    const char * const gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
+    const char * const ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
+    const char * const soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
+    const char * const sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
+    // It takes some time for the GPU's power to return to idle (ms)
+    const unsigned gpu_idle_time = 0;
+    // An individual power reading
+    struct PowerReading {
+        std::chrono::time_point<std::chrono::high_resolution_clock> time_;
+        double cpu_;
+        double gpu_;
+        double ddr_;
+        double soc_;
+        double sys_;
+    };
+    // Individual tensor op
+    struct TensorOp {
+        std::string name_;
+        double start_;
+        double finish_;
+        double time_;
+        double energy_;
+        double gpu_energy_;
+        double ddr_energy_;
+        double power_;
+        double gpu_power_;
+        double ddr_power_;
+        TensorOp(std::string name, double start, double finish)
+            : name_(name), start_(start), finish_(finish), time_(finish - start),
+            energy_(0.0), gpu_energy_(0.0), ddr_energy_(0.0),
+            power_(0.0), gpu_power_(0.0), ddr_power_(0.0) {
+        }
+    };
+    // Aggregate tensor info
+    struct AggTensorInfo {
+        // Op name
+        std::string name_;
+        // Averages
+        double average_time_;
+        double average_energy_;
+        double average_gpu_energy_;
+        double average_ddr_energy_;
+        double average_power_;
+        double average_gpu_power_;
+        double average_ddr_power_;
+        // Standard deviations
+        double time_std_;
+        double energy_std_;
+        double gpu_energy_std_;
+        double ddr_energy_std_;
+        double power_std_;
+        double gpu_power_std_;
+        double ddr_power_std_;
+    };
+    // Total time, energy, and power
+    struct TotalInfo {
+        double time_;
+        double energy_;
+        double gpu_energy_;
+        double ddr_energy_;
+        double power_;
+        double gpu_power_;
+        double ddr_power_;
+        void clear() {
+            time_ = 0.0;
+            energy_ = 0.0;
+            gpu_energy_ = 0.0;
+            ddr_energy_ = 0.0;
+            power_ = 0.0;
+            gpu_power_ = 0.0;
+            ddr_power_ = 0.0;
+        }
+    };
+    // For reading the i2c buses via sysfs
+    std::ifstream cpu_stream_;
+    std::ifstream gpu_stream_;
+    std::ifstream ddr_stream_;
+    std::ifstream soc_stream_;
+    std::ifstream sys_stream_;
+    // Start time (so graph begins from t=0)
+    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
+    // Per-run info
+    std::vector<PowerReading> power_readings_;
+    // Aggregate (across all runs) info
+    std::map<std::string, std::vector<TensorOp>> tensor_info_;
+    std::vector<AggTensorInfo> agg_tensor_info_;
+    TotalInfo total_info_;
+    unsigned iterations_;
+    // Start and stop flags to synchronize the program and profiling threads
+    std::atomic_bool start_;
+    std::atomic_bool stop_;
+private:
+    // Resets tensor info and total time and energy
+    void resetGlobal() {
+        tensor_info_.clear();
+        agg_tensor_info_.clear();
+        total_info_.clear();
+    }
+    // Resets power readings and flags
+    void resetLocal() {
+        power_readings_.clear();
+        start_ = false;
+        stop_ = false;
+    }
+    // Pins the given thread to the specified core
+    void pinThread(std::thread &t, const unsigned core) const {
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        CPU_SET(core, &cpuset);
+        if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0)
+            std::cout << "Couldn't set thread affinity\n";
+    }
+    // Adds a tensor op to the map
+    void addTensorOp(std::string &op_name, TensorOp &top) {
+        // Create a vector if this is the first entry
+        auto it = tensor_info_.find(op_name);
+        if (it == tensor_info_.end()) {
+            tensor_info_.insert(std::pair<std::string, std::vector<TensorOp>>(op_name, std::vector<TensorOp>()));
+        }
+        tensor_info_[op_name].push_back(top);
+    }
+    // Obtain's a single power reading from the GPU and DDR rails
+    void getPowerReading() {
+        PowerReading reading;
+        // The order matters here. All the reads have to happen together first
+        // and then all the seeks have to happen together at the end, otherwise
+        // there will be a significant time difference between the readings of
+        // the different rails.
+        reading.time_ = std::chrono::high_resolution_clock::now();
+        gpu_stream_ >> reading.gpu_;
+        ddr_stream_ >> reading.ddr_;
+        power_readings_.push_back(reading);
+        // Reset the input position of the files
+        gpu_stream_.seekg(0);
+        ddr_stream_.seekg(0);
+    }
+    // Executes the program to be profiled
+    void runProgram(const char * const program) {
+        // Tell the profiling thread to start, execute the program that needs
+        // to be profiled, and then tell the profiling thread to stop.
+        start_ = true;
+        const auto result = std::system(program);
+        stop_ = true;
+    }
+    // Records power while the program is running
+    void recordPower() {
+        // Obtain the new start time, wait for the start signal, and keep
+        // profiling until the stop flag is set.
+        start_time_ = std::chrono::high_resolution_clock::now();
+        while (!start_);
+        while (!stop_)
+            getPowerReading();
+    }
+    // Calculates stats for the entire execution (CPU+GPU phase)
+    void updateTotalStats() {
+        double energy = 0.0;
+        double gpu_energy = 0.0;
+        double ddr_energy = 0.0;
+        std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
+        for (auto reading : power_readings_) {
+            std::chrono::duration<double> duration = reading.time_ - prev_time;
+            gpu_energy += reading.gpu_ * duration.count();
+            ddr_energy += reading.ddr_ * duration.count();
+            prev_time = reading.time_;
+        }
+        energy = gpu_energy + ddr_energy;
+        auto time = std::chrono::duration<double>(prev_time - start_time_).count();
+        total_info_.time_ += time;
+        total_info_.energy_ += (gpu_energy + ddr_energy);
+        total_info_.gpu_energy_ += gpu_energy;
+        total_info_.ddr_energy_ += ddr_energy;
+        total_info_.power_ += (energy / time);
+        total_info_.gpu_power_ += (gpu_energy / time);
+        total_info_.ddr_power_ += (ddr_energy / time);
+    }
+    // Calculates energy and power usage of the given tensor operation
+    void calculateTensorEP(TensorOp &top) const {
+        auto prev_time = top.start_;
+        unsigned i = 0;
+        // Skip until we hit the start time of the operation
+        for (; std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count() < top.start_; i++);
+        // Keep going until we hit the finish time of the operation or we run out of readings
+        for (double curr_time; ((curr_time = std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count()) <= top.finish_)
+                && (i < power_readings_.size()); i++) {
+            auto duration = curr_time - prev_time;
+            prev_time = curr_time;
+            top.gpu_energy_ += power_readings_[i].gpu_ * duration;
+            top.ddr_energy_ += power_readings_[i].ddr_ * duration;
+        }
+        top.energy_ = top.gpu_energy_ + top.ddr_energy_;
+        top.power_ = top.energy_ / top.time_;
+        top.gpu_power_ = top.gpu_energy_ / top.time_;
+        top.ddr_power_ = top.ddr_energy_ / top.time_;
+    }
+    // Calculates stats for all the tensors in the timestamp file
+    void updatePerOpStats() {
+        const char * const op_file = "profile_data.txt";
+        std::string line;
+        std::ifstream ifs(op_file, std::ios::in);
+        // Calculate time and energy for each tensor operation. There are two
+        // possibilities for the file format:
+        // If the line doesn't begin with #, we are looking at FP32 code
+        // without any conversions to/from FP16, and each operation occupies
+        // two consecutive lines in the timestamp file.
+        // If the line does begin with #, we are looking at FP16 code with
+        // conversion routines in the middle. In this case, *after* the current
+        // line, there will be two lines for F2H, two lines for H2F, and then
+        // one line for the end of the operation.
+        while (std::getline(ifs, line)) {
+            std::vector<std::string> tokens;
+            boost::split(tokens, line, boost::is_any_of("\t"));
+            std::string op_name = tokens[0];
+            // FP32
+            if (tokens[0][0] != '#') {
+                // First line with tensor op name and start time
+                std::string op_name = tokens[0];
+                const auto start = std::stod(tokens[1]);
+                // Second line with tensor op end time
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto finish = std::stod(tokens[1]);
+                TensorOp top(op_name, start, finish);
+                calculateTensorEP(top);
+                addTensorOp(op_name, top);
+            } else {
+                // First line with tensor op name and start time
+                std::string op_name = tokens[0].substr(1);
+                const auto start = std::stod(tokens[1]);
+                // Second line with f2h
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                std::string f2h_name = op_name + "_f2h";
+                const auto f2h_start = std::stod(tokens[1]);
+                // Third line with f2h
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto f2h_finish = std::stod(tokens[1]);
+                // Add f2h
+                TensorOp f2h(f2h_name, f2h_start, f2h_finish);
+                calculateTensorEP(f2h);
+                addTensorOp(f2h_name, f2h);
+                // Fourth line with h2f
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                std::string h2f_name = op_name + "_h2f";
+                const auto h2f_start = std::stod(tokens[1]);
+                // Fifth line with h2f
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto h2f_finish = std::stod(tokens[1]);
+                // Add h2f
+                TensorOp h2f(h2f_name, h2f_start, h2f_finish);
+                calculateTensorEP(h2f);
+                addTensorOp(h2f_name, h2f);
+                // Sixth and final line with tensor op end time
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto finish = std::stod(tokens[1]);
+                // Subtract f2h's and h2f's time and energy to get just the computation's info
+                TensorOp top(op_name, start, finish);
+                calculateTensorEP(top);
+                top.time_ -= (f2h.time_ + h2f.time_);
+                top.energy_ -= (f2h.energy_ + h2f.energy_);
+                top.gpu_energy_ -= (f2h.gpu_energy_ + h2f.gpu_energy_);
+                top.ddr_energy_ -= (f2h.ddr_energy_ + h2f.ddr_energy_);
+                top.power_ = top.energy_ / top.time_;
+                top.gpu_power_ = top.gpu_energy_ / top.time_;
+                top.ddr_power_ = top.ddr_energy_ / top.time_;
+                addTensorOp(op_name, top);
+            }
+        }
+        ifs.close();
+    }
+    void updateStats() {
+        updatePerOpStats();
+        updateTotalStats();
+    }
+    // Calculates the average and standard deviation of each metric of each tensor op
+    void calculateAggregateStats() {
+        for (auto it = tensor_info_.begin(); it != tensor_info_.end(); it++) {
+            AggTensorInfo ati;
+            ati.name_ = it->first;
+            auto topv = it->second;
+            double total_time = 0.0;
+            double total_energy = 0.0;
+            double total_gpu_energy = 0.0;
+            double total_ddr_energy = 0.0;
+            double total_power = 0.0;
+            double total_gpu_power = 0.0;
+            double total_ddr_power = 0.0;
+            double time_sum = 0.0;
+            double energy_sum = 0.0;
+            double gpu_energy_sum = 0.0;
+            double ddr_energy_sum = 0.0;
+            double power_sum = 0.0;
+            double gpu_power_sum = 0.0;
+            double ddr_power_sum = 0.0;
+            // Calculate average
+            for (const auto &top : topv) {
+                total_time += top.time_;
+                total_energy += top.energy_;
+                total_gpu_energy += top.gpu_energy_;
+                total_ddr_energy += top.ddr_energy_;
+                total_power += top.power_;
+                total_gpu_power += top.gpu_power_;
+                total_ddr_power += top.ddr_power_;
+            }
+            ati.average_time_ = total_time / iterations_;
+            ati.average_energy_ = total_energy / iterations_;
+            ati.average_gpu_energy_ = total_gpu_energy / iterations_;
+            ati.average_ddr_energy_ = total_ddr_energy / iterations_;
+            ati.average_power_ = total_power / iterations_;
+            ati.average_gpu_power_ = total_gpu_power / iterations_;
+            ati.average_ddr_power_ = total_ddr_power / iterations_;
+            // Calculate standard deviation
+            for (const auto &top : topv) {
+                auto time_diff = top.time_ - ati.average_time_;
+                time_sum += time_diff * time_diff;
+                auto energy_diff = top.energy_ - ati.average_energy_;
+                energy_sum += energy_diff * energy_diff;
+                auto gpu_energy_diff = top.gpu_energy_ - ati.average_gpu_energy_;
+                gpu_energy_sum += gpu_energy_diff * gpu_energy_diff;
+                auto ddr_energy_diff = top.ddr_energy_ - ati.average_ddr_energy_;
+                ddr_energy_sum += ddr_energy_diff * ddr_energy_diff;
+                auto power_diff = top.power_ - ati.average_power_;
+                power_sum += power_diff * power_diff;
+                auto gpu_power_diff = top.gpu_power_ - ati.average_gpu_power_;
+                gpu_power_sum += gpu_power_diff * gpu_power_diff;
+                auto ddr_power_diff = top.ddr_power_ - ati.average_ddr_power_;
+                ddr_power_sum += ddr_power_diff * ddr_power_diff;
+            }
+            ati.time_std_ = std::sqrt(time_sum / iterations_);
+            ati.energy_std_ = std::sqrt(energy_sum / iterations_);
+            ati.gpu_energy_std_ = std::sqrt(gpu_energy_sum / iterations_);
+            ati.ddr_energy_std_ = std::sqrt(ddr_energy_sum / iterations_);
+            ati.power_std_ = std::sqrt(power_sum / iterations_);
+            ati.gpu_power_std_ = std::sqrt(gpu_power_sum / iterations_);
+            ati.ddr_power_std_ = std::sqrt(ddr_power_sum / iterations_);
+            agg_tensor_info_.push_back(ati);
+        }
+    }
+public:
+    Profiler() {
+        cpu_stream_.open(cpu_power_rail, std::ifstream::in);
+        gpu_stream_.open(gpu_power_rail, std::ifstream::in);
+        ddr_stream_.open(ddr_power_rail, std::ifstream::in);
+        soc_stream_.open(soc_power_rail, std::ifstream::in);
+        sys_stream_.open(sys_power_rail, std::ifstream::in);
+        if (!cpu_stream_.is_open() or !gpu_stream_.is_open() or !ddr_stream_.is_open()
+            or !soc_stream_.is_open() or !sys_stream_.is_open()) {
+            std::cout << "Failed to open one of the power rails for reading\n";
+            exit(1);
+        }
+    }
+    ~Profiler() {
+        cpu_stream_.close();
+        gpu_stream_.close();
+        ddr_stream_.close();
+        soc_stream_.close();
+        sys_stream_.close();
+    }
+    void profile(const char * const program, const int iterations) {
+        iterations_ = iterations;
+        resetGlobal();
+        for (unsigned i = 0; i < iterations_; i++) {
+            resetLocal();
+            // Launch two threads: one for running the program and one for
+            // profiling it. Pin the threads to specific cores to remove migration
+            // overhead. Profiling showed that the sampling rate increases slightly
+            // with pinning.
+            std::thread prog(&Profiler::runProgram, this, program);
+            std::thread power(&Profiler::recordPower, this);
+            pinThread(prog, core1);
+            pinThread(power, core2);
+            prog.join();
+            power.join();
+            updateStats();
+            // Sleep for some time to bring the GPU back to idle
+            std::this_thread::sleep_for(std::chrono::milliseconds(gpu_idle_time));
+        }
+        calculateAggregateStats();
+    }
+    void dumpTensorInfo(const char * const filename) const {
+        const std::string header = "Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std\n";
+        std::ofstream ofs;
+        ofs.open(filename);
+        //ofs << header;
+        for (const auto &ati : agg_tensor_info_) {
+            ofs << ati.name_
+                << "," << ati.average_time_ * 1e3
+                << "," << ati.average_energy_
+                /*
+                << "," << ati.average_gpu_energy_
+                << "," << ati.average_ddr_energy_
+                << "," << ati.average_power_
+                << "," << ati.average_gpu_power_
+                << "," << ati.average_ddr_power_
+                << "," << ati.time_std_ * 1e3
+                << "," << ati.energy_std_
+                << "," << ati.gpu_energy_std_
+                << "," << ati.ddr_energy_std_
+                << "," << ati.power_std_
+                << "," << ati.gpu_power_std_
+                << "," << ati.ddr_power_std_*/
+                << "\n";
+            std::cout << ati.average_time_ * 1e3 << "," << ati.average_energy_ << "\n";
+        }
+        ofs.close();
+    }
+    void dumpPowerReadings(const char * const filename) const {
+        std::ofstream ofs;
+        ofs.open(filename);
+        for (const auto &reading : power_readings_) {
+            std::chrono::duration<double> duration = reading.time_ - start_time_;
+            //std::chrono::duration<double> duration = reading.time_.time_since_epoch();
+            ofs << std::to_string(duration.count())
+                << " " << reading.gpu_
+                << " " << reading.ddr_
+                << "\n";
+        }
+        ofs.close();
+    }
+    void dumpTotalInfo() const {
+        auto total_time = total_info_.time_ / iterations_;
+        auto total_energy = total_info_.energy_ / iterations_;
+        auto gpu_energy = total_info_.gpu_energy_ / iterations_;
+        auto ddr_energy = total_info_.ddr_energy_ / iterations_;
+        auto power = total_info_.power_ / iterations_;
+        auto gpu_power = total_info_.gpu_power_ / iterations_;
+        auto ddr_power = total_info_.ddr_power_ / iterations_;
+        std::cout << "-----------------------------------------------------\n";
+        std::cout << "Program info (average)\n";
+        std::cout << "-----------------------------------------------------\n";
+        std::cout << "\tExecution time: " << total_time << " seconds\n";
+        std::cout << "\tTotal energy:   " << total_energy << " mJ\n";
+        std::cout << "\t    GPU:        " << gpu_energy << " mJ\n";
+        std::cout << "\t    DDR:        " << ddr_energy << " mJ\n";
+        std::cout << "\tPower:          " << power << " mW\n";
+        std::cout << "\t    GPU:        " << gpu_power << " mW\n";
+        std::cout << "\t    DDR:        " << ddr_power << " mW\n";
+        std::cout << "-----------------------------------------------------\n";
+    }
+};
+int main(int argc, char *argv[]) {
+    if (argc < NUM_ARGS) {
+        std::cout << "Usage: " << argv[0] << " <program> <iterations> <tensor output file> [power output file]\n";
+        exit(1);
+    }
+    Profiler pp;
+    pp.profile(argv[1], std::stoi(argv[2]));
+    pp.dumpTensorInfo(argv[3]);
+    if (argc > NUM_ARGS)
+        pp.dumpPowerReadings(argv[4]);
+    return 0;
+}