Merge branch 'approx_hpvm' of gitlab.engr.illinois.edu:llvm/hpvm into approx_hpvm

730b473a · kotsifa2 · 8ab6c870 · 5bb2d3f3 · 730b473a
Commit 730b473a authored 5 years ago by kotsifa2
--- a/llvm/projects/gpu_profiler/profiler.cpp
+++ b/llvm/projects/gpu_profiler/profiler.cpp
-#include <cmath>
+#include <atomic>
 #include <chrono>
+#include <cmath>
-#include <iostream>
+#include <condition_variable>
 #include <fstream>
+#include <iostream>
 #include <string>
-#include <boost/algorithm/string.hpp>
+#include <thread>
 #include <vector>
-#include <map>
-#include <thread>
+// Reads power rails at runtime and computes the GPU and DDR energy within a window  
-#include <atomic>
+// of time, which is delimitered by the calls to resume_profiler() and stop_profiler()
-#include <sched.h>
+// 
+// IMPORTANT: Must call exit_profiler() to kill the profiler thread 
+//
+// Public interface methods:
+//      void initialize();
+//      void run_profiler();
+//      void resume_profiler(); 
+//      void stop_profiler(); 
+//      std::pair<double, double> get_time_energy() const;
+//      void reset() 
+//      void exit_profiler();
+class Profiler {
+public:
+    Profiler() : should_run_profiler_(false), should_exit_profiler_(false) {
+		// Open all streams. Not done in initialize() function bc the streams
+		// should be strictly opened once 
+        cpu_stream_.open(cpu_power_rail, std::ifstream::in);
+        gpu_stream_.open(gpu_power_rail, std::ifstream::in);
+        ddr_stream_.open(ddr_power_rail, std::ifstream::in);
+        soc_stream_.open(soc_power_rail, std::ifstream::in);
+        sys_stream_.open(sys_power_rail, std::ifstream::in);
-#define NUM_ARGS 4
+        if (!cpu_stream_.is_open() or !gpu_stream_.is_open() or !ddr_stream_.is_open()
+            or !soc_stream_.is_open() or !sys_stream_.is_open()) {
+            std::cout << "Failed to open one of the power rails for reading\n";
+            exit(1);
+        }
+	}
-// This is a simple power profiler that can sample the power of the various
+    ~Profiler() {
-// components in a Jetson TX2. The usage is simple: profile() measures power
+        cpu_stream_.close();
-// for the specified program, and then dumpOutput() prints the readings to a
+        gpu_stream_.close();
-// file. profile() can be called as many times as desired - the internal state
+        ddr_stream_.close();
-// is reset each time and thus the measurements are not cumulative.
+        soc_stream_.close();
-class Profiler {
+        sys_stream_.close();
-private:
+    }
-    // Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
-    // we can't use them.
+    // Reinitializes boolean vars used for control flow and launches the profiler 
-    const unsigned core0 = 0;
+    // thread. DOES NOT reset other internal data structures. 
-    const unsigned core1 = 3;
+	void initialize(){
-    const unsigned core2 = 4;
+        // Reinitialize in case the profiler object has been used before 
-    const unsigned core3 = 5;
+		should_run_profiler_ = false;
+		should_exit_profiler_ = false;
+        // Launch profiler thread
+        profiler_thread_ = std::thread(&Profiler::run_profiler, this);
+    }
+    // Runs the profiler thread, keeping it alive by wrapping the functionality
+    // in an infinite loop 
+    void run_profiler(){
+        while (true){
+            if (should_exit_profiler_) {
+                break;
+            }
+            // TODO overhead between calls to obtain_power_reading
+            // Need to lock the mutex and check the condition var 
+            {
+                std::unique_lock<std::mutex> mutex_lock(mutex_);
+                if (should_exit_profiler_) {
+                    break;
+                }
+                // Wake the thread up when it's time to run the profiler or exit
+                // the profiler 
+                cond_var_.wait(mutex_lock, [this]{return should_run_profiler_ 
+                            || should_exit_profiler_; });   
+            }
+            if (should_exit_profiler_) {
+                break;
+            }
+			obtain_power_reading();	
+        }
+    }
+    // Resumes the profiling of whatever executable's currently running
+    // DOES NOT reset any data 
+    void resume_profiler() {
+        {
+            std::unique_lock<std::mutex> mutex_lock(mutex_);
+            if (should_run_profiler_){
+                std::cout << "WARNING: resume_profiler was already called\n"; 
+            }
+            should_run_profiler_ = true;
+            start_time_ = std::chrono::high_resolution_clock::now();
+        }
+        cond_var_.notify_one();
+    }
-    // sysfs paths for i2c buses of various components
+    // Stops profiler by putting profiler thread to sleep 
-    const char * const cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
+	void stop_profiler() {
-    const char * const gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
+        {
-    const char * const ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
+            std::unique_lock<std::mutex> mutex_lock(mutex_);
-    const char * const soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
+            if (!should_run_profiler_){
-    const char * const sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
+                std::cout << "WARNING: stop_profiler was already called\n";
+            }
+            should_run_profiler_ = false;
+        }
+        cond_var_.notify_one();
+    }
-    // It takes some time for the GPU's power to return to idle (ms)
+    // Gets the delta time and total GPU and DDR energy between the last two
-    const unsigned gpu_idle_time = 0;
+    // calls to resume_profiler and stop_profiler
+    //
+    // Returns this as a pair of <delta time in milliseconds, energy>
+	std::pair<double, double> get_time_energy() const {
+        double total_energy = 0.0;
+        std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
+        for (auto reading : power_readings_) {
+            std::chrono::duration<double> duration = reading.time_ - prev_time;
+            total_energy += reading.gpu_ * duration.count();
+            total_energy += reading.ddr_ * duration.count();
+            prev_time = reading.time_;
+        }
+        double delta_time = std::chrono::duration<double, std::milli>(prev_time 
+                    - start_time_).count();
+		return std::make_pair(delta_time, total_energy);
+	}
+    // Resets all internal data structures, including the vector storing all power_readings.
+	void reset() {
+        should_exit_profiler_ = false; // Can call reset after calling exit_profiler()
+        should_run_profiler_ = false; // Can call reset after calling resume 
+		power_readings_.clear();
+	}
+    // Exit the profiler and kill the thread
+    // Must call initialize() to reuse this object after calling exit_profiler()
+    void exit_profiler() { 
+        std::cout << "Exiting profiler\n";
+        should_exit_profiler_ = true;
+        cond_var_.notify_one();
+        profiler_thread_.join();
+    }
+private:
+    // Power rails are mounted as files. Keeping the old power rail file names for possible future
+    // integrations
+    const std::string cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
+    const std::string gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
+    const std::string ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
+    const std::string soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
+    const std::string sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
    // An individual power reading
    struct PowerReading {
@@ -49,81 +164,10 @@ private:
        double sys_;
    };
-    // Individual tensor op
+    // Stores all power readings and is cleared only when reset() is called
-    struct TensorOp {
+    std::vector<PowerReading> power_readings_;
-        std::string name_;
-        double start_;
-        double finish_;
-        double time_;
-        double energy_;
-        double gpu_energy_;
-        double ddr_energy_;
-        double power_;
-        double gpu_power_;
-        double ddr_power_;
-        TensorOp(std::string name, double start, double finish)
-            : name_(name), start_(start), finish_(finish), time_(finish - start),
-            energy_(0.0), gpu_energy_(0.0), ddr_energy_(0.0),
-            power_(0.0), gpu_power_(0.0), ddr_power_(0.0) {
-        }
-    };
-    // Aggregate tensor info
-    struct AggTensorInfo {
-        // Op name
-        std::string name_;
-        // Averages
-        double average_time_;
-        double average_energy_;
-        double average_gpu_energy_;
-        double average_ddr_energy_;
-        double average_power_;
-        double average_gpu_power_;
-        double average_ddr_power_;
-        // Standard deviations
-        double time_std_;
-        double energy_std_;
-        double gpu_energy_std_;
-        double ddr_energy_std_;
-        double power_std_;
-        double gpu_power_std_;
-        double ddr_power_std_;
-    };
-    // Total time, energy, and power
-    struct TotalInfo {
-        double time_;
-        double energy_;
-        double gpu_energy_;
-        double ddr_energy_;
-        double power_;
-        double gpu_power_;
-        double ddr_power_;
-        void clear() {
-            time_ = 0.0;
-            energy_ = 0.0;
-            gpu_energy_ = 0.0;
-            ddr_energy_ = 0.0;
-            power_ = 0.0;
+    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
-            gpu_power_ = 0.0;
-            ddr_power_ = 0.0;
-        }
-    };
    // For reading the i2c buses via sysfs
    std::ifstream cpu_stream_;
@@ -132,58 +176,18 @@ private:
    std::ifstream soc_stream_;
    std::ifstream sys_stream_;
-    // Start time (so graph begins from t=0)
+    std::mutex mutex_;
-    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
+    std::condition_variable cond_var_;
-    // Per-run info
-    std::vector<PowerReading> power_readings_;
-    // Aggregate (across all runs) info
+    bool should_run_profiler_; // True if we want to resume the profiling thread
-    std::map<std::string, std::vector<TensorOp>> tensor_info_;
-    std::vector<AggTensorInfo> agg_tensor_info_;
-    TotalInfo total_info_;
-    unsigned iterations_;
-    // Start and stop flags to synchronize the program and profiling threads
+    std::atomic_bool should_exit_profiler_; // Quit profiling
-    std::atomic_bool start_;
-    std::atomic_bool stop_;
-private:
+    std::thread profiler_thread_;
-    // Resets tensor info and total time and energy
-    void resetGlobal() {
-        tensor_info_.clear();
-        agg_tensor_info_.clear();
-        total_info_.clear();
-    }
-    // Resets power readings and flags
-    void resetLocal() {
-        power_readings_.clear();
-        start_ = false;
-        stop_ = false;
-    }
-    // Pins the given thread to the specified core
-    void pinThread(std::thread &t, const unsigned core) const {
-        cpu_set_t cpuset;
-        CPU_ZERO(&cpuset);
-        CPU_SET(core, &cpuset);
-        if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0)
-            std::cout << "Couldn't set thread affinity\n";
-    }
-    // Adds a tensor op to the map
-    void addTensorOp(std::string &op_name, TensorOp &top) {
-        // Create a vector if this is the first entry
-        auto it = tensor_info_.find(op_name);
-        if (it == tensor_info_.end()) {
-            tensor_info_.insert(std::pair<std::string, std::vector<TensorOp>>(op_name, std::vector<TensorOp>()));
-        }
-        tensor_info_[op_name].push_back(top);
-    }
    // Obtain's a single power reading from the GPU and DDR rails
-    void getPowerReading() {
+    void obtain_power_reading() {
        PowerReading reading;
        // The order matters here. All the reads have to happen together first
@@ -199,385 +203,34 @@ private:
        gpu_stream_.seekg(0);
        ddr_stream_.seekg(0);
    }
+};
-    // Executes the program to be profiled
+/*
-    void runProgram(const char * const program) {
+// TESTS
-        // Tell the profiling thread to start, execute the program that needs
+void resume_pause_profiler(Profiler& profile_wrapper, unsigned long sleep_millis){
-        // to be profiled, and then tell the profiling thread to stop.
+    profile_wrapper.resume_profiler();
-        start_ = true;
+    std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis));
-        const auto result = std::system(program);
+    profile_wrapper.stop_profiler();
-        stop_ = true;
-    }
-    // Records power while the program is running
-    void recordPower() {
-        // Obtain the new start time, wait for the start signal, and keep
-        // profiling until the stop flag is set.
-        start_time_ = std::chrono::high_resolution_clock::now();
-        while (!start_);
-        while (!stop_)
-            getPowerReading();
-    }
-    // Calculates stats for the entire execution (CPU+GPU phase)
-    void updateTotalStats() {
-        double energy = 0.0;
-        double gpu_energy = 0.0;
-        double ddr_energy = 0.0;
-        std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
-        for (auto reading : power_readings_) {
-            std::chrono::duration<double> duration = reading.time_ - prev_time;
-            gpu_energy += reading.gpu_ * duration.count();
-            ddr_energy += reading.ddr_ * duration.count();
-            prev_time = reading.time_;
-        }
-        energy = gpu_energy + ddr_energy;
-        auto time = std::chrono::duration<double>(prev_time - start_time_).count();
-        total_info_.time_ += time;
-        total_info_.energy_ += (gpu_energy + ddr_energy);
-        total_info_.gpu_energy_ += gpu_energy;
-        total_info_.ddr_energy_ += ddr_energy;
-        total_info_.power_ += (energy / time);
-        total_info_.gpu_power_ += (gpu_energy / time);
-        total_info_.ddr_power_ += (ddr_energy / time);
-    }
-    // Calculates energy and power usage of the given tensor operation
-    void calculateTensorEP(TensorOp &top) const {
-        auto prev_time = top.start_;
-        unsigned i = 0;
-        // Skip until we hit the start time of the operation
-        for (; std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count() < top.start_; i++);
-        // Keep going until we hit the finish time of the operation or we run out of readings
-        for (double curr_time; ((curr_time = std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count()) <= top.finish_)
-                && (i < power_readings_.size()); i++) {
-            auto duration = curr_time - prev_time;
-            prev_time = curr_time;
-            top.gpu_energy_ += power_readings_[i].gpu_ * duration;
-            top.ddr_energy_ += power_readings_[i].ddr_ * duration;
-        }
-        top.energy_ = top.gpu_energy_ + top.ddr_energy_;
-        top.power_ = top.energy_ / top.time_;
-        top.gpu_power_ = top.gpu_energy_ / top.time_;
-        top.ddr_power_ = top.ddr_energy_ / top.time_;
-    }
-    // Calculates stats for all the tensors in the timestamp file
-    void updatePerOpStats() {
-        const char * const op_file = "profile_data.txt";
-        std::string line;
-        std::ifstream ifs(op_file, std::ios::in);
-        // Calculate time and energy for each tensor operation. There are two
-        // possibilities for the file format:
-        // If the line doesn't begin with #, we are looking at FP32 code
-        // without any conversions to/from FP16, and each operation occupies
-        // two consecutive lines in the timestamp file.
-        // If the line does begin with #, we are looking at FP16 code with
-        // conversion routines in the middle. In this case, *after* the current
-        // line, there will be two lines for F2H, two lines for H2F, and then
-        // one line for the end of the operation.
-        while (std::getline(ifs, line)) {
-            std::vector<std::string> tokens;
-            boost::split(tokens, line, boost::is_any_of("\t"));
-            std::string op_name = tokens[0];
-            // FP32
-            if (tokens[0][0] != '#') {
-                // First line with tensor op name and start time
-                std::string op_name = tokens[0];
-                const auto start = std::stod(tokens[1]);
-                // Second line with tensor op end time
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto finish = std::stod(tokens[1]);
-                TensorOp top(op_name, start, finish);
-                calculateTensorEP(top);
-                addTensorOp(op_name, top);
-            } else {
-                // First line with tensor op name and start time
-                std::string op_name = tokens[0].substr(1);
-                const auto start = std::stod(tokens[1]);
-                // Second line with f2h
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                std::string f2h_name = op_name + "_f2h";
-                const auto f2h_start = std::stod(tokens[1]);
-                // Third line with f2h
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto f2h_finish = std::stod(tokens[1]);
-                // Add f2h
-                TensorOp f2h(f2h_name, f2h_start, f2h_finish);
-                calculateTensorEP(f2h);
-                addTensorOp(f2h_name, f2h);
-                // Fourth line with h2f
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                std::string h2f_name = op_name + "_h2f";
-                const auto h2f_start = std::stod(tokens[1]);
-                // Fifth line with h2f
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto h2f_finish = std::stod(tokens[1]);
-                // Add h2f
-                TensorOp h2f(h2f_name, h2f_start, h2f_finish);
-                calculateTensorEP(h2f);
-                addTensorOp(h2f_name, h2f);
-                // Sixth and final line with tensor op end time
-                std::getline(ifs, line);
-                tokens.clear();
-                boost::split(tokens, line, boost::is_any_of("\t"));
-                const auto finish = std::stod(tokens[1]);
-                // Subtract f2h's and h2f's time and energy to get just the computation's info
-                TensorOp top(op_name, start, finish);
-                calculateTensorEP(top);
-                top.time_ -= (f2h.time_ + h2f.time_);
-                top.energy_ -= (f2h.energy_ + h2f.energy_);
-                top.gpu_energy_ -= (f2h.gpu_energy_ + h2f.gpu_energy_);
-                top.ddr_energy_ -= (f2h.ddr_energy_ + h2f.ddr_energy_);
-                top.power_ = top.energy_ / top.time_;
-                top.gpu_power_ = top.gpu_energy_ / top.time_;
-                top.ddr_power_ = top.ddr_energy_ / top.time_;
-                addTensorOp(op_name, top);
-            }
-        }
-        ifs.close();
-    }
-    void updateStats() {
-        updatePerOpStats();
-        updateTotalStats();
-    }
-    // Calculates the average and standard deviation of each metric of each tensor op
-    void calculateAggregateStats() {
-        for (auto it = tensor_info_.begin(); it != tensor_info_.end(); it++) {
-            AggTensorInfo ati;
-            ati.name_ = it->first;
-            auto topv = it->second;
-            double total_time = 0.0;
-            double total_energy = 0.0;
-            double total_gpu_energy = 0.0;
-            double total_ddr_energy = 0.0;
-            double total_power = 0.0;
-            double total_gpu_power = 0.0;
-            double total_ddr_power = 0.0;
-            double time_sum = 0.0;
-            double energy_sum = 0.0;
-            double gpu_energy_sum = 0.0;
-            double ddr_energy_sum = 0.0;
-            double power_sum = 0.0;
-            double gpu_power_sum = 0.0;
-            double ddr_power_sum = 0.0;
-            // Calculate average
-            for (const auto &top : topv) {
-                total_time += top.time_;
-                total_energy += top.energy_;
-                total_gpu_energy += top.gpu_energy_;
-                total_ddr_energy += top.ddr_energy_;
-                total_power += top.power_;
-                total_gpu_power += top.gpu_power_;
-                total_ddr_power += top.ddr_power_;
-            }
-            ati.average_time_ = total_time / iterations_;
-            ati.average_energy_ = total_energy / iterations_;
-            ati.average_gpu_energy_ = total_gpu_energy / iterations_;
-            ati.average_ddr_energy_ = total_ddr_energy / iterations_;
-            ati.average_power_ = total_power / iterations_;
-            ati.average_gpu_power_ = total_gpu_power / iterations_;
-            ati.average_ddr_power_ = total_ddr_power / iterations_;
-            // Calculate standard deviation
-            for (const auto &top : topv) {
-                auto time_diff = top.time_ - ati.average_time_;
-                time_sum += time_diff * time_diff;
-                auto energy_diff = top.energy_ - ati.average_energy_;
-                energy_sum += energy_diff * energy_diff;
-                auto gpu_energy_diff = top.gpu_energy_ - ati.average_gpu_energy_;
-                gpu_energy_sum += gpu_energy_diff * gpu_energy_diff;
-                auto ddr_energy_diff = top.ddr_energy_ - ati.average_ddr_energy_;
-                ddr_energy_sum += ddr_energy_diff * ddr_energy_diff;
-                auto power_diff = top.power_ - ati.average_power_;
-                power_sum += power_diff * power_diff;
-                auto gpu_power_diff = top.gpu_power_ - ati.average_gpu_power_;
-                gpu_power_sum += gpu_power_diff * gpu_power_diff;
-                auto ddr_power_diff = top.ddr_power_ - ati.average_ddr_power_;
-                ddr_power_sum += ddr_power_diff * ddr_power_diff;
-            }
-            ati.time_std_ = std::sqrt(time_sum / iterations_);
-            ati.energy_std_ = std::sqrt(energy_sum / iterations_);
-            ati.gpu_energy_std_ = std::sqrt(gpu_energy_sum / iterations_);
-            ati.ddr_energy_std_ = std::sqrt(ddr_energy_sum / iterations_);
-            ati.power_std_ = std::sqrt(power_sum / iterations_);
-            ati.gpu_power_std_ = std::sqrt(gpu_power_sum / iterations_);
-            ati.ddr_power_std_ = std::sqrt(ddr_power_sum / iterations_);
-            agg_tensor_info_.push_back(ati);
-        }
-    }
-public:
-    Profiler() {
-        cpu_stream_.open(cpu_power_rail, std::ifstream::in);
-        gpu_stream_.open(gpu_power_rail, std::ifstream::in);
-        ddr_stream_.open(ddr_power_rail, std::ifstream::in);
-        soc_stream_.open(soc_power_rail, std::ifstream::in);
-        sys_stream_.open(sys_power_rail, std::ifstream::in);
-        if (!cpu_stream_.is_open() or !gpu_stream_.is_open() or !ddr_stream_.is_open()
-            or !soc_stream_.is_open() or !sys_stream_.is_open()) {
-            std::cout << "Failed to open one of the power rails for reading\n";
-            exit(1);
-        }
-    }
-    ~Profiler() {
-        cpu_stream_.close();
-        gpu_stream_.close();
-        ddr_stream_.close();
-        soc_stream_.close();
-        sys_stream_.close();
-    }
-    void profile(const char * const program, const int iterations) {
-        iterations_ = iterations;
-        resetGlobal();
-        for (unsigned i = 0; i < iterations_; i++) {
-            resetLocal();
-            // Launch two threads: one for running the program and one for
-            // profiling it. Pin the threads to specific cores to remove migration
-            // overhead. Profiling showed that the sampling rate increases slightly
-            // with pinning.
-            std::thread prog(&Profiler::runProgram, this, program);
-            std::thread power(&Profiler::recordPower, this);
-            pinThread(prog, core1);
-            pinThread(power, core2);
-            prog.join();
-            power.join();
-            updateStats();
-            // Sleep for some time to bring the GPU back to idle
-            std::this_thread::sleep_for(std::chrono::milliseconds(gpu_idle_time));
-        }
-        calculateAggregateStats();
-    }
-    void dumpTensorInfo(const char * const filename) const {
-        const std::string header = "Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std\n";
-        std::ofstream ofs;
-        ofs.open(filename);
-        //ofs << header;
-        for (const auto &ati : agg_tensor_info_) {
-            ofs << ati.name_
-                << "," << ati.average_time_ * 1e3
-                << "," << ati.average_energy_
-                << "," << ati.average_gpu_energy_
-                << "," << ati.average_ddr_energy_
-                << "," << ati.average_power_
-                << "," << ati.average_gpu_power_
-                << "," << ati.average_ddr_power_
-                << "," << ati.time_std_ * 1e3
-                << "," << ati.energy_std_
-                << "," << ati.gpu_energy_std_
-                << "," << ati.ddr_energy_std_
-                << "," << ati.power_std_
-                << "," << ati.gpu_power_std_
-                << "," << ati.ddr_power_std_
-                << "\n";
-            std::cout << ati.average_time_ * 1e3 << "," << ati.average_energy_ << "\n";
-        }
-        ofs.close();
-    }
-    void dumpPowerReadings(const char * const filename) const {
-        std::ofstream ofs;
-        ofs.open(filename);
-        for (const auto &reading : power_readings_) {
-            std::chrono::duration<double> duration = reading.time_ - start_time_;
-            //std::chrono::duration<double> duration = reading.time_.time_since_epoch();
-            ofs << std::to_string(duration.count())
-                << " " << reading.gpu_
-                << " " << reading.ddr_
-                << "\n";
-        }
-        ofs.close();
-    }
-    void dumpTotalInfo() const {
-        auto total_time = total_info_.time_ / iterations_;
-        auto total_energy = total_info_.energy_ / iterations_;
-        auto gpu_energy = total_info_.gpu_energy_ / iterations_;
-        auto ddr_energy = total_info_.ddr_energy_ / iterations_;
-        auto power = total_info_.power_ / iterations_;
+    auto time_energy_pair = profile_wrapper.get_time_energy();
-        auto gpu_power = total_info_.gpu_power_ / iterations_;
+    profile_wrapper.reset();
-        auto ddr_power = total_info_.ddr_power_ / iterations_;
-        std::cout << "-----------------------------------------------------\n";
+    printf("time: %f, energy: %f\n", time_energy_pair.first, time_energy_pair.second);
-        std::cout << "Program info (average)\n";
+    std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis));
-        std::cout << "-----------------------------------------------------\n";
+}
-        std::cout << "\tExecution time: " << total_time << " seconds\n";
-        std::cout << "\tTotal energy:   " << total_energy << " mJ\n";
-        std::cout << "\t    GPU:        " << gpu_energy << " mJ\n";
-        std::cout << "\t    DDR:        " << ddr_energy << " mJ\n";
-        std::cout << "\tPower:          " << power << " mW\n";
-        std::cout << "\t    GPU:        " << gpu_power << " mW\n";
-        std::cout << "\t    DDR:        " << ddr_power << " mW\n";
-        std::cout << "-----------------------------------------------------\n";
-    }
-};
-int main(int argc, char *argv[]) {
-    if (argc < NUM_ARGS) {
-        std::cout << "Usage: " << argv[0] << " <program> <iterations> <tensor output file> [power output file]\n";
-        exit(1);
-    }
-    Profiler pp;
+int main(){
-    pp.profile(argv[1], std::stoi(argv[2]));
+    Profiler profile_wrapper;
-    pp.dumpTensorInfo(argv[3]);
+	profile_wrapper.initialize();
-    if (argc > NUM_ARGS)
+    unsigned long sleep_millis = 5000;
-        pp.dumpPowerReadings(argv[4]);
+	resume_pause_profiler(profile_wrapper, sleep_millis);
+    resume_pause_profiler(profile_wrapper, sleep_millis);
+    resume_pause_profiler(profile_wrapper, sleep_millis);
+    resume_pause_profiler(profile_wrapper, sleep_millis);
+	// IMPORTANT
+    profile_wrapper.exit_profiler();
    return 0;
 }
+*/