diff --git a/llvm/projects/gpu_profiler/CMakeLists.txt b/llvm/projects/gpu_profiler/CMakeLists.txt
index cce1b3239ab7622bcc94d404fbca29b0a131c421..c6cf3041eee354609b3999e5a8dcd424990f75ec 100644
--- a/llvm/projects/gpu_profiler/CMakeLists.txt
+++ b/llvm/projects/gpu_profiler/CMakeLists.txt
@@ -1,4 +1,5 @@
 cmake_minimum_required(VERSION 3.5)
-set(libsrc profiler.cpp)
+set(libsrc src/profiler.cpp)
 set (CMAKE_CXX_STANDARD 11)
 add_library(gpu_profiler STATIC ${libsrc})
+target_include_directories(gpu_profiler PRIVATE include)
diff --git a/llvm/projects/gpu_profiler/include/profiler.h b/llvm/projects/gpu_profiler/include/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..b776ed2b6642ee773783e48c9ba408d33d211f43
--- /dev/null
+++ b/llvm/projects/gpu_profiler/include/profiler.h
@@ -0,0 +1,110 @@
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <condition_variable>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
+
+// Reads power rails at runtime and computes the GPU and DDR energy within a window  
+// of time, which is delimitered by the calls to resume_profiler() and pause_profiler()
+// 
+// IMPORTANT: Must call pause_profiler() to kill the profiler thread 
+//
+// Public interface methods:
+//      void start_profiler();
+//      void resume_profiler(); 
+//      void pause_profiler(); 
+//      std::pair<double, double> get_time_energy() const;
+//      void reset() 
+//      void pause_profiler();
+class Profiler {
+public:
+    Profiler();
+
+    ~Profiler();
+
+    // Reinitializes boolean vars used for control flow and launches the profiler 
+    // thread. DOES NOT reset other internal data structures. 
+	void start_profiler();
+
+    // Resumes the profiling of whatever executable's currently running
+    // DOES NOT reset any data 
+    void resume_profiler();
+
+    // Stops profiler by putting profiler thread to sleep 
+	void pause_profiler();
+
+    // Gets the delta time and total GPU and DDR energy between the last two
+    // calls to resume_profiler and pause_profiler
+    //
+    // Returns this as a pair of <delta time in milliseconds, energy>
+	std::pair<double, double> get_time_energy() const;
+
+    // Resets all internal data structures, including the vector storing all power_readings.
+	void reset();
+
+    // Exit the profiler and kill the thread
+    // Must call start_profiler() to reuse this object after calling pause_profiler()
+    void stop_profiler();
+
+private:
+    // Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
+    // we can't use them.
+    const unsigned core0 = 0;
+    const unsigned core1 = 3;
+    const unsigned core2 = 4;
+    const unsigned core3 = 5;
+
+    // Power rails are mounted as files. Keeping the old power rail file names for possible future
+    // integrations
+    const std::string cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
+    const std::string gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
+    const std::string ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
+    const std::string soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
+    const std::string sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
+
+    // An individual power reading
+    struct PowerReading {
+        std::chrono::time_point<std::chrono::high_resolution_clock> time_;
+        double cpu_;
+        double gpu_;
+        double ddr_;
+        double soc_;
+        double sys_;
+    };
+
+    // Stores all power readings and is cleared only when reset() is called
+    std::vector<PowerReading> power_readings_;
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
+
+    // For reading the i2c buses via sysfs
+    std::ifstream cpu_stream_;
+    std::ifstream gpu_stream_;
+    std::ifstream ddr_stream_;
+    std::ifstream soc_stream_;
+    std::ifstream sys_stream_;
+
+    std::mutex mutex_;
+    
+    std::condition_variable cond_var_;
+
+    bool should_run_profiler_; // True if we want to resume the profiling thread
+
+    std::atomic_bool should_stop_profiler_; // Quit profiling
+
+    std::thread profiler_thread_;
+
+    // Obtain's a single power reading from the GPU and DDR rails
+    void obtain_power_reading();
+
+    // Pins the given thread to the specified core
+    void pin_thread(std::thread &t, const unsigned core) const;
+
+    // Runs the profiler thread, keeping it alive by wrapping the functionality
+    // in an infinite loop 
+    void run_profiler();
+};
diff --git a/llvm/projects/gpu_profiler/src/offline_profiler.cpp b/llvm/projects/gpu_profiler/src/offline_profiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25ca45241c29e7a0f8edb0518d8347a185caf5a4
--- /dev/null
+++ b/llvm/projects/gpu_profiler/src/offline_profiler.cpp
@@ -0,0 +1,584 @@
+#include <cmath>
+#include <chrono>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <boost/algorithm/string.hpp>
+
+#include <vector>
+#include <map>
+
+#include <thread>
+#include <atomic>
+#include <sched.h>
+
+#define NUM_ARGS 4
+
+// This is a simple power profiler that can sample the power of the various
+// components in a Jetson TX2. The usage is simple: profile() measures power
+// for the specified program, and then dumpOutput() prints the readings to a
+// file. profile() can be called as many times as desired - the internal state
+// is reset each time and thus the measurements are not cumulative.
+class Profiler {
+private:
+    // Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
+    // we can't use them.
+    const unsigned core0 = 0;
+    const unsigned core1 = 3;
+    const unsigned core2 = 4;
+    const unsigned core3 = 5;
+
+    // sysfs paths for i2c buses of various components
+    const char * const cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
+    const char * const gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
+    const char * const ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
+    const char * const soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
+    const char * const sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
+
+    // It takes some time for the GPU's power to return to idle (ms)
+    const unsigned gpu_idle_time = 0;
+
+    // An individual power reading
+    struct PowerReading {
+        std::chrono::time_point<std::chrono::high_resolution_clock> time_;
+        double cpu_;
+        double gpu_;
+        double ddr_;
+        double soc_;
+        double sys_;
+    };
+
+    // Individual tensor op
+    struct TensorOp {
+        std::string name_;
+
+        double start_;
+        double finish_;
+        double time_;
+
+        double energy_;
+        double gpu_energy_;
+        double ddr_energy_;
+
+        double power_;
+        double gpu_power_;
+        double ddr_power_;
+
+        TensorOp(std::string name, double start, double finish)
+            : name_(name), start_(start), finish_(finish), time_(finish - start),
+            energy_(0.0), gpu_energy_(0.0), ddr_energy_(0.0),
+            power_(0.0), gpu_power_(0.0), ddr_power_(0.0) {
+        }
+    };
+
+    // Aggregate tensor info
+    struct AggTensorInfo {
+        // Op name
+        std::string name_;
+
+        // Averages
+        double average_time_;
+
+        double average_energy_;
+        double average_gpu_energy_;
+        double average_ddr_energy_;
+
+        double average_power_;
+        double average_gpu_power_;
+        double average_ddr_power_;
+
+        // Standard deviations
+        double time_std_;
+
+        double energy_std_;
+        double gpu_energy_std_;
+        double ddr_energy_std_;
+
+        double power_std_;
+        double gpu_power_std_;
+        double ddr_power_std_;
+    };
+
+    // Total time, energy, and power
+    struct TotalInfo {
+        double time_;
+
+        double energy_;
+        double gpu_energy_;
+        double ddr_energy_;
+
+        double power_;
+        double gpu_power_;
+        double ddr_power_;
+
+        void clear() {
+            time_ = 0.0;
+
+            energy_ = 0.0;
+            gpu_energy_ = 0.0;
+            ddr_energy_ = 0.0;
+
+            power_ = 0.0;
+            gpu_power_ = 0.0;
+            ddr_power_ = 0.0;
+        }
+    };
+
+    // For reading the i2c buses via sysfs
+    std::ifstream cpu_stream_;
+    std::ifstream gpu_stream_;
+    std::ifstream ddr_stream_;
+    std::ifstream soc_stream_;
+    std::ifstream sys_stream_;
+
+    // Start time (so graph begins from t=0)
+    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
+
+    // Per-run info
+    std::vector<PowerReading> power_readings_;
+
+    // Aggregate (across all runs) info
+    std::map<std::string, std::vector<TensorOp>> tensor_info_;
+    std::vector<AggTensorInfo> agg_tensor_info_;
+    TotalInfo total_info_;
+    unsigned iterations_;
+
+    // Start and stop flags to synchronize the program and profiling threads
+    std::atomic_bool start_;
+    std::atomic_bool stop_;
+
+private:
+    // Resets tensor info and total time and energy
+    void resetGlobal() {
+        tensor_info_.clear();
+        agg_tensor_info_.clear();
+        total_info_.clear();
+    }
+
+    // Resets power readings and flags
+    void resetLocal() {
+        power_readings_.clear();
+        start_ = false;
+        stop_ = false;
+    }
+
+    // Pins the given thread to the specified core
+    void pinThread(std::thread &t, const unsigned core) const {
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        CPU_SET(core, &cpuset);
+        if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0)
+            std::cout << "Couldn't set thread affinity\n";
+    }
+
+    // Adds a tensor op to the map
+    void addTensorOp(std::string &op_name, TensorOp &top) {
+        // Create a vector if this is the first entry
+        auto it = tensor_info_.find(op_name);
+        if (it == tensor_info_.end()) {
+            tensor_info_.insert(std::pair<std::string, std::vector<TensorOp>>(op_name, std::vector<TensorOp>()));
+        }
+        tensor_info_[op_name].push_back(top);
+    }
+
+    // Obtain's a single power reading from the GPU and DDR rails
+    void getPowerReading() {
+        PowerReading reading;
+
+        // The order matters here. All the reads have to happen together first
+        // and then all the seeks have to happen together at the end, otherwise
+        // there will be a significant time difference between the readings of
+        // the different rails.
+        reading.time_ = std::chrono::high_resolution_clock::now();
+        gpu_stream_ >> reading.gpu_;
+        ddr_stream_ >> reading.ddr_;
+        power_readings_.push_back(reading);
+
+        // Reset the input position of the files
+        gpu_stream_.seekg(0);
+        ddr_stream_.seekg(0);
+    }
+
+    // Executes the program to be profiled
+    void runProgram(const char * const program) {
+        // Tell the profiling thread to start, execute the program that needs
+        // to be profiled, and then tell the profiling thread to stop.
+        start_ = true;
+        const auto result = std::system(program);
+        stop_ = true;
+    }
+
+    // Records power while the program is running
+    void recordPower() {
+        // Obtain the new start time, wait for the start signal, and keep
+        // profiling until the stop flag is set.
+        start_time_ = std::chrono::high_resolution_clock::now();
+        while (!start_);
+        while (!stop_)
+            getPowerReading();
+    }
+
+    // Calculates stats for the entire execution (CPU+GPU phase)
+    void updateTotalStats() {
+        double energy = 0.0;
+        double gpu_energy = 0.0;
+        double ddr_energy = 0.0;
+
+        std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
+        for (auto reading : power_readings_) {
+            std::chrono::duration<double> duration = reading.time_ - prev_time;
+            gpu_energy += reading.gpu_ * duration.count();
+            ddr_energy += reading.ddr_ * duration.count();
+            prev_time = reading.time_;
+        }
+        energy = gpu_energy + ddr_energy;
+        auto time = std::chrono::duration<double>(prev_time - start_time_).count();
+
+        total_info_.time_ += time;
+        total_info_.energy_ += (gpu_energy + ddr_energy);
+        total_info_.gpu_energy_ += gpu_energy;
+        total_info_.ddr_energy_ += ddr_energy;
+
+        total_info_.power_ += (energy / time);
+        total_info_.gpu_power_ += (gpu_energy / time);
+        total_info_.ddr_power_ += (ddr_energy / time);
+    }
+
+    // Calculates energy and power usage of the given tensor operation
+    void calculateTensorEP(TensorOp &top) const {
+        auto prev_time = top.start_;
+        unsigned i = 0;
+
+        // Skip until we hit the start time of the operation
+        for (; std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count() < top.start_; i++);
+
+        // Keep going until we hit the finish time of the operation or we run out of readings
+        for (double curr_time; ((curr_time = std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count()) <= top.finish_)
+                && (i < power_readings_.size()); i++) {
+            auto duration = curr_time - prev_time;
+            prev_time = curr_time;
+
+            top.gpu_energy_ += power_readings_[i].gpu_ * duration;
+            top.ddr_energy_ += power_readings_[i].ddr_ * duration;
+        }
+        top.energy_ = top.gpu_energy_ + top.ddr_energy_;
+
+        top.power_ = top.energy_ / top.time_;
+        top.gpu_power_ = top.gpu_energy_ / top.time_;
+        top.ddr_power_ = top.ddr_energy_ / top.time_;
+    }
+
+    // Calculates stats for all the tensors in the timestamp file
+    void updatePerOpStats() {
+        const char * const op_file = "profile_data.txt";
+        std::string line;
+        std::ifstream ifs(op_file, std::ios::in);
+
+        // Calculate time and energy for each tensor operation. There are two
+        // possibilities for the file format:
+        // If the line doesn't begin with #, we are looking at FP32 code
+        // without any conversions to/from FP16, and each operation occupies
+        // two consecutive lines in the timestamp file.
+        // If the line does begin with #, we are looking at FP16 code with
+        // conversion routines in the middle. In this case, *after* the current
+        // line, there will be two lines for F2H, two lines for H2F, and then
+        // one line for the end of the operation.
+        while (std::getline(ifs, line)) {
+            std::vector<std::string> tokens;
+            boost::split(tokens, line, boost::is_any_of("\t"));
+            std::string op_name = tokens[0];
+
+            // FP32
+            if (tokens[0][0] != '#') {
+                // First line with tensor op name and start time
+                std::string op_name = tokens[0];
+                const auto start = std::stod(tokens[1]);
+
+                // Second line with tensor op end time
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto finish = std::stod(tokens[1]);
+
+                TensorOp top(op_name, start, finish);
+                calculateTensorEP(top);
+                addTensorOp(op_name, top);
+            } else {
+                // First line with tensor op name and start time
+                std::string op_name = tokens[0].substr(1);
+                const auto start = std::stod(tokens[1]);
+
+                // Second line with f2h
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                std::string f2h_name = op_name + "_f2h";
+                const auto f2h_start = std::stod(tokens[1]);
+
+                // Third line with f2h
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto f2h_finish = std::stod(tokens[1]);
+
+                // Add f2h
+                TensorOp f2h(f2h_name, f2h_start, f2h_finish);
+                calculateTensorEP(f2h);
+                addTensorOp(f2h_name, f2h);
+
+                // Fourth line with h2f
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                std::string h2f_name = op_name + "_h2f";
+                const auto h2f_start = std::stod(tokens[1]);
+
+                // Fifth line with h2f
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto h2f_finish = std::stod(tokens[1]);
+
+                // Add h2f
+                TensorOp h2f(h2f_name, h2f_start, h2f_finish);
+                calculateTensorEP(h2f);
+                addTensorOp(h2f_name, h2f);
+
+                // Sixth and final line with tensor op end time
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto finish = std::stod(tokens[1]);
+
+                // Subtract f2h's and h2f's time and energy to get just the computation's info
+                TensorOp top(op_name, start, finish);
+                calculateTensorEP(top);
+
+                top.time_ -= (f2h.time_ + h2f.time_);
+                top.energy_ -= (f2h.energy_ + h2f.energy_);
+                top.gpu_energy_ -= (f2h.gpu_energy_ + h2f.gpu_energy_);
+                top.ddr_energy_ -= (f2h.ddr_energy_ + h2f.ddr_energy_);
+                top.power_ = top.energy_ / top.time_;
+                top.gpu_power_ = top.gpu_energy_ / top.time_;
+                top.ddr_power_ = top.ddr_energy_ / top.time_;
+
+                addTensorOp(op_name, top);
+            }
+        }
+        ifs.close();
+    }
+
+    void updateStats() {
+        updatePerOpStats();
+        updateTotalStats();
+    }
+
+    // Calculates the average and standard deviation of each metric of each tensor op
+    void calculateAggregateStats() {
+        for (auto it = tensor_info_.begin(); it != tensor_info_.end(); it++) {
+            AggTensorInfo ati;
+            ati.name_ = it->first;
+            auto topv = it->second;
+
+            double total_time = 0.0;
+            double total_energy = 0.0;
+            double total_gpu_energy = 0.0;
+            double total_ddr_energy = 0.0;
+            double total_power = 0.0;
+            double total_gpu_power = 0.0;
+            double total_ddr_power = 0.0;
+
+            double time_sum = 0.0;
+            double energy_sum = 0.0;
+            double gpu_energy_sum = 0.0;
+            double ddr_energy_sum = 0.0;
+            double power_sum = 0.0;
+            double gpu_power_sum = 0.0;
+            double ddr_power_sum = 0.0;
+
+            // Calculate average
+            for (const auto &top : topv) {
+                total_time += top.time_;
+                total_energy += top.energy_;
+                total_gpu_energy += top.gpu_energy_;
+                total_ddr_energy += top.ddr_energy_;
+                total_power += top.power_;
+                total_gpu_power += top.gpu_power_;
+                total_ddr_power += top.ddr_power_;
+            }
+
+            ati.average_time_ = total_time / iterations_;
+            ati.average_energy_ = total_energy / iterations_;
+            ati.average_gpu_energy_ = total_gpu_energy / iterations_;
+            ati.average_ddr_energy_ = total_ddr_energy / iterations_;
+            ati.average_power_ = total_power / iterations_;
+            ati.average_gpu_power_ = total_gpu_power / iterations_;
+            ati.average_ddr_power_ = total_ddr_power / iterations_;
+
+            // Calculate standard deviation
+            for (const auto &top : topv) {
+                auto time_diff = top.time_ - ati.average_time_;
+                time_sum += time_diff * time_diff;
+
+                auto energy_diff = top.energy_ - ati.average_energy_;
+                energy_sum += energy_diff * energy_diff;
+                auto gpu_energy_diff = top.gpu_energy_ - ati.average_gpu_energy_;
+                gpu_energy_sum += gpu_energy_diff * gpu_energy_diff;
+                auto ddr_energy_diff = top.ddr_energy_ - ati.average_ddr_energy_;
+                ddr_energy_sum += ddr_energy_diff * ddr_energy_diff;
+
+                auto power_diff = top.power_ - ati.average_power_;
+                power_sum += power_diff * power_diff;
+                auto gpu_power_diff = top.gpu_power_ - ati.average_gpu_power_;
+                gpu_power_sum += gpu_power_diff * gpu_power_diff;
+                auto ddr_power_diff = top.ddr_power_ - ati.average_ddr_power_;
+                ddr_power_sum += ddr_power_diff * ddr_power_diff;
+            }
+
+            ati.time_std_ = std::sqrt(time_sum / iterations_);
+            ati.energy_std_ = std::sqrt(energy_sum / iterations_);
+            ati.gpu_energy_std_ = std::sqrt(gpu_energy_sum / iterations_);
+            ati.ddr_energy_std_ = std::sqrt(ddr_energy_sum / iterations_);
+            ati.power_std_ = std::sqrt(power_sum / iterations_);
+            ati.gpu_power_std_ = std::sqrt(gpu_power_sum / iterations_);
+            ati.ddr_power_std_ = std::sqrt(ddr_power_sum / iterations_);
+
+            agg_tensor_info_.push_back(ati);
+        }
+    }
+
+public:
+    Profiler() {
+        cpu_stream_.open(cpu_power_rail, std::ifstream::in);
+        gpu_stream_.open(gpu_power_rail, std::ifstream::in);
+        ddr_stream_.open(ddr_power_rail, std::ifstream::in);
+        soc_stream_.open(soc_power_rail, std::ifstream::in);
+        sys_stream_.open(sys_power_rail, std::ifstream::in);
+
+        if (!cpu_stream_.is_open() or !gpu_stream_.is_open() or !ddr_stream_.is_open()
+            or !soc_stream_.is_open() or !sys_stream_.is_open()) {
+            std::cout << "Failed to open one of the power rails for reading\n";
+            exit(1);
+        }
+    }
+
+    ~Profiler() {
+        cpu_stream_.close();
+        gpu_stream_.close();
+        ddr_stream_.close();
+        soc_stream_.close();
+        sys_stream_.close();
+    }
+
+    void profile(const char * const program, const int iterations) {
+        iterations_ = iterations;
+        resetGlobal();
+
+        for (unsigned i = 0; i < iterations_; i++) {
+            resetLocal();
+
+            // Launch two threads: one for running the program and one for
+            // profiling it. Pin the threads to specific cores to remove migration
+            // overhead. Profiling showed that the sampling rate increases slightly
+            // with pinning.
+            std::thread prog(&Profiler::runProgram, this, program);
+            std::thread power(&Profiler::recordPower, this);
+            pinThread(prog, core1);
+            pinThread(power, core2);
+            prog.join();
+            power.join();
+
+            updateStats();
+
+            // Sleep for some time to bring the GPU back to idle
+            std::this_thread::sleep_for(std::chrono::milliseconds(gpu_idle_time));
+        }
+
+        calculateAggregateStats();
+    }
+
+    void dumpTensorInfo(const char * const filename) const {
+        const std::string header = "Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std\n";
+        std::ofstream ofs;
+        ofs.open(filename);
+        //ofs << header;
+        for (const auto &ati : agg_tensor_info_) {
+            ofs << ati.name_
+                << "," << ati.average_time_ * 1e3
+                << "," << ati.average_energy_
+                /*
+                << "," << ati.average_gpu_energy_
+                << "," << ati.average_ddr_energy_
+                << "," << ati.average_power_
+                << "," << ati.average_gpu_power_
+                << "," << ati.average_ddr_power_
+                << "," << ati.time_std_ * 1e3
+                << "," << ati.energy_std_
+                << "," << ati.gpu_energy_std_
+                << "," << ati.ddr_energy_std_
+                << "," << ati.power_std_
+                << "," << ati.gpu_power_std_
+                << "," << ati.ddr_power_std_*/
+                << "\n";
+
+            std::cout << ati.average_time_ * 1e3 << "," << ati.average_energy_ << "\n";
+        }
+        ofs.close();
+    }
+
+    void dumpPowerReadings(const char * const filename) const {
+        std::ofstream ofs;
+        ofs.open(filename);
+        for (const auto &reading : power_readings_) {
+            std::chrono::duration<double> duration = reading.time_ - start_time_;
+            //std::chrono::duration<double> duration = reading.time_.time_since_epoch();
+            ofs << std::to_string(duration.count())
+                << " " << reading.gpu_
+                << " " << reading.ddr_
+                << "\n";
+        }
+        ofs.close();
+    }
+
+    void dumpTotalInfo() const {
+        auto total_time = total_info_.time_ / iterations_;
+
+        auto total_energy = total_info_.energy_ / iterations_;
+        auto gpu_energy = total_info_.gpu_energy_ / iterations_;
+        auto ddr_energy = total_info_.ddr_energy_ / iterations_;
+
+        auto power = total_info_.power_ / iterations_;
+        auto gpu_power = total_info_.gpu_power_ / iterations_;
+        auto ddr_power = total_info_.ddr_power_ / iterations_;
+
+        std::cout << "-----------------------------------------------------\n";
+        std::cout << "Program info (average)\n";
+        std::cout << "-----------------------------------------------------\n";
+        std::cout << "\tExecution time: " << total_time << " seconds\n";
+        std::cout << "\tTotal energy:   " << total_energy << " mJ\n";
+        std::cout << "\t    GPU:        " << gpu_energy << " mJ\n";
+        std::cout << "\t    DDR:        " << ddr_energy << " mJ\n";
+        std::cout << "\tPower:          " << power << " mW\n";
+        std::cout << "\t    GPU:        " << gpu_power << " mW\n";
+        std::cout << "\t    DDR:        " << ddr_power << " mW\n";
+        std::cout << "-----------------------------------------------------\n";
+    }
+};
+
+int main(int argc, char *argv[]) {
+    if (argc < NUM_ARGS) {
+        std::cout << "Usage: " << argv[0] << " <program> <iterations> <tensor output file> [power output file]\n";
+        exit(1);
+    }
+
+    Profiler pp;
+    pp.profile(argv[1], std::stoi(argv[2]));
+    pp.dumpTensorInfo(argv[3]);
+
+    if (argc > NUM_ARGS)
+        pp.dumpPowerReadings(argv[4]);
+
+    return 0;
+}
+
diff --git a/llvm/projects/gpu_profiler/src/profiler.cpp b/llvm/projects/gpu_profiler/src/profiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..188223a9059eede6d2a32e853dee22b95ecb719e
--- /dev/null
+++ b/llvm/projects/gpu_profiler/src/profiler.cpp
@@ -0,0 +1,180 @@
+#include "profiler.h"
+
+Profiler::Profiler() : should_run_profiler_(false), should_stop_profiler_(false) {
+    // Open all streams. Not done in start_profiler() function bc the streams
+    // should be strictly opened once 
+    cpu_stream_.open(cpu_power_rail, std::ifstream::in);
+    gpu_stream_.open(gpu_power_rail, std::ifstream::in);
+    ddr_stream_.open(ddr_power_rail, std::ifstream::in);
+    soc_stream_.open(soc_power_rail, std::ifstream::in);
+    sys_stream_.open(sys_power_rail, std::ifstream::in);
+
+    if (!cpu_stream_.is_open() || !gpu_stream_.is_open() || !ddr_stream_.is_open()
+                || !soc_stream_.is_open() || !sys_stream_.is_open()) {
+        std::cout << "Failed to open one of the power rails for reading\n";
+        exit(1);
+    }
+}
+
+Profiler::~Profiler() {
+    cpu_stream_.close();
+    gpu_stream_.close();
+    ddr_stream_.close();
+    soc_stream_.close();
+    sys_stream_.close();
+}
+
+// Reinitializes boolean vars used for control flow and launches the profiler 
+// thread. DOES NOT reset other internal data structures. 
+void Profiler::start_profiler(){
+    // Reinitialize in case the profiler object has been used before 
+    should_run_profiler_ = false;
+    should_stop_profiler_ = false;
+    
+    // Launch profiler thread
+    profiler_thread_ = std::thread(&Profiler::run_profiler, this);
+    pin_thread(profiler_thread_, core1);
+}
+
+// Resumes the profiling of whatever executable's currently running
+// DOES NOT reset any data 
+void Profiler::resume_profiler() {
+    {
+        std::unique_lock<std::mutex> mutex_lock(mutex_);
+        if (should_run_profiler_){
+            std::cout << "WARNING: resume_profiler was already called\n"; 
+        }
+        should_run_profiler_ = true;
+        start_time_ = std::chrono::high_resolution_clock::now();
+    }
+    cond_var_.notify_one();
+}
+
+// Stops profiler by putting profiler thread to sleep 
+void Profiler::pause_profiler() {
+    {
+        std::unique_lock<std::mutex> mutex_lock(mutex_);
+        if (!should_run_profiler_){
+            std::cout << "WARNING: pause_profiler was already called\n";
+        }
+        should_run_profiler_ = false;
+    }
+    cond_var_.notify_one();
+}
+
+// Gets the delta time and total GPU and DDR energy between the last two
+// calls to resume_profiler and pause_profiler
+//
+// Returns this as a pair of <delta time in milliseconds, energy>
+std::pair<double, double> Profiler::get_time_energy() const {
+    double total_energy = 0.0;
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
+    for (auto reading : power_readings_) {
+        std::chrono::duration<double> duration = reading.time_ - prev_time;
+        total_energy += reading.gpu_ * duration.count();
+        total_energy += reading.ddr_ * duration.count();
+        prev_time = reading.time_;
+    }
+    double delta_time = std::chrono::duration<double, std::milli>(prev_time 
+                - start_time_).count();
+    return std::make_pair(delta_time, total_energy);
+}
+
+// Resets all internal data structures, including the vector storing all power_readings.
+void Profiler::reset() {
+    should_stop_profiler_ = false; // Can call reset after calling pause_profiler()
+    should_run_profiler_ = false; // Can call reset after calling resume 
+    power_readings_.clear();
+}
+
+// Exit the profiler and kill the thread
+// Must call start_profiler() to reuse this object after calling pause_profiler()
+void Profiler::stop_profiler() { 
+    std::cout << "Exiting profiler\n";
+    should_stop_profiler_ = true;
+    cond_var_.notify_one();
+    profiler_thread_.join();
+}
+
+// Obtain's a single power reading from the GPU and DDR rails
+void Profiler::obtain_power_reading() {
+    PowerReading reading;
+
+    // The order matters here. All the reads have to happen together first
+    // and then all the seeks have to happen together at the end, otherwise
+    // there will be a significant time difference between the readings of
+    // the different rails.
+    reading.time_ = std::chrono::high_resolution_clock::now();
+    gpu_stream_ >> reading.gpu_;
+    ddr_stream_ >> reading.ddr_;
+    power_readings_.push_back(reading);
+
+    // Reset the input position of the files
+    gpu_stream_.seekg(0);
+    ddr_stream_.seekg(0);
+}
+
+// Pins the given thread to the specified core
+void Profiler::pin_thread(std::thread &t, const unsigned core) const {
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(core, &cpuset);
+    if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0)
+        std::cout << "Couldn't set thread affinity\n";
+}
+
+// Runs the profiler thread, keeping it alive by wrapping the functionality
+// in an infinite loop 
+void Profiler::run_profiler(){
+    while (true){
+        if (should_stop_profiler_) {
+            break;
+        }
+        // Need to lock the mutex and check the condition var 
+        {
+            std::unique_lock<std::mutex> mutex_lock(mutex_);
+            if (should_stop_profiler_) {
+                break;
+            }
+            // Wake the thread up when it's time to run the profiler or exit
+            // the profiler 
+            cond_var_.wait(mutex_lock, [this]{return should_run_profiler_
+                        || should_stop_profiler_; });
+        }
+        if (should_stop_profiler_) {
+            break;
+        }
+        obtain_power_reading();
+    }
+}
+
+/*
+// TESTS
+void resume_pause_profiler(Profiler& profile_wrapper, unsigned long sleep_millis){
+    profile_wrapper.resume_profiler(); 
+    std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis));
+    profile_wrapper.pause_profiler();
+
+    auto time_energy_pair = profile_wrapper.get_time_energy();
+    profile_wrapper.reset();
+
+    printf("time: %f, energy: %f\n", time_energy_pair.first, time_energy_pair.second);
+    std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millis));
+}
+
+int main(){
+    Profiler profile_wrapper;
+	profile_wrapper.start_profiler();
+
+    unsigned long sleep_millis = 500;
+	resume_pause_profiler(profile_wrapper, sleep_millis);
+    resume_pause_profiler(profile_wrapper, sleep_millis);
+    resume_pause_profiler(profile_wrapper, sleep_millis);
+    resume_pause_profiler(profile_wrapper, sleep_millis);
+
+	// IMPORTANT
+    profile_wrapper.stop_profiler();
+    return 0;
+}
+*/