diff --git a/llvm/projects/gpu_profiler/.gitignore b/llvm/projects/gpu_profiler/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dd2c293453382269c150c372d926f287a74edea5
--- /dev/null
+++ b/llvm/projects/gpu_profiler/.gitignore
@@ -0,0 +1,3 @@
+*.swp
+jetsonTX2Power
+pp
diff --git a/llvm/projects/gpu_profiler/Makefile b/llvm/projects/gpu_profiler/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..412d38265ab5c9408d4ac444ded9d6bd8b72f1b7
--- /dev/null
+++ b/llvm/projects/gpu_profiler/Makefile
@@ -0,0 +1,5 @@
+all:
+	g++ -std=c++11 -O3 profiler.cpp -o pp -lpthread
+
+clean:
+	rm -rf pp
diff --git a/llvm/projects/gpu_profiler/plot.sh b/llvm/projects/gpu_profiler/plot.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8e4573b10c2fab993b4998d2040d10b0f7e9f9c5
--- /dev/null
+++ b/llvm/projects/gpu_profiler/plot.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+input=$1
+gnuplot -p << EOF
+    #set terminal png
+    #set output "$input.png"
+    set xlabel "Time (s)"
+    set ylabel "Power (mW)"
+    set title "Power usage of GPU and DDR over time"
+    plot "$input" using 1:2 title 'GPU' with lines,"$input" using 1:3 title 'DDR' with lines
+EOF
diff --git a/llvm/projects/gpu_profiler/profiler.cpp b/llvm/projects/gpu_profiler/profiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f49993565ad651ed41f8ca443959b77ff803af2
--- /dev/null
+++ b/llvm/projects/gpu_profiler/profiler.cpp
@@ -0,0 +1,583 @@
+#include <cmath>
+#include <chrono>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <boost/algorithm/string.hpp>
+
+#include <vector>
+#include <map>
+
+#include <thread>
+#include <atomic>
+#include <sched.h>
+
+#define NUM_ARGS 4
+
+// This is a simple power profiler that can sample the power of the various
+// components in a Jetson TX2. The usage is simple: profile() measures power
+// for the specified program, and then dumpOutput() prints the readings to a
+// file. profile() can be called as many times as desired - the internal state
+// is reset each time and thus the measurements are not cumulative.
+class Profiler {
+private:
+    // Jetson's ARM cores' physical IDs. The two Denver cores are 1 and 2, and
+    // we can't use them.
+    const unsigned core0 = 0;
+    const unsigned core1 = 3;
+    const unsigned core2 = 4;
+    const unsigned core3 = 5;
+
+    // sysfs paths for i2c buses of various components
+    const char * const cpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power1_input";
+    const char * const gpu_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power0_input";
+    const char * const ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
+    const char * const soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
+    const char * const sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
+
+    // It takes some time for the GPU's power to return to idle (ms)
+    const unsigned gpu_idle_time = 0;
+
+    // An individual power reading
+    struct PowerReading {
+        std::chrono::time_point<std::chrono::high_resolution_clock> time_;
+        double cpu_;
+        double gpu_;
+        double ddr_;
+        double soc_;
+        double sys_;
+    };
+
+    // Individual tensor op
+    struct TensorOp {
+        std::string name_;
+
+        double start_;
+        double finish_;
+        double time_;
+
+        double energy_;
+        double gpu_energy_;
+        double ddr_energy_;
+
+        double power_;
+        double gpu_power_;
+        double ddr_power_;
+
+        TensorOp(std::string name, double start, double finish)
+            : name_(name), start_(start), finish_(finish), time_(finish - start),
+            energy_(0.0), gpu_energy_(0.0), ddr_energy_(0.0),
+            power_(0.0), gpu_power_(0.0), ddr_power_(0.0) {
+        }
+    };
+
+    // Aggregate tensor info
+    struct AggTensorInfo {
+        // Op name
+        std::string name_;
+
+        // Averages
+        double average_time_;
+
+        double average_energy_;
+        double average_gpu_energy_;
+        double average_ddr_energy_;
+
+        double average_power_;
+        double average_gpu_power_;
+        double average_ddr_power_;
+
+        // Standard deviations
+        double time_std_;
+
+        double energy_std_;
+        double gpu_energy_std_;
+        double ddr_energy_std_;
+
+        double power_std_;
+        double gpu_power_std_;
+        double ddr_power_std_;
+    };
+
+    // Total time, energy, and power
+    struct TotalInfo {
+        double time_;
+
+        double energy_;
+        double gpu_energy_;
+        double ddr_energy_;
+
+        double power_;
+        double gpu_power_;
+        double ddr_power_;
+
+        void clear() {
+            time_ = 0.0;
+
+            energy_ = 0.0;
+            gpu_energy_ = 0.0;
+            ddr_energy_ = 0.0;
+
+            power_ = 0.0;
+            gpu_power_ = 0.0;
+            ddr_power_ = 0.0;
+        }
+    };
+
+    // For reading the i2c buses via sysfs
+    std::ifstream cpu_stream_;
+    std::ifstream gpu_stream_;
+    std::ifstream ddr_stream_;
+    std::ifstream soc_stream_;
+    std::ifstream sys_stream_;
+
+    // Start time (so graph begins from t=0)
+    std::chrono::time_point<std::chrono::high_resolution_clock> start_time_;
+
+    // Per-run info
+    std::vector<PowerReading> power_readings_;
+
+    // Aggregate (across all runs) info
+    std::map<std::string, std::vector<TensorOp>> tensor_info_;
+    std::vector<AggTensorInfo> agg_tensor_info_;
+    TotalInfo total_info_;
+    unsigned iterations_;
+
+    // Start and stop flags to synchronize the program and profiling threads
+    std::atomic_bool start_;
+    std::atomic_bool stop_;
+
+private:
+    // Resets tensor info and total time and energy
+    void resetGlobal() {
+        tensor_info_.clear();
+        agg_tensor_info_.clear();
+        total_info_.clear();
+    }
+
+    // Resets power readings and flags
+    void resetLocal() {
+        power_readings_.clear();
+        start_ = false;
+        stop_ = false;
+    }
+
+    // Pins the given thread to the specified core
+    void pinThread(std::thread &t, const unsigned core) const {
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        CPU_SET(core, &cpuset);
+        if (pthread_setaffinity_np(t.native_handle(), sizeof(cpu_set_t), &cpuset) != 0)
+            std::cout << "Couldn't set thread affinity\n";
+    }
+
+    // Adds a tensor op to the map
+    void addTensorOp(std::string &op_name, TensorOp &top) {
+        // Create a vector if this is the first entry
+        auto it = tensor_info_.find(op_name);
+        if (it == tensor_info_.end()) {
+            tensor_info_.insert(std::pair<std::string, std::vector<TensorOp>>(op_name, std::vector<TensorOp>()));
+        }
+        tensor_info_[op_name].push_back(top);
+    }
+
+    // Obtain's a single power reading from the GPU and DDR rails
+    void getPowerReading() {
+        PowerReading reading;
+
+        // The order matters here. All the reads have to happen together first
+        // and then all the seeks have to happen together at the end, otherwise
+        // there will be a significant time difference between the readings of
+        // the different rails.
+        reading.time_ = std::chrono::high_resolution_clock::now();
+        gpu_stream_ >> reading.gpu_;
+        ddr_stream_ >> reading.ddr_;
+        power_readings_.push_back(reading);
+
+        // Reset the input position of the files
+        gpu_stream_.seekg(0);
+        ddr_stream_.seekg(0);
+    }
+
+    // Executes the program to be profiled
+    void runProgram(const char * const program) {
+        // Tell the profiling thread to start, execute the program that needs
+        // to be profiled, and then tell the profiling thread to stop.
+        start_ = true;
+        const auto result = std::system(program);
+        stop_ = true;
+    }
+
+    // Records power while the program is running
+    void recordPower() {
+        // Obtain the new start time, wait for the start signal, and keep
+        // profiling until the stop flag is set.
+        start_time_ = std::chrono::high_resolution_clock::now();
+        while (!start_);
+        while (!stop_)
+            getPowerReading();
+    }
+
+    // Calculates stats for the entire execution (CPU+GPU phase)
+    void updateTotalStats() {
+        double energy = 0.0;
+        double gpu_energy = 0.0;
+        double ddr_energy = 0.0;
+
+        std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
+        for (auto reading : power_readings_) {
+            std::chrono::duration<double> duration = reading.time_ - prev_time;
+            gpu_energy += reading.gpu_ * duration.count();
+            ddr_energy += reading.ddr_ * duration.count();
+            prev_time = reading.time_;
+        }
+        energy = gpu_energy + ddr_energy;
+        auto time = std::chrono::duration<double>(prev_time - start_time_).count();
+
+        total_info_.time_ += time;
+        total_info_.energy_ += (gpu_energy + ddr_energy);
+        total_info_.gpu_energy_ += gpu_energy;
+        total_info_.ddr_energy_ += ddr_energy;
+
+        total_info_.power_ += (energy / time);
+        total_info_.gpu_power_ += (gpu_energy / time);
+        total_info_.ddr_power_ += (ddr_energy / time);
+    }
+
+    // Calculates energy and power usage of the given tensor operation
+    void calculateTensorEP(TensorOp &top) const {
+        auto prev_time = top.start_;
+        unsigned i = 0;
+
+        // Skip until we hit the start time of the operation
+        for (; std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count() < top.start_; i++);
+
+        // Keep going until we hit the finish time of the operation or we run out of readings
+        for (double curr_time; ((curr_time = std::chrono::duration<double>(power_readings_[i].time_.time_since_epoch()).count()) <= top.finish_)
+                && (i < power_readings_.size()); i++) {
+            auto duration = curr_time - prev_time;
+            prev_time = curr_time;
+
+            top.gpu_energy_ += power_readings_[i].gpu_ * duration;
+            top.ddr_energy_ += power_readings_[i].ddr_ * duration;
+        }
+        top.energy_ = top.gpu_energy_ + top.ddr_energy_;
+
+        top.power_ = top.energy_ / top.time_;
+        top.gpu_power_ = top.gpu_energy_ / top.time_;
+        top.ddr_power_ = top.ddr_energy_ / top.time_;
+    }
+
+    // Calculates stats for all the tensors in the timestamp file
+    void updatePerOpStats() {
+        const char * const op_file = "profile_data.txt";
+        std::string line;
+        std::ifstream ifs(op_file, std::ios::in);
+
+        // Calculate time and energy for each tensor operation. There are two
+        // possibilities for the file format:
+        // If the line doesn't begin with #, we are looking at FP32 code
+        // without any conversions to/from FP16, and each operation occupies
+        // two consecutive lines in the timestamp file.
+        // If the line does begin with #, we are looking at FP16 code with
+        // conversion routines in the middle. In this case, *after* the current
+        // line, there will be two lines for F2H, two lines for H2F, and then
+        // one line for the end of the operation.
+        while (std::getline(ifs, line)) {
+            std::vector<std::string> tokens;
+            boost::split(tokens, line, boost::is_any_of("\t"));
+            std::string op_name = tokens[0];
+
+            // FP32
+            if (tokens[0][0] != '#') {
+                // First line with tensor op name and start time
+                std::string op_name = tokens[0];
+                const auto start = std::stod(tokens[1]);
+
+                // Second line with tensor op end time
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto finish = std::stod(tokens[1]);
+
+                TensorOp top(op_name, start, finish);
+                calculateTensorEP(top);
+                addTensorOp(op_name, top);
+            } else {
+                // First line with tensor op name and start time
+                std::string op_name = tokens[0].substr(1);
+                const auto start = std::stod(tokens[1]);
+
+                // Second line with f2h
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                std::string f2h_name = op_name + "_f2h";
+                const auto f2h_start = std::stod(tokens[1]);
+
+                // Third line with f2h
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto f2h_finish = std::stod(tokens[1]);
+
+                // Add f2h
+                TensorOp f2h(f2h_name, f2h_start, f2h_finish);
+                calculateTensorEP(f2h);
+                addTensorOp(f2h_name, f2h);
+
+                // Fourth line with h2f
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                std::string h2f_name = op_name + "_h2f";
+                const auto h2f_start = std::stod(tokens[1]);
+
+                // Fifth line with h2f
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto h2f_finish = std::stod(tokens[1]);
+
+                // Add h2f
+                TensorOp h2f(h2f_name, h2f_start, h2f_finish);
+                calculateTensorEP(h2f);
+                addTensorOp(h2f_name, h2f);
+
+                // Sixth and final line with tensor op end time
+                std::getline(ifs, line);
+                tokens.clear();
+                boost::split(tokens, line, boost::is_any_of("\t"));
+                const auto finish = std::stod(tokens[1]);
+
+                // Subtract f2h's and h2f's time and energy to get just the computation's info
+                TensorOp top(op_name, start, finish);
+                calculateTensorEP(top);
+
+                top.time_ -= (f2h.time_ + h2f.time_);
+                top.energy_ -= (f2h.energy_ + h2f.energy_);
+                top.gpu_energy_ -= (f2h.gpu_energy_ + h2f.gpu_energy_);
+                top.ddr_energy_ -= (f2h.ddr_energy_ + h2f.ddr_energy_);
+                top.power_ = top.energy_ / top.time_;
+                top.gpu_power_ = top.gpu_energy_ / top.time_;
+                top.ddr_power_ = top.ddr_energy_ / top.time_;
+
+                addTensorOp(op_name, top);
+            }
+        }
+        ifs.close();
+    }
+
+    void updateStats() {
+        updatePerOpStats();
+        updateTotalStats();
+    }
+
+    // Calculates the average and standard deviation of each metric of each tensor op
+    void calculateAggregateStats() {
+        for (auto it = tensor_info_.begin(); it != tensor_info_.end(); it++) {
+            AggTensorInfo ati;
+            ati.name_ = it->first;
+            auto topv = it->second;
+
+            double total_time = 0.0;
+            double total_energy = 0.0;
+            double total_gpu_energy = 0.0;
+            double total_ddr_energy = 0.0;
+            double total_power = 0.0;
+            double total_gpu_power = 0.0;
+            double total_ddr_power = 0.0;
+
+            double time_sum = 0.0;
+            double energy_sum = 0.0;
+            double gpu_energy_sum = 0.0;
+            double ddr_energy_sum = 0.0;
+            double power_sum = 0.0;
+            double gpu_power_sum = 0.0;
+            double ddr_power_sum = 0.0;
+
+            // Calculate average
+            for (const auto &top : topv) {
+                total_time += top.time_;
+                total_energy += top.energy_;
+                total_gpu_energy += top.gpu_energy_;
+                total_ddr_energy += top.ddr_energy_;
+                total_power += top.power_;
+                total_gpu_power += top.gpu_power_;
+                total_ddr_power += top.ddr_power_;
+            }
+
+            ati.average_time_ = total_time / iterations_;
+            ati.average_energy_ = total_energy / iterations_;
+            ati.average_gpu_energy_ = total_gpu_energy / iterations_;
+            ati.average_ddr_energy_ = total_ddr_energy / iterations_;
+            ati.average_power_ = total_power / iterations_;
+            ati.average_gpu_power_ = total_gpu_power / iterations_;
+            ati.average_ddr_power_ = total_ddr_power / iterations_;
+
+            // Calculate standard deviation
+            for (const auto &top : topv) {
+                auto time_diff = top.time_ - ati.average_time_;
+                time_sum += time_diff * time_diff;
+
+                auto energy_diff = top.energy_ - ati.average_energy_;
+                energy_sum += energy_diff * energy_diff;
+                auto gpu_energy_diff = top.gpu_energy_ - ati.average_gpu_energy_;
+                gpu_energy_sum += gpu_energy_diff * gpu_energy_diff;
+                auto ddr_energy_diff = top.ddr_energy_ - ati.average_ddr_energy_;
+                ddr_energy_sum += ddr_energy_diff * ddr_energy_diff;
+
+                auto power_diff = top.power_ - ati.average_power_;
+                power_sum += power_diff * power_diff;
+                auto gpu_power_diff = top.gpu_power_ - ati.average_gpu_power_;
+                gpu_power_sum += gpu_power_diff * gpu_power_diff;
+                auto ddr_power_diff = top.ddr_power_ - ati.average_ddr_power_;
+                ddr_power_sum += ddr_power_diff * ddr_power_diff;
+            }
+
+            ati.time_std_ = std::sqrt(time_sum / iterations_);
+            ati.energy_std_ = std::sqrt(energy_sum / iterations_);
+            ati.gpu_energy_std_ = std::sqrt(gpu_energy_sum / iterations_);
+            ati.ddr_energy_std_ = std::sqrt(ddr_energy_sum / iterations_);
+            ati.power_std_ = std::sqrt(power_sum / iterations_);
+            ati.gpu_power_std_ = std::sqrt(gpu_power_sum / iterations_);
+            ati.ddr_power_std_ = std::sqrt(ddr_power_sum / iterations_);
+
+            agg_tensor_info_.push_back(ati);
+        }
+    }
+
+public:
+    Profiler() {
+        cpu_stream_.open(cpu_power_rail, std::ifstream::in);
+        gpu_stream_.open(gpu_power_rail, std::ifstream::in);
+        ddr_stream_.open(ddr_power_rail, std::ifstream::in);
+        soc_stream_.open(soc_power_rail, std::ifstream::in);
+        sys_stream_.open(sys_power_rail, std::ifstream::in);
+
+        if (!cpu_stream_.is_open() or !gpu_stream_.is_open() or !ddr_stream_.is_open()
+            or !soc_stream_.is_open() or !sys_stream_.is_open()) {
+            std::cout << "Failed to open one of the power rails for reading\n";
+            exit(1);
+        }
+    }
+
+    ~Profiler() {
+        cpu_stream_.close();
+        gpu_stream_.close();
+        ddr_stream_.close();
+        soc_stream_.close();
+        sys_stream_.close();
+    }
+
+    void profile(const char * const program, const int iterations) {
+        iterations_ = iterations;
+        resetGlobal();
+
+        for (unsigned i = 0; i < iterations_; i++) {
+            resetLocal();
+
+            // Launch two threads: one for running the program and one for
+            // profiling it. Pin the threads to specific cores to remove migration
+            // overhead. Profiling showed that the sampling rate increases slightly
+            // with pinning.
+            std::thread prog(&Profiler::runProgram, this, program);
+            std::thread power(&Profiler::recordPower, this);
+            pinThread(prog, core1);
+            pinThread(power, core2);
+            prog.join();
+            power.join();
+
+            updateStats();
+
+            // Sleep for some time to bring the GPU back to idle
+            std::this_thread::sleep_for(std::chrono::milliseconds(gpu_idle_time));
+        }
+
+        calculateAggregateStats();
+    }
+
+    void dumpTensorInfo(const char * const filename) const {
+        const std::string header = "Op,Time (ms),Energy (mJ),GPU Energy (mJ),DDR Energy (mJ),Power (mW),GPU Power (mW),DDR Power (mW),Time std,Energy std,GPU Energy std,DDR Energy std,Power std,GPU Power std,DDR Power std\n";
+        std::ofstream ofs;
+        ofs.open(filename);
+        //ofs << header;
+        for (const auto &ati : agg_tensor_info_) {
+            ofs << ati.name_
+                << "," << ati.average_time_ * 1e3
+                << "," << ati.average_energy_
+                << "," << ati.average_gpu_energy_
+                << "," << ati.average_ddr_energy_
+                << "," << ati.average_power_
+                << "," << ati.average_gpu_power_
+                << "," << ati.average_ddr_power_
+                << "," << ati.time_std_ * 1e3
+                << "," << ati.energy_std_
+                << "," << ati.gpu_energy_std_
+                << "," << ati.ddr_energy_std_
+                << "," << ati.power_std_
+                << "," << ati.gpu_power_std_
+                << "," << ati.ddr_power_std_
+                << "\n";
+
+            std::cout << ati.average_time_ * 1e3 << "," << ati.average_energy_ << "\n";
+        }
+        ofs.close();
+    }
+
+    void dumpPowerReadings(const char * const filename) const {
+        std::ofstream ofs;
+        ofs.open(filename);
+        for (const auto &reading : power_readings_) {
+            std::chrono::duration<double> duration = reading.time_ - start_time_;
+            //std::chrono::duration<double> duration = reading.time_.time_since_epoch();
+            ofs << std::to_string(duration.count())
+                << " " << reading.gpu_
+                << " " << reading.ddr_
+                << "\n";
+        }
+        ofs.close();
+    }
+
+    void dumpTotalInfo() const {
+        auto total_time = total_info_.time_ / iterations_;
+
+        auto total_energy = total_info_.energy_ / iterations_;
+        auto gpu_energy = total_info_.gpu_energy_ / iterations_;
+        auto ddr_energy = total_info_.ddr_energy_ / iterations_;
+
+        auto power = total_info_.power_ / iterations_;
+        auto gpu_power = total_info_.gpu_power_ / iterations_;
+        auto ddr_power = total_info_.ddr_power_ / iterations_;
+
+        std::cout << "-----------------------------------------------------\n";
+        std::cout << "Program info (average)\n";
+        std::cout << "-----------------------------------------------------\n";
+        std::cout << "\tExecution time: " << total_time << " seconds\n";
+        std::cout << "\tTotal energy:   " << total_energy << " mJ\n";
+        std::cout << "\t    GPU:        " << gpu_energy << " mJ\n";
+        std::cout << "\t    DDR:        " << ddr_energy << " mJ\n";
+        std::cout << "\tPower:          " << power << " mW\n";
+        std::cout << "\t    GPU:        " << gpu_power << " mW\n";
+        std::cout << "\t    DDR:        " << ddr_power << " mW\n";
+        std::cout << "-----------------------------------------------------\n";
+    }
+};
+
+int main(int argc, char *argv[]) {
+    if (argc < NUM_ARGS) {
+        std::cout << "Usage: " << argv[0] << " <program> <iterations> <tensor output file> [power output file]\n";
+        exit(1);
+    }
+
+    Profiler pp;
+    pp.profile(argv[1], std::stoi(argv[2]));
+    pp.dumpTensorInfo(argv[3]);
+
+    if (argc > NUM_ARGS)
+        pp.dumpPowerReadings(argv[4]);
+
+    return 0;
+}
+
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-1.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..af57723b4091da6feffa9ef8f789698837b90bfa
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-1.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-2.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..57c5597c28e1028fa643bc5b03db8fc51d0f4b6b
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-2.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-3.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-3.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c7e0e3b2e7ff9d52c66b208321ecfa858ef5d9da
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-3.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-1.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..85bba9ee4c6dea2b1a7356d3847acb9aa5ea85aa
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-1.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-2.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..a7ddb64b1e9e97f8ba93c52b38402dd2293725ef
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-2.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-3.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-3.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..5865ac69d6c5187fac0476f87d20e3a5154d516f
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3fullCLK-ddr-3.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-1.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..f235128927672fdb46c54dd357cafb3c275a7144
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-1.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-2.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e80e4cde621b99ef074e1c97aadd191d6b9777c1
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-2.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-3.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-3.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..61f5d13e1cd040dbf7bdef058f34387f83a7df23
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-3.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-1.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..fbd3ebd141c6eb0582496c85dcd5388a6a0bce7b
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-1.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-2.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..f19c5a204ccc418c5af80c8953b9c28f39c3fd93
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-2.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-3.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-3.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4ae009ab08d8368139c09a4f51e5cadc3623fefb
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-ddr-3.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-gpu-ddr.pdf b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-gpu-ddr.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..5996b934a55fd90d9cd773d42e9cfa89429cab68
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/lenet/lenet-m3normal-gpu-ddr.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/tests/merge1.pdf b/llvm/projects/gpu_profiler/results/tests/merge1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..5d3c3540c2e67aa7943f5993cfebf0cca40412fe
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/tests/merge1.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/tests/merge3.pdf b/llvm/projects/gpu_profiler/results/tests/merge3.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c38e37e2d610175311fb3221fd5fa26892fdf1e8
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/tests/merge3.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-1.pdf b/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..61bdcc890e0b5800ee4b3a8e19abdc724461a01b
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-1.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-2.pdf b/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..bf80ebf1859640bdec386ef470fc04ada74ea822
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-2.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-3.pdf b/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-3.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..557bc295b6386ea73a0fcbd140de63a326f6aecd
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-3.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-tool.pdf b/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-tool.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..d00032f35f77934432832cef4c00124327a14169
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/tests/mm300-gpu+ddr-tool.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/tests/mm300-gpu-1.pdf b/llvm/projects/gpu_profiler/results/tests/mm300-gpu-1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..030d102341f58c57753d436554278c8d628137a8
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/tests/mm300-gpu-1.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/tests/mm300-gpu-2.pdf b/llvm/projects/gpu_profiler/results/tests/mm300-gpu-2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..e6f97075597d8080bf223a1fd1a35b8969b9b141
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/tests/mm300-gpu-2.pdf differ
diff --git a/llvm/projects/gpu_profiler/results/tests/mm300-gpu-3.pdf b/llvm/projects/gpu_profiler/results/tests/mm300-gpu-3.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..6898f235ef85bd5148f5fcfdcf50b4f14ca2ab19
Binary files /dev/null and b/llvm/projects/gpu_profiler/results/tests/mm300-gpu-3.pdf differ
diff --git a/llvm/projects/gpu_profiler/run.pl b/llvm/projects/gpu_profiler/run.pl
new file mode 100755
index 0000000000000000000000000000000000000000..8674e63d9453fbb1e07371d99cf22c4745f234b3
--- /dev/null
+++ b/llvm/projects/gpu_profiler/run.pl
@@ -0,0 +1,64 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my $time;
+my $iterations = 100;
+
+# FP32
+print "############### FP32 ##############\n";
+
+print "Running Lenet\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./lenet_tanh $iterations lenet-fp32.csv`;
+
+print "Running FC2\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./fc2_clipped $iterations fc2-fp32.csv`;
+
+print "Running FC3\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./fc3_clipped $iterations fc3-fp32.csv`;
+
+print "Running FC4\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./fc4_clipped $iterations fc4-fp32.csv`;
+
+print "Running CIFAR\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./cifar_keras $iterations cifar-fp32.csv`;
+
+# FP16
+print "############### FP16 ##############\n";
+
+print "Running Lenet\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./lenet_tanh_half $iterations lenet-fp16.csv`;
+
+print "Running FC2\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./fc2_half $iterations fc2-fp16.csv`;
+
+print "Running FC3\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./fc3_half $iterations fc3-fp16.csv`;
+
+print "Running FC4\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./fc4_half $iterations fc4-fp16.csv`;
+
+print "Running CIFAR\n";
+$time = `date`;
+print $time;
+`~/awesome_profiler/pp ./cifar_keras_half $iterations cifar-fp16.csv`;
+
diff --git a/llvm/projects/gpu_profiler/run_dnns.pl b/llvm/projects/gpu_profiler/run_dnns.pl
new file mode 100755
index 0000000000000000000000000000000000000000..041f3e3cae8598d34ac8d38f65cd37d51e8aa0ba
--- /dev/null
+++ b/llvm/projects/gpu_profiler/run_dnns.pl
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my $time;
+my $iterations = 100;
+my @networks = ("alexnet", "alexnet2", "resnet18", "vgg16");
+
+# FP32
+print "############### FP32 ##############\n";
+
+foreach my $network (@networks) {
+    print "Running $network\n";
+    $time = `date`;
+    print $time;
+    `~/awesome_profiler/pp ./${network}_cifar10 $iterations ${network}_fp32.csv`;
+}
+
+# FP16
+print "############### FP16 ##############\n";
+
+foreach my $network (@networks) {
+    print "Running $network\n";
+    $time = `date`;
+    print $time;
+    `~/awesome_profiler/pp ./${network}_cifar10_half $iterations ${network}_fp16.csv`;
+}
+
diff --git a/llvm/projects/gpu_profiler/run_image_pipelines.pl b/llvm/projects/gpu_profiler/run_image_pipelines.pl
new file mode 100755
index 0000000000000000000000000000000000000000..8e6df67d2e96d343cff3cc6a324693c14abaa3f3
--- /dev/null
+++ b/llvm/projects/gpu_profiler/run_image_pipelines.pl
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my $time;
+my $iterations = 100;
+my @pipelines = ("pipeline_GEMO", "pipeline_GEO", "pipeline_GEOM", "pipeline_GSM", "pipeline_GSME");
+
+# FP32
+print "############### FP32 ##############\n";
+
+foreach my $pipeline (@pipelines) {
+    print "Running $pipeline\n";
+    $time = `date`;
+    print $time;
+    `~/awesome_profiler/pp ./${pipeline} $iterations ${pipeline}_fp32.csv`;
+}
+
+# FP16
+print "############### FP16 ##############\n";
+
+foreach my $pipeline (@pipelines) {
+    print "Running $pipeline\n";
+    $time = `date`;
+    print $time;
+    `~/awesome_profiler/pp ./${pipeline}_half $iterations ${pipeline}_fp16.csv`;
+}
+