diff --git a/llvm/projects/gpu_profiler/include/profiler.h b/llvm/projects/gpu_profiler/include/profiler.h
index b776ed2b6642ee773783e48c9ba408d33d211f43..78c0f18071e916edfff435844fd990936855c410 100644
--- a/llvm/projects/gpu_profiler/include/profiler.h
+++ b/llvm/projects/gpu_profiler/include/profiler.h
@@ -65,6 +65,11 @@ private:
     const std::string ddr_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power2_input";
     const std::string soc_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0040/iio_device/in_power1_input";
     const std::string sys_power_rail = "/sys/devices/3160000.i2c/i2c-0/0-0041/iio_device/in_power0_input";
+    // Critical assumption: If this file doesn't exist, then the board isn't a Jetson
+    const std::string jetson_chip_id = "/sys/module/tegra_fuse/parameters/tegra_chip_id";
+
+    // True if running on Jetson, else false
+    bool on_jetson_; 
 
     // An individual power reading
     struct PowerReading {
diff --git a/llvm/projects/gpu_profiler/src/profiler.cpp b/llvm/projects/gpu_profiler/src/profiler.cpp
index 188223a9059eede6d2a32e853dee22b95ecb719e..822d708d58f6c8468cb4b84ed23e988251b95159 100644
--- a/llvm/projects/gpu_profiler/src/profiler.cpp
+++ b/llvm/projects/gpu_profiler/src/profiler.cpp
@@ -1,4 +1,4 @@
-#include "profiler.h"
+#include "profiler.h" 
 
 Profiler::Profiler() : should_run_profiler_(false), should_stop_profiler_(false) {
     // Open all streams. Not done in start_profiler() function bc the streams
@@ -9,8 +9,14 @@ Profiler::Profiler() : should_run_profiler_(false), should_stop_profiler_(false)
     soc_stream_.open(soc_power_rail, std::ifstream::in);
     sys_stream_.open(sys_power_rail, std::ifstream::in);
 
-    if (!cpu_stream_.is_open() || !gpu_stream_.is_open() || !ddr_stream_.is_open()
-                || !soc_stream_.is_open() || !sys_stream_.is_open()) {
+    // Check if the jetson file id file exists to indirectly check architecture 
+    std::ifstream jetson_file(jetson_chip_id);
+    on_jetson_ = jetson_file.good();
+
+    if (on_jetson_ && 
+                (!cpu_stream_.is_open() || !gpu_stream_.is_open() 
+                || !ddr_stream_.is_open() || !soc_stream_.is_open() 
+                || !sys_stream_.is_open())) {
         std::cout << "Failed to open one of the power rails for reading\n";
         exit(1);
     }
@@ -68,16 +74,23 @@ void Profiler::pause_profiler() {
 // Returns this as a pair of <delta time in milliseconds, energy>
 std::pair<double, double> Profiler::get_time_energy() const {
     double total_energy = 0.0;
-
-    std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
-    for (auto reading : power_readings_) {
-        std::chrono::duration<double> duration = reading.time_ - prev_time;
-        total_energy += reading.gpu_ * duration.count();
-        total_energy += reading.ddr_ * duration.count();
-        prev_time = reading.time_;
+    double delta_time = 0.0;
+
+    if (on_jetson_) {
+        std::chrono::time_point<std::chrono::high_resolution_clock> prev_time = start_time_;
+        for (auto reading : power_readings_) {
+            std::chrono::duration<double> duration = reading.time_ - prev_time;
+            total_energy += reading.gpu_ * duration.count();
+            total_energy += reading.ddr_ * duration.count();
+            prev_time = reading.time_;
+        }
+        delta_time = std::chrono::duration<double, std::milli>(prev_time
+                    - start_time_).count();
+    } else {
+        auto last_reading_time = power_readings_[power_readings_.size() - 1].time_;
+        delta_time = std::chrono::duration<double, std::milli>(last_reading_time
+                    - start_time_).count();
     }
-    double delta_time = std::chrono::duration<double, std::milli>(prev_time 
-                - start_time_).count();
     return std::make_pair(delta_time, total_energy);
 }
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
index e7784cb18e7e7c766f0ae27e6588d4851a2f2812..e8947881765637d68ca9d95d716c97d486e8380a 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
@@ -53,5 +53,6 @@ void dummyFunction(){
   void* TanhWrapper = (void*) &wrapper_tensorTanh;
   void* BatchNormWrapper = (void*) &wrapper_tensorBatchNorm;    
   void* PoolingWrapper = (void*) &wrapper_tensorPooling;    
+  void* softmaxWrapper = (void*) &wrapper_tensorSoftmax;    
 
 }
diff --git a/llvm/projects/soc_simulator/src/driver.py b/llvm/projects/soc_simulator/src/driver.py
index 1df46eec8fc34cee7c6a7683d1faaae4a94639ca..dbf2651bd3a9512c46d9e0a549c61290ad913ab0 100644
--- a/llvm/projects/soc_simulator/src/driver.py
+++ b/llvm/projects/soc_simulator/src/driver.py
@@ -14,6 +14,13 @@ class Driver:
     results_time_key = "Time"
     results_energy_key = "Energy"
 
+    def __get_str(self, appr):
+        if appr == Driver.ApproxTypes.FP16:
+            return "FP16"
+        elif appr == Driver.ApproxTypes.FP32:
+            return "FP32"
+        elif appr == Driver.ApproxTypes.PROMISE:
+            return "PROMISE"
 
     def driver(self):
         self.__parse_tensor_layer_file()
@@ -143,6 +150,7 @@ class Driver:
 
 
     def __quantize(self, curr_layer, prev_layer, h2f_f2h_operation_ind, layer_data):
+        print(self.__get_str(curr_layer), self.__get_str(prev_layer), h2f_f2h_operation_ind)
         if curr_layer == prev_layer or curr_layer == Driver.ApproxTypes.PROMISE \
                     or prev_layer == Driver.ApproxTypes.PROMISE: # No quantization needed
             return 0.0, 0.0
@@ -187,7 +195,7 @@ class Driver:
             exit(1)
         # Run promise simulator
         # TODO need to print time and energy in the ptm runner so we can pipe it
-        output = subprocess.Popen(["./ptm", str(rows_a), str(cols_a), str(rows_b), \
+        output = subprocess.Popen(["./ptm_new", str(rows_a), str(cols_a), str(rows_b), \
                     str(cols_b), str(patch_factor), str(swing)], \
                     stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0]
         total_time_energy = output.strip().split(',')
@@ -210,7 +218,6 @@ class Driver:
 
 
     def __run_simulations(self):
-        print("run sim")
         if not os.path.isfile(self.__config_filename):
             print("ERROR: %s was not found" % self.__config_filename)
             exit(1)
@@ -228,14 +235,12 @@ class Driver:
             for layer_ind, config_layer in enumerate(config_layers): # level
                 layer_data = self.__tensor_layers[layer_ind]  # layer
                 layer_name = layer_data["Name"]
-
                 if Driver.is_promise(config_layer):
                     print("Running layer %s on PROMISE" % layer_name)
                     curr_layer = Driver.ApproxTypes.PROMISE
                     quant_time, quant_energy = self.__quantize(curr_layer, prev_layer, 0, layer_data)
                     # Compute 
                     time, energy = self.__run_promise_simulation(config_layer, layer_data)
-                    print(time, energy)
                     self.__aggregate_results[Driver.results_time_key][self.__config_count] += time
                     self.__aggregate_results[Driver.results_energy_key][self.__config_count] += energy 
                 else:
@@ -250,15 +255,17 @@ class Driver:
                             curr_layer = Driver.ApproxTypes.FP16
                         else:
                             curr_layer = Driver.ApproxTypes.FP32
+
                         quant_time, quant_energy = self.__quantize(curr_layer, prev_layer, tensor_ind, layer_data)
                         conv_time, conv_energy = self.__run_gpu_simulation(curr_layer, layer_name, tensor_ind)
                         total_time += quant_time + conv_time 
                         total_energy += quant_energy + conv_energy
+                        prev_layer = curr_layer
 
                     self.__aggregate_results[Driver.results_time_key][self.__config_count] += total_time
                     self.__aggregate_results[Driver.results_energy_key][self.__config_count] += total_energy 
-
                 prev_layer = curr_layer
+
             self.__config_count += 1
             print("\n")
         config_file.close()
diff --git a/llvm/projects/soc_simulator/src/driver_new_config.py b/llvm/projects/soc_simulator/src/driver_new_config.py
index 115237dac51c96b47d02c84a603d98bdcf0b84a4..46af48c737b149f0665683bae9ffa023cf8d339e 100644
--- a/llvm/projects/soc_simulator/src/driver_new_config.py
+++ b/llvm/projects/soc_simulator/src/driver_new_config.py
@@ -15,13 +15,97 @@ class Driver:
     results_time_key = "Time"
     results_energy_key = "Energy"
 
+    def __get_str(self, appr):
+        if appr == Driver.ApproxTypes.FP16:
+            return "FP16"
+        elif appr == Driver.ApproxTypes.FP32:
+            return "FP32"
+        elif appr == Driver.ApproxTypes.PROMISE:
+            return "PROMISE"
 
     def driver(self):
         self.__parse_tensor_layer_file()
         self.__parse_tensor_table()
-        self.__run_simulations()
-        self.__display_results()
+        #self.__run_simulations()
+        #self.__display_results()
+        config_file = open(self.__config_filename, "r")
+        line = config_file.readline().strip()
+        config_count = 0 
+
+        prev_layer = Driver.ApproxTypes.FP32
+        curr_layer = None
+    
+        while line: 
+            assert(line == "+++++")
+            print("CONFIGURATION")
+
+            line = config_file.readline().strip()
+            # configuration data
+            conf_name = line.split(' ')[0]
+            assert(conf_name.startswith("conf"))
+            print(conf_name)
+            line = config_file.readline().strip()
+
+            # layers
+            layer_count = 0
+            while line != "-----":
+                print("LAYER", line ==  "-----")
+                print(layer_count, len(self.__tensor_layers)) 
+                if line.find("softmax") != -1:
+                    line = config_file.readline().strip()
+                    continue
+                layer_table_data = self.__tensor_layers[layer_count]
+                layer_name = layer_table_data["name"]
+
+                layer_line = line.split(' ')
+
+                if layer_line[1] == "promise":
+                    print("PROMISE")
+                    curr_layer = Driver.ApproxTypes.PROMISE
+                    time, energy = self.__run_promise_simulation(layer_line[3], layer_table_data)
+                    print(time, energy)
+                    self.__aggregate_results[results_time_key][config_count] += time
+                    self.__aggregate_results[results_energy_key][config_count] += energy
+
+                elif layer_line[1] == "gpu":
+                    print("GPU")
+
+                    total_time = 0
+                    total_energy = 0
+                    tensor_count = 0 
+
+                    for i in range(2, len(layer_line), 3):
+                        op_type = layer_line[i]
+                        approx_type = layer_line[i + 1]
+                        op_number = layer_line[i + 2]
+                        #print(op_type, approx_type, op_number)
+        
+                        if approx_type == "fp16":
+                            curr_layer = Driver.ApproxTypes.FP16
+                        elif approx_type == "fp32":
+                            curr_layer = Driver.ApproxTypes.FP32
+                        elif approx_type == "perf":
+                            curr_layer = Driver.ApproxTypes.PERF
+                        quant_time, quant_energy = self.__quantize(curr_layer, prev_layer, tensor_count, layer_table_data)
+                        if curr_layer == Driver.ApproxTypes.PERF:
+							conv_time, conv_energy = self.__run_gpu_simulation(curr_layer, layer_name, tensor_count, op_number) # Only need op number for perf
+                        else:
+                            conv_time, conv_energy = self.__run_gpu_simulation(curr_layer, layer_name, tensor_count)
+                        total_time += quant_time + conv_time
+                        total_energy += quant_energy + conv_energy
+                        prev_layer = curr_layer
+                        tensor_count += 1
+
+                    self.__aggregate_results[results_time_key][config_count] += total_time
+                    self.__aggregate_results[results_energy_key][config_count] += total_energy 
+
+                layer_count += 1
+                line = config_file.readline().strip()
+                prev_layer = curr_layer
 
+            config_count += 1
+            line = config_file.readline().strip()
+        config_file.close()
 
     def __init__(self, layer_filename, table_filename, config_filename, results_filename):
         self.__layer_filename = layer_filename
@@ -144,8 +228,11 @@ class Driver:
 
 
     def __quantize(self, curr_layer, prev_layer, h2f_f2h_operation_ind, layer_data):
+        print(self.__get_str(curr_layer), self.__get_str(prev_layer), h2f_f2h_operation_ind)
         if curr_layer == prev_layer or curr_layer == Driver.ApproxTypes.PROMISE \
-                    or prev_layer == Driver.ApproxTypes.PROMISE: # No quantization needed
+                    or prev_layer == Driver.ApproxTypes.PROMISE \
+                    or curr_layer == Driver.ApproxTypes.PERF \
+                    or prev_layer == Driver.ApproxTypes.PERF: # No quantization needed
             return 0.0, 0.0
        
         layer_name = layer_data["Name"]
@@ -155,7 +242,6 @@ class Driver:
 
         # Get h2f/f2h data using the first tensor operation in the layer
         # (which is why order matters in the tensor table)
-        print(layer_name, self.__tensor_table[layer_name])
         tensor_op_row = self.__tensor_table[layer_name][h2f_f2h_operation_ind]  
         if curr_layer == Driver.ApproxTypes.FP32:
             time = tensor_op_row["h2f_time"]
@@ -189,7 +275,7 @@ class Driver:
             exit(1)
         # Run promise simulator
         # TODO need to print time and energy in the ptm runner so we can pipe it
-        output = subprocess.Popen(["./ptm", str(rows_a), str(cols_a), str(rows_b), \
+        output = subprocess.Popen(["./ptm_new", str(rows_a), str(cols_a), str(rows_b), \
                     str(cols_b), str(patch_factor), str(swing)], \
                     stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0]
         total_time_energy = output.strip().split(',')
@@ -199,98 +285,23 @@ class Driver:
         return float(total_time_energy[0]), float(total_time_energy[1])
 
 
-    def __run_simulations(self):
-        if not os.path.isfile(self.__config_filename):
-            print("ERROR: %s was not found" % self.__config_filename)
-            exit(1)
-
-        config_file = open(self.__config_filename, "r")
-     
-        line = config_file.readline().strip()
-			
-        while line:
-            assert(line.startswith("+++++"))
-            config_name = config_file.readline().strip().split(' ')[0] # Next line = configuration name 
-            print("CONFIGURATION")
-
-            line = config_file.readline().strip()
-            layer_ind = 0 # NOTE can also use the leftmost number in the currl ine  
-
-            prev_layer = Driver.ApproxTypes.FP32
-            curr_layer = None
-
-            while not line.startswith("-----"):
-                layer_info = line.split(' ')
-                layer_data = self.__tensor_layers[layer_ind]  
-                layer_name = layer_data["Name"]
-
-                if layer_info[1] == "promise":
-                    print("Running layer %s on PROMISE" % layer_name)
-                    curr_layer = Driver.ApproxTypes.PROMISE
-                    
-                    swing = int(layer_info[3])
-                    time, energy = self.__run_promise_simulation(swing, layer_data)
-                    print(time, energy)
-                    self.__aggregate_results[Driver.results_time_key][self.__config_count] += time
-                    self.__aggregate_results[Driver.results_energy_key][self.__config_count] += energy
-				
-                elif layer_info[1] == "gpu":
-                    # Parse each individual tensor operation
-                    # TODO not portable bc there can be multiple numbers after each approx later on 
-                    total_time = 0
-                    total_energy = 0
-
-                    tensor_ind = 0
-                    for i in range(2, len(layer_info), 3):
-                        tensor_op = layer_info[i]
-                        approx_type = layer_info[i + 1]
-                        approx_num = layer_info[i + 2] # only matters if perf
-
-                        if approx_type == "fp16":
-                            curr_layer = Driver.ApproxTypes.FP16
-                        elif approx_type == "fp32":
-                            curr_layer = Driver.ApproxTypes.FP32
-                        elif approx_type == "perf":
-                            curr_layer = DriverApproxTypes.PERF
-                        else: 
-                            assert(False) 
-
-                        quant_time, quant_energy = self.__quantize(curr_layer, prev_layer, tensor_ind, layer_data)
-                        time, energy = self.__run_gpu_simulation(curr_layer, layer_name, tensor_ind, approx_num)
-                        total_time += time
-                        total_energy += energy
-	
-                        tensor_ind += 1 
-
-                    self.__aggregate_results[Driver.results_time_key][self.__config_count] += total_time
-                    self.__aggregate_results[Driver.results_energy_key][self.__config_count] += total_energy
-
-                layer_ind += 1
-                line = config_file.readline().strip()
-
-            self.__config_count += 1
-            line = config_file.readline().strip()
-
-        config_file.close()
-
-
-    def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind, approx_num):
+    def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind, perf_number = None):
         tensor_info = self.__tensor_table[layer_name][tensor_ind]
-
         if curr_layer == Driver.ApproxTypes.FP32:
-			time = tensor_info["fp32_time"]
-			energy = tensor_info["fp32_energy"]
+            conversion_time = tensor_info["fp32_time"]
+            conversion_energy = tensor_info["fp32_energy"]
 
         elif curr_layer == Driver.ApproxTypes.FP16:
-			time = tensor_info["fp16_time"]
-			energy = tensor_info["fp16_energy"]
+            conversion_time = tensor_info["fp16_time"]
+            conversion_energy = tensor_info["fp16_energy"]
 
         elif curr_layer == Driver.ApproxTypes.PERF:
-			time = tensor_info["perf%s_energy" % approx_num]
-			energy = tensor_info["perf%s_energy" % approx_num]
-            
-        print("GPU: (%f, %f)" % (time, energy))
-        return time, energy
+            # Then we care abut the following number 
+            conversion_time = tensor_info["perf%d_time" % perf_number]
+            conversion_energy = tensor_info["perf%d_energy" % perf_number]
+
+        print("GPU: (%f, %f)" % (conversion_time, conversion_energy))
+        return (conversion_time, conversion_energy)
 
 
     def __display_results(self):