diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9db1ed227291f8aa60c7c919162db26dbd0ea98
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc
@@ -0,0 +1,276 @@
+// Per tensor operation
+
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+
+#include "../../../tensor_runtime/include/tensor_runtime.h"
+#include "../../include/utils.h"
+
+/*
+void add_to_map(std::unordered_map<std::string, std::pair<double, double> >& total_time_energies, double data, const std::string& key){
+    auto itr = total_time_energies.find(key);
+    if (itr == total_time_energies.end()){
+        total_time_energies.insert(std::make_pair(key, data));
+    } else {
+        itr->second += data;
+    }
+}
+*/
+
+void add_data(std::unordered_map<std::string, std::pair<double, double> >& total_time_energies, Profiler& profiler, const std::string& op_name){
+    profiler.pause_profiler();
+    auto time_energy = profiler.get_time_energy_realzzzz();
+
+    auto itr = total_time_energies.find(op_name);
+    if (itr == total_time_energies.end()){
+        total_time_energies.insert(std::make_pair(op_name, time_energy));
+    } else {
+        itr->second.first += time_energy.first;
+		itr->second.second += time_energy.second;
+    }
+    profiler.reset();
+}
+
+/* NOTE: Reference Architecture to use for profiling */
+void testCifarNet(){
+
+  printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
+ 
+  std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); 
+  std::string input_path =  dir_prefix + std::string("norm_cifar_input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("test_labels.bin"); 
+
+  void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin",
+					  float_type, 32, 3, 3, 3);  
+  void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin",
+					float_type, 1, 32, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin",
+					  float_type, 32, 32, 3, 3);  
+  void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin",
+					float_type, 1, 32, 1, 1);
+  void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin",
+					  float_type, 64, 32, 3, 3);  
+  void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin",
+					  float_type, 64, 64, 3, 3);  
+  void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin",
+					float_type, 1, 64, 1, 1);
+  void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin",
+					  float_type, 128, 64, 3, 3);  
+  void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin",
+					float_type, 1, 128, 1, 1);
+  void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin",
+					  float_type, 128, 128, 3, 3);  
+  void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin",
+					float_type, 1, 128, 1, 1);
+  
+  void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin",
+					 float_type, 1, 1, 2048, 10);  
+  void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin",
+				      float_type, 1, 10, 1, 1);  
+ 
+  
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
+  std::ofstream online_profiler_output;
+  online_profiler_output.open("online_output.txt");
+
+  startMemTracking();
+
+  // NOTE: CHANGED INPUT TO STANDARDIZE
+  int total_runs = 50; // FOR NOW 100;
+
+  int test_input_size = 2500; //10000;
+  int batch_size = 2500;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
+
+  Profiler profiler;
+  profiler.start_profiler();
+
+  // Get the total time and energy per tensor per run 
+  std::unordered_map<std::string, std::pair<double, double> > total_time_energies;
+
+  for(int i = 0; i < total_runs; i++){
+    for(int i = 0; i < batch_count; i++){
+      int start = i * batch_size;
+      int end = (i + 1) * batch_size;
+      void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
+
+      // FIRST Tensor Runtime CALL
+      profiler.resume_profiler();
+      void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv1");
+       
+      profiler.resume_profiler();
+      tensorAdd(conv1out, conv1_bias); 
+      add_data(total_time_energies, profiler, "Add1");
+
+      profiler.resume_profiler();
+      void* conv1_tanh = tensorTanh(conv1out);
+      add_data(total_time_energies, profiler, "Tanh1");
+
+      // 2nd Layer
+      profiler.resume_profiler();
+      void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv2");
+
+      profiler.resume_profiler();
+      tensorAdd(conv2out, conv2_bias); 
+      add_data(total_time_energies, profiler, "Add2");
+
+      profiler.resume_profiler();
+      void* conv2_tanh = tensorTanh(conv2out);
+      add_data(total_time_energies, profiler, "Tanh2");
+
+      profiler.resume_profiler();
+      void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+      add_data(total_time_energies, profiler, "Pool1");
+
+      // 3rd Layer
+      profiler.resume_profiler();
+      void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv3");
+
+      profiler.resume_profiler();
+      tensorAdd(conv3out, conv3_bias); 
+      add_data(total_time_energies, profiler, "Add3");
+
+      profiler.resume_profiler();
+      void* conv3_tanh = tensorTanh(conv3out);
+      add_data(total_time_energies, profiler, "Tanh3");
+
+      // 4th Layer
+      profiler.resume_profiler();
+      void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv4");
+
+      profiler.resume_profiler();
+      tensorAdd(conv4out, conv4_bias); 
+      add_data(total_time_energies, profiler, "Add4");
+
+      profiler.resume_profiler();
+      void* conv4_tanh = tensorTanh(conv4out);
+      add_data(total_time_energies, profiler, "Tanh4");
+
+      profiler.resume_profiler();
+      void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+      add_data(total_time_energies, profiler, "Pool2");
+
+      // 5th Layer
+      profiler.resume_profiler();
+      void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv5");
+
+      profiler.resume_profiler();
+      tensorAdd(conv5out, conv5_bias); 
+      add_data(total_time_energies, profiler, "Add5");
+
+      profiler.resume_profiler();
+      void* conv5_tanh = tensorTanh(conv5out);
+      add_data(total_time_energies, profiler, "Tanh5");
+
+      // 6th Layer
+      profiler.resume_profiler();
+      void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv6");
+
+      profiler.resume_profiler();
+      tensorAdd(conv6out, conv6_bias); 
+      add_data(total_time_energies, profiler, "Add6");
+
+      profiler.resume_profiler();
+      void* conv6_tanh = tensorTanh(conv6out);
+      add_data(total_time_energies, profiler, "Tanh6");
+
+      profiler.resume_profiler();
+      void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+      add_data(total_time_energies, profiler, "Pool3");
+
+      // final FC Layer
+      profiler.resume_profiler();
+      void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); 
+      add_data(total_time_energies, profiler, "Mul1"); // ASSUMING that this is mul1
+
+      std::cout<<"-----------------------------------ADD 7--------------------------------\n";
+      profiler.resume_profiler();
+      void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+      add_data(total_time_energies, profiler, "Add7");
+      std::cout<<"-----------------------------------ADD 7 ENDS --------------------------------\n";
+
+      profiler.resume_profiler();
+      void* result = tensorSoftmax(gemm1biasout);
+      add_data(total_time_energies, profiler, "Softmax1");
+
+      uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+      float accuracy = computeAccuracy2(labels, batch_size, result); 
+      final_accuracy += accuracy;
+    
+      freeBatchMemory();
+    }
+  }
+  auto total = profiler.get_time_energy_realzzzz();
+  std::cout << "total = " << total.first << ", " << total.second << std::endl;
+  profiler.get();
+  profiler.stop_profiler();
+  
+  stopProfiling();
+  //online_profiler_output << "Total time: " << total_time << ", " << total_energy << "\n";
+  // Now compute the averages across batches
+  std::ofstream ofs;
+  std::string arr[] = {"Add1", "Add2", "Add3", "Add4", "Add5", "Add6", "Add7",
+                       "Conv1", "Conv2", "Conv3", "Conv4", "Conv5", "Conv6",
+                       "Mul1",
+                       "Pool1", "Pool2", "Pool3",
+                       "Softmax1",
+                       "Tanh1", "Tanh2", "Tanh3", "Tanh4", "Tanh5", "Tanh6"};
+  ofs.open("online_profiler_tensor_data.txt");
+  std::vector<std::string> ordered_keys(std::begin(arr), std::end(arr));
+  for (const std::string& key : ordered_keys){
+    const auto& data_pair = total_time_energies[key];
+    ofs << key << ": " << data_pair.first / total_runs << "\t" << data_pair.second / total_runs << '\n';
+    std::cout<< key << ": " << data_pair.first / total_runs << "\t" << data_pair.second / total_runs << '\n';
+  }
+   
+  /*
+  ofs.open("online_profiler_tensor_data.txt");
+  for (const auto& tensor_data : total_time_energies){
+    ofs << tensor_data.first << ": " << tensor_data.second.first / total_runs << "\t" << tensor_data.second.second / total_runs << '\n';
+  }*/
+  ofs.close();
+  final_accuracy = (final_accuracy / batch_count) / total_runs;
+  dumpFinalAccuracy(final_accuracy);
+  online_profiler_output.close();
+}
+
+
+int main(int argc, char* argv[]){
+
+  llvm_hpvm_initTensorRt(0);
+
+  testCifarNet();
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
+}
+