diff --git a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
index e8714c128f745d7949ce0897a57475bd070de4a5..8ccd087d49fc2171ebb95dbd723ed8e4723736db 100644
--- a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
@@ -918,7 +918,7 @@ errs() << "TensorII: " << *TensorII << "\n";
         Args.push_back(TensorII->getOperand(7));
     
         // Create wrapper API runtime function call
-        Constant* wrapper_tensorGroupConvolution;
+        Constant* wrapper_tensorGroupConvolution =
           M->getOrInsertFunction(StringRef("wrapper_tensorGroupConvolution"),
             RtM->getFunction(StringRef("wrapper_tensorGroupConvolution"))->getFunctionType());
         CallInst* CI = CallInst::Create(wrapper_tensorGroupConvolution,
@@ -956,9 +956,9 @@ errs() << "TensorII: " << *TensorII << "\n";
         Args.push_back(TensorII->getOperand(3));
         Args.push_back(TensorII->getOperand(4));
         Args.push_back(TensorII->getOperand(5));
-    
+
         // Create wrapper API runtime function call
-        Constant* wrapper_tensorBatchNorm;
+        Constant* wrapper_tensorBatchNorm =
           M->getOrInsertFunction(StringRef("wrapper_tensorBatchNorm"),
             RtM->getFunction(StringRef("wrapper_tensorBatchNorm"))->getFunctionType());
         CallInst* CI = CallInst::Create(wrapper_tensorBatchNorm,
diff --git a/llvm/projects/hpvm-tensor-rt/bin/extractQuantRange.py b/llvm/projects/hpvm-tensor-rt/bin/extractQuantRange.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b7f09d92e91894d284b40cc0bd2d346c08e36c7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/bin/extractQuantRange.py
@@ -0,0 +1,42 @@
+
+
+import sys
+
+
+if __name__ == "__main__":
+
+    f = open(sys.argv[1], "r")
+    f2 = open("quant_ranges.txt", "w+")
+
+    layer_line = False
+    for x in f:
+        if "ConvLayer_PROMISE" in x or "FCLayer_PROMISE" in x or layer_line == True:
+            if layer_line == True:
+              layer_line = False
+            else:
+              layer_line = True
+            
+            print x 
+            toks = x.split(",")
+
+            for tok in toks:
+                tok = tok.strip()
+                tok_val = ""
+                try:
+                    tok_val = float(tok)
+                    try:
+                        tok_val = int(tok)
+                    except: 
+                        print (tok_val)
+                        f2.write(str(tok_val) + " ")
+                        #f2.write("tok_val = ", tok_val + " ")
+                except:
+                    continue
+
+            f2.write("\n")
+    
+
+    f.close()
+    f2.close()
+
+        
diff --git a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh
index cde03bd6d0ffa9969c785e17fe2f708c75396158..33a54cd0de626113e5cf11e2f6a6928d4fa384eb 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh
+++ b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh
@@ -3,11 +3,9 @@
 export HPVM_TENSOR_RT_HOME=/home/hsharif3/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/
 export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH
 
-clang++ -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc
+clang++ -I/software/cuda-9.1/include -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc
 llvm-dis --version
 llvm-dis ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc
-cp ${HPVM_TENSOR_RT_HOME}/build/libtensor_runtime.a  ${HPVM_TENSOR_RT_HOME}/lib/libtensor_runtime.a
-cp ${HPVM_TENSOR_RT_HOME}/build_autotuner/libtensor_runtime.a  ${HPVM_TENSOR_RT_HOME}/lib/libtensor_autotuner.a
 
 
 
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py
index eeca0ed8ed8ed407b9c84592b22820857678b311..7e969271c20031dab9f302b333a4f7feb0338871 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py
@@ -7,9 +7,15 @@
 # Batch 12: Error Sens: 10, 25, 35, for Loss1, 2, 3, respectively, Min: P3. 1000 Runs for All
 # Batch 13: No Error Sens: Equal Runs (1000) for all. Min: P1
 # Batch 14: Reruning Batch12 with bugFix!
-# Batch 16: MAJOR CHANGE: 3 different skip levels for each Loss1,Loss2,Loss3
-# Batch 17: Baseline with 3000 runs. Compare with Batch16
+# Batch 15: MAJOR CHANGE: 3 different skip levels for each Loss1,Loss2,Loss3
 
+# Batch 18: Batch13 (Basline) + ParetoCurve (1500 Runs) - BUGGY IGNORE!!!
+
+# Batch 19: (Basline) + ParetoCurve + 2 runs in Tuning Phase (1500 Runs)
+# Batch 20: 3-Skip levels + + 2 runs + 1500 Runs + EnergyBandSize now % of Max (Compare against Batch19
+
+
+batch_id = "batch201"
 
 class Benchmark:
   def __init__(self):
@@ -46,21 +52,23 @@ Alexnet1.skip_layers = 0
 Alexnet1.skip_layer_str = "5_0"
 
 Alexnet1.base_dir = "../build_tuner/tuner_results/alexnet_cifar10/"
-Alexnet1.result_dir_1 = "../build_tuner/tuner_results/alexnet_cifar10/loss_1/batch17"
-Alexnet1.result_dir_2 = "../build_tuner/tuner_results/alexnet_cifar10/loss_2/batch17"
-Alexnet1.result_dir_3 = "../build_tuner/tuner_results/alexnet_cifar10/loss_3/batch17"
+Alexnet1.result_dir_1 = "../build_tuner/tuner_results/alexnet_cifar10/loss_1/" + batch_id
+Alexnet1.result_dir_2 = "../build_tuner/tuner_results/alexnet_cifar10/loss_2/" + batch_id
+Alexnet1.result_dir_3 = "../build_tuner/tuner_results/alexnet_cifar10/loss_3/" + batch_id
 
 Alexnet1.tensor_desc_file = "tuner_results/alexnet_cifar10/alexnet_tensors.txt"
 Alexnet1.layer_file = "tuner_results/alexnet_cifar10/alexnet_layers.txt"
 Alexnet1.cost_file = "../build_tuner/tuner_results/alexnet_cifar10/op_cost.txt"
+Alexnet1.layer_knobs = "../opentuner/data/alexnet/knobs.txt"
 
 #Alexnet1.loss1_result_file = "tuner_results/alexnet2_cifar10/alexnet_layers.txt"
 Alexnet1.loss1_result_file = "tuner_results/alexnet_cifar10/loss_1/promise_tuned_confs/promise_confs.txt"
 Alexnet1.loss2_result_file = "tuner_results/alexnet_cifar10/loss_2/promise_tuned_confs/promise_confs.txt"
 
-Alexnet1.autotuner_runs = 1000
+Alexnet1.autotuner_runs = 1500
 Alexnet1.tuner_accuracy = 79.9
-Alexnet1.promise_accuracy = 79.9
+#Alexnet1.promise_accuracy = 79.9
+Alexnet1.promise_accuracy = 79.5
 Alexnet1.validation_accuracy = 79.19
 
 bench_tuner_data["alexnet_cifar10"] = Alexnet1
@@ -79,17 +87,19 @@ Alexnet2.start_promise_range = 1
 Alexnet2.skip_layer_str = "6_1_0"
 
 Alexnet2.base_dir = "../build_tuner/tuner_results/alexnet2_cifar10/"
-Alexnet2.result_dir_1 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_1/batch17"
-Alexnet2.result_dir_2 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_2/batch17"
-Alexnet2.result_dir_3 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_3/batch17"
+Alexnet2.result_dir_1 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_1/" + batch_id
+Alexnet2.result_dir_2 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_2/" + batch_id
+Alexnet2.result_dir_3 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_3/" + batch_id
 Alexnet2.tensor_desc_file = "tuner_results/alexnet2_cifar10/alexnet2_tensors.txt"
 Alexnet2.layer_file = "tuner_results/alexnet2_cifar10/alexnet2_layers.txt"
 Alexnet2.cost_file = "../build_tuner/tuner_results/alexnet2_cifar10/op_cost.txt"
+Alexnet2.layer_knobs = "../opentuner/data/alexnet2/knobs.txt"
 #Alexnet2.loss1_result_file = "tuner_results/alexnet2_cifar10/loss_1/promise_tuned_confs/promise_confs.txt"
 #Alexnet2.loss2_result_file = "tuner_results/alexnet2_cifar10/loss_2/promise_tuned_confs/promise_confs.txt"
-Alexnet2.autotuner_runs = 1000
+Alexnet2.autotuner_runs = 1500
 Alexnet2.tuner_accuracy = 84.19
-Alexnet2.promise_accuracy = 84.19
+#Alexnet2.promise_accuracy = 84.19
+Alexnet2.promise_accuracy = 84.8
 Alexnet2.validation_accuracy = 85.15
 
 bench_tuner_data["alexnet2_cifar10"] = Alexnet2
@@ -109,20 +119,22 @@ Alexnet3.start_promise_range = 1
 Alexnet3.skip_layer_str = "14_3_4_1_6"
 
 Alexnet3.base_dir = "../build_tuner/tuner_results/vgg16_cifar10/"
-Alexnet3.result_dir_1 = "../build_tuner/tuner_results/vgg16_cifar10/loss_1/batch17"
-Alexnet3.result_dir_2 = "../build_tuner/tuner_results/vgg16_cifar10/loss_2/batch17"
-Alexnet3.result_dir_3 = "../build_tuner/tuner_results/vgg16_cifar10/loss_3/batch17"
+Alexnet3.result_dir_1 = "../build_tuner/tuner_results/vgg16_cifar10/loss_1/" + batch_id
+Alexnet3.result_dir_2 = "../build_tuner/tuner_results/vgg16_cifar10/loss_2/" + batch_id
+Alexnet3.result_dir_3 = "../build_tuner/tuner_results/vgg16_cifar10/loss_3/" + batch_id
 
 Alexnet3.tensor_desc_file = "tuner_results/vgg16_cifar10/vgg16_tensors.txt"
 Alexnet3.layer_file = "tuner_results/vgg16_cifar10/vgg16_layers.txt"
 Alexnet3.cost_file = "../build_tuner/tuner_results/vgg16_cifar10/op_cost.txt"
+Alexnet3.layer_knobs = "../opentuner/data/vgg16_cifar10/knobs.txt"
 
 Alexnet3.loss1_result_file = "tuner_results/vgg16_cifar10/loss_1/promise_tuned_confs/promise_confs.txt"
 Alexnet3.loss2_result_file = "tuner_results/vgg16_cifar10/loss_2/promise_tuned_confs/promise_confs.txt"
 
-Alexnet3.autotuner_runs = 1000
+Alexnet3.autotuner_runs = 1500
 Alexnet3.tuner_accuracy = 90.19
-Alexnet3.promise_accuracy = 90.19
+#Alexnet3.promise_accuracy = 90.19
+Alexnet3.promise_accuracy = 89.55
 Alexnet3.validation_accuracy = 89.05
 
 bench_tuner_data["vgg16_cifar10"] = Alexnet3
@@ -141,19 +153,21 @@ Alexnet4.start_promise_range = 1
 #Alexnet4.skip_layer_str = "0"
 Alexnet4.skip_layer_str = "0_1_2_14_15_17_18_21"
 Alexnet4.base_dir = "../build_tuner/tuner_results/resnet18_cifar10/"
-Alexnet4.result_dir_1 = "../build_tuner/tuner_results/resnet18_cifar10/loss_1/batch17"
-Alexnet4.result_dir_2 = "../build_tuner/tuner_results/resnet18_cifar10/loss_2/batch17"
-Alexnet4.result_dir_3 = "../build_tuner/tuner_results/resnet18_cifar10/loss_3/batch17"
+Alexnet4.result_dir_1 = "../build_tuner/tuner_results/resnet18_cifar10/loss_1/" + batch_id
+Alexnet4.result_dir_2 = "../build_tuner/tuner_results/resnet18_cifar10/loss_2/" + batch_id
+Alexnet4.result_dir_3 = "../build_tuner/tuner_results/resnet18_cifar10/loss_3/" + batch_id
 Alexnet4.tensor_desc_file = "tuner_results/resnet18_cifar10/resnet_tensors.txt"
-Alexnet4.layer_file = "tuner_results/resnet18_cifar10/resnet18_layers.txt"
+Alexnet4.layer_file = "tuner_results/resnet18_cifar10/resnet_layers.txt"
 Alexnet4.cost_file = "../build_tuner/tuner_results/resnet18_cifar10/op_cost.txt"
+Alexnet4.layer_knobs = "../opentuner/data/resnet/knobs.txt"
 
 Alexnet4.loss1_result_file = "tuner_results/resnet18_cifar10/loss_1/promise_tuned_confs/promise_confs.txt"
 Alexnet4.loss2_result_file = "tuner_results/resnet18_cifar10/loss_2/promise_tuned_confs/promise_confs.txt"
 
-Alexnet4.autotuner_runs = 1000
+Alexnet4.autotuner_runs = 1500
 Alexnet4.tuner_accuracy = 89.6
-Alexnet4.promise_accuracy = 89.59
+#Alexnet4.promise_accuracy = 89.59  - 1000 images
+Alexnet4.promise_accuracy = 89.94
 Alexnet4.validation_accuracy = 89.65
 
 bench_tuner_data["resnet18_cifar10"] = Alexnet4
@@ -174,19 +188,21 @@ Alexnet5.start_promise_range = 1
 #Alexnet5.skip_layer_str = "0"
 Alexnet5.skip_layer_str = "0_1_2_3_4"
 Alexnet5.base_dir = "../build_tuner/tuner_results/vgg16_cifar100/"
-Alexnet5.result_dir_1 = "../build_tuner/tuner_results/vgg16_cifar100/loss_1/batch17"
-Alexnet5.result_dir_2 = "../build_tuner/tuner_results/vgg16_cifar100/loss_2/batch17"
-Alexnet5.result_dir_3 = "../build_tuner/tuner_results/vgg16_cifar100/loss_3/batch17"
+Alexnet5.result_dir_1 = "../build_tuner/tuner_results/vgg16_cifar100/loss_1/" + batch_id
+Alexnet5.result_dir_2 = "../build_tuner/tuner_results/vgg16_cifar100/loss_2/" + batch_id
+Alexnet5.result_dir_3 = "../build_tuner/tuner_results/vgg16_cifar100/loss_3/" + batch_id
 
 Alexnet5.tensor_desc_file = "../build_tuner/tuner_results/vgg16_cifar100/vgg16_tensors.txt"
 Alexnet5.layer_file = "../build_tuner/tuner_results/vgg16_cifar100/vgg16_layers.txt"
 Alexnet5.cost_file = "../build_tuner/tuner_results/vgg16_cifar100/op_cost.txt"
+Alexnet5.layer_knobs = "../opentuner/data/vgg16_cifar100/knobs.txt"
 
 Alexnet5.loss1_result_file = "tuner_results/vgg_cifar100/loss_1/promise_tuned_confs/promise_confs.txt"
 Alexnet5.loss2_result_file = "tuner_results/vgg_cifar100/loss_2/promise_tuned_confs/promise_confs.txt"
-Alexnet5.autotuner_runs = 1000
+Alexnet5.autotuner_runs = 1500
 Alexnet5.tuner_accuracy = 67.95
-Alexnet5.promise_accuracy = 66.8
+#Alexnet5.promise_accuracy = 66.8
+Alexnet5.promise_accuracy = 70.1
 Alexnet5.validation_accuracy = 68.65
 
 bench_tuner_data["vgg16_cifar100"] = Alexnet5
@@ -206,17 +222,18 @@ Alexnet6.start_promise_range = 1
 Alexnet6.skip_layer_str = "0"
 
 Alexnet6.base_dir = "../build_tuner/tuner_results/lenet_keras/"
-Alexnet6.result_dir_1 = "../build_tuner/tuner_results/lenet_keras/loss_1/batch17"
-Alexnet6.result_dir_2 = "../build_tuner/tuner_results/lenet_keras/loss_2/batch17"
-Alexnet6.result_dir_3 = "../build_tuner/tuner_results/lenet_keras/loss_3/batch17"
+Alexnet6.result_dir_1 = "../build_tuner/tuner_results/lenet_keras/loss_1/" + batch_id
+Alexnet6.result_dir_2 = "../build_tuner/tuner_results/lenet_keras/loss_2/" + batch_id
+Alexnet6.result_dir_3 = "../build_tuner/tuner_results/lenet_keras/loss_3/" + batch_id
 
 Alexnet6.tensor_desc_file = "tuner_results/lenet_keras/lenet_tensors.txt"
 Alexnet6.layer_file = "tuner_results/lenet_keras/lenet_layers.txt"
 Alexnet6.cost_file = "../build_tuner/tuner_results/lenet_keras/op_cost.txt"
+Alexnet6.layer_knobs = "../opentuner/data/lenet/knobs.txt"
 
 #Alexnet6.loss1_result_file = "tuner_results/vgg_cifar100/loss_1/promise_tuned_confs/promise_confs.txt"
 #Alexnet6.loss2_result_file = "tuner_results/vgg_cifar100/loss_2/promise_tuned_confs/promise_confs.txt"
-Alexnet6.autotuner_runs = 500
+Alexnet6.autotuner_runs = 900
 Alexnet6.tuner_accuracy = 98.9
 Alexnet6.promise_accuracy = 98.9
 Alexnet6.validation_accuracy = 99
@@ -239,20 +256,22 @@ Alexnet7.start_promise_range = 1
 #Alexnet7.skip_layer_str = "0"
 Alexnet7.skip_layer_str = "1_14_0_6_2"
 Alexnet7.base_dir = "../build_tuner/tuner_results/mobilenet/"
-Alexnet7.result_dir_1 = "../build_tuner/tuner_results/mobilenet/loss_1/batch17"
-Alexnet7.result_dir_2 = "../build_tuner/tuner_results/mobilenet/loss_2/batch17"
-Alexnet7.result_dir_3 = "../build_tuner/tuner_results/mobilenet/loss_3/batch17"
+Alexnet7.result_dir_1 = "../build_tuner/tuner_results/mobilenet/loss_1/" + batch_id
+Alexnet7.result_dir_2 = "../build_tuner/tuner_results/mobilenet/loss_2/" + batch_id
+Alexnet7.result_dir_3 = "../build_tuner/tuner_results/mobilenet/loss_3/" + batch_id
 
 Alexnet7.tensor_desc_file = "tuner_results/mobilenet/mobilenet_ops.txt"
 Alexnet7.layer_file = "tuner_results/mobilenet/mobilenet_layer_comp.txt"
 Alexnet7.cost_file = "../build_tuner/tuner_results/mobilenet/op_cost.txt"
+Alexnet7.layer_knobs = "../opentuner/data/mobilenet/knobs.txt"
 
 #--- Files below needed for VALIDATION experiment
 Alexnet7.loss1_result_file = "tuner_results/mobilenet/loss_1/batch1/promise_tuner/high_confidence/promise_confs.txt"
 Alexnet7.loss2_result_file = "tuner_results/mobilenet/loss_2/batch1/promise_tuner/high_confidence/promise_confs.txt"
-Alexnet7.autotuner_runs = 1000
+Alexnet7.autotuner_runs = 1500
 Alexnet7.tuner_accuracy = 84.8
-Alexnet7.promise_accuracy = 84.8
+#Alexnet7.promise_accuracy = 84.8
+Alexnet7.promise_accuracy = 83.65
 Alexnet7.validation_accuracy = 84.4
 
 bench_tuner_data["mobilenet_cifar10"] = Alexnet7
@@ -271,27 +290,29 @@ Alexnet8.start_promise_range = 1
 #Alexnet8.skip_layer_str = "0"
 Alexnet8.skip_layer_str = "7_0_1"
 Alexnet8.base_dir = "../build_tuner/tuner_results/mobilenet_shallow/"
-Alexnet8.result_dir_1 = "../build_tuner/tuner_results/mobilenet_shallow/loss_1/batch17"
-Alexnet8.result_dir_2 = "../build_tuner/tuner_results/mobilenet_shallow/loss_2/batch17"
-Alexnet8.result_dir_3 = "../build_tuner/tuner_results/mobilenet_shallow/loss_3/batch17"
+Alexnet8.result_dir_1 = "../build_tuner/tuner_results/mobilenet_shallow/loss_1/" + batch_id
+Alexnet8.result_dir_2 = "../build_tuner/tuner_results/mobilenet_shallow/loss_2/" + batch_id
+Alexnet8.result_dir_3 = "../build_tuner/tuner_results/mobilenet_shallow/loss_3/" + batch_id
 
 Alexnet8.tensor_desc_file = "../build_tuner/tuner_results/mobilenet_shallow/mobilenet_shallow_ops.txt"
 Alexnet8.layer_file = "../build_tuner/tuner_results/mobilenet_shallow/mobilenet_shallow_layer_comp.txt"
 Alexnet8.cost_file = "../build_tuner/tuner_results/mobilenet_shallow/op_cost.txt"
+Alexnet8.layer_knobs = "../opentuner/data/mobilenet_shallow/knobs.txt"
 
 Alexnet8.loss1_result_file = "../build_tuner/tuner_results/mobilenet_shallow/loss_1/batch2/promise_tuner/high_confidence/promise_selected_confs.txt"
 Alexnet8.loss2_result_file = "../build_tuner/tuner_results/mobilenet_shallow/loss_2/batch2/promise_tuner/high_confidence/promise_selected_confs.txt"
 
-Alexnet8.autotuner_runs = 1000
+Alexnet8.autotuner_runs = 1500
 Alexnet8.tuner_accuracy = 87.6
-Alexnet8.promise_accuracy = 87.59
+#Alexnet8.promise_accuracy = 87.59
+Alexnet8.promise_accuracy = 89.25
 Alexnet8.validation_accuracy = 88.5
 
 bench_tuner_data["mobilenet_shallow"] = Alexnet8
 
 
 
-
+"""
 Alexnet9 = Benchmark()
 Alexnet9.tuner_binary = "fc4_clipped"
 Alexnet9.promise_binary = ""
@@ -442,6 +463,6 @@ Pipeline5.validation_accuracy = 95
 
 bench_tuner_data["pipeline_GSM"] = Pipeline5
 
-
+"""
 
 
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py
index ca1772637c0c294386c894238e457edc71c01ca5..6a07ef86e53d2b4b6372e1e253611ba6f018aaad 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py
@@ -138,7 +138,7 @@ def loadConfigData(result_dir, baseline_accuracy):
           config.avg_loss = baseline_accuracy - avg_accuracy 
           config.speedup = speedup
           config.fname = fname
-          print ("acc = " + str(avg_accuracy) + "\n")
+          #print ("acc = " + str(avg_accuracy) + "\n")
         else:
           flag = int(x.strip())
           config.flags.append(flag)
@@ -242,7 +242,8 @@ def buildConfigStr(config, layer_desc):
 
 def dumpConfig(layer_desc, config_arrs, result_dir):
 
-  f = open(result_dir + "/tuner_confs.txt", "w+")
+  
+  f = open(result_dir + "/tuner_confs_11.txt", "w+")
 
   it = 1
   for config in config_arrs:
@@ -274,34 +275,82 @@ def generateConf(Bench):
 
 
 
+def dumpBaselineConfs(Bench):
+
+  layer_desc = loadLayerDesc(Bench.layer_file)
+
+  f = open(Bench.base_dir + "/tuner_confs_base.txt", "w+")
+ 
+  f.write("+++++\n")
+  f.write("conf" + str(1) + " " + str(1) + " 0 " + str(Bench.promise_accuracy) + " " + str(0) + "\n")
+
+  config = Config()
+  flags = []
+  for i in range(Bench.num_layers):
+    flags.append(11)
+    
+  config.flags = flags
+  config_str = buildConfigStr(config, layer_desc)
+
+  f.write(config_str)  
+  f.write("-----\n")
+          
+
+  
+  f.write("+++++\n")
+  f.write("conf" + str(2) + " " + str(1.5) + " 0 " + str(Bench.promise_accuracy) + " " + str(0) + "\n")
+
+  config = Config()
+  flags = []
+  for i in range(Bench.num_layers):
+    flags.append(10)
+    
+  config.flags = flags
+  config_str = buildConfigStr(config, layer_desc)
+
+  f.write(config_str)    
+  f.write("-----\n")
+
+
+
+  
+
+
 if __name__ == "__main__":
 
-  """
   Bench = bench_tuner_data["alexnet_cifar10"]
-  generateConf(Bench)
-    
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["alexnet2_cifar10"]
-  generateConf(Bench)
-        
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["vgg16_cifar10"]
-  generateConf(Bench)
-    
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["vgg16_cifar100"]
-  generateConf(Bench)
-
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["resnet18_cifar10"]
-  generateConf(Bench)
-    
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["lenet_keras"]
-  generateConf(Bench)
-
-  """
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["mobilenet_cifar10"]
-  generateConf(Bench)
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
     
-  #Bench = bench_tuner_data["mobilenet_shallow"]
+  Bench = bench_tuner_data["mobilenet_shallow"]
   #generateConf(Bench)
+  dumpBaselineConfs(Bench)
 
 
 
 
+  
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_algo_tuner.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_algo_tuner.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df75fbfc4e7568361747f75f06a4b818a8f99be
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_algo_tuner.py
@@ -0,0 +1,102 @@
+
+
+import os
+import subprocess
+from error_sensitivity import select_skip_layers
+
+
+def runAlgoTunerCmd(Bench, dir_prefix, result_dir, acc_threshold, autotuner_runs):
+
+  tuner_cmd = "python2  ../opentuner/autotuner/algo_tuner.py "
+  tuner_cmd += " --test-limit "
+  tuner_cmd += str(autotuner_runs)
+  tuner_cmd += " --binary ./"
+  tuner_cmd += Bench.promise_binary
+  tuner_cmd += " --num-layers "
+  tuner_cmd += str(Bench.num_layers)
+  tuner_cmd += " --result-dir "
+  tuner_cmd += dir_prefix
+  tuner_cmd += result_dir + "/algo_tuner/"
+  tuner_cmd += " --accuracy "
+  tuner_cmd += str(Bench.promise_accuracy - acc_threshold)
+  tuner_cmd += " --cost-file "
+  tuner_cmd += Bench.cost_file
+  tuner_cmd += " --knobs-config "
+  tuner_cmd += "../opentuner/data/global_knobs.txt"
+  tuner_cmd += " --layer-knobs "
+  tuner_cmd += Bench.layer_knobs
+
+  
+  print (tuner_cmd)
+
+  p = subprocess.Popen(tuner_cmd, shell=True)
+  p.wait()
+  
+
+"""
+
+def promiseTunerLoss1(Bench, dir_prefix):
+
+  tuner_runs = int(Bench.autotuner_runs / 3)
+  
+  skip_layers1 = "0"
+  skip_layers2 = "0_" + select_skip_layers(Bench, 30)
+  skip_layers3 = "0_" + select_skip_layers(Bench, 50)
+
+  runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_1, 0.85, tuner_runs, skip_layers1)
+  runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_1, 0.85, tuner_runs, skip_layers2)
+  runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_1, 0.85, tuner_runs, skip_layers3)
+
+
+def promiseTunerLoss2(Bench, dir_prefix):
+
+  tuner_runs = int(Bench.autotuner_runs / 3) 
+  
+  skip_layers1 = "0"
+  skip_layers2 = "0_" + select_skip_layers(Bench, 20)
+  skip_layers3 = "0_" + select_skip_layers(Bench, 40)
+
+  runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_2, 1.7, tuner_runs, skip_layers1)
+  runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_2, 1.7, tuner_runs, skip_layers2)
+  runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_2, 1.7, tuner_runs, skip_layers3)
+
+
+  
+def promiseTunerLoss3(Bench, dir_prefix):
+
+  tuner_runs = int (Bench.autotuner_runs / 3)
+  
+  skip_layers1 = "0"
+  skip_layers2 = "0_" + select_skip_layers(Bench, 10)
+  skip_layers3 = "0_" + select_skip_layers(Bench, 30)
+  
+  runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_3,  2.5, tuner_runs, skip_layers1)
+  runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_3,  2.5, tuner_runs, skip_layers2)
+  runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_3,  2.5, tuner_runs, skip_layers3)
+
+
+"""
+
+
+BASELINE = True
+
+  
+def runAlgoBench(Bench):
+
+  # NOTE-IMP: Changing current directory to one with promise binaries
+  dir_prefix = "../build_tuner/"
+  
+
+  if BASELINE:
+    tuner_runs = Bench.autotuner_runs 
+    runAlgoTunerCmd(Bench, dir_prefix, Bench.result_dir_1, 0.85, tuner_runs)
+    runAlgoTunerCmd(Bench, dir_prefix, Bench.result_dir_2, 1.7, tuner_runs)
+    runAlgoTunerCmd(Bench, dir_prefix, Bench.result_dir_3, 2.5, tuner_runs)
+    
+  else:    
+    promiseTunerLoss1(Bench, dir_prefix)
+    promiseTunerLoss2(Bench, dir_prefix)
+    promiseTunerLoss3(Bench, dir_prefix)
+
+  
+  
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py
index 2d4a3bb9ca0189e7889abeca2888f985d1bbe380..73d460be0c4091067c9d52e07ea7f4d421765ff3 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py
@@ -5,16 +5,17 @@ import subprocess
 import shutil
 
 from swing_selection import loadLayerDesc
-from error_sensitivity import test_sensitivity, test_sensitivity2, test_sensitivity3 
+from error_sensitivity import test_sensitivity, test_sensitivity2, test_sensitivity3, test_sensitivity4  
 from benchmarks import bench_tuner_data
 from run_psnr import runPSNRTuner
 from run_ha_tuner import runTunerBench
 from run_hs_tuner import runPromiseBench
+from run_algo_tuner import runAlgoBench
 from compute_confs import computePSNRBenchSwings, computeBenchSwings
 from validation import runPromiseBenchValidation, runPromiseBenchValidation2, runBenchValidation
 from profiling import startProfile, stopProfile, dumpProfiles  
 from utils import createResultDirs
-
+from benchmarks import batch_id
   
 
   
@@ -52,44 +53,44 @@ def computeLayerSwings():
 
   
 
-gpu = 1
+
   
 def runPromiseTuner():
 
-  if gpu == 2:
-    start = startProfile("LeNet")  
-    runPromiseBench(bench_tuner_data["lenet_keras"])
-    stopProfile("LeNet", start)
-   
-    start = startProfile("Alexnet")  
-    runPromiseBench(bench_tuner_data["alexnet_cifar10"])
-    stopProfile("Alexnet", start)
+  
+  start = startProfile("MobileNet")  
+  runPromiseBench(bench_tuner_data["mobilenet_cifar10"])
+  stopProfile("MobileNet", start)
+  
+  start = startProfile("Alexnet")  
+  runPromiseBench(bench_tuner_data["alexnet_cifar10"])
+  stopProfile("Alexnet", start)
 
-    start = startProfile("Alexnet2")  
-    runPromiseBench(bench_tuner_data["alexnet2_cifar10"])
-    stopProfile("Alexnet2", start)  
+  start = startProfile("Alexnet2")  
+  runPromiseBench(bench_tuner_data["alexnet2_cifar10"])
+  stopProfile("Alexnet2", start)  
 
-    start = startProfile("ResNet")  
-    runPromiseBench(bench_tuner_data["resnet18_cifar10"])
-    stopProfile("ResNet", start)  
+  start = startProfile("VGG16_10")  
+  runPromiseBench(bench_tuner_data["vgg16_cifar10"])
+  stopProfile("VGG16_10", start)  
 
-  if gpu == 1:
-    
-    start = startProfile("VGG16_10")  
-    runPromiseBench(bench_tuner_data["vgg16_cifar10"])
-    stopProfile("VGG16_10", start)  
-  
-    start = startProfile("VGG16_100")  
-    runPromiseBench(bench_tuner_data["vgg16_cifar100"])
-    stopProfile("VGG16_100", start)
+  start = startProfile("VGG16_100")  
+  runPromiseBench(bench_tuner_data["vgg16_cifar100"])
+  stopProfile("VGG16_100", start)
+
+  start = startProfile("ResNet")  
+  runPromiseBench(bench_tuner_data["resnet18_cifar10"])
+  stopProfile("ResNet", start)  
 
-    start = startProfile("MobileNet")  
-    runPromiseBench(bench_tuner_data["mobilenet_cifar10"])
-    stopProfile("MobileNet", start)
+  start = startProfile("MobileNet-SH")  
+  runPromiseBench(bench_tuner_data["mobilenet_shallow"])
+  stopProfile("MobileNet-SH", start)  
+  
+  start = startProfile("LeNet")  
+  runPromiseBench(bench_tuner_data["lenet_keras"])
+  stopProfile("LeNet", start)
+  
 
-    start = startProfile("MobileNet-SH")  
-    runPromiseBench(bench_tuner_data["mobilenet_shallow"])
-    stopProfile("MobileNet-SH", start)  
 
   #runPSNRPromiseBench("pipeline_GEOM")
   #runPSNRPromiseBench("pipeline_GEMO")
@@ -97,20 +98,47 @@ def runPromiseTuner():
   #runPSNRPromiseBench("pipeline_GSM")
   #runPSNRPromiseBench("pipeline_GSME")
 
-  dumpProfiles("time_profile_17.txt")
+  dumpProfiles("time_profile" + batch_id + ".txt")
   
+
+
   
 def runPromiseValidation():
 
-  #runPromiseBenchValidation(bench_tuner_data["mobilenet_shallow"])
- 
-  #runPromiseBenchValidation("mobilenet_cifar10")
-  #runPromiseBenchValidation("resnet18_cifar10")
-  #runPromiseBenchValidation("alexnet2_cifar10")
-  #runPromiseBenchValidation("vgg_cifar100")
 
-  #runPromiseBenchValidation("vgg16_cifar10")
-  runPromiseBenchValidation2(bench_tuner_data["lenet_keras"])
+  start = startProfile("AlexNet")    
+  runPromiseBenchValidation2(bench_tuner_data["alexnet_cifar10"])
+  stopProfile("AlexNet", start)  
+
+  start = startProfile("AlexNet2")    
+  runPromiseBenchValidation2(bench_tuner_data["alexnet2_cifar10"])
+  stopProfile("AlexNet2", start)  
+
+  start = startProfile("VGG16_100")    
+  runPromiseBenchValidation2(bench_tuner_data["vgg16_cifar100"])
+  stopProfile("VGG16_100", start)  
+
+  start = startProfile("VGG16_10")    
+  runPromiseBenchValidation2(bench_tuner_data["vgg16_cifar10"])
+  stopProfile("VGG16_10", start)  
+  #runPromiseBenchValidation2(bench_tuner_data["lenet_keras"])
+
+  start = startProfile("ResNet")    
+  runPromiseBenchValidation2(bench_tuner_data["resnet18_cifar10"])
+  stopProfile("ResNet", start)  
+
+  start = startProfile("MobileNet_SH")  
+  runPromiseBenchValidation2(bench_tuner_data["mobilenet_shallow"])
+  stopProfile("MobileNet_SH", start)  
+
+  start = startProfile("MobileNet")    
+  runPromiseBenchValidation2(bench_tuner_data["mobilenet_cifar10"])
+  stopProfile("MobileNet", start)  
+
+  
+  dumpProfiles("validation_prof" + batch_id + ".txt")
+
+  
   
 
 def runAutotuner(): 
@@ -135,8 +163,44 @@ def runAutotuner():
 
 
 def runSensAnalysis():
+ 
+  start = startProfile("LeNet")  
+  test_sensitivity4(bench_tuner_data["lenet_keras"])
+  stopProfile("LeNet", start)  
 
   """
+  start = startProfile("AlexNet")  
+  test_sensitivity4(bench_tuner_data["alexnet_cifar10"])
+  stopProfile("AlexNet", start)  
+
+  start = startProfile("AlexNet2")  
+  test_sensitivity4(bench_tuner_data["alexnet2_cifar10"])
+  stopProfile("AlexNet2", start)  
+
+  start = startProfile("ResNet")  
+  test_sensitivity4(bench_tuner_data["resnet18_cifar10"])
+  stopProfile("ResNet", start)  
+
+  start = startProfile("MobileNet")  
+  test_sensitivity4(bench_tuner_data["mobilenet_cifar10"])
+  stopProfile("MobileNet", start)  
+
+  start = startProfile("MobileNet_SH")  
+  test_sensitivity4(bench_tuner_data["mobilenet_shallow"])
+  stopProfile("MobileNet_SH", start)  
+
+  start = startProfile("VGG_10")  
+  test_sensitivity4(bench_tuner_data["vgg16_cifar10"])
+  stopProfile("VGG16_10", start)  
+
+  start = startProfile("VGG_100")  
+  test_sensitivity4(bench_tuner_data["vgg16_cifar100"]) 
+  stopProfile("VGG16_100", start)  
+
+  dumpProfiles("sens_time_prof.txt")
+
+  """
+  
   start = startProfile("LeNet")  
   test_sensitivity3(bench_tuner_data["lenet_keras"])
   stopProfile("LeNet", start)  
@@ -148,8 +212,7 @@ def runSensAnalysis():
   start = startProfile("AlexNet2")  
   test_sensitivity3(bench_tuner_data["alexnet2_cifar10"])
   stopProfile("AlexNet2", start)  
-  """
-  
+
   start = startProfile("ResNet")  
   test_sensitivity3(bench_tuner_data["resnet18_cifar10"])
   stopProfile("ResNet", start)  
@@ -163,7 +226,6 @@ def runSensAnalysis():
   test_sensitivity3(bench_tuner_data["mobilenet_shallow"])
   stopProfile("MobileNet_SH", start)  
 
-  """
   start = startProfile("VGG_10")  
   test_sensitivity3(bench_tuner_data["vgg16_cifar10"])
   stopProfile("VGG16_10", start)  
@@ -171,9 +233,7 @@ def runSensAnalysis():
   start = startProfile("VGG_100")  
   test_sensitivity3(bench_tuner_data["vgg16_cifar100"]) 
   stopProfile("VGG16_100", start)  
-  
-  """
-  
+
   dumpProfiles("sens_time_prof.txt")
 
   
@@ -200,9 +260,39 @@ def runSensAnalysis():
   test_sensitivity(bench_tuner_data["vgg16_cifar100"]) 
   """
   
-  
+
+
+def runAlgoTuner():
+
+  Bench = bench_tuner_data["alexnet_cifar10"]  
+  runAlgoBench(Bench)
+
+  Bench = bench_tuner_data["mobilenet_shallow"]  
+  runAlgoBench(Bench)
+
+  Bench = bench_tuner_data["mobilenet_cifar10"]  
+  runAlgoBench(Bench)
+
+  Bench = bench_tuner_data["vgg16_cifar10"]  
+  runAlgoBench(Bench)
+
+  #Bench = bench_tuner_data["lenet_keras"]  
+  #runAlgoBench(Bench)
+
+  Bench = bench_tuner_data["alexnet2_cifar10"]  
+  runAlgoBench(Bench)
+
+
+  Bench = bench_tuner_data["vgg16_cifar100"]  
+  runAlgoBench(Bench)
+
+  Bench = bench_tuner_data["resnet18_cifar10"]  
+  runAlgoBench(Bench)
+
+
 
   
+  
 if __name__ == "__main__":
 
   createResultDirs(bench_tuner_data)
@@ -213,8 +303,10 @@ if __name__ == "__main__":
 
   #computeLayerSwings()
   
-  runPromiseTuner()    
+  #runPromiseTuner()    
 
+  runAlgoTuner()
+  
   #runPromiseValidation()
 
   #runSensAnalysis()
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_different_clock_frequencies_testing_automator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_different_clock_frequencies_testing_automator.py
new file mode 100644
index 0000000000000000000000000000000000000000..d787af8ec350b7fa2f2eeb2b0ed4c3ae4c015c95
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_different_clock_frequencies_testing_automator.py
@@ -0,0 +1,139 @@
+# Automates online benchmark testing with different clock speeds
+# Input: GPU clock speed, DDR clock speed, set of benchmark names to test
+# Set of benchmarks format: (full_bin_name, half_bin_name)
+import os
+import sys
+
+from subprocess import Popen, PIPE
+
+def set_clock_speeds(gpu_speed_mhz, ddr_speed_mhz):
+    def find_closest_clock_speed(goal_speed):
+        # Reads /sys/devices/17000000.gp10b/devfreq/17000000.gp10b/available_frequencies
+        # and finds the closest clock speed
+        AVAIL_FREQS = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/available_frequencies"
+        avail_freqs_file = open(AVAIL_FREQS, "r")
+        avail_speeds_lst = avail_freqs_file.read().strip().split()
+        avail_freqs_file.close()
+
+        min_diff = abs(gpu_speed - int(avail_speeds_lst[0])) 
+        closest_speed = int(avail_speeds_lst[0])
+        for avail_speed in avail_speeds_lst[1:]:
+            avail_speed = int(avail_speed)
+            curr_diff = abs(gpu_speed - avail_speed)
+            if curr_diff < min_diff:
+                min_diff = curr_diff
+                closest_speed = avail_speed
+        return closest_speed
+
+    new_conf_filename = 'jetson_clocks_conf%d_%d.txt' % (gpu_speed_mhz, ddr_speed_mhz)
+    curr_conf_filename = "jetson_clocks_conf_backup.txt"
+    if os.path.isfile(curr_conf_filename):
+        os.remove(curr_conf_filename)
+
+    # Get the current configurations in a file 
+    sudo_password = 'nvidia'
+    p = Popen(['sudo', '/home/nvidia/jetson_clocks.sh', '--store', curr_conf_filename], \
+            stdin=PIPE, universal_newlines=True)
+    p.communicate(sudo_password + '\n')
+    assert p.returncode == 0
+
+    # Read the current config file in 
+    curr_conf_file = open(curr_conf_filename, "r")
+    curr_confs = curr_conf_file.read().strip().split('\n')
+    curr_conf_file.close()
+    
+    GPU_MIN_FREQ = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"
+    GPU_MAX_FREQ = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"
+    GPU_CUR_FREQ = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"
+    
+    DDR_UPDATE_PATH = "/sys/kernel/debug/bpmp/debug/clk/emc/rate"
+
+    # Copy everything in the old configuration except for the GPU/DDR lines
+    new_conf_file = open(new_conf_filename, "w")
+    for line in curr_confs:
+		# Write the GPU clock frequencies at the end to configure the clocks even if
+		# the current configuration doesn't have one of the lines
+        if line.startswith(GPU_MIN_FREQ) or line.startswith(GPU_MAX_FREQ) or \
+					line.startswith(GPU_CUR_FREQ) or line.startswith(DDR_UPDATE_PATH):
+            continue
+        else:
+            new_conf_file.write("%s\n" % line)
+
+    MHZ_TO_HZ_MULT = 1000000
+    gpu_speed = gpu_speed_mhz * MHZ_TO_HZ_MULT
+    ddr_speed = ddr_speed_mhz * MHZ_TO_HZ_MULT
+
+	# Set GPU
+    closest_gpu_speed = find_closest_clock_speed(gpu_speed)
+    print("Setting GPU speed to %d" % closest_gpu_speed)
+    new_conf_file.write("%s:%d\n" % (GPU_MIN_FREQ, closest_gpu_speed))
+    new_conf_file.write("%s:%d\n" % (GPU_MAX_FREQ, closest_gpu_speed))
+    #new_conf_file.write("%s:%d\n" % (GPU_CUR_FREQ, closest_gpu_speed))
+
+	# Set DDR
+    new_conf_file.write("%s:%d\n" % (DDR_UPDATE_PATH, ddr_speed))
+    new_conf_file.close()
+
+    # Set the new configuration
+    p = Popen(['sudo', '/home/nvidia/jetson_clocks.sh', '--restore', new_conf_filename], \
+            stdin=PIPE, universal_newlines=True)
+    p.communicate(sudo_password + '\n')
+    assert p.returncode == 0
+    print("SUCCESSFULLY SET CLOCK SPEEDS")
+
+
+def run_benchmark(bin_name, should_print_bin_output):
+    print("RUNNING %s" % bin_name)
+    proc = Popen("./%s" % bin_name, stdout = PIPE, universal_newlines = True)
+    proc_output = proc.communicate()[0]
+    assert proc.returncode == 0
+    
+    if should_print_bin_output:
+		print(proc_output)
+    print("FINISHED RUNNING %s" % bin_name)
+    return proc_output    
+
+
+def parse_binary_output(proc_output):
+    avg_time_key_ind = proc_output.find("Average time:")
+    assert avg_time_key_ind >= 0
+    avg_time = proc_output[avg_time_key_ind : proc_output.find("\n", avg_time_key_ind)]
+    print(avg_time)
+    return avg_time
+
+
+# Input: a list of tuples of benchmark names
+# Can change to input a file containing benchmarks to run 
+def run_benchmarks(benchmarks_filename, output_filename, should_print_bin_output):
+    benchmarks_file = open(benchmarks_filename, "r")
+    output_file = open(output_filename, "w")
+
+    def parse_binary_names_tuple(tuple_line):
+        tuple_line = tuple_line.replace("(", "").replace(")", "").strip().split(',')
+        return tuple_line[0].strip(), tuple_line[1].strip()
+
+    for line in benchmarks_file:
+        full_bin_name, half_bin_name = parse_binary_names_tuple(line)
+        output_file.write("%s: %s\n" % (full_bin_name, \
+                parse_binary_output(run_benchmark(full_bin_name, should_print_bin_output))))
+        output_file.write("%s: %s\n" % (half_bin_name, \
+                parse_binary_output(run_benchmark(half_bin_name, should_print_bin_output))))    
+
+    benchmarks_file.close()
+    output_file.close()
+
+
+if __name__ == "__main__":
+    num_args = len(sys.argv)
+
+    if num_args != 5 and num_args != 6:
+        print("Usage: python online_benchmark_testing_automator.py <gpu freq in MHz> <ddr freq in MHz> <binary_names_file> <output_file> [1 to print binary output]")
+        print("Binary names file format: (full_binary_name, half_binary_name)<newline>")
+        exit(1)
+    print("GPU clock speed: %s" % sys.argv[1])
+    print("DDR clock speed: %s" % sys.argv[2])
+    print("Benchmarks file name: %s" % sys.argv[3])
+    print("Output file name: %s" % sys.argv[4])
+
+    set_clock_speeds(int(sys.argv[1]), int(sys.argv[2]))
+    run_benchmarks(sys.argv[3], sys.argv[4], num_args == 6 and sys.argv[-1] == "1")
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f00f4e285fbf487fee03bfee72dbe1a84ea55a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py
@@ -0,0 +1,72 @@
+# Automates online benchmark testing with different clock speeds
+# Input: set of benchmark names to test
+# Set of benchmarks format: (full_bin_name, half_bin_name)
+import os
+import sys
+
+from collections import defaultdict
+from subprocess import Popen, PIPE
+
+def run_benchmark(bin_name, should_print_bin_output):
+    print("RUNNING %s" % bin_name)
+    proc = Popen("./%s" % bin_name, stdout = PIPE, universal_newlines = True)
+    proc_output = proc.communicate()[0]
+    assert proc.returncode == 0
+    
+    if should_print_bin_output:
+		print(proc_output)
+    print("FINISHED RUNNING %s" % bin_name)
+    return proc_output    
+
+
+def parse_binary_output(proc_output):
+    avg_time_key_ind = proc_output.find("Average time:")
+    assert avg_time_key_ind >= 0
+    avg_time = proc_output[avg_time_key_ind : proc_output.find("\n", avg_time_key_ind)]
+    print(avg_time)
+    return avg_time
+
+
+def get_sorted_binaries(builds_dir):
+    # dict of network names to lists of binaries
+    # list of binaries should be in sorted order (can do that when we run the benchmarks)
+    network_bins = defaultdict(list)
+    for bin_name in os.listdir(builds_dir):
+        if bin_name.find("profiling") == -1:
+            continue
+        network_name = bin_name[ : bin_name.rfind("_")]
+        network_bins[network_name].append(bin_name)
+    return network_bins
+
+# Input: a list of tuples of benchmark names
+# Can change to input a file containing benchmarks to run 
+def run_benchmarks(sorted_bins, builds_dir, output_filename, should_print_bin_output = False):
+    def get_knob_id(bin_name):
+        return int(bin_name[bin_name.rfind("_") + 1 : ])
+
+    output_file = open(output_filename, "w", buffering = 0)
+    for network_name in sorted_bins:
+        # Sort the binaries in order by knob id
+        sorted_bins[network_name].sort(key = get_knob_id)
+        print("--------------------------------------")
+        print(network_name)
+        # Go through all binaries
+        for bin_name in sorted_bins[network_name]:
+            print(bin_name)
+            output_file.write("%s results\n" % bin_name)
+            output_file.write("%s: %s\n" % (bin_name, \
+                parse_binary_output(run_benchmark(os.path.join(builds_dir, bin_name), \
+                should_print_bin_output))))
+        print("--------------------------------------\n")
+    output_file.close()
+
+
+if __name__ == "__main__":
+    num_args = len(sys.argv)
+
+    if num_args != 3:
+        print("Usage: python online_benchmark_testing_automator.py <builds dir> <outputs_file_name>")
+        exit(1)
+    print("Output file name: %s" % sys.argv[2])
+    sorted_bins = get_sorted_binaries(sys.argv[1])
+    run_benchmarks(sorted_bins, sys.argv[1], sys.argv[2])
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/cmakelists_generator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/cmakelists_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f6c5eec378276cd0c89fcc7013cb6996a90f2f
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/cmakelists_generator.py
@@ -0,0 +1,109 @@
+# Generates a CMakeLists.txt file for all generated files in a specific directory
+# Input: Arbitrarily long list containing names of all generated files directories
+# Ex: alexnet_cifar10_autogenerated_knobs mobilenet_cifar10_autogenerated_knobs
+# If inputted 0 parameters: Generates CMakeLists.txt file for all generated files in CURRENT dir
+
+import sys
+import os
+
+def get_all_generated_directory_names(): 
+    '''
+    Returns a list of all generated source code directories (<>_autogenerated_knobs)
+    in the current directory. Called when program is run with 0 args
+    '''
+    generated_dir_names = []
+    for dir_name in os.listdir("."):
+        print(dir_name)
+        if dir_name.endswith("autogenerated_knobs"):
+            generated_dir_names.append(dir_name)
+    return generated_dir_names
+
+
+def generate_cmakelists_setup(cmakelists_file):
+    '''
+    Copies over all the setup instructions (ex: finding libraries) from a "base" CMakeLists.txt
+    file. Ends copyng when we find the first instance of add_executable
+
+    Args:
+        cmakelists_file: File object to write cmake instructions to 
+
+    Assumption: All setup instructions are being any add_executable instructions
+    '''
+    BASE_CMAKELISTS_PATH = "/home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt"
+    base_cmakelists_file = open(os.path.join(BASE_CMAKELISTS_PATH, "CMakeLists.txt"), "r")
+
+    find_lib_line = ""
+
+    for line in base_cmakelists_file:
+        if line.find("add_executable") != -1:
+            break
+
+        elif line.startswith("#"):
+            continue
+
+        # Special case: ignore / if -I flag exists
+        elif line.find("/") != -1 and line.find("-I") == -1: 
+            dot_dot_slash_ind = line.find("../")
+            dot_slash_ind = line.find("./")
+            if dot_dot_slash_ind != -1:
+                start_ind = dot_dot_slash_ind
+            elif dot_slash_ind != -1:
+                start_ind = dot_slash_ind
+            else:
+                slash_ind = line.find("/")
+                prev_space_ind = line[:slash_ind].rfind(" ")
+                start_ind = prev_space_ind + 1
+
+            old_rel_path = []
+            while start_ind < len(line):
+                if line[start_ind] == ")" or line[start_ind].isspace():
+                    break
+                old_rel_path.append(line[start_ind])
+                start_ind += 1
+            old_rel_path = ''.join(old_rel_path)
+            if os.path.isabs(old_rel_path):
+                cmakelists_file.write(line)
+            else:
+                new_path = os.path.join(BASE_CMAKELISTS_PATH, old_rel_path)
+                cmakelists_file.write(line.replace(old_rel_path, new_path))
+            continue
+        cmakelists_file.write(line)
+    base_cmakelists_file.close()
+
+
+def generate_cmakelists_file(cmakelists_file, source_file_dirs):
+    generate_cmakelists_setup(cmakelists_file)
+    LIBRARIES = "tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}"
+    cmake_instrs = []
+
+    for source_file_dir in source_file_dirs:
+        cmake_instrs.append("# %s" % source_file_dir)
+        for source_file in os.listdir(source_file_dir):
+            # Executable name = name of source code file without file extension
+            file_ext_ind = source_file.find(".cc")
+            if file_ext_ind == -1:
+                print("WARNING: Found file with wrong extension. Skipping. %s" % source_file)
+                continue
+            exec_name = source_file[ : file_ext_ind]
+            
+            source_file_path = os.path.join(source_file_dir, source_file)
+            cmake_instrs.append("add_executable(%s %s)" % (exec_name, source_file_path))
+            cmake_instrs.append("target_link_libraries(%s %s)\n" % (exec_name, LIBRARIES))
+        cmake_instrs.append("\n")
+    cmakelists_file.write('\n'.join(cmake_instrs))
+
+
+if __name__ == "__main__":
+    num_args = len(sys.argv)
+
+    if num_args >= 2 and sys.argv[1] == "--usage":
+        print("python cmakelists_generator.py <names of all generated files directories>")
+        print("If given no parameters: Generates CMakeLists.txt file for all generated files in CURRENT directory")
+        exit(1)
+
+    cmakelists_file = open("CMakeLists.txt", "w")
+    if num_args == 1:
+        generate_cmakelists_file(cmakelists_file, get_all_generated_directory_names())
+    else:
+        generate_cmakelists_file(cmakelists_file, sys.argv[1:])
+    cmakelists_file.close()
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/source_code_autogenerator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/source_code_autogenerator.py
new file mode 100644
index 0000000000000000000000000000000000000000..d587a3b7b57b96c8eb61b2e3e63709c7745ed466
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/source_code_autogenerator.py
@@ -0,0 +1,295 @@
+# Input: file of the following table format
+#   id    knob configurations (arbitrary # of columns)   orig_func_name     new_func_name 
+# Input: file containing list of filenames to generate modified sources for 
+# Generates:
+#   a new directory called <original_source_nane>_different_knobs
+#   files named <original_source_name>_<id>.txt within their respective directories
+
+import glob
+import sys
+import os
+import re
+
+class Approx:
+    FP32 = 0
+    FP16 = 1
+    PERF = 2
+    SAMP = 3
+
+class KnobConfiguration:
+    '''
+    Stores the configurations as well as other useful information for each knob configuration
+    Stores: id (may factor out if ids are guaranteed to start at 0/1 and be consecutive)
+            original function name
+            modified function name
+            new function parameters (knobs)
+            new function call (modified function name(knobs)) 
+    '''
+    def __init__(self, raw_config):
+        '''
+        Args: raw_config = line of configuration file to parse
+        '''
+        line_as_lst = raw_config.strip().split()
+        # approx,<id> knob1,knob2,etc IGNORE old_fun_name new_fun_name
+
+        approx_id_lst = line_as_lst[0].split(',')
+        assert len(approx_id_lst) == 2
+
+        self.id = int(approx_id_lst[1])
+
+        if approx_id_lst[0] == "fp32":
+            self.approx = Approx.FP32
+            return # special case 
+        elif approx_id_lst[0] == "fp16":
+            self.approx = Approx.FP16
+            return # special case
+        elif approx_id_lst[0] == "perf":
+            self.approx = Approx.PERF
+        elif approx_id_lst[0] == "samp":
+            self.approx = Approx.SAMP
+
+        self.orig_func_name = line_as_lst[-2] # Second to last element
+        self.modified_func_name = line_as_lst[-1] # Last element  
+        self.params = line_as_lst[1].split(",") # First element = knob configuration 
+ 
+
+    # DEBUG
+    def __repr__(self):
+        if self.approx == Approx.FP32:
+            return "FP32"
+        elif self.approx == Approx.FP16:
+            return "FP16"
+
+        approx_type = None
+        if self.approx == Approx.PERF:
+            approx_type = "PERF"
+        elif self.approx == Approx.SAMP:
+            approx_type = "SAMP"
+        return "Approx: %s, ID: %d, Orig func nane: %s, Modified func nane: %s, Params: %s" \
+                % (approx_type, self.id, self.orig_func_name, self.modified_func_name, \
+                   ', '.join(self.params))
+
+
+def get_new_path(old_path, orig_source_code_dir):
+    '''
+    Returns a path that's compatible with the location of the generated source code
+
+    Args:
+        old_path: Original path of file that's being included
+        orig_source_code_dir: Path to original source code dir wrt the current dir
+    '''
+    if os.path.isabs(old_path): # Old path works
+        return old_path 
+    # Adding an extra .. because the path should be wrt the generated directory
+    return os.path.join("..", orig_source_code_dir, old_path)
+
+
+# "complete_line" = a valid line of code  
+def get_new_function_calls(complete_line, knob_config):
+    '''
+    Returns a copy of an inputted line of code such that all instances of old 
+    function calls are replaced with newFunctionCall(old params, knobs)
+
+    Note: The old calls aren't completely overriden, as we still need the old parameters but
+    insert new parameters as well
+
+    Args:
+        complete_line: A complete line of code to process
+        knob_config: KnobConfiguration object representing current configuration 
+    '''
+    orig_func_ind = complete_line.find(knob_config.orig_func_name)
+    new_line = []
+    line_start_ind = 0
+    last_ind = 0
+
+    while orig_func_ind != -1:
+        new_line.append(complete_line[line_start_ind : orig_func_ind])
+        line_start_ind = complete_line.find(")", orig_func_ind) + 1 
+        
+        old_func_call = complete_line[complete_line.find("(", orig_func_ind): line_start_ind]
+        new_line.append("%s%s, %s)" % (knob_config.modified_func_name, old_func_call[:-1], ', '.join(knob_config.params)))
+        orig_func_ind = complete_line.find(knob_config.orig_func_name, line_start_ind)
+	new_line.append(complete_line[line_start_ind : ])
+    return ''.join(new_line)
+
+
+def convert_local_paths(file_contents, orig_source_dir): 
+    '''
+    Converts all local paths wrt the original source file's directory to paths compatible
+    with the current source code directory
+
+    Args:
+        file_contents: String containing source code read from file
+        orig_source_dir: Path of original source code dir wrt the current directory 
+    '''
+    last_include_ind = file_contents.rfind("#include")
+    last_include_newline_ind = file_contents.find("\n", last_include_ind)
+    include_lines = file_contents[ : last_include_newline_ind].split("\n")
+    
+    new_file_contents = []
+    for line in include_lines:
+        if line.startswith("#"):
+            include_file = line.split()[1]
+            if include_file.startswith("\""):
+                new_include_path = get_new_path(include_file.replace("\"", ""), orig_source_dir.replace("\"", ""))
+                new_file_contents.append("#include \"%s\"\n" % new_include_path)
+            else:
+                new_file_contents.append(line)
+    new_file_contents.append(file_contents[last_include_newline_ind : ])
+    return '\n'.join(new_file_contents)
+
+
+def generate_fp32_source(new_file, source_file, orig_source_dir):
+    # Copy the source code over 
+    new_file_contents = convert_local_paths(source_file.read(), orig_source_dir)
+    new_file.write(new_file_contents)
+
+
+def generate_fp16_source(knob_config, new_file, source_file, orig_source_dir):
+    file_contents = source_file.read()
+
+    new_file_contents = convert_local_paths(file_contents, orig_source_dir)
+
+    # Replace all tensorOperation calls with tensorHalfOperation calls
+    # Derived from ../bin/replace_half_calls.py 
+    # NOTE: Not very portable but I don't see another way of ONLY replacing tensorOperation FUNCTION calls
+    new_file_contents = new_file_contents.replace("tensorConvolution", "tensorHalfConvolution")
+    new_file_contents = new_file_contents.replace("tensorAdd", "tensorHalfAdd")
+    new_file_contents = new_file_contents.replace("tensorRelu", "tensorHalfRelu")
+    new_file_contents = new_file_contents.replace("tensorRelu2", "tensorHalfRelu2")
+    new_file_contents = new_file_contents.replace("tensorTanh", "tensorHalfTanh")
+    new_file_contents = new_file_contents.replace("tensorPooling", "tensorHalfPooling")
+    new_file_contents = new_file_contents.replace("tensorGemmGPU", "tensorHalfGemmGPU")
+   
+    new_file.write(new_file_contents)
+
+
+def generate_approx_source(knob_config, new_file, source_file, orig_source_dir):
+	new_file_contents = []
+
+	# Store complete line to handle cases where one line of code is split into two lines
+	complete_line = ""
+	for line in source_file:
+		# Replace the current path of the local include with a path that's compatible 
+		# with the location of the generated source code 
+		if line.startswith("#"):
+			include_file = line.split()[1]
+			if include_file.startswith("\""):
+				new_include_path = get_new_path(include_file.replace("\"", ""), orig_source_dir.replace("\"", ""))
+				new_file_contents.append("#include \"%s\"\n" % new_include_path)
+			else:
+				new_file_contents.append(line)
+			continue
+		# Handles case where 1 actual line of code is split into 2 lines 
+		elif line.find("}") != -1 or line.find("{") != -1:
+			complete_line += line
+			new_file_contents.append(complete_line)
+			complete_line = ""
+			continue
+		elif line.find(";") == -1: # Last char is always \n
+			complete_line += line
+			continue
+
+		complete_line += line
+		orig_func_ind = complete_line.find(knob_config.orig_func_name)
+		if orig_func_ind != -1:
+			new_file_contents.append(get_new_function_calls(complete_line, knob_config))
+		else:
+			new_file_contents.append(complete_line)
+		complete_line = ""
+	new_file.write(''.join(new_file_contents))
+
+
+def generate_source_code(table, dir_name, filename, source_name):
+    '''
+    Generates source code for all configurations in the table for one original source 
+    Args
+        table: List of KnobConfigurations
+        dir_name: Directory new sources should be placed in
+        filename: Filename of original source
+        source_name: Filename without the file extension (ex: foo/blah.cc --> blah)
+    '''
+    source_file = open(filename, "r") 
+    orig_source_dir = os.path.dirname(filename)
+
+    for knob_config in table:
+        source_file.seek(0, 0)
+        new_filename = os.path.join(dir_name, "%s_%s.cc" % (source_name, knob_config.id))
+        new_file = open(new_filename, "w")
+        if knob_config.approx == Approx.FP16:
+            generate_fp16_source(knob_config, new_file, source_file, orig_source_dir)
+        elif knob_config.approx == Approx.FP32:
+            generate_fp32_source(new_file, source_file, orig_source_dir)
+        else:
+			generate_approx_source(knob_config, new_file, source_file, orig_source_dir)
+
+        new_file.close()
+        print("Generated source code as %s" % new_filename)
+    source_file.close()
+
+
+def generate_all_sources(table, orig_files_filename):
+    '''
+    Generates directories and source code for all original sources for all knob configurations
+    Args:
+        table: List of KnobConfiguration objects
+        orig_files_filename: Filename of file containing all original source names to generate new
+               sources for 
+    '''
+    orig_files = open(orig_files_filename, "r")
+    for orig_filename in orig_files:
+        orig_filename = orig_filename.strip()
+
+        # Source name = original filename without the .cc 
+        last_slash_ind = orig_filename.rfind("/")
+        file_ext_ind = orig_filename.find(".cc")
+        if last_slash_ind == -1:
+            source_name = orig_filename[ : file_ext_ind]
+        else:
+            source_name = orig_filename[last_slash_ind + 1 : file_ext_ind]
+        print("Source name: %s" % source_name)
+       
+        # Start with a clean directory
+        dir_name = "%s_autogenerated_knobs" % source_name
+        print("Setting up directory: %s" % dir_name)
+        if os.path.isdir(dir_name):
+            print("Directory exists: clearing everything")
+            for old_file in glob.glob(os.path.join(dir_name, "*")):
+                os.remove(old_file)
+
+        else:
+            print("Generating directory: %s" % dir_name)
+            os.makedirs(dir_name)
+            
+        generate_source_code(table, dir_name, orig_filename, source_name)
+        print("\n")
+    orig_files.close()
+
+
+def parse_table(table_filename):
+    '''
+    Given the filename of a table, parses the table into a list of KnobConfigurations 
+    '''
+    # Can we assume that the ids always start at 1 --> if so, can index by knobs 
+    # else: need to use a dict
+    table = []
+    table_file = open(table_filename, "r")
+    for raw_config in table_file:
+        table.append(KnobConfiguration(raw_config))
+    table_file.close()  
+    return table
+    
+
+if __name__ == "__main__":
+    num_args = len(sys.argv)
+    if num_args != 3:
+        print("Usage: python source_code_autogenerator.py <table file> <original filenames file>")
+        if num_args >= 2 and sys.argv[1] == "--usage":
+            print("Table file format: <id> <knob configurations separated by spaces> <orig func name> <new func name>")
+            print("Original filenames file: <original_filename><newline> etc")
+        else:
+            print("Run with --usage flag for more detailed information")
+        exit(1)
+
+    table = parse_table(sys.argv[1])
+    generate_all_sources(table, sys.argv[2])
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c9583f291ea908c4c89a8b56045e06585a4f83a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc
@@ -0,0 +1,185 @@
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+
+
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+
+bool Opentuner_run = false;
+
+int total_runs = 1;
+
+
+/* NOTE: Reference Architecture to use for profiling */
+void testLenetTanh(){
+
+  if(Opentuner_run){
+    total_runs = 1000000;
+  }
+
+  
+  printf("********* Lenet-2 Architecture ********** \n");
+  // FIXIT: Extend this to batch of images - currently 5 images
+
+  int test_batch_size = 1000;
+
+  uint8_t* labels = readLabels("../model_params/lenet_params/datasets/t10k-labels-idx1-ubyte", test_batch_size);
+  
+  void* input = readInputTensor("../model_params/lenet_params/datasets/t10k-images-idx3-ubyte",
+				CUDNN_DATA_FLOAT,
+				test_batch_size, 1, 28, 28);
+
+  // NOTE: Filter descriptors do NOT have batch size
+  // NOTE: First two dims are output channels (configurable), input channels (MUST match input channels)
+  // IMP: The output channels matches the trained model - not the Lenet arch proposed in Andrew Ng's class
+  void* conv1_filter = readTrainedWeights("../model_params/lenet_keras/conv1.bin",
+					  float_type, 32, 1, 5, 5);    
+  void* conv1_bias = readTrainedWeights("../model_params/lenet_keras/conv1_bias.bin",
+					float_type, 1, 32, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/lenet_keras/conv2.bin",
+					  float_type, 64, 32, 5, 5);  
+  void* conv2_bias = readTrainedWeights("../model_params/lenet_keras/conv2_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* fc1_weights = readTrainedWeights("../model_params/lenet_keras/fc1.bin",
+					 float_type, 1, 1, 7*7*64, 1024);  
+  void* fc1_bias = readTrainedWeights("../model_params/lenet_keras/fc1_bias.bin",
+				      float_type, 1, 1024, 1, 1);  
+  void* fc2_weights = readTrainedWeights("../model_params/lenet_keras/fc2.bin",
+					 float_type, 1, 1, 1024, 10);  
+  void* fc2_bias = readTrainedWeights("../model_params/lenet_keras/fc2_bias.bin",
+				      float_type, 1, 10, 1, 1);  
+
+
+  
+  clearTensorMap();
+  
+  for(int i = 0; i < total_runs; i++){
+
+    if(Opentuner_run){
+
+      const char* myfifo = "/tmp/myfifo";
+      int fd = open(myfifo, O_RDONLY);
+
+      int ret_val = fcntl(fd, F_GETFD);
+      if(ret_val == -1){
+	printf("Invalid descriptor \n");
+	abort();
+      }
+
+      char str[100];
+      read(fd, str, 80);
+      if(strcmp(str, "stop_run") == 0){
+	abort();
+      }
+
+      close(fd);
+    }
+
+    
+    readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters
+
+    // Start power and performnce profiling 
+    startProfiling();
+  
+    int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+    int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
+    // NOTE: 'SAME' convolution
+    //void* conv1out = tensorConvPerfCuda(input, conv1_filter, 2, 2, 1, 1,
+    //				conv_mode, conv_precision, 2, 2, 1);
+
+    void* conv1out = tensorConvSampSim(input, conv1_filter, 2, 2, 1, 1,
+    				       conv_mode, conv_precision, 4, 0);
+
+    // NOTE: For tensorAdd, the only dimension that MUST match is channels  
+    tensorAdd(conv1out, conv1_bias); // NOTE: In place operation
+
+    void* pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
+
+    void* conv1_tanh = tensorTanh(pool1out);
+
+    // NOTE: input channels have to match between tensor op inputs and outputs 
+    //void* conv2out = tensorConvPerfCuda(conv1_tanh, conv2_filter, 2, 2, 1, 1,
+    //				conv_mode, conv_precision, 1, 2, 1);
+
+    void* conv2out = tensorConvSampSim(conv1_tanh, conv2_filter, 2, 2, 1, 1,
+				       conv_mode, conv_precision, 2, 0);
+    
+    tensorAdd(conv2out, conv2_bias); // NOTE: In place operation
+
+    void* pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+
+    void* conv2_tanh = tensorTanh(pool2out);
+
+    void* gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights);  
+
+    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+
+    void* tanh1out = tensorTanh(gemm1biasout);
+  
+    void* gemm2out = tensorGemmGPU(tanh1out, fc2_weights);  
+  
+    void* gemm2_biasout = tensorAdd(gemm2out, fc2_bias);
+
+    void* tanh2out = tensorTanh(gemm2_biasout);
+  
+    void* result = tensorSoftmax(tanh2out);
+
+    // End profiling and dump output to profile.txt
+    stopProfiling();
+  
+    float accuracy = computeAccuracy2(labels, test_batch_size, result);
+    dumpFinalAccuracy(accuracy); 
+
+    
+    //FIXME: remove the comment below to use piped autotuner
+    //dumpAccuracyNorms();
+    freeOutputTensors();  
+
+    if(Opentuner_run){
+
+      const char* myfifo = "/tmp/myfifo";
+      int fd_out = open(myfifo, O_WRONLY);
+      int ret_val = fcntl(fd_out, F_GETFD);
+      if(ret_val == -1){
+	printf("Invalid descriptor \n");
+	abort();
+      }
+      
+      const char* str = "completed***!\n\0";
+      write(fd_out, str, 80);
+      close(fd_out);
+    }
+    
+  }
+
+  dumpExecutionAccuracies();
+
+  
+}
+
+
+
+int main(int argc, char* argv[]){
+
+  if (argc > 1){
+    total_runs = atoi(argv[1]);
+  }
+
+  llvm_hpvm_initTensorRt(0);
+
+  testLenetTanh();
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6ab2aed33b13a249214d94508e193d0b6049aaf
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling.cc
@@ -0,0 +1,162 @@
+// Per tensor operation
+
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+
+#include "../../../tensor_runtime/include/tensor_runtime.h"
+#include "../../include/utils.h"
+
+/* NOTE: Reference Architecture to use for profiling */
+void testCifarNet(){
+
+  printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
+ 
+  std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); 
+  std::string input_path =  dir_prefix + std::string("norm_cifar_input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("test_labels.bin"); 
+
+  void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin",
+					  float_type, 32, 3, 3, 3);  
+  void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin",
+					float_type, 1, 32, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin",
+					  float_type, 32, 32, 3, 3);  
+  void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin",
+					float_type, 1, 32, 1, 1);
+  void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin",
+					  float_type, 64, 32, 3, 3);  
+  void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin",
+					  float_type, 64, 64, 3, 3);  
+  void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin",
+					float_type, 1, 64, 1, 1);
+  void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin",
+					  float_type, 128, 64, 3, 3);  
+  void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin",
+					float_type, 1, 128, 1, 1);
+  void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin",
+					  float_type, 128, 128, 3, 3);  
+  void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin",
+					float_type, 1, 128, 1, 1);
+  
+  void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin",
+					 float_type, 1, 1, 2048, 10);  
+  void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin",
+				      float_type, 1, 10, 1, 1);  
+ 
+  
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
+  startMemTracking();
+
+  int total_runs = 100; 
+
+  int test_input_size = 5000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
+
+  Profiler profiler;
+  profiler.start_profiler();
+
+  double total_time = 0.0;
+
+  for(int i = 0; i < total_runs; i++){
+    for(int i = 0; i < batch_count; i++){
+      int start = i * batch_size;
+      int end = (i + 1) * batch_size;
+      void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
+
+      // FIRST Tensor Runtime CALL
+      profiler.resume_profiler();
+      void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      tensorAdd(conv1out, conv1_bias); 
+      void* conv1_tanh = tensorTanh(conv1out);
+
+      // 2nd Layer
+      void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      tensorAdd(conv2out, conv2_bias); 
+      void* conv2_tanh = tensorTanh(conv2out);
+      void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+
+      // 3rd Layer
+      void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      tensorAdd(conv3out, conv3_bias); 
+      void* conv3_tanh = tensorTanh(conv3out);
+
+      // 4th Layer
+      void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      tensorAdd(conv4out, conv4_bias); 
+      void* conv4_tanh = tensorTanh(conv4out);
+      void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+
+      // 5th Layer
+      void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      tensorAdd(conv5out, conv5_bias); 
+      void* conv5_tanh = tensorTanh(conv5out);
+
+      // 6th Layer
+      void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      tensorAdd(conv6out, conv6_bias); 
+      void* conv6_tanh = tensorTanh(conv6out);
+      void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+
+      // final FC Layer
+      void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); 
+      void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+      void* result = tensorSoftmax(gemm1biasout);
+
+      profiler.pause_profiler();
+      auto time_energy = profiler.get_time_energy();
+      total_time += time_energy.first;
+      profiler.reset();
+
+      uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+      float accuracy = computeAccuracy2(labels, batch_size, result); 
+      final_accuracy += accuracy;
+    
+      freeBatchMemory();
+    }
+  }
+  profiler.stop_profiler();
+
+  std::cout<<"---------------------------------------\n";
+  std::cout<<"Average time: " << total_time / total_runs << '\n';
+  std::cout<<"---------------------------------------\n";
+  
+  stopProfiling();
+  final_accuracy = (final_accuracy / batch_count) / total_runs;
+  dumpFinalAccuracy(final_accuracy);
+}
+
+
+int main(int argc, char* argv[]){
+
+  llvm_hpvm_initTensorRt(0);
+
+  testCifarNet();
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f95a7bda4fc581e4c40d4882304156f2420f22a5
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc
@@ -0,0 +1,262 @@
+// Per tensor operation
+
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+
+#include "../../../tensor_runtime/include/tensor_runtime.h"
+#include "../../include/utils.h"
+
+void add_data(std::unordered_map<std::string, std::pair<double, double> >& total_time_energies, Profiler& profiler, const std::string& op_name){
+    profiler.pause_profiler();
+    auto time_energy = profiler.get_time_energy();
+
+    auto itr = total_time_energies.find(op_name);
+    if (itr == total_time_energies.end()){
+        total_time_energies.insert(std::make_pair(op_name, time_energy));
+    } else {
+        itr->second.first += time_energy.first;
+		itr->second.second += time_energy.second;
+    }
+    profiler.reset();
+}
+
+/* NOTE: Reference Architecture to use for profiling */
+void testCifarNet(){
+
+  printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
+ 
+  std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); 
+  std::string input_path =  dir_prefix + std::string("norm_cifar_input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("test_labels.bin"); 
+
+  void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin",
+					  float_type, 32, 3, 3, 3);  
+  void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin",
+					float_type, 1, 32, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin",
+					  float_type, 32, 32, 3, 3);  
+  void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin",
+					float_type, 1, 32, 1, 1);
+  void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin",
+					  float_type, 64, 32, 3, 3);  
+  void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin",
+					  float_type, 64, 64, 3, 3);  
+  void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin",
+					float_type, 1, 64, 1, 1);
+  void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin",
+					  float_type, 128, 64, 3, 3);  
+  void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin",
+					float_type, 1, 128, 1, 1);
+  void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin",
+					  float_type, 128, 128, 3, 3);  
+  void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin",
+					float_type, 1, 128, 1, 1);
+  
+  void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin",
+					 float_type, 1, 1, 2048, 10);  
+  void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin",
+				      float_type, 1, 10, 1, 1);  
+ 
+  
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
+  std::ofstream online_profiler_output;
+  online_profiler_output.open("online_output.txt");
+
+  startMemTracking();
+
+  // NOTE: CHANGED INPUT TO STANDARDIZE
+  int total_runs = 50; // FOR NOW 100;
+
+  int test_input_size = 5000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
+
+  Profiler profiler;
+  profiler.start_profiler();
+
+  // Get the total time and energy per tensor per run 
+  std::unordered_map<std::string, std::pair<double, double> > total_time_energies;
+
+  for(int i = 0; i < total_runs; i++){
+    for(int i = 0; i < batch_count; i++){
+      int start = i * batch_size;
+      int end = (i + 1) * batch_size;
+      void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
+
+      // FIRST Tensor Runtime CALL
+      profiler.resume_profiler();
+      void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv1");
+       
+      profiler.resume_profiler();
+      tensorAdd(conv1out, conv1_bias); 
+      add_data(total_time_energies, profiler, "Add1");
+
+      profiler.resume_profiler();
+      void* conv1_tanh = tensorTanh(conv1out);
+      add_data(total_time_energies, profiler, "Tanh1");
+
+      // 2nd Layer
+      profiler.resume_profiler();
+      void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv2");
+
+      profiler.resume_profiler();
+      tensorAdd(conv2out, conv2_bias); 
+      add_data(total_time_energies, profiler, "Add2");
+
+      profiler.resume_profiler();
+      void* conv2_tanh = tensorTanh(conv2out);
+      add_data(total_time_energies, profiler, "Tanh2");
+
+      profiler.resume_profiler();
+      void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+      add_data(total_time_energies, profiler, "Pool1");
+
+      // 3rd Layer
+      profiler.resume_profiler();
+      void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv3");
+
+      profiler.resume_profiler();
+      tensorAdd(conv3out, conv3_bias); 
+      add_data(total_time_energies, profiler, "Add3");
+
+      profiler.resume_profiler();
+      void* conv3_tanh = tensorTanh(conv3out);
+      add_data(total_time_energies, profiler, "Tanh3");
+
+      // 4th Layer
+      profiler.resume_profiler();
+      void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv4");
+
+      profiler.resume_profiler();
+      tensorAdd(conv4out, conv4_bias); 
+      add_data(total_time_energies, profiler, "Add4");
+
+      profiler.resume_profiler();
+      void* conv4_tanh = tensorTanh(conv4out);
+      add_data(total_time_energies, profiler, "Tanh4");
+
+      profiler.resume_profiler();
+      void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+      add_data(total_time_energies, profiler, "Pool2");
+
+      // 5th Layer
+      profiler.resume_profiler();
+      void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv5");
+
+      profiler.resume_profiler();
+      tensorAdd(conv5out, conv5_bias); 
+      add_data(total_time_energies, profiler, "Add5");
+
+      profiler.resume_profiler();
+      void* conv5_tanh = tensorTanh(conv5out);
+      add_data(total_time_energies, profiler, "Tanh5");
+
+      // 6th Layer
+      profiler.resume_profiler();
+      void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1,
+					 conv_mode, conv_precision);
+      add_data(total_time_energies, profiler, "Conv6");
+
+      profiler.resume_profiler();
+      tensorAdd(conv6out, conv6_bias); 
+      add_data(total_time_energies, profiler, "Add6");
+
+      profiler.resume_profiler();
+      void* conv6_tanh = tensorTanh(conv6out);
+      add_data(total_time_energies, profiler, "Tanh6");
+
+      profiler.resume_profiler();
+      void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+      add_data(total_time_energies, profiler, "Pool3");
+
+      // final FC Layer
+      profiler.resume_profiler();
+      void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); 
+      add_data(total_time_energies, profiler, "Mul1"); // ASSUMING that this is mul1
+
+      std::cout<<"-----------------------------------ADD 7--------------------------------\n";
+      profiler.resume_profiler();
+      void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+      add_data(total_time_energies, profiler, "Add7");
+      std::cout<<"-----------------------------------ADD 7 ENDS --------------------------------\n";
+
+      profiler.resume_profiler();
+      void* result = tensorSoftmax(gemm1biasout);
+      add_data(total_time_energies, profiler, "Softmax1");
+
+      uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+      float accuracy = computeAccuracy2(labels, batch_size, result); 
+      final_accuracy += accuracy;
+    
+      freeBatchMemory();
+    }
+  }
+  profiler.stop_profiler();
+  
+  stopProfiling();
+  //online_profiler_output << "Total time: " << total_time << ", " << total_energy << "\n";
+  // Now compute the averages across batches
+  std::ofstream ofs;
+  std::string arr[] = {"Add1", "Add2", "Add3", "Add4", "Add5", "Add6", "Add7",
+                       "Conv1", "Conv2", "Conv3", "Conv4", "Conv5", "Conv6",
+                       "Mul1",
+                       "Pool1", "Pool2", "Pool3",
+                       "Softmax1",
+                       "Tanh1", "Tanh2", "Tanh3", "Tanh4", "Tanh5", "Tanh6"};
+  ofs.open("online_profiler_tensor_data.txt");
+  std::vector<std::string> ordered_keys(std::begin(arr), std::end(arr));
+  for (const std::string& key : ordered_keys){
+    const auto& data_pair = total_time_energies[key];
+    ofs << key << ": " << data_pair.first / total_runs << "\t" << data_pair.second / total_runs << '\n';
+    std::cout<< key << ": " << data_pair.first / total_runs << "\t" << data_pair.second / total_runs << '\n';
+  }
+   
+  /*
+  ofs.open("online_profiler_tensor_data.txt");
+  for (const auto& tensor_data : total_time_energies){
+    ofs << tensor_data.first << ": " << tensor_data.second.first / total_runs << "\t" << tensor_data.second.second / total_runs << '\n';
+  }*/
+  ofs.close();
+  final_accuracy = (final_accuracy / batch_count) / total_runs;
+  dumpFinalAccuracy(final_accuracy);
+  online_profiler_output.close();
+}
+
+
+int main(int argc, char* argv[]){
+
+  llvm_hpvm_initTensorRt(0);
+
+  testCifarNet();
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet_cifar10_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet_cifar10_profiling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eee98920bdfde1de5e769b038c87432fc4d269e1
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet_cifar10_profiling.cc
@@ -0,0 +1,124 @@
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/types.h> 
+#include <sys/stat.h> 
+#include <string.h> 
+#include "../../../tensor_runtime/include/tensor_runtime.h" 
+#include "../../include/utils.h" 
+
+int main(){ 
+
+  llvm_hpvm_initTensorRt(0); 
+
+
+  std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); 
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  //void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  //uint8_t* labels = readLabels(labels_path.c_str(),10000); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv0.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
+  std::string conv2d_1_b_path =  dir_prefix + std::string("conv_bias0.bin"); 
+  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv3.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
+  std::string conv2d_2_b_path =  dir_prefix + std::string("conv_bias3.bin"); 
+  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv6.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
+  std::string conv2d_3_b_path =  dir_prefix + std::string("conv_bias6.bin"); 
+  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv7.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
+  std::string conv2d_4_b_path =  dir_prefix + std::string("conv_bias7.bin"); 
+  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv8.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
+  std::string conv2d_5_b_path =  dir_prefix + std::string("conv_bias8.bin"); 
+  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("fc12.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
+  std::string dense_1_b_path =  dir_prefix + std::string("fc_bias12.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+
+
+  startMemTracking();
+
+  int test_input_size = 5000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  int total_runs = 100;
+  Profiler profiler;
+  profiler.start_profiler();
+
+  double total_time = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
+
+  for(int i = 0; i < total_runs; i++){
+      for(int i = 0; i < batch_count; i++){
+
+        int start = i * batch_size;
+        int end = (i + 1) * batch_size;
+        void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
+
+        profiler.resume_profiler();
+        void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
+        void* var_1 = tensorAdd(var_0, conv2d_1_b); 
+        void* var_2 = tensorTanh(var_1); 
+        void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); 
+        void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
+        void* var_6 = tensorAdd(var_5, conv2d_2_b); 
+        void* var_7 = tensorTanh(var_6); 
+        void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); 
+        void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
+        void* var_11 = tensorAdd(var_10, conv2d_3_b); 
+        void* var_12 = tensorTanh(var_11); 
+        void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
+        void* var_14 = tensorAdd(var_13, conv2d_4_b); 
+        void* var_15 = tensorTanh(var_14); 
+        void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
+        void* var_17 = tensorAdd(var_16, conv2d_5_b); 
+        void* var_18 = tensorTanh(var_17); 
+        void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); 
+        void* var_22 = tensorGemmGPU(var_19, dense_1_w); 
+        void* var_23 = tensorAdd(var_22, dense_1_b); 
+        void* var_24 = tensorSoftmax(var_23); 
+
+        profiler.pause_profiler();
+        auto time_energy = profiler.get_time_energy();
+        total_time += time_energy.first;
+        profiler.reset();
+
+        uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+        float accuracy = computeAccuracy2(labels,batch_size,var_24); 
+        final_accuracy += accuracy;
+        
+        freeBatchMemory();
+      } 
+  }
+  profiler.stop_profiler();
+
+  std::cout<<"---------------------------------------\n";
+  std::cout<<"Average time: " << total_time / total_runs << '\n';
+  std::cout<<"---------------------------------------\n";
+
+  stopProfiling();
+
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
+
+
+  llvm_hpvm_cleanupTensorRt(); 
+
+  return 0; 
+
+}
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/lenet_keras_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/lenet_keras_profiling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee323d068f60413090433ec013c985acafbd3406
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/lenet_keras_profiling.cc
@@ -0,0 +1,182 @@
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+
+
+#include "../../../tensor_runtime/include/tensor_runtime.h"
+#include "../../include/utils.h"
+
+
+bool Opentuner_run = false;
+
+
+/* NOTE: Reference Architecture to use for profiling */
+void testLenetTanh(){
+
+  int total_runs = 100;
+  
+  printf("********* Lenet-2 Architecture ********** \n");
+  // FIXIT: Extend this to batch of images - currently 5 images
+
+  int test_batch_size = 5000;
+
+  uint8_t* labels = readLabels("../model_params/lenet_params/datasets/t10k-labels-idx1-ubyte", test_batch_size);
+  
+  void* input = readInputTensor("../model_params/lenet_params/datasets/t10k-images-idx3-ubyte",
+				CUDNN_DATA_FLOAT,
+				test_batch_size, 1, 28, 28);
+
+  // NOTE: Filter descriptors do NOT have batch size
+  // NOTE: First two dims are output channels (configurable), input channels (MUST match input channels)
+  // IMP: The output channels matches the trained model - not the Lenet arch proposed in Andrew Ng's class
+  void* conv1_filter = readTrainedWeights("../model_params/lenet_keras/conv1.bin",
+					  float_type, 32, 1, 5, 5);    
+  void* conv1_bias = readTrainedWeights("../model_params/lenet_keras/conv1_bias.bin",
+					float_type, 1, 32, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/lenet_keras/conv2.bin",
+					  float_type, 64, 32, 5, 5);  
+  void* conv2_bias = readTrainedWeights("../model_params/lenet_keras/conv2_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* fc1_weights = readTrainedWeights("../model_params/lenet_keras/fc1.bin",
+					 float_type, 1, 1, 7*7*64, 1024);  
+  void* fc1_bias = readTrainedWeights("../model_params/lenet_keras/fc1_bias.bin",
+				      float_type, 1, 1024, 1, 1);  
+  void* fc2_weights = readTrainedWeights("../model_params/lenet_keras/fc2.bin",
+					 float_type, 1, 1, 1024, 10);  
+  void* fc2_bias = readTrainedWeights("../model_params/lenet_keras/fc2_bias.bin",
+				      float_type, 1, 10, 1, 1);  
+
+
+  
+  clearTensorMap();
+ 
+  Profiler profiler;
+  profiler.start_profiler();
+
+  double total_time = 0.0;
+
+  for(int i = 0; i < total_runs; i++){
+
+    if(Opentuner_run){
+
+      const char* myfifo = "/tmp/myfifo";
+      int fd = open(myfifo, O_RDONLY);
+
+      int ret_val = fcntl(fd, F_GETFD);
+      if(ret_val == -1){
+	printf("Invalid descriptor \n");
+	abort();
+      }
+
+      char str[100];
+      read(fd, str, 80);
+      if(strcmp(str, "stop_run") == 0){
+	abort();
+      }
+
+      close(fd);
+    }
+
+    
+    readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters
+
+    // Start power and performnce profiling 
+    startProfiling();
+    profiler.resume_profiler();
+
+    int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+    int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
+    // NOTE: 'SAME' convolution
+    void* conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1,
+				       conv_mode, conv_precision);
+
+    // NOTE: For tensorAdd, the only dimension that MUST match is channels  
+    tensorAdd(conv1out, conv1_bias); // NOTE: In place operation
+
+    void* pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
+
+    void* conv1_tanh = tensorTanh(pool1out);
+
+    // NOTE: input channels have to match between tensor op inputs and outputs 
+    void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv2out, conv2_bias); // NOTE: In place operation
+
+    void* pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+
+    void* conv2_tanh = tensorTanh(pool2out);
+
+    void* gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights);  
+
+    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+
+    void* tanh1out = tensorTanh(gemm1biasout);
+  
+    void* gemm2out = tensorGemmGPU(tanh1out, fc2_weights);  
+  
+    void* gemm2_biasout = tensorAdd(gemm2out, fc2_bias);
+
+    void* tanh2out = tensorTanh(gemm2_biasout);
+  
+    void* result = tensorSoftmax(tanh2out);
+
+    profiler.pause_profiler();
+    auto time_energy = profiler.get_time_energy();
+    total_time += time_energy.first;
+    profiler.reset();
+    std::cout<<"---------------------------------------\n";
+    std::cout<<"ITERATION TIME: " << time_energy.first << '\n';
+    std::cout<<"---------------------------------------\n";
+
+    // End profiling and dump output to profile.txt
+    stopProfiling();
+  
+    computeAccuracy2(labels, test_batch_size, result);
+    
+    dumpAccuracyNorms();
+    freeOutputTensors();  
+
+    if(Opentuner_run){
+
+      const char* myfifo = "/tmp/myfifo";
+      int fd_out = open(myfifo, O_WRONLY);
+      int ret_val = fcntl(fd_out, F_GETFD);
+      if(ret_val == -1){
+	printf("Invalid descriptor \n");
+	abort();
+      }
+      
+      const char* str = "completed***!\n\0";
+      write(fd_out, str, 80);
+      close(fd_out);
+    }
+    
+  }
+
+  profiler.stop_profiler();
+
+  std::cout<<"---------------------------------------\n";
+  std::cout<<"Average time: " << total_time / total_runs << '\n';
+  std::cout<<"---------------------------------------\n";
+  
+}
+
+
+int main(int argc, char* argv[]){
+
+  llvm_hpvm_initTensorRt(0);
+
+  testLenetTanh();
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_cifar10_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_cifar10_profiling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66b7f2a6c4983a8e1f04dfe32f9b599340ea2d05
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_cifar10_profiling.cc
@@ -0,0 +1,435 @@
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/types.h> 
+#include <sys/stat.h> 
+#include <string.h> 
+#include "../../../tensor_runtime/include/tensor_runtime.h" 
+#include "../../include/utils.h" 
+
+int main(){ 
+
+  llvm_hpvm_initTensorRt(0); 
+
+
+  std::string dir_prefix = std::string("../model_params/mobilenet_quant/"); 
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
+  std::string batch_normalization_1_gamma_path =  dir_prefix + std::string("batch_normalization_1_gamma.bin"); 
+  void* batch_normalization_1_gamma =  readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_1_beta_path =  dir_prefix + std::string("batch_normalization_1_beta.bin"); 
+  void* batch_normalization_1_beta =  readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_1_mean_path =  dir_prefix + std::string("batch_normalization_1_mean.bin"); 
+  void* batch_normalization_1_mean =  readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_1_variance_path =  dir_prefix + std::string("batch_normalization_1_variance.bin"); 
+  void* batch_normalization_1_variance =  readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); 
+  std::string depthwise_conv2d_1_w_path =  dir_prefix + std::string("depthwise_conv2d_1_w.bin"); 
+  void* depthwise_conv2d_1_w =  readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); 
+  std::string batch_normalization_2_gamma_path =  dir_prefix + std::string("batch_normalization_2_gamma.bin"); 
+  void* batch_normalization_2_gamma =  readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_2_beta_path =  dir_prefix + std::string("batch_normalization_2_beta.bin"); 
+  void* batch_normalization_2_beta =  readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_2_mean_path =  dir_prefix + std::string("batch_normalization_2_mean.bin"); 
+  void* batch_normalization_2_mean =  readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_2_variance_path =  dir_prefix + std::string("batch_normalization_2_variance.bin"); 
+  void* batch_normalization_2_variance =  readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); 
+  std::string batch_normalization_3_gamma_path =  dir_prefix + std::string("batch_normalization_3_gamma.bin"); 
+  void* batch_normalization_3_gamma =  readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_3_beta_path =  dir_prefix + std::string("batch_normalization_3_beta.bin"); 
+  void* batch_normalization_3_beta =  readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_3_mean_path =  dir_prefix + std::string("batch_normalization_3_mean.bin"); 
+  void* batch_normalization_3_mean =  readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_3_variance_path =  dir_prefix + std::string("batch_normalization_3_variance.bin"); 
+  void* batch_normalization_3_variance =  readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); 
+  std::string depthwise_conv2d_2_w_path =  dir_prefix + std::string("depthwise_conv2d_2_w.bin"); 
+  void* depthwise_conv2d_2_w =  readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); 
+  std::string batch_normalization_4_gamma_path =  dir_prefix + std::string("batch_normalization_4_gamma.bin"); 
+  void* batch_normalization_4_gamma =  readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_4_beta_path =  dir_prefix + std::string("batch_normalization_4_beta.bin"); 
+  void* batch_normalization_4_beta =  readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_4_mean_path =  dir_prefix + std::string("batch_normalization_4_mean.bin"); 
+  void* batch_normalization_4_mean =  readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_4_variance_path =  dir_prefix + std::string("batch_normalization_4_variance.bin"); 
+  void* batch_normalization_4_variance =  readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); 
+  std::string batch_normalization_5_gamma_path =  dir_prefix + std::string("batch_normalization_5_gamma.bin"); 
+  void* batch_normalization_5_gamma =  readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_5_beta_path =  dir_prefix + std::string("batch_normalization_5_beta.bin"); 
+  void* batch_normalization_5_beta =  readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_5_mean_path =  dir_prefix + std::string("batch_normalization_5_mean.bin"); 
+  void* batch_normalization_5_mean =  readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_5_variance_path =  dir_prefix + std::string("batch_normalization_5_variance.bin"); 
+  void* batch_normalization_5_variance =  readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); 
+  std::string depthwise_conv2d_3_w_path =  dir_prefix + std::string("depthwise_conv2d_3_w.bin"); 
+  void* depthwise_conv2d_3_w =  readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); 
+  std::string batch_normalization_6_gamma_path =  dir_prefix + std::string("batch_normalization_6_gamma.bin"); 
+  void* batch_normalization_6_gamma =  readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_6_beta_path =  dir_prefix + std::string("batch_normalization_6_beta.bin"); 
+  void* batch_normalization_6_beta =  readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_6_mean_path =  dir_prefix + std::string("batch_normalization_6_mean.bin"); 
+  void* batch_normalization_6_mean =  readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_6_variance_path =  dir_prefix + std::string("batch_normalization_6_variance.bin"); 
+  void* batch_normalization_6_variance =  readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); 
+  std::string batch_normalization_7_gamma_path =  dir_prefix + std::string("batch_normalization_7_gamma.bin"); 
+  void* batch_normalization_7_gamma =  readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_7_beta_path =  dir_prefix + std::string("batch_normalization_7_beta.bin"); 
+  void* batch_normalization_7_beta =  readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_7_mean_path =  dir_prefix + std::string("batch_normalization_7_mean.bin"); 
+  void* batch_normalization_7_mean =  readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_7_variance_path =  dir_prefix + std::string("batch_normalization_7_variance.bin"); 
+  void* batch_normalization_7_variance =  readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); 
+  std::string depthwise_conv2d_4_w_path =  dir_prefix + std::string("depthwise_conv2d_4_w.bin"); 
+  void* depthwise_conv2d_4_w =  readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); 
+  std::string batch_normalization_8_gamma_path =  dir_prefix + std::string("batch_normalization_8_gamma.bin"); 
+  void* batch_normalization_8_gamma =  readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_8_beta_path =  dir_prefix + std::string("batch_normalization_8_beta.bin"); 
+  void* batch_normalization_8_beta =  readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_8_mean_path =  dir_prefix + std::string("batch_normalization_8_mean.bin"); 
+  void* batch_normalization_8_mean =  readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_8_variance_path =  dir_prefix + std::string("batch_normalization_8_variance.bin"); 
+  void* batch_normalization_8_variance =  readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); 
+  std::string batch_normalization_9_gamma_path =  dir_prefix + std::string("batch_normalization_9_gamma.bin"); 
+  void* batch_normalization_9_gamma =  readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_9_beta_path =  dir_prefix + std::string("batch_normalization_9_beta.bin"); 
+  void* batch_normalization_9_beta =  readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_9_mean_path =  dir_prefix + std::string("batch_normalization_9_mean.bin"); 
+  void* batch_normalization_9_mean =  readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_9_variance_path =  dir_prefix + std::string("batch_normalization_9_variance.bin"); 
+  void* batch_normalization_9_variance =  readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); 
+  std::string depthwise_conv2d_5_w_path =  dir_prefix + std::string("depthwise_conv2d_5_w.bin"); 
+  void* depthwise_conv2d_5_w =  readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); 
+  std::string batch_normalization_10_gamma_path =  dir_prefix + std::string("batch_normalization_10_gamma.bin"); 
+  void* batch_normalization_10_gamma =  readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_10_beta_path =  dir_prefix + std::string("batch_normalization_10_beta.bin"); 
+  void* batch_normalization_10_beta =  readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_10_mean_path =  dir_prefix + std::string("batch_normalization_10_mean.bin"); 
+  void* batch_normalization_10_mean =  readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_10_variance_path =  dir_prefix + std::string("batch_normalization_10_variance.bin"); 
+  void* batch_normalization_10_variance =  readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
+  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); 
+  std::string batch_normalization_11_gamma_path =  dir_prefix + std::string("batch_normalization_11_gamma.bin"); 
+  void* batch_normalization_11_gamma =  readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_11_beta_path =  dir_prefix + std::string("batch_normalization_11_beta.bin"); 
+  void* batch_normalization_11_beta =  readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_11_mean_path =  dir_prefix + std::string("batch_normalization_11_mean.bin"); 
+  void* batch_normalization_11_mean =  readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_11_variance_path =  dir_prefix + std::string("batch_normalization_11_variance.bin"); 
+  void* batch_normalization_11_variance =  readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); 
+  std::string depthwise_conv2d_6_w_path =  dir_prefix + std::string("depthwise_conv2d_6_w.bin"); 
+  void* depthwise_conv2d_6_w =  readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); 
+  std::string batch_normalization_12_gamma_path =  dir_prefix + std::string("batch_normalization_12_gamma.bin"); 
+  void* batch_normalization_12_gamma =  readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_12_beta_path =  dir_prefix + std::string("batch_normalization_12_beta.bin"); 
+  void* batch_normalization_12_beta =  readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_12_mean_path =  dir_prefix + std::string("batch_normalization_12_mean.bin"); 
+  void* batch_normalization_12_mean =  readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_12_variance_path =  dir_prefix + std::string("batch_normalization_12_variance.bin"); 
+  void* batch_normalization_12_variance =  readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
+  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); 
+  std::string batch_normalization_13_gamma_path =  dir_prefix + std::string("batch_normalization_13_gamma.bin"); 
+  void* batch_normalization_13_gamma =  readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_13_beta_path =  dir_prefix + std::string("batch_normalization_13_beta.bin"); 
+  void* batch_normalization_13_beta =  readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_13_mean_path =  dir_prefix + std::string("batch_normalization_13_mean.bin"); 
+  void* batch_normalization_13_mean =  readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_13_variance_path =  dir_prefix + std::string("batch_normalization_13_variance.bin"); 
+  void* batch_normalization_13_variance =  readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); 
+  std::string depthwise_conv2d_7_w_path =  dir_prefix + std::string("depthwise_conv2d_7_w.bin"); 
+  void* depthwise_conv2d_7_w =  readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); 
+  std::string batch_normalization_14_gamma_path =  dir_prefix + std::string("batch_normalization_14_gamma.bin"); 
+  void* batch_normalization_14_gamma =  readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_14_beta_path =  dir_prefix + std::string("batch_normalization_14_beta.bin"); 
+  void* batch_normalization_14_beta =  readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_14_mean_path =  dir_prefix + std::string("batch_normalization_14_mean.bin"); 
+  void* batch_normalization_14_mean =  readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_14_variance_path =  dir_prefix + std::string("batch_normalization_14_variance.bin"); 
+  void* batch_normalization_14_variance =  readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
+  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); 
+  std::string batch_normalization_15_gamma_path =  dir_prefix + std::string("batch_normalization_15_gamma.bin"); 
+  void* batch_normalization_15_gamma =  readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_15_beta_path =  dir_prefix + std::string("batch_normalization_15_beta.bin"); 
+  void* batch_normalization_15_beta =  readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_15_mean_path =  dir_prefix + std::string("batch_normalization_15_mean.bin"); 
+  void* batch_normalization_15_mean =  readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_15_variance_path =  dir_prefix + std::string("batch_normalization_15_variance.bin"); 
+  void* batch_normalization_15_variance =  readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); 
+  std::string depthwise_conv2d_8_w_path =  dir_prefix + std::string("depthwise_conv2d_8_w.bin"); 
+  void* depthwise_conv2d_8_w =  readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); 
+  std::string batch_normalization_16_gamma_path =  dir_prefix + std::string("batch_normalization_16_gamma.bin"); 
+  void* batch_normalization_16_gamma =  readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_16_beta_path =  dir_prefix + std::string("batch_normalization_16_beta.bin"); 
+  void* batch_normalization_16_beta =  readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_16_mean_path =  dir_prefix + std::string("batch_normalization_16_mean.bin"); 
+  void* batch_normalization_16_mean =  readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_16_variance_path =  dir_prefix + std::string("batch_normalization_16_variance.bin"); 
+  void* batch_normalization_16_variance =  readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
+  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); 
+  std::string batch_normalization_17_gamma_path =  dir_prefix + std::string("batch_normalization_17_gamma.bin"); 
+  void* batch_normalization_17_gamma =  readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_17_beta_path =  dir_prefix + std::string("batch_normalization_17_beta.bin"); 
+  void* batch_normalization_17_beta =  readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_17_mean_path =  dir_prefix + std::string("batch_normalization_17_mean.bin"); 
+  void* batch_normalization_17_mean =  readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_17_variance_path =  dir_prefix + std::string("batch_normalization_17_variance.bin"); 
+  void* batch_normalization_17_variance =  readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); 
+  std::string depthwise_conv2d_9_w_path =  dir_prefix + std::string("depthwise_conv2d_9_w.bin"); 
+  void* depthwise_conv2d_9_w =  readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); 
+  std::string batch_normalization_18_gamma_path =  dir_prefix + std::string("batch_normalization_18_gamma.bin"); 
+  void* batch_normalization_18_gamma =  readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_18_beta_path =  dir_prefix + std::string("batch_normalization_18_beta.bin"); 
+  void* batch_normalization_18_beta =  readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_18_mean_path =  dir_prefix + std::string("batch_normalization_18_mean.bin"); 
+  void* batch_normalization_18_mean =  readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_18_variance_path =  dir_prefix + std::string("batch_normalization_18_variance.bin"); 
+  void* batch_normalization_18_variance =  readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
+  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); 
+  std::string batch_normalization_19_gamma_path =  dir_prefix + std::string("batch_normalization_19_gamma.bin"); 
+  void* batch_normalization_19_gamma =  readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_19_beta_path =  dir_prefix + std::string("batch_normalization_19_beta.bin"); 
+  void* batch_normalization_19_beta =  readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_19_mean_path =  dir_prefix + std::string("batch_normalization_19_mean.bin"); 
+  void* batch_normalization_19_mean =  readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_19_variance_path =  dir_prefix + std::string("batch_normalization_19_variance.bin"); 
+  void* batch_normalization_19_variance =  readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); 
+  std::string depthwise_conv2d_10_w_path =  dir_prefix + std::string("depthwise_conv2d_10_w.bin"); 
+  void* depthwise_conv2d_10_w =  readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); 
+  std::string batch_normalization_20_gamma_path =  dir_prefix + std::string("batch_normalization_20_gamma.bin"); 
+  void* batch_normalization_20_gamma =  readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_20_beta_path =  dir_prefix + std::string("batch_normalization_20_beta.bin"); 
+  void* batch_normalization_20_beta =  readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_20_mean_path =  dir_prefix + std::string("batch_normalization_20_mean.bin"); 
+  void* batch_normalization_20_mean =  readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_20_variance_path =  dir_prefix + std::string("batch_normalization_20_variance.bin"); 
+  void* batch_normalization_20_variance =  readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
+  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); 
+  std::string batch_normalization_21_gamma_path =  dir_prefix + std::string("batch_normalization_21_gamma.bin"); 
+  void* batch_normalization_21_gamma =  readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_21_beta_path =  dir_prefix + std::string("batch_normalization_21_beta.bin"); 
+  void* batch_normalization_21_beta =  readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_21_mean_path =  dir_prefix + std::string("batch_normalization_21_mean.bin"); 
+  void* batch_normalization_21_mean =  readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_21_variance_path =  dir_prefix + std::string("batch_normalization_21_variance.bin"); 
+  void* batch_normalization_21_variance =  readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); 
+  std::string depthwise_conv2d_11_w_path =  dir_prefix + std::string("depthwise_conv2d_11_w.bin"); 
+  void* depthwise_conv2d_11_w =  readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); 
+  std::string batch_normalization_22_gamma_path =  dir_prefix + std::string("batch_normalization_22_gamma.bin"); 
+  void* batch_normalization_22_gamma =  readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_22_beta_path =  dir_prefix + std::string("batch_normalization_22_beta.bin"); 
+  void* batch_normalization_22_beta =  readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_22_mean_path =  dir_prefix + std::string("batch_normalization_22_mean.bin"); 
+  void* batch_normalization_22_mean =  readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_22_variance_path =  dir_prefix + std::string("batch_normalization_22_variance.bin"); 
+  void* batch_normalization_22_variance =  readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
+  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); 
+  std::string batch_normalization_23_gamma_path =  dir_prefix + std::string("batch_normalization_23_gamma.bin"); 
+  void* batch_normalization_23_gamma =  readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_23_beta_path =  dir_prefix + std::string("batch_normalization_23_beta.bin"); 
+  void* batch_normalization_23_beta =  readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_23_mean_path =  dir_prefix + std::string("batch_normalization_23_mean.bin"); 
+  void* batch_normalization_23_mean =  readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_23_variance_path =  dir_prefix + std::string("batch_normalization_23_variance.bin"); 
+  void* batch_normalization_23_variance =  readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); 
+  std::string depthwise_conv2d_12_w_path =  dir_prefix + std::string("depthwise_conv2d_12_w.bin"); 
+  void* depthwise_conv2d_12_w =  readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); 
+  std::string batch_normalization_24_gamma_path =  dir_prefix + std::string("batch_normalization_24_gamma.bin"); 
+  void* batch_normalization_24_gamma =  readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_24_beta_path =  dir_prefix + std::string("batch_normalization_24_beta.bin"); 
+  void* batch_normalization_24_beta =  readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_24_mean_path =  dir_prefix + std::string("batch_normalization_24_mean.bin"); 
+  void* batch_normalization_24_mean =  readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_24_variance_path =  dir_prefix + std::string("batch_normalization_24_variance.bin"); 
+  void* batch_normalization_24_variance =  readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
+  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); 
+  std::string batch_normalization_25_gamma_path =  dir_prefix + std::string("batch_normalization_25_gamma.bin"); 
+  void* batch_normalization_25_gamma =  readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); 
+  std::string batch_normalization_25_beta_path =  dir_prefix + std::string("batch_normalization_25_beta.bin"); 
+  void* batch_normalization_25_beta =  readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); 
+  std::string batch_normalization_25_mean_path =  dir_prefix + std::string("batch_normalization_25_mean.bin"); 
+  void* batch_normalization_25_mean =  readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); 
+  std::string batch_normalization_25_variance_path =  dir_prefix + std::string("batch_normalization_25_variance.bin"); 
+  void* batch_normalization_25_variance =  readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); 
+  std::string depthwise_conv2d_13_w_path =  dir_prefix + std::string("depthwise_conv2d_13_w.bin"); 
+  void* depthwise_conv2d_13_w =  readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); 
+  std::string batch_normalization_26_gamma_path =  dir_prefix + std::string("batch_normalization_26_gamma.bin"); 
+  void* batch_normalization_26_gamma =  readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); 
+  std::string batch_normalization_26_beta_path =  dir_prefix + std::string("batch_normalization_26_beta.bin"); 
+  void* batch_normalization_26_beta =  readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); 
+  std::string batch_normalization_26_mean_path =  dir_prefix + std::string("batch_normalization_26_mean.bin"); 
+  void* batch_normalization_26_mean =  readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); 
+  std::string batch_normalization_26_variance_path =  dir_prefix + std::string("batch_normalization_26_variance.bin"); 
+  void* batch_normalization_26_variance =  readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); 
+  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
+  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); 
+  std::string batch_normalization_27_gamma_path =  dir_prefix + std::string("batch_normalization_27_gamma.bin"); 
+  void* batch_normalization_27_gamma =  readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); 
+  std::string batch_normalization_27_beta_path =  dir_prefix + std::string("batch_normalization_27_beta.bin"); 
+  void* batch_normalization_27_beta =  readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); 
+  std::string batch_normalization_27_mean_path =  dir_prefix + std::string("batch_normalization_27_mean.bin"); 
+  void* batch_normalization_27_mean =  readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); 
+  std::string batch_normalization_27_variance_path =  dir_prefix + std::string("batch_normalization_27_variance.bin"); 
+  void* batch_normalization_27_variance =  readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); 
+  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+
+
+
+  startMemTracking(); 
+  startProfiling();
+
+  int test_input_size = 5000; 
+  int batch_size = 1000; 
+  int batch_count = test_input_size / batch_size; 
+  float final_accuracy = 0.0; 
+
+  int total_runs = 100;
+  Profiler profiler;
+  profiler.start_profiler();
+
+  double total_time = 0.0;
+
+  for(int i = 0; i < total_runs; i++){
+      for(int i = 0; i < batch_count; i++){ 
+
+        int start = i * batch_size; 
+        int end = (i + 1) * batch_size; 
+
+        void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
+
+        profiler.resume_profiler();
+        void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
+        void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
+        void* var_2 = tensorRelu(var_1); 
+        void* var_4 = tensorConvolution(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
+        void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
+        void* var_6 = tensorRelu(var_5); 
+        void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
+        void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
+        void* var_9 = tensorRelu(var_8); 
+        void* var_11 = tensorConvolution(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
+        void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
+        void* var_13 = tensorRelu(var_12); 
+        void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
+        void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
+        void* var_16 = tensorRelu(var_15); 
+        void* var_18 = tensorConvolution(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
+        void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
+        void* var_20 = tensorRelu(var_19); 
+        void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
+        void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
+        void* var_23 = tensorRelu(var_22); 
+        void* var_26 = tensorConvolution(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
+        void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
+        void* var_28 = tensorRelu(var_27); 
+        void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
+        void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
+        void* var_31 = tensorRelu(var_30); 
+        void* var_33 = tensorConvolution(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
+        void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
+        void* var_35 = tensorRelu(var_34); 
+        void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
+        void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
+        void* var_38 = tensorRelu(var_37); 
+        void* var_41 = tensorConvolution(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
+        void* var_42 = tensorBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
+        void* var_43 = tensorRelu(var_42); 
+        void* var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); 
+        void* var_45 = tensorBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
+        void* var_46 = tensorRelu(var_45); 
+        void* var_48 = tensorConvolution(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
+        void* var_49 = tensorBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
+        void* var_50 = tensorRelu(var_49); 
+        void* var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
+        void* var_52 = tensorBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
+        void* var_53 = tensorRelu(var_52); 
+        void* var_55 = tensorConvolution(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
+        void* var_56 = tensorBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
+        void* var_57 = tensorRelu(var_56); 
+        void* var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
+        void* var_59 = tensorBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
+        void* var_60 = tensorRelu(var_59); 
+        void* var_63 = tensorConvolution(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
+        void* var_64 = tensorBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
+        void* var_65 = tensorRelu(var_64); 
+        void* var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); 
+        void* var_67 = tensorBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
+        void* var_68 = tensorRelu(var_67); 
+        void* var_70 = tensorConvolution(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
+        void* var_71 = tensorBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
+        void* var_72 = tensorRelu(var_71); 
+        void* var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
+        void* var_74 = tensorBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
+        void* var_75 = tensorRelu(var_74); 
+        void* var_77 = tensorConvolution(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
+        void* var_78 = tensorBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
+        void* var_79 = tensorRelu(var_78); 
+        void* var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); 
+        void* var_81 = tensorBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
+        void* var_82 = tensorRelu(var_81); 
+        void* var_85 = tensorConvolution(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
+        void* var_86 = tensorBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
+        void* var_87 = tensorRelu(var_86); 
+        void* var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); 
+        void* var_89 = tensorBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
+        void* var_90 = tensorRelu(var_89); 
+        void* var_92 = tensorConvolution(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
+        void* var_93 = tensorBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
+        void* var_94 = tensorRelu(var_93); 
+        void* var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
+        void* var_96 = tensorBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
+        void* var_97 = tensorRelu(var_96); 
+        void* var_99 = tensorPooling(var_97,1,2,2,0,0,2,2); 
+        void* var_101 = tensorGemmGPU(var_99, dense_1_w); 
+        void* var_102 = tensorAdd(var_101, dense_1_b); 
+        void* var_103 = tensorSoftmax(var_102); 
+
+        profiler.pause_profiler();
+        auto time_energy = profiler.get_time_energy();
+        total_time += time_energy.first;
+        profiler.reset();
+
+        uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
+
+        float accuracy = computeAccuracy2(labels, batch_size, var_103); 
+        final_accuracy += accuracy; 
+        freeBatchMemory(); 
+      }
+  }
+  profiler.stop_profiler();
+
+  std::cout<<"---------------------------------------\n";
+  std::cout<<"Average time: " << total_time / total_runs << '\n';
+  std::cout<<"---------------------------------------\n";
+
+  stopProfiling();
+
+  final_accuracy = final_accuracy / batch_count; 
+  dumpFinalAccuracy(final_accuracy); 
+
+
+  llvm_hpvm_cleanupTensorRt(); 
+
+  return 0; 
+
+}
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_shallow_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_shallow_profiling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c8d402b78ddd65057e75fadc9acd0e1dd4b6170
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_shallow_profiling.cc
@@ -0,0 +1,224 @@
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/types.h> 
+#include <sys/stat.h> 
+#include <string.h> 
+#include "../../../tensor_runtime/include/tensor_runtime.h" 
+#include "../../include/utils.h" 
+
+int main(){ 
+
+  llvm_hpvm_initTensorRt(0); 
+
+  std::string dir_prefix = std::string("../model_params/mobilenet_shallow/"); 
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
+  std::string batch_normalization_1_gamma_path =  dir_prefix + std::string("batch_normalization_1_gamma.bin"); 
+  void* batch_normalization_1_gamma =  readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_1_beta_path =  dir_prefix + std::string("batch_normalization_1_beta.bin"); 
+  void* batch_normalization_1_beta =  readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_1_mean_path =  dir_prefix + std::string("batch_normalization_1_mean.bin"); 
+  void* batch_normalization_1_mean =  readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_1_variance_path =  dir_prefix + std::string("batch_normalization_1_variance.bin"); 
+  void* batch_normalization_1_variance =  readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); 
+  std::string depthwise_conv2d_1_w_path =  dir_prefix + std::string("depthwise_conv2d_1_w.bin"); 
+  void* depthwise_conv2d_1_w =  readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); 
+  std::string batch_normalization_2_gamma_path =  dir_prefix + std::string("batch_normalization_2_gamma.bin"); 
+  void* batch_normalization_2_gamma =  readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_2_beta_path =  dir_prefix + std::string("batch_normalization_2_beta.bin"); 
+  void* batch_normalization_2_beta =  readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_2_mean_path =  dir_prefix + std::string("batch_normalization_2_mean.bin"); 
+  void* batch_normalization_2_mean =  readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_2_variance_path =  dir_prefix + std::string("batch_normalization_2_variance.bin"); 
+  void* batch_normalization_2_variance =  readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); 
+  std::string batch_normalization_3_gamma_path =  dir_prefix + std::string("batch_normalization_3_gamma.bin"); 
+  void* batch_normalization_3_gamma =  readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_3_beta_path =  dir_prefix + std::string("batch_normalization_3_beta.bin"); 
+  void* batch_normalization_3_beta =  readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_3_mean_path =  dir_prefix + std::string("batch_normalization_3_mean.bin"); 
+  void* batch_normalization_3_mean =  readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_3_variance_path =  dir_prefix + std::string("batch_normalization_3_variance.bin"); 
+  void* batch_normalization_3_variance =  readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); 
+  std::string depthwise_conv2d_2_w_path =  dir_prefix + std::string("depthwise_conv2d_2_w.bin"); 
+  void* depthwise_conv2d_2_w =  readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); 
+  std::string batch_normalization_4_gamma_path =  dir_prefix + std::string("batch_normalization_4_gamma.bin"); 
+  void* batch_normalization_4_gamma =  readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_4_beta_path =  dir_prefix + std::string("batch_normalization_4_beta.bin"); 
+  void* batch_normalization_4_beta =  readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_4_mean_path =  dir_prefix + std::string("batch_normalization_4_mean.bin"); 
+  void* batch_normalization_4_mean =  readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_4_variance_path =  dir_prefix + std::string("batch_normalization_4_variance.bin"); 
+  void* batch_normalization_4_variance =  readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,64,1,1); 
+  std::string batch_normalization_5_gamma_path =  dir_prefix + std::string("batch_normalization_5_gamma.bin"); 
+  void* batch_normalization_5_gamma =  readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_5_beta_path =  dir_prefix + std::string("batch_normalization_5_beta.bin"); 
+  void* batch_normalization_5_beta =  readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_5_mean_path =  dir_prefix + std::string("batch_normalization_5_mean.bin"); 
+  void* batch_normalization_5_mean =  readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_5_variance_path =  dir_prefix + std::string("batch_normalization_5_variance.bin"); 
+  void* batch_normalization_5_variance =  readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,64,1,1); 
+  std::string depthwise_conv2d_3_w_path =  dir_prefix + std::string("depthwise_conv2d_3_w.bin"); 
+  void* depthwise_conv2d_3_w =  readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,64,1,3,3); 
+  std::string batch_normalization_6_gamma_path =  dir_prefix + std::string("batch_normalization_6_gamma.bin"); 
+  void* batch_normalization_6_gamma =  readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_6_beta_path =  dir_prefix + std::string("batch_normalization_6_beta.bin"); 
+  void* batch_normalization_6_beta =  readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_6_mean_path =  dir_prefix + std::string("batch_normalization_6_mean.bin"); 
+  void* batch_normalization_6_mean =  readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_6_variance_path =  dir_prefix + std::string("batch_normalization_6_variance.bin"); 
+  void* batch_normalization_6_variance =  readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,64,1,1); 
+  std::string batch_normalization_7_gamma_path =  dir_prefix + std::string("batch_normalization_7_gamma.bin"); 
+  void* batch_normalization_7_gamma =  readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_7_beta_path =  dir_prefix + std::string("batch_normalization_7_beta.bin"); 
+  void* batch_normalization_7_beta =  readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_7_mean_path =  dir_prefix + std::string("batch_normalization_7_mean.bin"); 
+  void* batch_normalization_7_mean =  readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_7_variance_path =  dir_prefix + std::string("batch_normalization_7_variance.bin"); 
+  void* batch_normalization_7_variance =  readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); 
+  std::string depthwise_conv2d_4_w_path =  dir_prefix + std::string("depthwise_conv2d_4_w.bin"); 
+  void* depthwise_conv2d_4_w =  readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); 
+  std::string batch_normalization_8_gamma_path =  dir_prefix + std::string("batch_normalization_8_gamma.bin"); 
+  void* batch_normalization_8_gamma =  readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_8_beta_path =  dir_prefix + std::string("batch_normalization_8_beta.bin"); 
+  void* batch_normalization_8_beta =  readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_8_mean_path =  dir_prefix + std::string("batch_normalization_8_mean.bin"); 
+  void* batch_normalization_8_mean =  readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_8_variance_path =  dir_prefix + std::string("batch_normalization_8_variance.bin"); 
+  void* batch_normalization_8_variance =  readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); 
+  std::string batch_normalization_9_gamma_path =  dir_prefix + std::string("batch_normalization_9_gamma.bin"); 
+  void* batch_normalization_9_gamma =  readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_9_beta_path =  dir_prefix + std::string("batch_normalization_9_beta.bin"); 
+  void* batch_normalization_9_beta =  readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_9_mean_path =  dir_prefix + std::string("batch_normalization_9_mean.bin"); 
+  void* batch_normalization_9_mean =  readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_9_variance_path =  dir_prefix + std::string("batch_normalization_9_variance.bin"); 
+  void* batch_normalization_9_variance =  readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); 
+  std::string depthwise_conv2d_5_w_path =  dir_prefix + std::string("depthwise_conv2d_5_w.bin"); 
+  void* depthwise_conv2d_5_w =  readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); 
+  std::string batch_normalization_10_gamma_path =  dir_prefix + std::string("batch_normalization_10_gamma.bin"); 
+  void* batch_normalization_10_gamma =  readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_10_beta_path =  dir_prefix + std::string("batch_normalization_10_beta.bin"); 
+  void* batch_normalization_10_beta =  readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_10_mean_path =  dir_prefix + std::string("batch_normalization_10_mean.bin"); 
+  void* batch_normalization_10_mean =  readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_10_variance_path =  dir_prefix + std::string("batch_normalization_10_variance.bin"); 
+  void* batch_normalization_10_variance =  readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
+  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); 
+  std::string batch_normalization_11_gamma_path =  dir_prefix + std::string("batch_normalization_11_gamma.bin"); 
+  void* batch_normalization_11_gamma =  readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_11_beta_path =  dir_prefix + std::string("batch_normalization_11_beta.bin"); 
+  void* batch_normalization_11_beta =  readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_11_mean_path =  dir_prefix + std::string("batch_normalization_11_mean.bin"); 
+  void* batch_normalization_11_mean =  readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_11_variance_path =  dir_prefix + std::string("batch_normalization_11_variance.bin"); 
+  void* batch_normalization_11_variance =  readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); 
+  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+
+
+  startMemTracking(); 
+
+  int test_input_size = 5000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  int total_runs = 100;
+  Profiler profiler;
+  profiler.start_profiler();
+
+  double total_time = 0.0;
+
+  for(int i = 0; i < total_runs; i++){
+	  for(int i = 0; i < batch_count; i++){ 
+
+		int start = i * batch_size; 
+		int end = (i + 1) * batch_size; 
+
+		void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
+
+        profiler.resume_profiler();
+
+		void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
+		void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
+		void* var_2 = tensorRelu(var_1); 
+		void* var_4 = tensorConvolution(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
+		void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
+		void* var_6 = tensorRelu(var_5); 
+		void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
+		void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
+		void* var_9 = tensorRelu(var_8); 
+		void* var_11 = tensorConvolution(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
+		void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
+		void* var_13 = tensorRelu(var_12); 
+		void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
+		void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
+		void* var_16 = tensorRelu(var_15); 
+		void* var_18 = tensorConvolution(var_16, depthwise_conv2d_3_w, 1, 1, 2, 2, 1, 64); 
+		void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
+		void* var_20 = tensorRelu(var_19); 
+		void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
+		void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
+		void* var_23 = tensorRelu(var_22); 
+		void* var_26 = tensorConvolution(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
+		void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
+		void* var_28 = tensorRelu(var_27); 
+		void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
+		void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
+		void* var_31 = tensorRelu(var_30); 
+		void* var_33 = tensorConvolution(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
+		void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
+		void* var_35 = tensorRelu(var_34); 
+		void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
+		void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
+		void* var_38 = tensorRelu(var_37); 
+		void* var_40 = tensorPooling(var_38,1,2,2,0,0,2,2); 
+		void* var_42 = tensorGemmGPU(var_40, dense_1_w); 
+		void* var_43 = tensorAdd(var_42, dense_1_b); 
+		void* var_44 = tensorSoftmax(var_43); 
+
+        profiler.pause_profiler();
+        auto time_energy = profiler.get_time_energy();
+        total_time += time_energy.first;
+        profiler.reset();
+
+		uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
+
+		float accuracy = computeAccuracy2(labels, batch_size, var_44); 
+		final_accuracy += accuracy; 
+		freeBatchMemory(); 
+	  } 
+  }
+
+  profiler.stop_profiler();
+
+  std::cout<<"---------------------------------------\n";
+  std::cout<<"Average time: " << total_time / total_runs << '\n';
+  std::cout<<"---------------------------------------\n";
+
+  final_accuracy = final_accuracy / batch_count; 
+  dumpFinalAccuracy(final_accuracy); 
+
+
+  llvm_hpvm_cleanupTensorRt(); 
+
+  return 0; 
+
+}
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/resnet18_cifar10_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/resnet18_cifar10_profiling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30a8912ffbe71c69342e80af572db4fe4eea1289
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/resnet18_cifar10_profiling.cc
@@ -0,0 +1,243 @@
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/types.h> 
+#include <sys/stat.h> 
+#include <string.h> 
+#include "../../../tensor_runtime/include/tensor_runtime.h" 
+#include "../../include/utils.h" 
+
+int main(){ 
+
+  llvm_hpvm_initTensorRt(0); 
+  
+  std::string dir_prefix = std::string("../model_params/resnet18_cifar10_3/"); 
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); 
+  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
+  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
+  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
+  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
+  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
+  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
+  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
+  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
+  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
+  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
+  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); 
+  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
+  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
+  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); 
+  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
+  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
+  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
+  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
+  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
+  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
+  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
+  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
+  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
+  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
+  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_14_b_path =  dir_prefix + std::string("conv2d_14_b.bin"); 
+  void* conv2d_14_b =  readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_15_w_path =  dir_prefix + std::string("conv2d_15_w.bin"); 
+  void* conv2d_15_w =  readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); 
+  std::string conv2d_15_b_path =  dir_prefix + std::string("conv2d_15_b.bin"); 
+  void* conv2d_15_b =  readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_17_w_path =  dir_prefix + std::string("conv2d_17_w.bin"); 
+  void* conv2d_17_w =  readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); 
+  std::string conv2d_17_b_path =  dir_prefix + std::string("conv2d_17_b.bin"); 
+  void* conv2d_17_b =  readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_16_w_path =  dir_prefix + std::string("conv2d_16_w.bin"); 
+  void* conv2d_16_w =  readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_16_b_path =  dir_prefix + std::string("conv2d_16_b.bin"); 
+  void* conv2d_16_b =  readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_18_w_path =  dir_prefix + std::string("conv2d_18_w.bin"); 
+  void* conv2d_18_w =  readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_18_b_path =  dir_prefix + std::string("conv2d_18_b.bin"); 
+  void* conv2d_18_b =  readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_19_w_path =  dir_prefix + std::string("conv2d_19_w.bin"); 
+  void* conv2d_19_w =  readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_19_b_path =  dir_prefix + std::string("conv2d_19_b.bin"); 
+  void* conv2d_19_b =  readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_20_w_path =  dir_prefix + std::string("conv2d_20_w.bin"); 
+  void* conv2d_20_w =  readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_20_b_path =  dir_prefix + std::string("conv2d_20_b.bin"); 
+  void* conv2d_20_b =  readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_21_w_path =  dir_prefix + std::string("conv2d_21_w.bin"); 
+  void* conv2d_21_w =  readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_21_b_path =  dir_prefix + std::string("conv2d_21_b.bin"); 
+  void* conv2d_21_b =  readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); 
+  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+
+
+  startMemTracking();
+
+  int test_input_size = 5000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  int total_runs = 100;
+
+  Profiler profiler;
+  profiler.start_profiler();
+
+  double total_time = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
+
+  for(int i = 0; i < total_runs; i++){
+      for(int i = 0; i < batch_count; i++){
+        int start = i * batch_size;
+        int end = (i + 1) * batch_size;
+        
+        void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
+
+        profiler.resume_profiler();
+        
+        void* var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
+        void* var_3 = tensorAdd(var_2, conv2d_1_b); 
+        void* var_4 = tensorRelu(var_3); 
+        void* var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
+        void* var_7 = tensorAdd(var_6, conv2d_2_b); 
+        void* var_8 = tensorRelu(var_7); 
+        void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
+        void* var_11 = tensorAdd(var_10, conv2d_3_b); 
+        void* var_12 = tensorAdd(var_4, var_11); 
+        void* var_13 = tensorRelu(var_12); 
+        void* var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
+        void* var_16 = tensorAdd(var_15, conv2d_4_b); 
+        void* var_17 = tensorRelu(var_16); 
+        void* var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
+        void* var_20 = tensorAdd(var_19, conv2d_5_b); 
+        void* var_21 = tensorAdd(var_13, var_20); 
+        void* var_22 = tensorRelu(var_21); 
+        void* var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
+        void* var_25 = tensorAdd(var_24, conv2d_6_b); 
+        void* var_26 = tensorRelu(var_25); 
+        void* var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
+        void* var_29 = tensorAdd(var_28, conv2d_7_b); 
+        void* var_30 = tensorAdd(var_22, var_29); 
+        void* var_31 = tensorRelu(var_30); 
+        void* var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); 
+        void* var_34 = tensorAdd(var_33, conv2d_8_b); 
+        void* var_35 = tensorRelu(var_34); 
+        void* var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
+        void* var_38 = tensorAdd(var_37, conv2d_9_b); 
+        void* var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); 
+        void* var_41 = tensorAdd(var_40, conv2d_10_b); 
+        void* var_42 = tensorAdd(var_41, var_38); 
+        void* var_43 = tensorRelu(var_42); 
+        void* var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
+        void* var_46 = tensorAdd(var_45, conv2d_11_b); 
+        void* var_47 = tensorRelu(var_46); 
+        void* var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
+        void* var_50 = tensorAdd(var_49, conv2d_12_b); 
+        void* var_51 = tensorAdd(var_43, var_50); 
+        void* var_52 = tensorRelu(var_51); 
+        void* var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
+        void* var_55 = tensorAdd(var_54, conv2d_13_b); 
+        void* var_56 = tensorRelu(var_55); 
+        void* var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); 
+        void* var_59 = tensorAdd(var_58, conv2d_14_b); 
+        void* var_60 = tensorAdd(var_52, var_59); 
+        void* var_61 = tensorRelu(var_60); 
+        void* var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); 
+        void* var_64 = tensorAdd(var_63, conv2d_15_b); 
+        void* var_65 = tensorRelu(var_64); 
+        void* var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); 
+        void* var_68 = tensorAdd(var_67, conv2d_16_b); 
+        void* var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); 
+        void* var_71 = tensorAdd(var_70, conv2d_17_b); 
+        void* var_72 = tensorAdd(var_71, var_68); 
+        void* var_73 = tensorRelu(var_72); 
+        void* var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); 
+        void* var_76 = tensorAdd(var_75, conv2d_18_b); 
+        void* var_77 = tensorRelu(var_76); 
+        void* var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); 
+        void* var_80 = tensorAdd(var_79, conv2d_19_b); 
+        void* var_81 = tensorAdd(var_73, var_80); 
+        void* var_82 = tensorRelu(var_81); 
+        void* var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); 
+        void* var_85 = tensorAdd(var_84, conv2d_20_b); 
+        void* var_86 = tensorRelu(var_85); 
+        void* var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); 
+        void* var_89 = tensorAdd(var_88, conv2d_21_b); 
+        void* var_90 = tensorAdd(var_82, var_89); 
+        void* var_91 = tensorRelu(var_90); 
+        void* var_92 = tensorPooling(var_91,1,8,8,0,0,8,8); 
+        void* var_94 = tensorGemmGPU(var_92, dense_1_w); 
+        void* var_95 = tensorAdd(var_94, dense_1_b); 
+        void* var_96 = tensorSoftmax(var_95); 
+
+        profiler.pause_profiler();
+        auto time_energy = profiler.get_time_energy();
+        std::cout<<"-----------------------"<<time_energy.first<<"----------------------------\n";
+        total_time += time_energy.first;
+        profiler.reset();
+
+        uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+        float accuracy = computeAccuracy2(labels,batch_size,var_96); 
+        final_accuracy += accuracy;
+        
+        freeBatchMemory();
+    }
+  }
+  profiler.stop_profiler();
+
+  std::cout<<"---------------------------------------\n";
+  std::cout<<"Average time: " << total_time / total_runs << '\n';
+  std::cout<<"---------------------------------------\n";
+
+  stopProfiling();
+
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
+
+  
+  llvm_hpvm_cleanupTensorRt(); 
+
+  return 0; 
+
+}
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar100_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar100_profiling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..160a97cb1437e3c31b82aefc1c055bd562ce48f9
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar100_profiling.cc
@@ -0,0 +1,181 @@
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/types.h> 
+#include <sys/stat.h> 
+#include <string.h> 
+#include "../../../tensor_runtime/include/tensor_runtime.h" 
+#include "../../include/utils.h" 
+
+int main(){ 
+
+  llvm_hpvm_initTensorRt(0); 
+
+  std::string dir_prefix = std::string("../model_params/vgg16_cifar100_front/"); 
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
+  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
+  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
+  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
+  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
+  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
+  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
+  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
+  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
+  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
+  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
+  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
+  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
+  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
+  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
+  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
+  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
+  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
+  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
+  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
+  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
+  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
+  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
+  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
+  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
+  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
+  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
+  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
+  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
+  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
+  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
+  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); 
+  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
+  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); 
+
+
+  startMemTracking(); 
+
+  int test_input_size = 5000; 
+  int batch_size = 1000; 
+  int batch_count = test_input_size / batch_size; 
+  float final_accuracy = 0.0; 
+
+  int total_runs = 100;
+  Profiler profiler;
+  profiler.start_profiler();
+  double total_time = 0.0;
+
+  for (int i = 0; i < total_runs; i++){
+	  for(int i = 0; i < batch_count; i++){ 
+
+		int start = i * batch_size; 
+		int end = (i + 1) * batch_size; 
+
+		void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
+
+        profiler.resume_profiler();
+
+		void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
+		void* var_1 = tensorAdd(var_0, conv2d_1_b); 
+		void* var_2 = tensorRelu(var_1); 
+		void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
+		void* var_5 = tensorAdd(var_4, conv2d_2_b); 
+		void* var_6 = tensorRelu(var_5); 
+		void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); 
+		void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
+		void* var_9 = tensorAdd(var_8, conv2d_3_b); 
+		void* var_10 = tensorRelu(var_9); 
+		void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
+		void* var_13 = tensorAdd(var_12, conv2d_4_b); 
+		void* var_14 = tensorRelu(var_13); 
+		void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); 
+		void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
+		void* var_17 = tensorAdd(var_16, conv2d_5_b); 
+		void* var_18 = tensorRelu(var_17); 
+		void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
+		void* var_21 = tensorAdd(var_20, conv2d_6_b); 
+		void* var_22 = tensorRelu(var_21); 
+		void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
+		void* var_25 = tensorAdd(var_24, conv2d_7_b); 
+		void* var_26 = tensorRelu(var_25); 
+		void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); 
+		void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
+		void* var_29 = tensorAdd(var_28, conv2d_8_b); 
+		void* var_30 = tensorRelu(var_29); 
+		void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
+		void* var_33 = tensorAdd(var_32, conv2d_9_b); 
+		void* var_34 = tensorRelu(var_33); 
+		void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
+		void* var_37 = tensorAdd(var_36, conv2d_10_b); 
+		void* var_38 = tensorRelu(var_37); 
+		void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); 
+		void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
+		void* var_41 = tensorAdd(var_40, conv2d_11_b); 
+		void* var_42 = tensorRelu(var_41); 
+		void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
+		void* var_45 = tensorAdd(var_44, conv2d_12_b); 
+		void* var_46 = tensorRelu(var_45); 
+		void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
+		void* var_49 = tensorAdd(var_48, conv2d_13_b); 
+		void* var_50 = tensorRelu(var_49); 
+		void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); 
+		void* var_54 = tensorGemmGPU(var_51, dense_1_w); 
+		void* var_55 = tensorAdd(var_54, dense_1_b); 
+		void* var_56 = tensorRelu(var_55); 
+		void* var_58 = tensorGemmGPU(var_56, dense_2_w); 
+		void* var_59 = tensorAdd(var_58, dense_2_b); 
+		void* var_60 = tensorSoftmax(var_59); 
+
+        profiler.pause_profiler();
+        auto time_energy = profiler.get_time_energy();
+        total_time += time_energy.first;
+        profiler.reset();
+
+		uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
+
+		float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); 
+		final_accuracy += accuracy; 
+		freeBatchMemory(); 
+	 
+	  }
+  }
+
+  profiler.stop_profiler();
+
+  std::cout<<"---------------------------------------\n";
+  std::cout<<"Average time: " << total_time / total_runs << '\n';
+  std::cout<<"---------------------------------------\n";
+
+  final_accuracy = final_accuracy / batch_count; 
+  dumpFinalAccuracy(final_accuracy); 
+
+  llvm_hpvm_cleanupTensorRt(); 
+
+  return 0; 
+}
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar10_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar10_profiling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d9be320540fa2d1264004f35d16bc358a432413
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar10_profiling.cc
@@ -0,0 +1,182 @@
+#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h"
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/types.h> 
+#include <sys/stat.h> 
+#include <string.h> 
+#include "../../../tensor_runtime/include/tensor_runtime.h" 
+#include "../../include/utils.h" 
+
+int main(){ 
+
+  llvm_hpvm_initTensorRt(0); 
+
+  std::string dir_prefix = std::string("../model_params/vgg16_cifar10_2/"); 
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
+  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
+  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
+  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
+  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
+  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
+  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
+  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
+  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
+  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
+  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
+  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
+  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
+  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
+  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
+  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
+  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
+  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
+  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
+  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
+  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
+  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
+  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
+  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
+  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
+  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
+  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
+  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
+  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
+  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
+  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
+  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
+  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
+  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); 
+  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
+  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); 
+
+
+  startMemTracking();
+
+  int test_input_size = 5000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  Profiler profiler;
+  profiler.start_profiler();
+
+  double total_time = 0.0;
+
+  int total_runs = 100; 
+  for(int i = 0; i < total_runs; i++){
+      for(int i = 0; i < batch_count; i++){
+
+        int start = i * batch_size;
+        int end = (i + 1) * batch_size;
+        
+        void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); 
+
+        profiler.resume_profiler();
+     
+        void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
+        void* var_1 = tensorAdd(var_0, conv2d_1_b); 
+        void* var_2 = tensorRelu(var_1); 
+        void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
+        void* var_5 = tensorAdd(var_4, conv2d_2_b); 
+        void* var_6 = tensorRelu(var_5); 
+        void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); 
+        void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
+        void* var_9 = tensorAdd(var_8, conv2d_3_b); 
+        void* var_10 = tensorRelu(var_9); 
+        void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
+        void* var_13 = tensorAdd(var_12, conv2d_4_b); 
+        void* var_14 = tensorRelu(var_13); 
+        void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); 
+        void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
+        void* var_17 = tensorAdd(var_16, conv2d_5_b); 
+        void* var_18 = tensorRelu(var_17); 
+        void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
+        void* var_21 = tensorAdd(var_20, conv2d_6_b); 
+        void* var_22 = tensorRelu(var_21); 
+        void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
+        void* var_25 = tensorAdd(var_24, conv2d_7_b); 
+        void* var_26 = tensorRelu(var_25); 
+        void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); 
+        void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
+        void* var_29 = tensorAdd(var_28, conv2d_8_b); 
+        void* var_30 = tensorRelu(var_29); 
+        void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
+        void* var_33 = tensorAdd(var_32, conv2d_9_b); 
+        void* var_34 = tensorRelu(var_33); 
+        void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
+        void* var_37 = tensorAdd(var_36, conv2d_10_b); 
+        void* var_38 = tensorRelu(var_37); 
+        void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); 
+        void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
+        void* var_41 = tensorAdd(var_40, conv2d_11_b); 
+        void* var_42 = tensorRelu(var_41); 
+        void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
+        void* var_45 = tensorAdd(var_44, conv2d_12_b); 
+        void* var_46 = tensorRelu(var_45); 
+        void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
+        void* var_49 = tensorAdd(var_48, conv2d_13_b); 
+        void* var_50 = tensorRelu(var_49); 
+        void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); 
+        void* var_54 = tensorGemmGPU(var_51, dense_1_w); 
+        void* var_55 = tensorAdd(var_54, dense_1_b); 
+        void* var_56 = tensorRelu(var_55); 
+        void* var_58 = tensorGemmGPU(var_56, dense_2_w); 
+        void* var_59 = tensorAdd(var_58, dense_2_b); 
+        void* var_60 = tensorSoftmax(var_59); 
+
+        profiler.pause_profiler();
+        auto time_energy = profiler.get_time_energy();
+        total_time += time_energy.first;
+        profiler.reset();
+
+        uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+        float accuracy = computeAccuracy2(labels,batch_size,var_60); 
+        final_accuracy += accuracy;
+        
+        freeBatchMemory();
+    }
+  }
+  profiler.stop_profiler();
+
+  std::cout<<"---------------------------------------\n";
+  std::cout<<"Average time: " << total_time / total_runs << '\n';
+  std::cout<<"---------------------------------------\n";
+
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
+  
+  llvm_hpvm_cleanupTensorRt(); 
+
+  return 0; 
+
+}
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc
index 6074dacf3f56e672ac5ca80eda572a53a58f1044..66e824f6d098434e140d764edda7cdacd11e110f 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc
@@ -30,7 +30,7 @@ int main(int argc, char* argv[]){
   }
 
   
-  llvm_hpvm_initTensorRt(1); 
+  llvm_hpvm_initTensorRt(0); 
 
   int missed = 0;
   for (int i = 0 ; i < total_runs; i++){ 
@@ -41,7 +41,7 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
+    int test_input_size = 2000; 
     int batch_size = 1000;
     int offset = 5000;
     
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc
index 0513723b5a4a36984e736b94ee82b9fc3fb2d1f9..6b951cffcaf142bd917abc7f7c04a2c691c472d7 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc
@@ -31,9 +31,9 @@ int main(int argc, char* argv[]){
   }
 
   
-  llvm_hpvm_initTensorRt(1); 
-
+  llvm_hpvm_initTensorRt(0); 
 
+  
   int missed = 0; 
   for (int i = 0 ; i < total_runs; i++){ 
 
@@ -43,15 +43,15 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
+    int test_input_size = 2000; 
     int batch_size = 1000;
     int offset = 5000;
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
 
-    for(int i = 0; i < batch_count; i++){ 
-
-      std::string dir_prefix = std::string("../model_params/alexnet_cifar10_test/"); 
+    for(int i = 0; i < batch_count; i++){
+      
+      std::string dir_prefix = std::string("../model_params/alexnet_cifar10_test/");   
       std::string input_path =  dir_prefix + std::string("input.bin"); 
       std::string labels_path =  dir_prefix + std::string("labels.bin"); 
       std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
@@ -79,6 +79,7 @@ int main(int argc, char* argv[]){
       std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
       void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
 
+      
 
       int start = i * batch_size + offset; 
       int end = (i + 1) * batch_size + offset; 
@@ -117,3 +118,4 @@ int main(int argc, char* argv[]){
   return 0; 
 
 }
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc
index 1cf73cd92a39a14c6a1fdd3965e63bfabee634b1..052809f29b9d89534005e56125e66c5e4a0bd1cf 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc
@@ -43,8 +43,10 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
-    int batch_size = 1000; 
+    int test_input_size = 2000; 
+    int batch_size = 1000;
+    int offset = 5000;
+    
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
 
@@ -330,93 +332,93 @@ int main(int argc, char* argv[]){
       void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
 
 
-      int start = i * batch_size; 
-      int end = (i + 1) * batch_size; 
+      int start = i * batch_size + offset; 
+      int end = (i + 1) * batch_size + offset; 
 
       void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
 
       void* var_0 = ConvLayer_PROMISE(input, -1.9892114, 2.126797, conv2d_1_w, -2.196306920051575, 1.347581704139706, NULL, 0, 0, 1, 1, 1, 1, -1, 0, -1, -60.89275047302246, 51.99256916046146, 9); 
-      void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
+      void* var_1 = tensorHalfBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
       void* var_2 = tensorRelu(var_1); 
       void* var_3 = tensorConvolution(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
-      void* var_4 = tensorBatchNorm(var_3, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
+      void* var_4 = tensorHalfBatchNorm(var_3, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
       void* var_5 = tensorRelu(var_4); 
       void* var_6 = ConvLayer_PROMISE(var_5, 0.0, 5.713541553974245, conv2d_2_w, -0.9317721160650253, 1.0774258937835774, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -6.518589503288269, 6.810842518806449, 9); 
-      void* var_7 = tensorBatchNorm(var_6, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
+      void* var_7 = tensorHalfBatchNorm(var_6, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
       void* var_8 = tensorRelu(var_7); 
       void* var_9 = tensorConvolution(var_8, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
-      void* var_10 = tensorBatchNorm(var_9, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
+      void* var_10 = tensorHalfBatchNorm(var_9, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
       void* var_11 = tensorRelu(var_10); 
       void* var_12 = ConvLayer_PROMISE(var_11, 0.0, 4.932139402866376, conv2d_3_w, -0.5316544661521911, 0.5753790403604531, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.482631235122681, 3.96730119752885, 9); 
-      void* var_13 = tensorBatchNorm(var_12, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
+      void* var_13 = tensorHalfBatchNorm(var_12, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
       void* var_14 = tensorRelu(var_13); 
       void* var_15 = tensorConvolution(var_14, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
-      void* var_16 = tensorBatchNorm(var_15, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
+      void* var_16 = tensorHalfBatchNorm(var_15, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
       void* var_17 = tensorRelu(var_16); 
       void* var_18 = ConvLayer_PROMISE(var_17, 0.0, 4.103263397693674, conv2d_4_w, -0.36234098821878435, 0.4076913900375366, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.04261828327179, 3.88677932929993, 9); 
-      void* var_19 = tensorBatchNorm(var_18, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
+      void* var_19 = tensorHalfBatchNorm(var_18, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
       void* var_20 = tensorRelu(var_19); 
       void* var_21 = tensorConvolution(var_20, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
-      void* var_22 = tensorBatchNorm(var_21, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
+      void* var_22 = tensorHalfBatchNorm(var_21, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
       void* var_23 = tensorRelu(var_22); 
       void* var_24 = ConvLayer_PROMISE(var_23, 0.0, 5.383221302509475, conv2d_5_w, -0.3131200549006462, 0.29357679939270065, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -5.921469215393066, 4.338679324150087, 9); 
-      void* var_25 = tensorBatchNorm(var_24, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
+      void* var_25 = tensorHalfBatchNorm(var_24, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
       void* var_26 = tensorRelu(var_25); 
       void* var_27 = tensorConvolution(var_26, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
-      void* var_28 = tensorBatchNorm(var_27, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
+      void* var_28 = tensorHalfBatchNorm(var_27, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
       void* var_29 = tensorRelu(var_28); 
       void* var_30 = ConvLayer_PROMISE(var_29, 0.0, 4.316738154411368, conv2d_6_w, -0.23299247801303866, 0.2580290257930756, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.207789947509766, 3.932436970710759, 9); 
-      void* var_31 = tensorBatchNorm(var_30, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
+      void* var_31 = tensorHalfBatchNorm(var_30, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
       void* var_32 = tensorRelu(var_31); 
       void* var_33 = tensorConvolution(var_32, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
-      void* var_34 = tensorBatchNorm(var_33, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
+      void* var_34 = tensorHalfBatchNorm(var_33, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
       void* var_35 = tensorRelu(var_34); 
       void* var_36 = ConvLayer_PROMISE(var_35, 0.0, 5.830408106803901, conv2d_7_w, -0.20233777219057084, 0.18998308175802117, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -6.298286915779113, 4.848135117530843, 9); 
-      void* var_37 = tensorBatchNorm(var_36, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
+      void* var_37 = tensorHalfBatchNorm(var_36, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
       void* var_38 = tensorRelu(var_37); 
       void* var_39 = tensorConvolution(var_38, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
-      void* var_40 = tensorBatchNorm(var_39, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
+      void* var_40 = tensorHalfBatchNorm(var_39, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
       void* var_41 = tensorRelu(var_40); 
       void* var_42 = ConvLayer_PROMISE(var_41, 0.0, 4.446417809963227, conv2d_8_w, -0.17442735651135444, 0.17695830866694454, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.347910885810852, 3.6144364695549145, 9); 
-      void* var_43 = tensorBatchNorm(var_42, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
+      void* var_43 = tensorHalfBatchNorm(var_42, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
       void* var_44 = tensorRelu(var_43); 
       void* var_45 = tensorConvolution(var_44, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
-      void* var_46 = tensorBatchNorm(var_45, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
+      void* var_46 = tensorHalfBatchNorm(var_45, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
       void* var_47 = tensorRelu(var_46); 
       void* var_48 = ConvLayer_PROMISE(var_47, 0.0, 4.518095604896667, conv2d_9_w, -0.14546796187758446, 0.15256431668996823, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -3.0287702755928043, 2.9487365779876953, 9); 
-      void* var_49 = tensorBatchNorm(var_48, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
+      void* var_49 = tensorHalfBatchNorm(var_48, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
       void* var_50 = tensorRelu(var_49); 
       void* var_51 = tensorConvolution(var_50, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
-      void* var_52 = tensorBatchNorm(var_51, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
+      void* var_52 = tensorHalfBatchNorm(var_51, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
       void* var_53 = tensorRelu(var_52); 
       void* var_54 = ConvLayer_PROMISE(var_53, 0.0, 6.348575634956407, conv2d_10_w, -0.13025874522328376, 0.13558243343234128, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.2293100805282595, 3.5315046372413645, 9); 
-      void* var_55 = tensorBatchNorm(var_54, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
+      void* var_55 = tensorHalfBatchNorm(var_54, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
       void* var_56 = tensorRelu(var_55); 
       void* var_57 = tensorConvolution(var_56, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
-      void* var_58 = tensorBatchNorm(var_57, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
+      void* var_58 = tensorHalfBatchNorm(var_57, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
       void* var_59 = tensorRelu(var_58); 
       void* var_60 = ConvLayer_PROMISE(var_59, 0.0, 5.221003110408843, conv2d_11_w, -0.11900172759592534, 0.12536374783515936, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.038203780174255, 4.004009407043483, 9); 
-      void* var_61 = tensorBatchNorm(var_60, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
+      void* var_61 = tensorHalfBatchNorm(var_60, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
       void* var_62 = tensorRelu(var_61); 
       void* var_63 = tensorConvolution(var_62, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
-      void* var_64 = tensorBatchNorm(var_63, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
+      void* var_64 = tensorHalfBatchNorm(var_63, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
       void* var_65 = tensorRelu(var_64); 
       void* var_66 = ConvLayer_PROMISE(var_65, 0.0, 5.732498347759442, conv2d_12_w, -0.10839721685647964, 0.11625668607652187, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -3.3111015114784244, 4.462933233261136, 9); 
-      void* var_67 = tensorBatchNorm(var_66, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
+      void* var_67 = tensorHalfBatchNorm(var_66, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
       void* var_68 = tensorRelu(var_67); 
       void* var_69 = tensorConvolution(var_68, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
-      void* var_70 = tensorBatchNorm(var_69, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
-      void* var_71 = tensorRelu(var_70); 
+      void* var_70 = tensorHalfBatchNorm(var_69, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
+      void* var_71 = tensorHalfRelu(var_70); 
       void* var_72 = ConvLayer_PROMISE(var_71, 0.0, 7.240498211860681, conv2d_13_w, -0.08623744961619377, 0.08859449951350662, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.175431394577027, 6.2043294754027345, 9); 
-      void* var_73 = tensorBatchNorm(var_72, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-      void* var_74 = tensorRelu(var_73); 
+      void* var_73 = tensorHalfBatchNorm(var_72, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
+      void* var_74 = tensorHalfRelu(var_73); 
       void* var_75 = tensorConvolution(var_74, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
-      void* var_76 = tensorBatchNorm(var_75, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
+      void* var_76 = tensorHalfBatchNorm(var_75, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
       void* var_77 = tensorRelu(var_76); 
       void* var_78 = ConvLayer_PROMISE(var_77, 0.0, 7.813958834648251, conv2d_14_w, -0.06813025139272214, 0.07002027779817581, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -10.920566423416137, 2.6442912578582534, 9); 
-      void* var_79 = tensorBatchNorm(var_78, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-      void* var_80 = tensorRelu(var_79); 
-      void* var_81 = tensorPooling(var_80,1,2,2,0,0,2,2); 
+      void* var_79 = tensorHalfBatchNorm(var_78, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
+      void* var_80 = tensorHalfRelu(var_79); 
+      void* var_81 = tensorHalfPooling(var_80,1,2,2,0,0,2,2); 
       void* var_82 = FCLayer_PROMISE(var_81, 0.0, 2.8692066650391013, dense_1_w, -0.22301019695401192, 0.1442659378200768, dense_1_b, -0.1654396, 0.23336112, -1, -12.245949958801269, 23.80532513427739, 9); 
       void* var_83 = tensorSoftmax(var_82); 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc
index 394ec85390aa4248fd93aefa339ff196f39a5559..42d26d34e65939b410143485a61f23e705906bfc 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc
@@ -42,8 +42,10 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
-    int batch_size = 1000; 
+    int test_input_size = 2000; 
+    int batch_size = 1000;
+    int offset = 5000;
+    
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
 
@@ -188,8 +190,8 @@ int main(int argc, char* argv[]){
       void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
 
 
-      int start = i * batch_size; 
-      int end = (i + 1) * batch_size; 
+      int start = i * batch_size + offset; 
+      int end = (i + 1) * batch_size + offset; 
 
       void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc
index cc0981dc7d1d75ce56388f3135fa0f89f8c688e3..0e5cdd1d284e6c7621cd3331b924c06969be79db 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc
@@ -30,7 +30,7 @@ int main(int argc, char* argv[]){
   }
 
 
-  llvm_hpvm_initTensorRt(1); 
+  llvm_hpvm_initTensorRt(0); 
 
   int missed = 0; 
   for (int i = 0 ; i < total_runs; i++){ 
@@ -41,9 +41,10 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
+    int test_input_size = 2000; 
     int batch_size = 1000;
     int offset = 5000;
+    
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc
index ec5de9a5e2c2d66be44fdd99b83dd634d8f5b2f9..33c68eae84a075f50b2bc8e7484036c54ade5620 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc
@@ -31,7 +31,7 @@ int main(int argc, char* argv[]){
   }
 
 
-  llvm_hpvm_initTensorRt(1); 
+  llvm_hpvm_initTensorRt(0); 
 
   
   int missed = 0; 
@@ -43,8 +43,10 @@ int main(int argc, char* argv[]){
 
    startMemTracking(); 
 
-   int test_input_size = 1000; 
-   int batch_size = 1000; 
+   int test_input_size = 2000; 
+   int batch_size = 1000;
+   int offset = 5000;
+
    int batch_count = test_input_size / batch_size; 
    float final_accuracy = 0.0; 
    
@@ -115,8 +117,9 @@ int main(int argc, char* argv[]){
      void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); 
 
 
-     int start = i * batch_size; 
-     int end = (i + 1) * batch_size; 
+     int start = i * batch_size + offset; 
+     int end = (i + 1) * batch_size + offset;
+     
 
      void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc
index 798b5f67aa9636f8e7ad3b9d08b9fc8e53cb137d..ff767235e9d44139f97ad885aa89eef1c385ad33 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc
@@ -29,7 +29,7 @@ int main(int argc, char* argv[]){
    to_skip = atoi(argv[3]);   
  }
 
- llvm_hpvm_initTensorRt(1); 
+ llvm_hpvm_initTensorRt(0); 
 
  int missed = 0; 
  for (int i = 0 ; i < total_runs; i++){ 
@@ -40,7 +40,7 @@ int main(int argc, char* argv[]){
    
    startMemTracking(); 
 
-   int test_input_size = 1000; 
+   int test_input_size = 2000; 
    int batch_size = 1000;
    int offset = 5000;
    
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/resnet18_cifar10_cudaperf.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/resnet18_cifar10_cudaperf.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e33715e8c6972966e7359a1e7b8fc5069e1f16f
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/resnet18_cifar10_cudaperf.cc
@@ -0,0 +1,221 @@
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/types.h> 
+#include <sys/stat.h> 
+#include <string.h> 
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h" 
+
+int main(){ 
+
+  llvm_hpvm_initTensorRt(0); 
+  
+  std::string dir_prefix = std::string("../model_params/resnet18_cifar10_3/"); 
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); 
+  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
+  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
+  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
+  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
+  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
+  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
+  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
+  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
+  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); 
+  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
+  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); 
+  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
+  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); 
+  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
+  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
+  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); 
+  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
+  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
+  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
+  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
+  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
+  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
+  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
+  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
+  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
+  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
+  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); 
+  std::string conv2d_14_b_path =  dir_prefix + std::string("conv2d_14_b.bin"); 
+  void* conv2d_14_b =  readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_15_w_path =  dir_prefix + std::string("conv2d_15_w.bin"); 
+  void* conv2d_15_w =  readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); 
+  std::string conv2d_15_b_path =  dir_prefix + std::string("conv2d_15_b.bin"); 
+  void* conv2d_15_b =  readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_17_w_path =  dir_prefix + std::string("conv2d_17_w.bin"); 
+  void* conv2d_17_w =  readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); 
+  std::string conv2d_17_b_path =  dir_prefix + std::string("conv2d_17_b.bin"); 
+  void* conv2d_17_b =  readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_16_w_path =  dir_prefix + std::string("conv2d_16_w.bin"); 
+  void* conv2d_16_w =  readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_16_b_path =  dir_prefix + std::string("conv2d_16_b.bin"); 
+  void* conv2d_16_b =  readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_18_w_path =  dir_prefix + std::string("conv2d_18_w.bin"); 
+  void* conv2d_18_w =  readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_18_b_path =  dir_prefix + std::string("conv2d_18_b.bin"); 
+  void* conv2d_18_b =  readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_19_w_path =  dir_prefix + std::string("conv2d_19_w.bin"); 
+  void* conv2d_19_w =  readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_19_b_path =  dir_prefix + std::string("conv2d_19_b.bin"); 
+  void* conv2d_19_b =  readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_20_w_path =  dir_prefix + std::string("conv2d_20_w.bin"); 
+  void* conv2d_20_w =  readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_20_b_path =  dir_prefix + std::string("conv2d_20_b.bin"); 
+  void* conv2d_20_b =  readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_21_w_path =  dir_prefix + std::string("conv2d_21_w.bin"); 
+  void* conv2d_21_w =  readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); 
+  std::string conv2d_21_b_path =  dir_prefix + std::string("conv2d_21_b.bin"); 
+  void* conv2d_21_b =  readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); 
+  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+
+
+  startMemTracking();
+
+  int test_input_size = 10000;
+  int batch_size = 2000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
+  
+  for(int i = 0; i < batch_count; i++){
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+    
+    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
+    
+    void* var_2 = tensorConvPerfCuda(input, conv2d_1_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_3 = tensorAdd(var_2, conv2d_1_b); 
+    void* var_4 = tensorRelu(var_3); 
+    void* var_6 = tensorConvPerfCuda(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_7 = tensorAdd(var_6, conv2d_2_b); 
+    void* var_8 = tensorRelu(var_7); 
+    void* var_10 = tensorConvPerfCuda(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
+    void* var_12 = tensorAdd(var_4, var_11); 
+    void* var_13 = tensorRelu(var_12); 
+    void* var_15 = tensorConvPerfCuda(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_16 = tensorAdd(var_15, conv2d_4_b); 
+    void* var_17 = tensorRelu(var_16); 
+    void* var_19 = tensorConvPerfCuda(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_20 = tensorAdd(var_19, conv2d_5_b); 
+    void* var_21 = tensorAdd(var_13, var_20); 
+    void* var_22 = tensorRelu(var_21); 
+    void* var_24 = tensorConvPerfCuda(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0, 3, 1, 2); 
+    void* var_25 = tensorAdd(var_24, conv2d_6_b); 
+    void* var_26 = tensorRelu(var_25); 
+    void* var_28 = tensorConvPerfCuda(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_29 = tensorAdd(var_28, conv2d_7_b); 
+    void* var_30 = tensorAdd(var_22, var_29); 
+    void* var_31 = tensorRelu(var_30); 
+    void* var_33 = tensorConvPerfCuda(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0, 1, 1, 0); 
+    void* var_34 = tensorAdd(var_33, conv2d_8_b); 
+    void* var_35 = tensorRelu(var_34); 
+    void* var_37 = tensorConvPerfCuda(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_38 = tensorAdd(var_37, conv2d_9_b); 
+    void* var_40 = tensorConvPerfCuda(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0, 1, 1, 0); 
+    void* var_41 = tensorAdd(var_40, conv2d_10_b); 
+    void* var_42 = tensorAdd(var_41, var_38); 
+    void* var_43 = tensorRelu(var_42); 
+    void* var_45 = tensorConvPerfCuda(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0, 3, 1, 0); 
+    void* var_46 = tensorAdd(var_45, conv2d_11_b); 
+    void* var_47 = tensorRelu(var_46); 
+    void* var_49 = tensorConvPerfCuda(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_50 = tensorAdd(var_49, conv2d_12_b); 
+    void* var_51 = tensorAdd(var_43, var_50); 
+    void* var_52 = tensorRelu(var_51); 
+    void* var_54 = tensorConvPerfCuda(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_55 = tensorAdd(var_54, conv2d_13_b); 
+    void* var_56 = tensorRelu(var_55); 
+    void* var_58 = tensorConvPerfCuda(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0, 1, 3, 1); 
+    void* var_59 = tensorAdd(var_58, conv2d_14_b); 
+    void* var_60 = tensorAdd(var_52, var_59); 
+    void* var_61 = tensorRelu(var_60); 
+    void* var_63 = tensorConvPerfCuda(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0, 1, 1, 0); 
+    void* var_64 = tensorAdd(var_63, conv2d_15_b); 
+    void* var_65 = tensorRelu(var_64); 
+    void* var_67 = tensorConvPerfCuda(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_68 = tensorAdd(var_67, conv2d_16_b); 
+    void* var_70 = tensorConvPerfCuda(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0, 3, 1, 2); 
+    void* var_71 = tensorAdd(var_70, conv2d_17_b); 
+    void* var_72 = tensorAdd(var_71, var_68); 
+    void* var_73 = tensorRelu(var_72); 
+    void* var_75 = tensorConvPerfCuda(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_76 = tensorAdd(var_75, conv2d_18_b); 
+    void* var_77 = tensorRelu(var_76); 
+    void* var_79 = tensorConvPerfCuda(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0, 1, 3, 0); 
+    void* var_80 = tensorAdd(var_79, conv2d_19_b); 
+    void* var_81 = tensorAdd(var_73, var_80); 
+    void* var_82 = tensorRelu(var_81); 
+    void* var_84 = tensorConvPerfCuda(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_85 = tensorAdd(var_84, conv2d_20_b); 
+    void* var_86 = tensorRelu(var_85); 
+    void* var_88 = tensorConvPerfCuda(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); 
+    void* var_89 = tensorAdd(var_88, conv2d_21_b); 
+    void* var_90 = tensorAdd(var_82, var_89); 
+    void* var_91 = tensorRelu(var_90); 
+    void* var_92 = tensorPooling(var_91,1,8,8,0,0,8,8); 
+    void* var_94 = tensorGemmGPU(var_92, dense_1_w); 
+    void* var_95 = tensorAdd(var_94, dense_1_b); 
+    void* var_96 = tensorSoftmax(var_95); 
+
+    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+    float accuracy = computeAccuracy2(labels,batch_size,var_96); 
+    final_accuracy += accuracy;
+    
+    freeBatchMemory();
+  }
+
+  stopProfiling();
+
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
+
+  
+  llvm_hpvm_cleanupTensorRt(); 
+
+  return 0; 
+
+}
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc
index bc2d416ce655641c58b304bbc07384c6cada6f8a..dfa411126089849337929c7d9f631cf7e3cd3143 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc
@@ -345,15 +345,15 @@ void testTensorGroupedConv(){
   void* x3 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
   // NOTE: Filter descriptors do NOT have batch size
   // NOTE: First two dims are output channels (configurable), input channels (MUST match input channels)
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
+  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
   fillTensorWithOnes(x3);
   fillTensorWithOnes(filter);
 
   int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
   int conv_groups = 2;
   
-  void* conv1 = tensorConvolution(x3, filter, 0, 0,
-				  1, 1, conv_mode, conv_groups);
+  void* conv1 = tensorConvolution(x3, filter, 2, 2,
+				  2, 2, conv_mode, conv_groups);
   printTensorValues(conv1);
 
   // NOTE: For cudnnTensorAdd, the only dimension that MUST match is channels  
@@ -474,6 +474,38 @@ void testQuantization(){
 
 
 
+void testSampleFilter(){
+
+  printf("***** Tensor Sample Filter ***** \n\n");
+  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 2, 2, 2);
+  fillTensorWithVal(input, 3);
+
+  /*  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  host_ptr[0] = -0.1;
+  host_ptr[1] = -25;
+  host_ptr[2] = 0.2;
+  host_ptr[3] = -0.4;
+  host_ptr[4] = 1.7;
+  host_ptr[5] = -2.9;
+  host_ptr[6] = 0.7;
+  host_ptr[7] = 0.99;
+  */
+
+  printTensorValues(input);
+
+  printf("\n\n");
+
+  hpvm_request_tensor(input, DEVICE);
+    
+  sampleFilter(input, 2, 1);
+
+  hpvm_request_tensor(input, HOST);
+  
+  printTensorValues(input);
+}
+
+
+
 int main(){
 
   llvm_hpvm_initTensorRt(0);
@@ -490,7 +522,7 @@ int main(){
   //testTensorConv();
   //testTensorGroupedConv();
 
-  testTensorBatchNorm();
+  //testTensorBatchNorm();
   
   //testTensorGemm();
   //testTensorGemmGPU();
@@ -499,6 +531,10 @@ int main(){
   //testTensorConv3();
   //testLRN();
 
+
+  testSampleFilter();
+  
+    
   stopProfiling();
 
   return 0;
diff --git a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll
index 3e48a094b89ac506cf50f712a0d60b1bac95f75d..89c8da90f8ab740062bd84cdd365baa67311a7a4 100644
--- a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll
+++ b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll
@@ -8,8 +8,8 @@ define void @_Z13dummyFunctionv() #0 {
 entry:
   %initRT = alloca i8*, align 8
   %cleanRT = alloca i8*, align 8
-  %initApproxhpvmRT = alloca i8*, align 8
-  %cleaApproxhpvmRT = alloca i8*, align 8
+  %initApproxRT = alloca i8*, align 8
+  %cleanApproxRT = alloca i8*, align 8
   %initRTController = alloca i8*, align 8
   %cleanRTController = alloca i8*, align 8
   %request_tensorPtr = alloca i8*, align 8
@@ -44,17 +44,18 @@ entry:
   %ConvLayer = alloca i8*, align 8
   %FCLayer = alloca i8*, align 8
   %ConvLayer2 = alloca i8*, align 8
+  %ConvLayer3 = alloca i8*, align 8
   %FCLayer2 = alloca i8*, align 8
   %AddWrapper = alloca i8*, align 8
   %ReluWrapper = alloca i8*, align 8
   %TanhWrapper = alloca i8*, align 8
   %BatchNormWrapper = alloca i8*, align 8
   %PoolingWrapper = alloca i8*, align 8
-  %SoftmaxWrapper = alloca i8*, align 8
+  %softmaxWrapper = alloca i8*, align 8
   store i8* bitcast (void (i32)* @llvm_hpvm_initTensorRt to i8*), i8** %initRT, align 8
   store i8* bitcast (void ()* @llvm_hpvm_cleanupTensorRt to i8*), i8** %cleanRT, align 8
-  store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxhpvmRT, align 8
-  store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleaApproxhpvmRT, align 8
+  store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxRT, align 8
+  store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleanApproxRT, align 8
   store i8* bitcast (void (i8*, i8*)* @llvm_hpvm_initializeRuntimeController to i8*), i8** %initRTController, align 8
   store i8* bitcast (void ()* @llvm_hpvm_clearRuntimeController to i8*), i8** %cleanRTController, align 8
   store i8* bitcast (void (i8*, i32)* @hpvm_request_tensor to i8*), i8** %request_tensorPtr, align 8
@@ -89,13 +90,14 @@ entry:
   store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, i32, i32, i32, i32, i32, i32, float, float, i32)* @ConvLayer_PROMISE to i8*), i8** %ConvLayer, align 8
   store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, float, float, i32)* @FCLayer_PROMISE to i8*), i8** %FCLayer, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float)* @wrapper_ConvLayer to i8*), i8** %ConvLayer2, align 8
+  store i8* bitcast (i8* (i8*, i8*, i8*, i32, i32, i32, i32, i32, i32)* @wrapper_tensorGroupConvolution to i8*), i8** %ConvLayer3, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, float, float)* @wrapper_FCLayer to i8*), i8** %FCLayer2, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*)* @wrapper_tensorAdd to i8*), i8** %AddWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorRelu to i8*), i8** %ReluWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorTanh to i8*), i8** %TanhWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i8*, i8*, double)* @wrapper_tensorBatchNorm to i8*), i8** %BatchNormWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*, i32, i32, i32, i32, i32, i32, i32)* @wrapper_tensorPooling to i8*), i8** %PoolingWrapper, align 8
-  store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %SoftmaxWrapper, align 8
+  store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %softmaxWrapper, align 8
   ret void
 }
 
@@ -175,6 +177,8 @@ declare i8* @FCLayer_PROMISE(i8*, float, float, i8*, float, float, i8*, float, f
 
 declare i8* @wrapper_ConvLayer(i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float) #1
 
+declare i8* @wrapper_tensorGroupConvolution(i8*, i8*, i8*, i32, i32, i32, i32, i32, i32) #1
+
 declare i8* @wrapper_FCLayer(i8*, i8*, i8*, i8*, i32, float, float) #1
 
 declare i8* @wrapper_tensorAdd(i8*, i8*, i8*) #1
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8145e179893bc0db2631cf1f7ee0f11bcc9be0e
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python
+#
+# Algorithmic Approximation Tuning
+# Purpose: Tunes for Perforation, Sampling, Numerical Precision (FP16)
+
+
+import adddeps  
+
+import argparse
+import opentuner
+from opentuner import ConfigurationManipulator
+from opentuner import MeasurementInterface
+from opentuner import Result
+from opentuner import EnumParameter
+from opentuner.search.objective import ThresholdAccuracyMinimizeTime
+from opentuner.measurement.inputmanager import FixedInputManager
+import shutil
+import os
+import sys
+import subprocess
+import threading
+import psutil
+
+from measure_confidence2 import dump_promise_confidence_files3
+from measure_confidence2 import getConfidence, getMinAccuracy
+from select_top_results import select_top_results
+from time import sleep
+from pareto_curve import findParetoConfigs
+
+
+
+
+class TunerData:
+  def __init__(self):
+    self.binary_path = ""
+    self.output_dir = ""
+    self.num_layers = 0
+    self.knobs_list = []
+    self.knobs_speedup = {}
+    self.accuracy_threshold = 0
+    self.test_id = 0
+    self.layer_costs = []
+    self.tuning_flags = []
+    self.autotuner_runs = 0
+    
+
+
+tunerData = TunerData()
+
+
+def readCostFile(file_path):
+
+  layer_costs = []
+  f = open(file_path)
+  for x in f:
+    cost = float(x.strip())
+    layer_costs.append(cost)
+
+  print ("len(layer_costs) = ", layer_costs)
+  f.close()
+
+  return layer_costs
+
+  
+
+def getAccuracy(file_name):
+  
+  file = open(file_name, "r")
+  acc_str = file.read()
+  file.close()
+  accuracy = float(acc_str)
+  
+  try:
+    accuracy = float(acc_str)
+  except:
+    return 20
+    
+  print (accuracy)
+  return accuracy
+    
+
+
+def createFlagsFile(file_name, cfg):
+
+  f = open(file_name, "w+")
+  cmd_config = ""
+  for i in range(tunerData.num_layers):  # flag in tunerData.tuning_flags:
+    flag = tunerData.tuning_flags[i]
+    flag_value = cfg[flag]
+    cmd_config += str(flag_value) + "\n"
+    
+  f.write(cmd_config)
+  f.close()
+
+  
+
+def readLayerKnobs(file_path):
+
+  f = open(file_path, "r")
+  knobs_list = []
+  for x in f:
+    knobs = []
+    vals = x.split(",")
+    for val in vals:
+      knobs.append(int(val))
+      
+    knobs_list.append(knobs)
+
+  print ("knobs_list = ", knobs_list)
+  
+  return knobs_list
+
+
+
+def readKnobConfig(file_path):
+
+  knobs_speedup = {}
+  f = open(file_path, "r")
+  for x in f:
+    toks = x.split("\t")
+    ID = int(toks[0].split(",")[1])
+
+    speedup = float(toks[2])
+    knobs_speedup[ID] = speedup
+  
+  print ("knobs_speedup = ", knobs_speedup)
+  
+  return knobs_speedup
+
+
+
+
+def getConfigCost(cfg):
+
+  orig_cost = 0.0
+  total_cost = 0.0
+  for it in range(tunerData.num_layers):
+    flag = tunerData.tuning_flags[it]
+    flag_value = cfg[flag]
+    op_cost = tunerData.layer_costs[it]
+    speedup = tunerData.knobs_speedup[flag_value]
+
+    total_cost += (op_cost * 1.0 / speedup * 1.0)
+    orig_cost += op_cost
+    
+    it += 1
+
+  speedup = (orig_cost * 1.0) / (total_cost * 1.0)
+  
+  return total_cost, speedup
+
+
+
+def appendTopLine(f_path, accuracy, total_runs, total_comps, speedup):
+
+  f_str = open(f_path, "r").read()
+
+  f_out = open(f_path, "w+")
+
+  f_out.write("total_runs=" + str(total_runs) + "\tconfidence=100.0" + "\tavg_accuracy=" + str(accuracy) + "\tconfig_cost=" + str(total_comps) + "\tspeedup=" + str(speedup) + "\n" )
+  f_out.write(f_str)
+
+  f_out.close()
+      
+
+
+
+
+class ClangFlagsTuner(MeasurementInterface):
+
+  def __init__(self, args):
+    objective = ThresholdAccuracyMinimizeTime(tunerData.accuracy_threshold)
+    input_manager = FixedInputManager(size=tunerData.num_layers)
+    self.configs_list = []
+
+    super(ClangFlagsTuner, self).__init__(
+        args, program_name=args.binary,
+        program_version=self.file_hash(args.binary),
+        input_manager=input_manager, objective=objective)
+
+
+    
+
+  def manipulator(self):
+    """
+    Define the search space by creating a
+    ConfigurationManipulator
+    """
+    manipulator = ConfigurationManipulator()
+
+    for i in range(tunerData.num_layers):
+      tunerData.tuning_flags.append("flag" + str(i))
+
+         
+    #for flag in tunerData.tuning_flags:
+    for ind in range(tunerData.num_layers):
+        flag = tunerData.tuning_flags[ind]
+        manipulator.add_parameter(
+        EnumParameter(flag, tunerData.knobs_list[ind]))
+
+        print ("ind = ", ind, " len = ", len(tunerData.knobs_list))
+        print (tunerData.knobs_list[ind])
+        ind += 1  
+      
+    return manipulator
+
+  
+  
+  def run(self, desired_result, input, limit):
+    
+    """
+    Run  a given configuration then
+    return performance
+    """
+    global test_id
+    
+    cfg = desired_result.configuration.data
+    
+    # NOTE: creates the file with flags read by the runtime
+    createFlagsFile("promise_flags", cfg)
+    
+    run_cmd = tunerData.binary_path
+    print "\nbinary_path = ", run_cmd
+ 
+
+    total_runs = 1 # NOTE: Single run sufficient in Algorithmic Approx Tuner
+    FNULL = open(os.devnull, 'wb')
+    p = subprocess.Popen([run_cmd, str(total_runs)], stdout = FNULL)
+    p.wait()
+
+       
+    accuracy = getAccuracy("final_accuracy")
+    
+    # getConfigCost returns the cost associated with the selected configuration
+    total_comps, speedup = getConfigCost(cfg)
+   
+    
+    Result = opentuner.resultsdb.models.Result()
+    Result.time = total_comps
+    #Result.accuracy = accuracy
+    min_accuracy = getMinAccuracy("run_accuracies.txt")
+    print ("min_accuracy = ", min_accuracy)
+    Result.accuracy = min_accuracy
+    
+    if min_accuracy > tunerData.accuracy_threshold:
+      config_tuple = (total_comps, accuracy, cfg)
+      self.configs_list.append(config_tuple)
+      f_path = tunerData.output_dir + '/' + tunerData.binary_path + '_' + str(tunerData.test_id)
+      shutil.copy('promise_flags', f_path)
+
+      appendTopLine(f_path, accuracy, total_runs, total_comps, speedup)
+
+      f_acc = open(tunerData.output_dir + '/' + tunerData.binary_path + '_' + str(tunerData.test_id) + "_accuracy", "w")
+      f_acc.write(str(accuracy))
+      f_acc.close()
+                   
+      
+    tunerData.test_id += 1
+    
+    return Result
+
+
+  def save_final_config(self, configuration):
+
+    print "Done with Autotuning Run \n"
+    sleep(2)
+
+    print "Final configuration", configuration.data
+
+    return
+
+  
+
+
+if __name__ == '__main__':
+
+  argparser = argparse.ArgumentParser(parents=opentuner.argparsers())
+  argparser.add_argument('--binary', help='path to target binary')
+  argparser.add_argument('--num-layers', type=int, help='num of flags to tune')
+  argparser.add_argument('--accuracy', type=float, help='accuracy threshold')
+  argparser.add_argument('--result-dir', help='result directory')
+  argparser.add_argument('--cost-file', help='layer description')
+  argparser.add_argument('--knobs-config', help='knob settings and ID mapping')
+  argparser.add_argument('--layer-knobs', help='per-layer Knobs')
+  
+  
+  args = argparser.parse_args()
+
+  tunerData.binary_path = str(args.binary)
+  tunerData.num_layers = int(args.num_layers)
+  tunerData.accuracy_threshold = float(args.accuracy)
+
+
+  # NOTE: Reading the cost file (with No of ops) to better guide the Autotuner
+  cost_file_path = args.cost_file
+  tunerData.layer_costs = readCostFile(cost_file_path)
+
+  
+  tunerData.knobs_list = readLayerKnobs(args.layer_knobs)
+  tunerData.knobs_speedup = readKnobConfig(args.knobs_config)
+  
+  result_dir = args.result_dir
+  if result_dir == "":
+    print("Provide --result-dir ")
+        
+  tunerData.output_dir = result_dir + "/high_confidence/"
+  if not os.path.exists(result_dir):
+    os.mkdir(result_dir)
+    
+  if not os.path.exists(tunerData.output_dir):
+    print("Creating output directory = ", tunerData.output_dir)
+    os.mkdir(tunerData.output_dir)
+
+
+    
+  ClangFlagsTuner.main(argparser.parse_args())
+
+  
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d2fc2c9493453f55cb83094373b19a24b59135d4
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt
@@ -0,0 +1,6 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..04336fca2708d5e5d78849e1c12014f5ddbd1ad7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt
@@ -0,0 +1,6 @@
+11894784.000000
+39321600.000000
+21233664.000000
+28311552.000000
+18874368.000000
+20480.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..063ba473d6a7fa57d7572c86dde9beac0932163d
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt
@@ -0,0 +1,7 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a5722f202dde469dca94c71dd9c5fc1cd7aa32b
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt
@@ -0,0 +1,7 @@
+88473.601562
+943718.375000
+471859.187500
+943718.375000
+471859.187500
+943718.375000
+2048.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d0a974015d74c2a08659deb6e4f664bebbbe83a9
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt
@@ -0,0 +1,14 @@
+fp32,11	-1	1.0	tensorConvolution	tensorConvolution
+fp16,12	-1	1.5	tensorConvolution	tensorHalfConvolution
+perf,21	1,2,0	2.25	tensorConvolution	tensorConvPerfCuda
+perf,22	1,2,1	2.25	tensorConvolution	tensorConvPerfCuda
+perf,23	1,3,0	1.88	tensorConvolution	tensorConvPerfCuda
+perf,24	1,3,1	1.88	tensorConvolution	tensorConvPerfCuda
+perf,25	2,1,0	2.25	tensorConvolution	tensorConvPerfCuda
+perf,26	2,1,1	2.25	tensorConvolution	tensorConvPerfCuda
+perf,27	3,1,0	1.88	tensorConvolution	tensorConvPerfCuda
+perf,28	3,1,1	1.88	tensorConvolution	tensorConvPerfCuda
+samp,31	2,0	2.25	tensorConvolution	tensorConvSampSim
+samp,32	2,1	2.25	tensorConvolution	tensorConvSampSim
+samp,33	4,0	1.8	tensorConvolution	tensorConvSampSim
+samp,34	4,1	1.8	tensorConvolution	tensorConvSampSim
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be1ce58c95981535ec94a7f8badffe967cfed586
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt
@@ -0,0 +1,4 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..74b1b668e2f27f3ddb77dcac7fff9890c70a6f02
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt
@@ -0,0 +1,4 @@
+62720.000000
+1003520.000000
+321126.406250
+1024.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6719acb97a58bd7f3d9fbe428f755e13df98b3d0
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt
@@ -0,0 +1,15 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673e704b7e37e19c090e98799189a4411bad9f7c
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt
@@ -0,0 +1,28 @@
+88473.601562
+29491.199219
+209715.203125
+14745.599609
+209715.203125
+29491.199219
+419430.406250
+7372.799805
+209715.203125
+14745.599609
+419430.406250
+3686.399902
+209715.203125
+7372.799805
+419430.406250
+7372.799805
+419430.406250
+7372.799805
+419430.406250
+7372.799805
+419430.406250
+7372.799805
+419430.406250
+1843.199951
+209715.203125
+3686.399902
+419430.406250
+1024.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..719d96e48168a477d6edfee1a02b80b554612ec7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt
@@ -0,0 +1,8 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7266441905a08c1ef1796dec8ee6c05660998378
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt
@@ -0,0 +1,8 @@
+265420.812500
+629145.625000
+629145.625000
+1258291.250000
+629145.625000
+1258291.250000
+629145.625000
+6144.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b7ff033cec2b85390ce6c7667fbbb04837a7eaf9
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt
@@ -0,0 +1,22 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fdba070cfc5eac559c8384306993fb52a1eb2e04
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt
@@ -0,0 +1,22 @@
+44236.800781
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+117964.796875
+235929.593750
+13107.200195
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+117964.796875
+235929.593750
+13107.200195
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+64.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fb54e7f077eaf27d7182e273fae31a867d8cbb9f
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt
@@ -0,0 +1,15 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f58ebcc043915d28cf874a1f67e5b2637db1dfc
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt
@@ -0,0 +1,15 @@
+88473.601562
+1887436.750000
+943718.375000
+1887436.750000
+943718.375000
+1887436.750000
+1887436.750000
+943718.375000
+1887436.750000
+1887436.750000
+471859.187500
+471859.187500
+471859.187500
+13107.200195
+256.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fb54e7f077eaf27d7182e273fae31a867d8cbb9f
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt
@@ -0,0 +1,15 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8c6daad2e2902e3ac821d99ebbe12e21b6428cc7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt
@@ -0,0 +1,15 @@
+884736.000000
+18874368.000000
+9437184.000000
+18874368.000000
+9437184.000000
+18874368.000000
+18874368.000000
+9437184.000000
+18874368.000000
+18874368.000000
+4718592.000000
+4718592.000000
+4718592.000000
+131072.000000
+25600.000000
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
index 6b38acc6577c1f804ae47d1cb6539b35ea07cf0f..2dc985a0c14ebc18a68d5e54f78bd416f9d3b523 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
@@ -1,5 +1,7 @@
 
 
+#include "tensor.h"
+
 
 extern "C"{
 
@@ -25,4 +27,18 @@ extern "C"{
   void* tensorConvolutionKernelSamp(void* input, void* filter_ptr,
 				    int vertical_pad, int horizontal_pad, int vertical_stride,
 				    int horizontal_stride, int conv_mode, int conv_groups, int skip_every);
+
+  void* tensorConvPerfCuda(void* input, void* filter,
+			   int vertical_pad, int horizontal_pad,
+			   int vertical_stride, int horizontal_stride,
+			   int conv_mode, int conv_groups, int row, int col, int start);
+
+  
+  void sampleFilter(Tensor* filter, int skip_rate, int skip_offset);
+
+  void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
+			  int vertical_pad, int horizontal_pad,
+			  int vertical_stride, int horizontal_stride,
+			  int conv_mode, int conv_groups,
+			  int skip_rate, int skip_offset);
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h
index 28a7f8c8fc7cadfed7e25840a2eb9308d5350336..66070f3058d840e4dbe25919e33aa8abc060b330 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h
@@ -1,5 +1,24 @@
 
 
+#ifndef SIM_HEADER
+#define SIM_HEADER
+
+
+
+#include "tensor_runtime.h"
+#include "tensor_utils.cu"
+#include "debug.h"
+#include "profiling.h"
+#include "fp16_conversion.h"
+#include "global_data.h"
+#include "error.h"
+#include "tensor.h"
+#include "op_overheads.h"
+#include "half_precision_api.h"
+#include "approx_techniques2.h"
+#include <unordered_map>
+
+
 
 
 //N is new_data's size
@@ -49,11 +68,11 @@ void postInterpolateCol(int N, int n, int c, int h, int w, float* data, int int_
 
 
 
-// Perforated Tensor Conv with 'perforation_rate' parameter
-void* tensorConvPerf2(void* input_ptr, void* filter_ptr,
-		      int vertical_pad, int horizontal_pad,
-		      int vertical_stride, int horizontal_stride,
-		      int conv_mode, int conv_groups, int row, int col){
+// A 'Simulation' of perforated tensor convolution
+void* tensorConvPerfSim(void* input_ptr, void* filter_ptr,
+			int vertical_pad, int horizontal_pad,
+			int vertical_stride, int horizontal_stride,
+			int conv_mode, int conv_groups, int row, int col){
   
 
   INFO("*** TensorConvolution \n");
@@ -65,15 +84,14 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr,
   cudnnConvolutionDescriptor_t convDesc;
   cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
+  
   if(conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
   else if(conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
-  // FIXIT: Need to be more aware of the implications of alpha and beta
   float alpha = 1.0f, beta = 0.0f;
 
-  // TODO: Support other cases;
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
@@ -92,8 +110,7 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr,
   int new_v = vertical_stride + 0;
   int new_h = horizontal_stride + 0;
   cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-  // FIXIT: Think if upscaling values need to be configurable?
-  // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
+  
   checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
 					     vertical_pad, horizontal_pad, // conv padding
 					     new_v, new_h, // conv strides
@@ -128,7 +145,8 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr,
   // NOTE: Necessary to insert the above call for every output tensor
 
   DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
+	output->data_type, output->data_format, output->dims.dim_sizes[0],
+	output->dims.dim_sizes[1],
 	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
 
   if(convDesc == NULL || input->tensor_desc == NULL ||
@@ -136,10 +154,6 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr,
     ERROR("NULL descriptor! \n");
 
 
-  // Debugging info prints
-  printTensorDescInfo(input);
-  printTensorDescInfo(filter);
-  printTensorDescInfo(output);
 
   // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
   checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
@@ -197,10 +211,1120 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr,
 				         (float *) output->gpu_data, col);
 
 
-  //cudaDeviceSynchronize();
+  profileEvent("tensorConv_end", true);
+
+  return output;
+}
+
+
+
+
+
+//N is new_data's size
+//n, c, h, w are the dimensions of new_data
+__global__
+void sampleFilterElems(int N,
+		       int n, int c, int h, int w,
+		       float* data,
+		       int skip_elem, int skip_offset, float mul_factor){
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < N; i += stride){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n = i / (c * h * w);
+
+    //int local_index = row * w + col;
+    int local_index = (ch * (h * w)) + (row * w) + col;
+    
+    if(local_index % skip_elem  == skip_offset)
+       data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0;
+    else
+      data[n * (c * h * w) + ch * (h * w) + row * (w) + col] *= mul_factor;
+      
+  }
+}
+
+
+
+
+
+void sampleFilter(Tensor* filter, int skip_rate, int skip_offset){
+
+  int n = filter->dims.dim_sizes[0];
+  int c = filter->dims.dim_sizes[1];
+  int h = filter->dims.dim_sizes[2];
+  int w = filter->dims.dim_sizes[3];
+    
+  int numBlocks = (n * c * h * w  + 127) / 128;
+  int N = n * c * h * w;
+  float mul_factor = skip_rate / (skip_rate - 1); 
+
+  printf ("mul_factor = %f \n", mul_factor);
+  
+  sampleFilterElems<<<numBlocks,128>>>(N,
+				       n, c, h, w,
+				       (float *) filter->gpu_data,
+				       skip_rate, skip_offset, mul_factor);
+
+}
+
+
+
+// A 'Simulation' of perforated tensor convolution
+void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
+			int vertical_pad, int horizontal_pad,
+			int vertical_stride, int horizontal_stride,
+			int conv_mode, int conv_groups,
+			int skip_rate, int skip_offset){
+  
+
+  INFO("*** TensorConvolution \n");
+  profileEvent("tensorConv");
+
+  Tensor* input = (Tensor*) input_ptr;
+  Tensor* filter = (Tensor*) filter_ptr;
+
+  
+  cudnnConvolutionDescriptor_t convDesc;
+  cudnnConvolutionFwdAlgo_t convAlgo;  
+  cudnnConvolutionMode_t mode;
+  
+  if(conv_mode == 0)
+    mode = CUDNN_CONVOLUTION;
+  else if(conv_mode == 1)
+    mode = CUDNN_CROSS_CORRELATION;
+
+  float alpha = 1.0f, beta = 0.0f;
+
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  convertToFP32(input);
+  convertToFP32(filter);
+
+  
+  // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
+  sampleFilter(filter, skip_rate, skip_offset);
+  
+
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
+
+  checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
+
+  //FIXME: Current hack to preserve backward compatibilty
+  if(conv_groups == 0){
+    conv_groups = 1;
+  }
+
+  // NOTE: Adding support for grouped convolution
+  checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
+
+  int new_v = vertical_stride + 0;
+  int new_h = horizontal_stride + 0;
+  cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
+  
+  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
+					     vertical_pad, horizontal_pad, // conv padding
+					     new_v, new_h, // conv strides
+					     1, 1, // upscaling values
+					     mode , // mode is configurable
+					     computeType)); // defines compute precision
+
+  int n, c, h, w; // output dimensions
+  // Find dimension of convolution output
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
+						   input->tensor_desc,
+						   filter->filter_desc,
+						   &n, &c, &h, &w));
+
+
+  DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+
+  Tensor* output;
+  output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
+				      CUDNN_TENSOR_NCHW, n, c, h, w);
+  
+
+  // NOTE: Changing output tensor placement from host to device
+  changeTensorPlacement(output, DEVICE);
+  // NOTE: Necessary to insert the above call for every output tensor
+
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
+	output->data_type, output->data_format, output->dims.dim_sizes[0],
+	output->dims.dim_sizes[1],
+	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+
+  if(convDesc == NULL || input->tensor_desc == NULL ||
+     filter->filter_desc == NULL || output->tensor_desc == NULL)
+    ERROR("NULL descriptor! \n");
+
+
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
+						 input->tensor_desc,
+						 filter->filter_desc,
+						 convDesc,
+						 output->tensor_desc,
+						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+						 0,
+						 &convAlgo));
+
+
+  DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
+	CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+	CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
+
+
+  // NOTE: Using GEMM-based Algo
+  convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+
+  size_t workspace_size;
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
+						     input->tensor_desc,
+						     filter->filter_desc,
+						     convDesc,
+						     output->tensor_desc,
+						     convAlgo,
+						     &workspace_size));
+
+  // Allocating memory for the convolution workspace
+  void* workspace;
+  checkCudaErrors(cudaMalloc(&workspace, workspace_size));
+  DEBUG("workspace size = %d \n", workspace_size);
+
+
+  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
+				     input->gpu_data, filter->filter_desc, filter->gpu_data,
+				     convDesc, convAlgo, workspace, workspace_size,
+				     &beta, output->tensor_desc, output->gpu_data));
+
+
+ 
 
   profileEvent("tensorConv_end", true);
 
   return output;
 }
 
+
+
+
+
+
+
+
+
+
+
+/************ NOTE: API for ApproxHPVM Wrapper runtime *******/ 
+
+
+void* PROMISE_Conv(void* input, float i_min, float i_max,
+		   void* filter, float w_min, float w_max,
+		   void* bias, float b_min, float b_max,
+		   int conv_pad_h, int conv_pad_w,
+		   int conv_stride_h, int conv_stride_w,
+		   int pool_id, int pool_size,
+		   int activation_id, // Relu, Tanh, ClipRelu
+		   float out_min, float out_max, int swing){ 
+
+
+  Tensor* input_t = (Tensor*) input;
+  Tensor* filter_t = (Tensor*) filter;
+  Tensor* bias_t = (Tensor*) bias;
+  
+  int orig_type = input_t->cur_type;
+
+  DEBUG("FP32 conversions \n");
+  
+  convertToFP32(input_t);
+
+  convertToFP32(filter_t);
+  convertToFP32(bias_t);
+
+  DEBUG("DONE FP32 conversions \n");
+  
+
+  if(swing < 8){
+    input = quantizeTensorPromise(input, i_min, i_max);
+    filter = quantizeTensorPromise(filter, w_min, w_max);
+    if(bias != NULL)
+      bias = quantizeTensorPromise(bias, b_min, b_max);
+    // aRead error
+    
+    input = addPromiseError(input, swing);
+  }
+
+  
+  void* conv_out;
+  conv_out = tensorConvolution(input, filter,
+			       conv_pad_h, conv_pad_w,
+			       conv_stride_h, conv_stride_w,
+			       1, 0);
+  
+  void* conv_add;
+  if(bias != NULL){
+    conv_add = tensorAdd(conv_out, bias);
+  }
+  else{
+    conv_add = conv_out;
+  }
+
+  void* pool_out;
+  // NOTE: Skip pooling on negative pool sizes
+  if(pool_size > 0){
+    //FIXME: Currently only using MaxPooling
+    pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_size, pool_size);
+  }
+  else{
+    pool_out = conv_add;
+  }
+  
+  void* activation_out;  
+  switch(activation_id){
+  case -1:
+    activation_out = pool_out;
+    INFO("NO Activation Function \n");
+    break;
+  case 0:
+    activation_out = tensorTanh(pool_out);
+    break;
+  case 1:
+    activation_out = tensorRelu(pool_out);
+    break;
+  case 2:
+    activation_out = tensorRelu2(pool_out, out_min, out_max);
+    break;
+  default:
+    ERROR("Activation id %d NOT supported \n", activation_out);
+    break;
+  }
+
+
+  if(swing < 8 && activation_id != -1){
+    activation_out = quantizeTensorPromise(activation_out, out_min, out_max);
+  }
+
+
+
+  //NOTE: Convert back to FP16 if original type
+  if (orig_type == half_type){
+    convertToFP16((Tensor*) activation_out);
+  }
+
+  
+  return activation_out;
+}
+
+
+
+void* PROMISE_FC(void* input, float i_min, float i_max,
+		 void* weights, float w_min, float w_max,
+		 void* bias, float b_min, float b_max,
+		 int activation_id,
+		 float out_min, float out_max, int swing){
+
+
+  Tensor* input_t = (Tensor*) input;
+  Tensor* weights_t = (Tensor*) weights;
+  Tensor* bias_t = (Tensor*) bias;
+  
+  int orig_type = input_t->cur_type;
+  
+  convertToFP32(input_t);
+  convertToFP32(weights_t);
+  convertToFP32(bias_t);
+  
+  
+  if(swing < 8){
+    input = quantizeTensorPromise(input, i_min, i_max);
+    weights = quantizeTensorPromise(weights, w_min, w_max);
+    if(bias != NULL)
+      bias = quantizeTensorPromise(bias, b_min, b_max);
+
+    // NOTE: Modelling aRead error in PROMISE
+    input = addPromiseError(input, swing);
+  }
+
+
+  
+  void* gemm_out;
+  gemm_out = tensorGemmGPU(input, weights);
+
+  
+  void* gemmbias_out;
+  if(bias != NULL){
+    gemmbias_out = tensorAdd(gemm_out, bias);
+  }
+  else{
+    gemmbias_out = gemm_out;
+  }
+ 
+  void* activation_out;
+  switch(activation_id){
+
+  case -1:
+    activation_out = gemmbias_out;
+    INFO("No Activation Function \n");
+    break;
+  case 0:
+    activation_out = tensorTanh(gemmbias_out);
+    break;
+  case 1:
+    activation_out = tensorRelu(gemmbias_out);
+    break;
+  case 2:
+    activation_out = tensorRelu2(gemmbias_out, out_min, out_max);
+    break;
+  default:
+    ERROR("Activation id %d NOT supported \n", activation_out);
+    break;
+  }
+  
+  
+  if(swing < 8 && activation_id != -1){
+    activation_out = quantizeTensorPromise(activation_out, out_min, out_max);
+  }
+
+
+  //NOTE: Convert back to FP16 if original type
+  if (orig_type == half_type){
+    convertToFP16((Tensor*) activation_out);
+  }
+
+
+  
+  return activation_out;
+}
+
+
+
+
+
+// NOTE: Enabling the macro below is used for testing against the old PROMISE wrapper
+//#define OLD_MODEL
+
+#ifndef OLD_MODEL
+
+
+
+bool isPromise(int swing){
+
+  if(swing < 8)
+    return true;
+  else
+    return false;      
+}
+
+
+bool isFullPrecision(int swing){
+
+  if(swing == 11)
+    return true;
+  else
+    return false;      
+}
+
+
+
+bool isHalfPrecision(int swing){
+
+  if(swing == 12)
+    return true;
+  else
+    return false;      
+}
+
+
+bool isPerforation(int swing){
+
+  if(swing >= 21 && swing <= 29)
+    return true;
+  else
+    return false;      
+}
+
+
+bool isSampling(int swing){
+
+  if(swing >= 31 && swing <= 39)
+    return true;
+  else
+    return false;      
+}
+
+
+int getSwing(int swing){
+
+  #ifdef PROMISE_TUNER_ENABLED
+
+  // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime
+  if(!approxhpvm_runtime_mode){
+  
+    if(op_counter >= total_ops){
+      ERROR("No accuracy flag found \n");
+    }
+  
+    swing = op_accuracies[op_counter];
+    op_counter++;
+  }
+
+  #endif  
+
+   DEBUG("---- swing_value = %d \n", swing);  
+
+   return swing;
+}
+
+
+
+
+
+
+class PerfParams{
+
+ public:
+  int row;
+  int col;
+  int skip_offset;
+
+  PerfParams(){
+    row = 1;
+    col = 1;
+    skip_offset = 0;
+  }
+  
+  PerfParams(int row1, int col1, int skip_offset1){
+    row = row1;
+    col = col1;
+    skip_offset = skip_offset1;
+  }
+ 		
+};
+
+
+
+PerfParams getPerfParams(int swing){
+
+  std::map<int, PerfParams> perf_knob_map;
+
+  PerfParams params21(1, 2, 0);
+  perf_knob_map[21] = params21;
+
+  PerfParams params22(1, 2, 1);
+  perf_knob_map[22] = params22;
+
+  PerfParams params23(1, 3, 0);
+  perf_knob_map[23] = params23;
+
+  PerfParams params24(1, 3, 1);
+  perf_knob_map[24] = params24;
+
+  PerfParams params25(2, 1, 0);
+  perf_knob_map[25] = params25;
+
+  PerfParams params26(2, 1, 1);
+  perf_knob_map[26] = params26;
+
+  PerfParams params27(3, 1, 0);
+  perf_knob_map[27] = params27;
+
+  PerfParams params28(3, 1, 1);
+  perf_knob_map[28] = params28;
+
+  
+  return perf_knob_map[swing];
+  
+}
+
+
+
+
+class SampParams{
+
+ public:  
+  int skip_rate;
+  int skip_offset;
+
+  SampParams(){
+    skip_rate = 1;
+    skip_offset = 0;
+  }
+  
+  SampParams(int skip_rate1, int skip_offset1){
+    skip_rate = skip_rate1;
+    skip_offset = skip_offset1;
+  }
+ 		
+};
+
+
+
+SampParams getSampParams(int swing){
+
+  std::map<int, SampParams> samp_knob_map;
+
+  SampParams params31(2, 0);
+  samp_knob_map[31] = params31;
+
+  SampParams params32(2, 1);
+  samp_knob_map[32] = params32;
+
+  SampParams params33(4, 0);
+  samp_knob_map[33] = params33;
+
+  SampParams params34(4, 1);
+  samp_knob_map[34] = params34;
+
+  return samp_knob_map[swing];
+  
+}
+
+
+
+
+
+
+/***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */
+
+void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
+			void* filter, float w_min, float w_max,
+			void* bias, float b_min, float b_max,
+			int conv_pad_h, int conv_pad_w,
+			int conv_stride_h, int conv_stride_w,
+			int pool_id, int pool_size,
+			int activation_id, // Relu, Tanh, ClipRelu
+			float out_min, float out_max, int swing){ 
+
+  if(ONLINE_PROFILING){
+    ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n");
+  }
+  
+
+  swing = getSwing(swing);  
+ 
+  if(isPromise(swing)){
+    
+    return PROMISE_Conv(input, i_min, i_max,
+			filter, w_min, w_max,
+			bias, b_min, b_max,
+			conv_pad_h, conv_pad_w,
+			conv_stride_h, conv_stride_w,
+			pool_id, pool_size,
+			activation_id, 
+			out_min, out_max, swing);
+  }
+
+  
+  
+  void* conv_out;
+  if(isPerforation(swing)){
+ 
+    PerfParams params = getPerfParams(swing);
+    DEBUG("params.row = %d, params.col = %d, params.skip_offset = %d \n",
+	  params.row, params.col, params.skip_offset);
+    
+    conv_out = tensorConvPerfCuda(input, filter,
+				  conv_pad_h, conv_pad_w,
+				  conv_stride_h, conv_stride_w, 1, 1,
+				  params.row, params.col, params.skip_offset);
+
+  }
+
+  if(isSampling(swing)){
+ 
+    SampParams params = getSampParams(swing);
+    DEBUG("params.skip_rate = %d, params.skip_offset = %d \n",
+	  params.skip_rate, params.skip_offset);
+    
+    conv_out = tensorConvSampSim(input, filter,
+				 conv_pad_h, conv_pad_w,
+				 conv_stride_h, conv_stride_w, 1, 1,
+				 params.skip_rate, params.skip_offset);
+
+  }
+  
+
+  if (isHalfPrecision(swing)){
+
+    conv_out = tensorHalfConvolution(input, filter,
+				     conv_pad_h, conv_pad_w,
+				     conv_stride_h, conv_stride_w,
+				     1, 0);
+  }
+
+  if (isFullPrecision(swing)){
+    conv_out = tensorConvolution(input, filter,
+				 conv_pad_h, conv_pad_w,
+				 conv_stride_h, conv_stride_w,
+				 1, 0);
+  }
+
+  
+  void* conv_add;
+  if(bias != NULL){
+    if( !isFullPrecision(swing) ){  
+      conv_add = tensorHalfAdd(conv_out, bias);
+    }
+    else{
+      conv_add = tensorAdd(conv_out, bias);
+    }
+  }
+  else{
+    conv_add = conv_out;
+  }
+
+  void* pool_out;
+  if(pool_size > 0){
+    //FIXME: Currently only using MaxPooling
+    pool_out = tensorHalfPooling(conv_add, 0, pool_size, pool_size,
+				 0, 0, pool_size, pool_size);
+  }
+  else{
+    pool_out = conv_add;
+  }
+  
+  void* activation_out;  
+  switch(activation_id){
+  case -1:
+    activation_out = pool_out;
+    INFO("NO Activation Function \n");
+    break;
+  case 0:
+    activation_out = tensorHalfTanh(pool_out);
+    break;
+  case 1:
+    activation_out = tensorHalfRelu(pool_out);
+    break;
+  case 2:
+    activation_out = tensorHalfRelu2(pool_out, out_min, out_max);
+    break;
+  default:
+    ERROR("Activation id %d NOT supported \n", activation_out);
+    break;
+  }
+
+  
+  return activation_out;
+}
+
+
+void* FCLayer_PROMISE(void* input, float i_min, float i_max,
+		      void* weights, float w_min, float w_max,
+		      void* bias, float b_min, float b_max,
+		      int activation_id,
+		      float out_min, float out_max, int swing){ //NOTE: min_val, max_val apply to 'ClippedRelu'
+
+
+  swing = getSwing(swing);
+  
+  if(isPromise(swing)){
+
+    return PROMISE_FC(input, i_min, i_max,
+		      weights, w_min, w_max,
+		      bias, b_min, b_max,
+		      activation_id,
+		      out_min, out_max, swing);
+  }
+
+
+  
+  void* gemm_out;
+  if(!isFullPrecision(swing)){
+    gemm_out = tensorHalfGemm(input, weights);
+  }
+  else{
+    gemm_out = tensorGemmGPU(input, weights);
+  }
+
+  
+  void* gemmbias_out;
+  if(bias != NULL){
+    // Swing 8 corresponds to FP32
+    if(!isFullPrecision(swing)){
+      gemmbias_out = tensorHalfAdd(gemm_out, bias);
+    }
+    else{
+      gemmbias_out = tensorAdd(gemm_out, bias);
+    }
+  }
+  else{
+    gemmbias_out = gemm_out;
+  }
+ 
+  void* activation_out;
+  switch(activation_id){
+
+  case -1:
+    activation_out = gemmbias_out;
+    INFO("No Activation Function \n");
+    break;
+  case 0:
+    activation_out = tensorHalfTanh(gemmbias_out);
+    break;
+  case 1:
+    activation_out = tensorHalfRelu(gemmbias_out);
+    break;
+  case 2:
+    activation_out = tensorHalfRelu2(gemmbias_out, out_min, out_max);
+    break;
+  default:
+    ERROR("Activation id %d NOT supported \n", activation_out);
+    break;
+  }
+  
+  
+  
+  return activation_out;
+}
+
+#endif
+
+
+
+#ifdef OLD_MODEL
+
+#endif
+
+#endif 
+
+
+
+/************* NOTE: Outdated PROMISE routines - Used for Comparison ****/
+
+  
+
+
+/*
+
+
+
+void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
+			void* filter, float w_min, float w_max,
+			void* bias, float b_min, float b_max,
+			int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w,
+			int pool_id, int pool_size,
+			int activation_id, // Relu, Tanh, ClipRelu
+			float out_min, float out_max, int swing){ 
+
+
+  DEBUG("\n\n**** NOTE: Conv OLD MODEL *** \n\n");
+  
+  #ifdef PROMISE_TUNER_ENABLED
+
+  // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime
+  if(!approxhpvm_runtime_mode){
+  
+    if(op_counter >= total_ops){
+      ERROR("No accuracy flag found \n");
+    }
+  
+    swing = op_accuracies[op_counter];
+    op_counter++;
+  }
+  
+  #endif  
+
+  
+  if (swing < 0 || swing > 20){
+    ERROR("Incorrect swing value");
+  }
+
+  
+
+  if(swing < 8){
+    input = quantizeTensorPromise(input, i_min, i_max);
+    filter = quantizeTensorPromise(filter, w_min, w_max);
+    if(bias != NULL)
+      bias = quantizeTensorPromise(bias, b_min, b_max);
+    // aRead error
+    
+    input = addPromiseError(input, swing);
+  }
+
+  
+  void* conv_out;
+  if(swing == 8 || (swing >= 12 && swing <= 15) ){
+    //conv_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w,
+    //		              conv_stride_h, conv_stride_w, 1, 1, 1, 0);
+
+    int rows = 2;
+    switch(swing){
+
+    case 12: rows = 5; break;
+    case 13: rows = 4; break;
+    case 14: rows = 3; break;
+    case 15: rows = 2; break;    
+		   
+    default: rows = 2; break;
+    }
+    
+    conv_out = tensorConvPerfSim(input, filter, conv_pad_h, conv_pad_w,
+				 conv_stride_h, conv_stride_w, 1, 1, rows, 0);
+
+    /*void* gold = tensorConvolution(input, filter,
+				   conv_pad_h, conv_pad_w,
+				   conv_stride_h, conv_stride_w,
+				   1, 0);
+
+    Norm_t* norms = calculateNormsTreeReduction((struct Tensor*) conv_out, (struct Tensor*) gold);
+
+    DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); 
+    */
+
+/*-------------
+  }
+  else if(swing == 9 || (swing >= 16 && swing <= 19) ){
+    //conv_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w,
+    //		              conv_stride_h, conv_stride_w, 1, 1, 0, 1);
+
+
+    int cols = 2;
+    switch(swing){
+
+    case 16: cols = 5; break;
+    case 17: cols = 4; break;
+    case 18: cols = 3; break;
+    case 19: cols = 2; break;    
+		   
+    default: cols = 2; break;
+    }
+
+    
+    conv_out = tensorConvPerfSim(input, filter, conv_pad_h, conv_pad_w,
+				 conv_stride_h, conv_stride_w, 1, 1, 0, cols);
+
+
+    /*void* gold = tensorConvolution(input, filter,
+				   conv_pad_h, conv_pad_w,
+				   conv_stride_h, conv_stride_w,
+				   1, 0);
+
+    Norm_t* norms = calculateNormsTreeReduction((struct Tensor*)conv_out, (struct Tensor*) gold);
+
+    DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); 
+    */
+
+/*-----
+
+  }
+  else if(swing == 10){  
+    conv_out = tensorHalfConvolution(input, filter,
+				     conv_pad_h, conv_pad_w,
+				     conv_stride_h, conv_stride_w,
+				     1, 0);
+  }
+  else{
+    conv_out = tensorConvolution(input, filter,
+				 conv_pad_h, conv_pad_w,
+				 conv_stride_h, conv_stride_w,
+				 1, 0);
+  }
+  
+  void* conv_add;
+  if(bias != NULL){
+    if(swing >= 8){  
+      conv_add = tensorHalfAdd(conv_out, bias);
+    }
+    else{
+      conv_add = tensorAdd(conv_out, bias);
+    }
+  }
+  else{
+    conv_add = conv_out;
+  }
+
+  void* pool_out;
+  // NOTE: Skip pooling on negative pool sizes
+  if(pool_size > 0){
+    //FIXME: Currently only using MaxPooling
+    pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_size, pool_size);
+  }
+  else{
+    pool_out = conv_add;
+  }
+  
+  void* activation_out;  
+  switch(activation_id){
+  case -1:
+    activation_out = pool_out;
+    INFO("NO Activation Function \n");
+    break;
+  case 0:
+    activation_out = tensorTanh(pool_out);
+    break;
+  case 1:
+    activation_out = tensorRelu(pool_out);
+    break;
+  case 2:
+    activation_out = tensorHalfRelu2(pool_out, out_min, out_max);
+    break;
+  default:
+    ERROR("Activation id %d NOT supported \n", activation_out);
+    break;
+  }
+
+
+  if(swing < 8 && activation_id != -1){
+    activation_out = quantizeTensorPromise(activation_out, out_min, out_max);
+  }
+  
+  return activation_out;
+}
+
+
+void* FCLayer_PROMISE(void* input, float i_min, float i_max,
+		      void* weights, float w_min, float w_max,
+		      void* bias, float b_min, float b_max,
+		      int activation_id,
+		      float out_min, float out_max, int swing){ 
+
+
+  
+  #ifdef PROMISE_TUNER_ENABLED
+
+  // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime
+  if(!approxhpvm_runtime_mode){
+
+    if(op_counter >= total_ops){
+      ERROR("No accuracy flag found \n");
+    }
+  
+    swing = op_accuracies[op_counter];
+    op_counter++;
+  }
+  
+  #endif
+ 
+  
+  if (swing < 0 || swing > 20){
+    ERROR("Incorrect swing value");
+  }
+  
+  if(swing < 8){
+    input = quantizeTensorPromise(input, i_min, i_max);
+    weights = quantizeTensorPromise(weights, w_min, w_max);
+    if(bias != NULL)
+      bias = quantizeTensorPromise(bias, b_min, b_max);
+
+    // NOTE: Modelling aRead error in PROMISE
+    input = addPromiseError(input, swing);
+  }
+
+
+  
+  void* gemm_out;
+  if(swing >= 8 && swing < 11){
+    gemm_out = tensorHalfGemm(input, weights);
+  }
+  else{
+    gemm_out = tensorGemmGPU(input, weights);
+  }
+
+  
+  void* gemmbias_out;
+  if(bias != NULL){
+    // Swing 8 corresponds to FP32
+    if(swing >= 8 && swing < 20){
+      gemmbias_out = tensorHalfAdd(gemm_out, bias);
+    }
+    else{
+      gemmbias_out = tensorAdd(gemm_out, bias);
+    }
+  }
+  else{
+    gemmbias_out = gemm_out;
+  }
+ 
+  void* activation_out;
+  switch(activation_id){
+
+  case -1:
+    activation_out = gemmbias_out;
+    INFO("No Activation Function \n");
+    break;
+  case 0:
+    activation_out = tensorTanh(gemmbias_out);
+    break;
+  case 1:
+    activation_out = tensorRelu(gemmbias_out);
+    break;
+  case 2:
+    activation_out = tensorRelu2(gemmbias_out, out_min, out_max);
+    break;
+  default:
+    ERROR("Activation id %d NOT supported \n", activation_out);
+    break;
+  }
+  
+  
+  if(swing < 8 && activation_id != -1){
+    activation_out = quantizeTensorPromise(activation_out, out_min, out_max);
+  }
+  
+  return activation_out;
+}
+
+#endif
+
+
+
+
+
+
+#endif
+
+
+
+
+
+
+
+  /*void* gold = tensorConvolution(input, filter,
+				   conv_pad_h, conv_pad_w,
+				   conv_stride_h, conv_stride_w,
+				   1, 0);
+
+    Norm_t* norms = calculateNormsTreeReduction((struct Tensor*) conv_out, (struct Tensor*) gold);
+
+    DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); 
+    */
+  
+
+    /*void* gold = tensorConvolution(input, filter,
+				   conv_pad_h, conv_pad_w,
+				   conv_stride_h, conv_stride_w,
+				   1, 0);
+
+    Norm_t* norms = calculateNormsTreeReduction((struct Tensor*)conv_out, (struct Tensor*) gold);
+
+    DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); 
+    */
+
+
+
+
+
+
+  /*#ifdef PROMISE_TUNER_ENABLED
+
+  // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime
+  if(!approxhpvm_runtime_mode){
+
+    if(op_counter >= total_ops){
+      ERROR("No accuracy flag found \n");
+    }
+  
+    swing = op_accuracies[op_counter];
+    op_counter++;
+  }
+  
+  #endif
+
+  */
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
index af0ed1e202017dde2cb96e9f8798aff1219c0695..9689c6fce91d3a4093d91b5006ef1beee969f8eb 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
@@ -819,7 +819,7 @@ __global__ void depthwise_conv4_half3(__half* const __restrict__ y,
 
 				__half t1;
 
-				int total = C_dim * H_dim * W_dim;
+				//int total = C_dim * H_dim * W_dim;
 				t1 = xdata[(m - bstartm) * H_dim * W_dim + (start_h + p - bstart_h) * W_dim +
 					start_w + q - bstart_w];
 
@@ -920,7 +920,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 			int vertical_stride, int horizontal_stride,
 			int conv_mode, int conv_groups){
 
-  llvm_hpvm_initTensorRt(0);
 
   INFO("*** TensorConvolution \n");
   profileEvent("Conv");
@@ -935,7 +934,13 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 
   Tensor* output;
 
-	
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  convertToFP32(input);
+  convertToFP32(filter);
+
+  
   if (conv_groups > 32) {
     // TODO: Support other cases;  
     hostToDeviceCopy(input);
@@ -949,7 +954,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
     h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
     w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
 
-    output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 				     CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
@@ -957,33 +962,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
     // NOTE: Necessary to insert the above call for every output tensor
 
 
-    /*
-      if (c > 255) {
-      dim3 grid((n / 16), c);
-      dim3 block(h * w);
-      depthwise_conv << <grid, block >> > ((float*)output->gpu_data,
-      (float*)input->gpu_data, (float*)filter->gpu_data,
-      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
-
-      }*/
-
-    /*
-      dim3 grid((n / 12), c);
-      dim3 block(h * w);
-      depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data,
-      (float*)input->gpu_data, (float*)filter->gpu_data,
-      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
-      if(n % 12 > 0){ 
-      dim3 grid2((n % 12), c);
-      dim3 block(h * w);
-      depthwise_conv <<<grid, block >>> ((float*)output->gpu_data,
-      (float*)input->gpu_data, (float*)filter->gpu_data,
-      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12));
-      }
-    */
 		
     int blockSize;
     blockSize = 64;
@@ -994,7 +972,8 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 					 (float*)input->gpu_data, (float*)filter->gpu_data,
 					 input->dims.dim_sizes[0], input->dims.dim_sizes[1],
 					 input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-					 KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+					 KH, KW, h, w, vertical_pad, horizontal_pad,
+					 vertical_stride, horizontal_stride);
 
   }
   else {
@@ -1043,11 +1022,11 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
     DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
     if (input->data_format == CUDNN_TENSOR_NCHW)
-      output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
 				       CUDNN_TENSOR_NCHW, n, c, h, w);
     else if (input->data_format == CUDNN_TENSOR_NHWC) {
       DEBUG("* NHWC Format \n");
-      output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 				       CUDNN_TENSOR_NHWC, n, h, w, c);
     }
     else
@@ -1137,6 +1116,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 
 }
 
+// FIXME: Need to properly fix the new HALF type conversion
 void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
 			    int vertical_pad, int horizontal_pad,
 			    int vertical_stride, int horizontal_stride,
@@ -1165,6 +1145,9 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
+  convertToFP16(input);
+  convertToFP16(filter);
+  
 
   /***** CONVERSIONS from FP32 to FP16 - on the GPU */
   size_t* input_dims = input->dims.dim_sizes;
@@ -1209,7 +1192,7 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
     DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
     
 
-    output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
+    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type,
 				      CUDNN_TENSOR_NCHW, n, c, h, w);
     // FIXIT: more checks for data types needed
     output_half = (Tensor*) create4DTensor(CUDNN_DATA_HALF,
@@ -1797,7 +1780,7 @@ void* tensorConvPerf(void* input_ptr, void* filter_ptr,
 
   Tensor* new_output;
   if(input->data_format == CUDNN_TENSOR_NCHW)
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
+    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 					  CUDNN_TENSOR_NCHW, n, c, h, w);
   else if(input->data_format == CUDNN_TENSOR_NHWC){
     DEBUG("* NHWC Format \n");
@@ -2078,3 +2061,32 @@ void* tensorConvolutionKernelSamp(void* input_ptr, void* filter_ptr,
       #endif
   return output;
 }
+
+
+    /*
+      if (c > 255) {
+      dim3 grid((n / 16), c);
+      dim3 block(h * w);
+      depthwise_conv << <grid, block >> > ((float*)output->gpu_data,
+      (float*)input->gpu_data, (float*)filter->gpu_data,
+      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+
+      }*/
+
+    /*
+      dim3 grid((n / 12), c);
+      dim3 block(h * w);
+      depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data,
+      (float*)input->gpu_data, (float*)filter->gpu_data,
+      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+      if(n % 12 > 0){ 
+      dim3 grid2((n % 12), c);
+      dim3 block(h * w);
+      depthwise_conv <<<grid, block >>> ((float*)output->gpu_data,
+      (float*)input->gpu_data, (float*)filter->gpu_data,
+      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12));
+      }
+    */
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
new file mode 100644
index 0000000000000000000000000000000000000000..a81ffe296233178126555bbb53babdcd4192a7bf
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -0,0 +1,352 @@
+
+#include "tensor_utils.cu"
+
+
+
+//This skips every xth row
+//H_eff is the number of rows calculated exactly
+__global__
+void convToGemmPerfRow(float * const __restrict__ output,
+		       const float * const __restrict input, const int N, const int C,
+		       const int H, const int W, const int KH, const int KW, const int V_pad,
+		       const int H_pad, const int H_out, const int W_out, const int V_stride,
+		       const int H_stride, const int x, const int start, const int H_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_eff * W_out); //output image number
+  const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
+  const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  int past_start = (h % (x - 1) >= (x - 1 - start));
+  const int inH = (h / (x - 1) * x + h % (x-1) +
+		   past_start) * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] = 0;
+
+      }
+    }
+  }
+
+}
+
+
+//For use in tensorConvPerfCuda
+//Interpolates every xth row starting from x - 1 - start
+//N is total number of elements in final output array
+__global__
+void approxInterpolateRow(int N, int old_h, int n, int c, int h, int w,
+			  float *old_data, float *new_data, int x, int start){
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < N; i += stride){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n = i / (c * h * w);
+    int past_start = ((row % x) >= (x - 1 - start));
+
+    if(row == h-1)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
+    else if (row == 0)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    else if(row % x == x - 1 - start){
+      int past_startO = ((row - 1) % x) > (x - 1 - start);
+      int oldIdx1 = n * (c * old_h * w) + ch * (old_h * w) +
+	((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col;
+
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	(old_data[oldIdx1] + old_data[oldIdx1 + 1 * w]) / 2;
+    }
+    else
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[n * (c * old_h * w) + ch * (old_h * w) +
+		 ((x-1) * (row / x) + row % x - past_start )  * (w) + col];
+
+
+  }
+
+}
+
+
+//This skips every xth row
+//W_eff is the number of cols calculated exactly
+__global__
+void convToGemmPerfCol(float * const __restrict__ output,
+		       const float * const __restrict input, const int N, const int C,
+		       const int H, const int W, const int KH, const int KW, const int V_pad,
+		       const int H_pad, const int H_out, const int W_out, const int V_stride,
+		       const int H_stride, const int x, const int start, const int W_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_eff); //output image number
+  const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
+  const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
+  const int w = tx % W_eff; //output width index (col number)
+  int past_start = (w % (x - 1)) >= (x - 1 - start);
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = (w / (x - 1) * x + w % (x-1) +
+		   past_start) * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
+
+      }
+    }
+  }
+
+}
+
+
+//For use in tensorConvPerfCuda
+//Interpolates every xth col starting from x - 1 - start
+//N is total number of elements in final output array
+__global__
+void approxInterpolateCol(int N, int old_w, int n, int c, int h, int w,
+			  float *old_data, float *new_data, int x, int start){
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < N; i += stride){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n = i / (c * h * w);
+    int past_start = ((col % x) >= (x - 1 - start));
+
+    if(col == w-1)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+    else if (col == 0)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    else if(col % x == x - 1 - start){
+      int past_startO = ((col - 1) % x) > (x - 1 - start);
+      int oldIdx1 = n * (c * h * old_w) + ch * (h * old_w) + row * old_w +
+	((x-1) * ((col - 1) / x) + (col-1) % x - past_startO);
+
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	(old_data[oldIdx1] + old_data[oldIdx1 + 1]) / 2;
+    }
+    else
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w +
+		 ((x-1) * (col / x) + col % x - past_start)];
+
+  }
+
+}
+
+
+
+//start has to be less than row or less than col
+//row and col have to be >= 0
+//row = col = 1 means no perforation
+void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
+			 int vertical_pad, int horizontal_pad, int vertical_stride,
+			 int horizontal_stride, int conv_mode, int conv_groups,
+			 int row, int col, int start){
+
+  INFO("*** TensorConvolution (output perforation) \n");
+  profileEvent("Conv");
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
+  
+  Tensor* output;
+  // TODO: Support other cases;
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  
+  convertToFP32(input);
+  convertToFP32(filter);
+  
+  
+  int n, c, h, w; // output dimensions
+  n = input->dims.dim_sizes[0];
+  c = filter->dims.dim_sizes[0]; //number of filters
+  const int KH = filter->dims.dim_sizes[2];
+  const int KW = filter->dims.dim_sizes[3];
+
+  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  int h_eff = h - h / row;
+  if(h % row > row - 1 - start)
+    h_eff = h_eff - 1;
+
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  int w_eff = w - w / col;
+  if(w % col > col - 1 - start)
+    w_eff = w_eff - 1;
+
+
+  Tensor *new_output;
+  if(row > 1){
+    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+				     CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    float * convData;
+    int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 128;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+					       input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+					       input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w,
+					       vertical_stride, horizontal_stride, row, start, h_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      h_eff * w, c, num_filter_elem,
+					      &alpha,
+					      convData, h_eff * w, num_filter_elem * h_eff * w,
+					      (float *)filter->gpu_data, num_filter_elem, 0,
+					      &beta,
+					      (float *)output->gpu_data, h_eff * w, c * h_eff * w,
+					      n));
+
+    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 127) / 128;
+    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
+					    (float *) output->gpu_data, (float *) new_output->gpu_data,
+					    row, start);
+    cudaDeviceSynchronize();
+
+    cudaFree(output);
+    cudaFree(convData);
+  }
+  else if(col > 1){
+    
+    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
+				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    float * convData;
+    int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 128;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+					       input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+					       input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w,
+					       vertical_stride, horizontal_stride, col, start, w_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      h * w_eff, c, num_filter_elem,
+					      &alpha,
+					      convData, h * w_eff, num_filter_elem * h * w_eff,
+					      (float *)filter->gpu_data, num_filter_elem, 0,
+					      &beta,
+					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
+					      n));
+
+    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 127) / 128;
+    approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w,
+					    (float *)output->gpu_data, (float *)new_output->gpu_data,
+					    col, start);
+    cudaDeviceSynchronize();
+
+    cudaFree(output);
+    cudaFree(convData);
+  }
+  else{
+    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
+				     CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    float * convData;
+    int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 128;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApprox<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
+					      input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+					      input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w,
+					      vertical_stride, horizontal_stride, num_filter_elem, c * h * w);
+    checkCudaErrors(cudaDeviceSynchronize());
+    //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      h * w, c, num_filter_elem,
+					      &alpha,
+					      convData, h * w, num_filter_elem * h * w,
+					      (float *)filter->gpu_data, num_filter_elem, 0,
+					      &beta,
+					      (float *)output->gpu_data, h * w, c * h * w,
+					      n));
+
+    new_output = output;
+    cudaFree(convData);
+  }
+
+
+  profileEvent("Conv_end", true);
+ 
+  
+  return new_output;
+}
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
index b482cef5377e0f879b43f06a7ebbfbe01b39be09..14dc8f20f2111e85e82630cdbcc0c695a39c5ecd 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
@@ -72,7 +72,9 @@ public:
     FP32,
     FP16,
     PERFORATION,
-//    INPUT_SAMPLING,
+    INPUT_SAMPLING,
+    REDUCTION_SAMPLING,
+//  ADDITIONAL_APPROXIMATION_METHOD
     APPROX_END
   };
 
@@ -91,6 +93,15 @@ public:
     POOL_MEAN,
     POOL_MIN,
     SOFTMAX,
+    FFT,
+    REDUCE,
+    PROJECTIVE_T,
+    MAP1,
+    MAP2,
+    MAP3,
+//    STENCIL,
+//    COSINE_T,
+//  ADDITIONAL_TENSOR_OPERATION
     TENSOR_OP_END
   };
 
@@ -269,6 +280,24 @@ void GPUNodeConfiguration::print() {
       case TENSOR_OP::SOFTMAX :
         DEBUG("softmax");
         break;
+      case TENSOR_OP::FFT :
+        DEBUG("fft");
+        break;
+      case TENSOR_OP::REDUCE :
+        DEBUG("reduce");
+        break;
+      case TENSOR_OP::PROJECTIVE_T :
+        DEBUG("projectiveT");
+        break;
+      case TENSOR_OP::MAP1 :
+        DEBUG("map1");
+        break;
+      case TENSOR_OP::MAP2 :
+        DEBUG("map2");
+        break;
+      case TENSOR_OP::MAP3 :
+        DEBUG("map3");
+        break;
       default :
         ERROR("Unknown tensor operation.");
         break;
@@ -288,6 +317,12 @@ void GPUNodeConfiguration::print() {
         case APPROX::PERFORATION :
           DEBUG("perf");
           break;
+        case APPROX::INPUT_SAMPLING :
+          DEBUG("input_samp");
+          break;
+        case APPROX::REDUCTION_SAMPLING :
+          DEBUG("red_samp");
+          break;
         default:
           ERROR("Unknown approximation option");
           break;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h
index 9fd4a578318afe3d9f85097474396a351900354b..e2e78f1d10c048d73755df73d553b3932ab72d24 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h
@@ -422,7 +422,7 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){
   norms->mean_l2 = mean_l2;
   norms->orig_inf_norm = 0.0;
 
-  // Relative metrics (relative to distribution) - suitable for PROMISE
+  // Relative metrics (relative to distribution) 
   norms->l1_norm = relative_l1;
   norms->l2_norm = relative_l2;
   norms->inf_norm = 0.0;  
@@ -797,11 +797,11 @@ void* addGaussianError(void* x_ptr, int error_scale){
   Tensor* x = (Tensor*) x_ptr;
   
   size_t* dim_sizes = x->dims.dim_sizes;
-  Tensor* bias = (Tensor*) create4DTensor(x->data_type, x->data_format,
+  Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format,
 					  dim_sizes[0], dim_sizes[1],
 					  dim_sizes[2], dim_sizes[3]);
   
-  Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format,
+  Tensor* x_original = (Tensor*) create4DTensor(x->cur_type, x->data_format,
 					        dim_sizes[0], dim_sizes[1],
 						dim_sizes[2], dim_sizes[3]);
 
@@ -876,6 +876,7 @@ void initPromiseRandValues(Tensor* bias, int error_scale){
 }
 
 
+// NOTE: Assumption is that x_ptr is FP32 tensor - doesn't work with FP16
 // Routine for Adding PROMISE bitline swing error
 void* addPromiseError(void* x_ptr, int error_scale){
 
@@ -889,7 +890,7 @@ void* addPromiseError(void* x_ptr, int error_scale){
   Tensor* x = (Tensor*) x_ptr;
   
   size_t* dim_sizes = x->dims.dim_sizes;
-  Tensor* bias = (Tensor*) create4DTensor(x->data_type, x->data_format,
+  Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format,
 					  dim_sizes[0], dim_sizes[1],
 					  dim_sizes[2], dim_sizes[3]);
  
@@ -955,6 +956,7 @@ void* quantizeTensorPromise(void* input_ptr, float min, float max){
 
   INFO("QuantizeTensorPROMISE \n");
   Tensor* input = (Tensor*) input_ptr;
+
   
   int quantize_range = 256;
   float input_range = max - min;
@@ -967,8 +969,10 @@ void* quantizeTensorPromise(void* input_ptr, float min, float max){
 
   hostToDeviceCopy(input);
 
-  quantizeAndClip<<<gridSize, blockSize>>>((float*) input->gpu_data, input->num_elems, mul_factor, min, max);
+  quantizeAndClip<<<gridSize, blockSize>>>((float*) input->gpu_data,
+					   input->num_elems, mul_factor, min, max);
 
+  
   return input;
 }
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h
index 252427c65379aa977237652eb4435e685dbc3403..4c2fbe806d1758118f6d55c079f9c75de42599d8 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h
@@ -31,6 +31,12 @@
 // 
 // It is recommended to use the more robust versions in production code.
 
+
+#ifndef FP16_CONV_HEADER
+#define FP16_CONV_HEADER
+
+
+
 typedef unsigned uint;
 
 union FP32
@@ -111,4 +117,8 @@ static float half_to_float(half hf)
 
     o.u |= (h.u & 0x8000) << 16;    // sign bit
     return o.f;
-}
\ No newline at end of file
+}
+
+
+
+#endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
index cc1deccd058b02d1d2db6ef58c9be4ca48589231..230cb31f4de4740428737e52ad2834908566a07b 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
@@ -48,6 +48,8 @@ std::vector<void*> tensors_ptr;
 std::vector<void*> host_ptr;
 std::vector<void*> obj_ptr;
 
+std::unordered_map<void*, int> tracked_tensors;
+
 // Autotuning data
 std::unordered_map<int, int> skip_tensors;
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h
index f13e82f3aecf08c757341dda35d86a81b542180d..94e1a635b5a6baec9fec6c91509caee5cf287e01 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h
@@ -1,4 +1,9 @@
 
+
+#ifndef HALF_API_HEADER
+#define HALF_API_HEADER
+
+
 #include <stdio.h>
 #include <stdarg.h>
 #include <cstdio>
@@ -578,7 +583,6 @@ void* tensorHalfAdd(void* x_ptr, void* bias_ptr){
   hostToDeviceCopy(x);
   hostToDeviceCopy(bias);
 
-  size_t* x_dims = x->dims.dim_sizes;
 
   //**** Data conversion from float to half
   profileEvent("F2H_start");
@@ -611,3 +615,4 @@ void* tensorHalfAdd(void* x_ptr, void* bias_ptr){
 
 
 
+#endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
index 911f42b955a72cb756aadc1fc78231187ef3394e..21c6df7f1749e891dba257bbb1933c3beefb8c4f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
@@ -735,6 +735,30 @@ void RuntimeController::readConfigurationFile(const char *str) {
 	  DEBUG ("Found softmax operation\n");
 	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::SOFTMAX);
 	  idx++;
+	} else if (tokens[idx] == "fft") {
+	  DEBUG ("Found fft operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::FFT);
+	  idx++;
+	} else if (tokens[idx] == "reduce") {
+	  DEBUG ("Found reduce operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::REDUCE);
+	  idx++;
+	} else if (tokens[idx] == "projectiveT") {
+	  DEBUG ("Found projectiveT operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T);
+	  idx++;
+	} else if (tokens[idx] == "map1") {
+	  DEBUG ("Found map1 operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP1);
+	  idx++;
+	} else if (tokens[idx] == "map2") {
+	  DEBUG ("Found map2 operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP2);
+	  idx++;
+	} else if (tokens[idx] == "map3") {
+	  DEBUG ("Found map3 operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP3);
+	  idx++;
 	} else /*Not a new operation. This means an approximation option*/
 	  if (tokens[idx] == "fp32") {
 	    DEBUG("Found fp32 option\n");
@@ -756,6 +780,18 @@ void RuntimeController::readConfigurationFile(const char *str) {
 	    DEBUG("perf parameter: %d\n", perf);
         NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::PERFORATION, perf);
           idx += 2;
+        } else if (tokens[idx] == "input_samp") {
+	    DEBUG("Found input_samp option\n");
+        int input_samp = std::stoi(tokens[idx+1]);
+	    DEBUG("input_samp parameter: %d\n", input_samp);
+        NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::INPUT_SAMPLING, input_samp);
+          idx += 2;
+        } else if (tokens[idx] == "red_samp") {
+	    DEBUG("Found red_samp option\n");
+        int red_samp = std::stoi(tokens[idx+1]);
+	    DEBUG("red_samp parameter: %d\n", red_samp);
+        NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING, red_samp);
+          idx += 2;
         }
 	// TODO: other approximation options handled here
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu
index 9d4b7e0d03b766db8b4a0e0d5c1273bdd4ee74d8..282a0cbb68de4f033b46cdc5c4a8ad69aa1f20c0 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu
@@ -84,6 +84,8 @@ void freeTensor(void* tensor_ptr){
 
   
   cudaFree(tensor->gpu_data);
+  cudaFree(tensor->gpu_half_data);
+
   tensor->gpu_data = NULL;
   free(tensor->host_data);
   tensor->host_data = NULL;
@@ -137,6 +139,10 @@ void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){
   }
   
   tensors_ptr.push_back(tensor->gpu_data);
+  tensors_ptr.push_back(tensor->gpu_half_data);
+
+  tracked_tensors[tensor] = 1; // For FP16-FP32 data handling
+  
   host_ptr.push_back(tensor->host_data);
   obj_ptr.push_back(tensor);
   //host_ptr.push_back(tensor->host_data); 
@@ -323,12 +329,18 @@ extern "C"{
   void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){
 
     Tensor* tensor = (Tensor*) tensor_ptr;
-  
-    if(tensor->size_in_bytes != size_in_bytes){
+
+    size_t host_size_in_bytes = tensor->num_elems * 4;
+    //if(tensor->size_in_bytes != size_in_bytes){
+    if(host_size_in_bytes != size_in_bytes){
       ERROR("The destination and source sizes don't match");
     }
   
     std::memcpy(tensor->host_data, data_ptr, size_in_bytes);
+
+    changeTensorPlacement(tensor, HOST);
+
+    tensor->cur_type = float_type;
   }
 
 		      
@@ -422,10 +434,13 @@ extern "C"{
 
 
 
-bool ONLINE_PROFILING = false;
+bool ONLINE_PROFILING = false; // true;
 
 
 void convertToFP16(struct Tensor* tensor){
+
+  if(tensor == NULL)
+    return;
   
   printf("**** cur_type = %d , half_type = %d \n", tensor->cur_type, half_type);
 
@@ -443,7 +458,10 @@ void convertToFP16(struct Tensor* tensor){
   if(tensor->gpu_half_data == NULL)
      checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, size_in_bytes)); // Allocate memory on GPU
   
-  tensors_ptr.push_back(tensor->gpu_half_data);
+
+  // If Tensor is one of Tracked (has to free per batch) then track all data types
+  if(tracked_tensors.find(tensor) != tracked_tensors.end())
+    tensors_ptr.push_back(tensor->gpu_half_data);
   
   f2h((float*) tensor->gpu_data, tensor->num_elems, (half*) tensor->gpu_half_data);
 
@@ -454,6 +472,9 @@ void convertToFP16(struct Tensor* tensor){
 
 void convertToFP32(struct Tensor* tensor){
 
+  if(tensor == NULL)
+    return;
+  
   // Need this check for both offline and online profiling path
   if (tensor->cur_type == float_type)
     return;
@@ -468,7 +489,12 @@ void convertToFP32(struct Tensor* tensor){
     checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
     DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes);
   }
-  
+
+
+  // If Tensor is one of Tracked (has to free per batch) then track all data types
+  if(tracked_tensors.find(tensor) != tracked_tensors.end())
+    tensors_ptr.push_back(tensor->gpu_data);
+
   h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data);
 
   tensor->cur_type = float_type;
@@ -479,6 +505,9 @@ void convertToFP32(struct Tensor* tensor){
 
 void convertToFP32_offline(struct Tensor* tensor){
 
+  if(tensor == NULL)
+    return;
+  
   if(ONLINE_PROFILING){
     return;
   }
@@ -493,7 +522,11 @@ void convertToFP32_offline(struct Tensor* tensor){
     checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
     DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes);
   }
-  
+
+  // If Tensor is one of Tracked (has to free per batch) then track all data types
+  if(tracked_tensors.find(tensor) != tracked_tensors.end())
+    tensors_ptr.push_back(tensor->gpu_data);
+
   h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data);
 
   tensor->cur_type = float_type;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index eb7433afc59a862fbd6e7e0d7d153eb8080f459b..9e58f36a402844c33a1cb665ae4113e6e6a8534f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -134,6 +134,8 @@ void startMemTracking(){
   tensors_ptr.clear();
   host_ptr.clear();
   obj_ptr.clear();
+
+  tracked_tensors.clear();
 }
 
 
@@ -1287,8 +1289,7 @@ void* FCLayer_GPU(void* input,
 
 /*********** PROMISE API **************/
 
-
-
+/*
 void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
 			void* filter, float w_min, float w_max,
 			void* bias, float b_min, float b_max,
@@ -1359,6 +1360,10 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
 
     DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); 
     */
+
+
+
+  /* -----
   }
   else if(swing == 9 || (swing >= 16 && swing <= 19) ){
     //conv_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w,
@@ -1390,7 +1395,8 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
 
     DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); 
     */
-    
+
+  /*------
   }
   else if(swing == 10){  
     conv_out = tensorHalfConvolution(input, filter,
@@ -1549,7 +1555,7 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max,
   return activation_out;
 }
 
-
+*****/
 
 
 
diff --git a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py b/llvm/projects/soc_simulator/src/table_generator.py
similarity index 100%
rename from llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py
rename to llvm/projects/soc_simulator/src/table_generator.py
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile
index 4e762ea9894405bb375f518b65c209b4129d9f70..83b4dc9431ee84051def8a0f6850e7f2c194f033 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile
@@ -1,5 +1,6 @@
 DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks
 # NOTE: can configure build directory
+#HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/
 HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT)
 
 CC = $(HPVM_BUILD_DIR)/bin/clang++
@@ -15,9 +16,10 @@ APP = alexnet
 TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include
 TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include
 TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a
+PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a
+SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a
 TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a
 
-
 CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH)  -fno-exceptions -ffast-math -std=c++11 -O3
 CCFLAGS += -DDEVICE=CUDNN_TARGET
 LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lOpenCL
@@ -58,15 +60,17 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll
 	$(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP)_promise.ll -S -o  $(BUILD_DIR)/$(APP)_promise.visc.ll
 	$(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP)_loop.ll -S -o  $(BUILD_DIR)/$(APP)_loop.visc.ll
 	$(OPT) $(VISC_OPTFLAGS)  $(BUILD_DIR)/$(APP).visc.ll  -o  $(BUILD_DIR)/$(APP)_cudnn.bc
-	$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_promise.bc
-	$(OPT) $(VISC_OPTFLAGS)  $(BUILD_DIR)/$(APP)_loop.visc.ll  -o  $(BUILD_DIR)/$(APP)_loop.bc
+	#$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_promise.bc
+	$(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_wrapperapi.bc
+	$(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll  -o  $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc
-	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc
-	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_linked.bc
-	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_loop_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_linked $(LINKER_FLAGS)
-	#$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS)
+	#$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc
+	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc
+	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc
+	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
+	#$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS)
 
 $(BUILD_DIR):
 	mkdir -p $@
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/tuner_confs_base.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c3bc2335227cf06169b1f3d105314fdc9647d97d
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/tuner_confs_base.txt
@@ -0,0 +1,20 @@
++++++
+conf1 1 0 79.9 0
+1 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 
+2 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 
+3 gpu conv fp32 1 add fp32 1 tanh fp32 1 
+4 gpu conv fp32 1 add fp32 1 tanh fp32 1 
+5 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 
+6 gpu mul fp32 1 add fp32 1 
+7 gpu softmax fp32 1
+-----
++++++
+conf2 1.5 0 79.9 0
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu conv fp16 1 add fp16 1 tanh fp16 1 
+4 gpu conv fp16 1 add fp16 1 tanh fp16 1 
+5 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+6 gpu mul fp16 1 add fp16 1 
+7 gpu softmax fp32 1
+-----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp
index ee07bdd8f9901f1582d5f7642a2a86c099397a14..d92bc0c45d1115620d529aea4636ece8d3d62127 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp
@@ -9,8 +9,10 @@
 #include <tensorTypes.h> 
 #include <tensorUtils.h> 
 
+
+
 void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 5, 5, 1, 1); 
@@ -18,7 +20,7 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -26,7 +28,7 @@ void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_2_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -34,7 +36,7 @@ void var_2_node(void* t1, size_t bytes_t1) {
 }
 
 void var_3_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); 
@@ -42,7 +44,7 @@ void var_3_node(void* t1, size_t bytes_t1) {
 }
 
 void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); 
@@ -50,7 +52,7 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -58,7 +60,7 @@ void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_6_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -66,7 +68,7 @@ void var_6_node(void* t1, size_t bytes_t1) {
 }
 
 void var_7_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); 
@@ -74,7 +76,7 @@ void var_7_node(void* t1, size_t bytes_t1) {
 }
 
 void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
@@ -82,7 +84,7 @@ void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -90,7 +92,7 @@ void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_10_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -98,7 +100,7 @@ void var_10_node(void* t1, size_t bytes_t1) {
 }
 
 void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
@@ -106,7 +108,7 @@ void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -114,7 +116,7 @@ void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_13_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -122,7 +124,7 @@ void var_13_node(void* t1, size_t bytes_t1) {
 }
 
 void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
@@ -130,7 +132,7 @@ void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -138,7 +140,7 @@ void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_16_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -146,7 +148,7 @@ void var_16_node(void* t1, size_t bytes_t1) {
 }
 
 void var_17_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); 
@@ -154,7 +156,7 @@ void var_17_node(void* t1, size_t bytes_t1) {
 }
 
 void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_mul(t1, t2); 
@@ -162,7 +164,7 @@ void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -177,6 +179,8 @@ void var_20_node(void* t1, size_t bytes_t1) {
   __visc__return(2, r, (size_t) 0); 
 }
 
+
+
 void root(void* input, size_t input_bytes, 
 	  void* conv2d_1_w, size_t conv2d_1_w_bytes, 
 	  void* conv2d_1_b, size_t conv2d_1_b_bytes, 
@@ -371,9 +375,10 @@ int main(){
 
   std::string dir_prefix = std::string("../../../../../../projects/hpvm-tensor-rt/model_params/alexnet_cifar10_test/");
 
-
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  //void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32); 
   std::string labels_path =  dir_prefix + std::string("labels32.bin"); 
-  //uint8_t* labels = readLabels(labels_path.c_str(),10000); 
+  uint8_t* labels = readLabels(labels_path.c_str(),5000); 
   std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
   void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
   std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
@@ -404,6 +409,8 @@ int main(){
   __visc__init(); 
   RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); 
 
+  //args->input = input; 
+  //args->input_bytes = 0; 
   args->conv2d_1_w = conv2d_1_w; 
   args->conv2d_1_w_bytes = 0; 
   args->conv2d_1_b = conv2d_1_b; 
@@ -429,48 +436,38 @@ int main(){
   args->dense_1_b = dense_1_b; 
   args->dense_1_b_bytes = 0; 
 
-  int batch_size = 500;
-  int test_input_size = 10000;  
-  int batch_count = test_input_size / batch_size;
-
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  int batch_size = 500; 
+  int test_input_size = 10000;
+  int batch_count = test_input_size / batch_size; 
+  
   void* input = create4DTensor(0,nchw,batch_size,3,32,32);
 
-  
   startMemTracking();
-  for (int i = 0; i < batch_count; i++){
+  startProfiling();
 
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
+  for (int i = 0; i < batch_count; i++){
+  
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
 
     copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-
-    args->input = input; 
+  
+    args->input = input;
     args->input_bytes = 0; 
-
-    //void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32);
-
-    void* dfg = __visc__launch(0, root, (void*) args); 
+  
+    void* dfg = __visc__launch(0, root, (void*) args);
 
     __visc__wait(dfg); 
+  
+    void *result = static_cast<RootIn*>(args)->input;
+    hpvm_request_tensor(result, 0);
+  
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
 
-    void *result = static_cast<RootIn*>(args)->input; 
-    hpvm_request_tensor(result, 0); 
-
-
-    uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); 
-
-    computeAccuracy3(labels, result);
-
-    llvm_hpvm_invokeRtControl2(result, labels);
-      
     freeBatchMemory();
   }
-
-
-  __visc__cleanup();
-
-
+  stopProfiling();
+  __visc__cleanup();  
+  
   return 0; 
-
-} 
+}
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/tuner_confs_base.txt
new file mode 100644
index 0000000000000000000000000000000000000000..36b4d8bcd26563a1f398df34800ad2b70f24a670
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/tuner_confs_base.txt
@@ -0,0 +1,16 @@
++++++
+conf1 1 0 98.9 0
+1 gpu conv fp32 1 add fp32 1 pool_max fp32 1 tanh fp32 1 
+2 gpu conv fp32 1 add fp32 1 pool_max fp32 1 tanh fp32 1 
+3 gpu mul fp32 1 add fp32 1 tanh fp32 1 
+4 gpu mul fp32 1 add fp32 1 tanh fp32 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf2 1.5 0 98.9 0
+1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
+2 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/quant_ranges_rt.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/quant_ranges_rt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..75211f858c1cc9eb6a186dc7f90c143ea820ef67
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/quant_ranges_rt.txt
@@ -0,0 +1,15 @@
+1 -1.9892114 2.126797 -2.19630692005 1.34758170414  0.0  0.0  -60.892750473 51.9925691605 
+2 0.0 5.71354155397 -0.931772116065 1.07742589378   0.0  0.0 -6.51858950329 6.81084251881 
+3 0.0 4.93213940287 -0.531654466152 0.57537904036   0.0  0.0  -4.48263123512 3.96730119753 
+4 0.0 4.10326339769 -0.362340988219 0.407691390038   0.0  0.0  -4.04261828327 3.8867793293 
+5 0.0 5.38322130251 -0.313120054901 0.293576799393   0.0  0.0  -5.92146921539 4.33867932415 
+6 0.0 4.31673815441 -0.232992478013 0.258029025793   0.0  0.0  -4.20778994751 3.93243697071 
+7 0.0 5.8304081068 -0.202337772191 0.189983081758   0.0  0.0  -6.29828691578 4.84813511753 
+8 0.0 4.44641780996 -0.174427356511 0.176958308667  0.0  0.0   -4.34791088581 3.61443646955 
+9 0.0 4.5180956049 -0.145467961878 0.15256431669   0.0  0.0   -3.02877027559 2.94873657799 
+10 0.0 6.34857563496 -0.130258745223 0.135582433432   0.0  0.0  -4.22931008053 3.53150463724 
+11 0.0 5.22100311041 -0.119001727596 0.125363747835   0.0  0.0  -4.03820378017 4.00400940704 
+12 0.0 5.73249834776 -0.108397216856 0.116256686077    0.0  0.0  -3.31110151148 4.46293323326 
+13 0.0 7.24049821186 -0.0862374496162 0.0885944995135   0.0  0.0  -4.17543139458 6.2043294754 
+14 0.0 7.81395883465 -0.0681302513927 0.0700202777982    0.0  0.0  -10.9205664234 2.64429125786 
+15 0.0 2.86920666504 -0.223010196954 0.14426593782 -0.1654396 0.23336112 -12.2459499588 23.8053251343
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/tuner_confs_base.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b5c1727ad0dc9e24310e4c86e116894051c84b3
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/tuner_confs_base.txt
@@ -0,0 +1,174 @@
++++++
+conf1 1 0 84.8 0
+1 gpu conv fp32 1 
+2 gpu batchnorm fp32 1 
+3 gpu relu fp32 1 
+4 gpu group_conv fp32 1 
+5 gpu batchnorm fp32 1 
+6 gpu relu fp32 1 
+7 gpu conv fp32 1 
+8 gpu batchnorm fp32 1 
+9 gpu relu fp32 1 
+10 gpu group_conv fp32 1 
+11 gpu batchnorm fp32 1 
+12 gpu relu fp32 1 
+13 gpu conv fp32 1 
+14 gpu batchnorm fp32 1 
+15 gpu relu fp32 1 
+16 gpu group_conv fp32 1 
+17 gpu batchnorm fp32 1 
+18 gpu relu fp32 1 
+19 gpu conv fp32 1 
+20 gpu batchnorm fp32 1 
+21 gpu relu fp32 1 
+22 gpu group_conv fp32 1 
+23 gpu batchnorm fp32 1 
+24 gpu relu fp32 1 
+25 gpu conv fp32 1 
+26 gpu batchnorm fp32 1 
+27 gpu relu fp32 1 
+28 gpu group_conv fp32 1 
+29 gpu batchnorm fp32 1 
+30 gpu relu fp32 1 
+31 gpu conv fp32 1 
+32 gpu batchnorm fp32 1 
+33 gpu relu fp32 1 
+34 gpu group_conv fp32 1 
+35 gpu batchnorm fp32 1 
+36 gpu relu fp32 1 
+37 gpu conv fp32 1 
+38 gpu batchnorm fp32 1 
+39 gpu relu fp32 1 
+40 gpu group_conv fp32 1 
+41 gpu batchnorm fp32 1 
+42 gpu relu fp32 1 
+43 gpu conv fp32 1 
+44 gpu batchnorm fp32 1 
+45 gpu relu fp32 1 
+46 gpu group_conv fp32 1 
+47 gpu batchnorm fp32 1 
+48 gpu relu fp32 1 
+49 gpu conv fp32 1 
+50 gpu batchnorm fp32 1 
+51 gpu relu fp32 1 
+52 gpu group_conv fp32 1 
+53 gpu batchnorm fp32 1 
+54 gpu relu fp32 1 
+55 gpu conv fp32 1 
+56 gpu batchnorm fp32 1 
+57 gpu relu fp32 1 
+58 gpu group_conv fp32 1 
+59 gpu batchnorm fp32 1 
+60 gpu relu fp32 1 
+61 gpu conv fp32 1 
+62 gpu batchnorm fp32 1 
+63 gpu relu fp32 1 
+64 gpu group_conv fp32 1 
+65 gpu batchnorm fp32 1 
+66 gpu relu fp32 1 
+67 gpu conv fp32 1 
+68 gpu batchnorm fp32 1 
+69 gpu relu fp32 1 
+70 gpu group_conv fp32 1 
+71 gpu batchnorm fp32 1 
+72 gpu relu fp32 1 
+73 gpu conv fp32 1 
+74 gpu batchnorm fp32 1 
+75 gpu relu fp32 1 
+76 gpu group_conv fp32 1 
+77 gpu batchnorm fp32 1 
+78 gpu relu fp32 1 
+79 gpu conv fp32 1 
+80 gpu batchnorm fp32 1 
+81 gpu relu fp32 1 
+82 gpu pool_mean fp32 1 
+83 gpu mul fp32 1 add fp32 1 
+84 gpu softmax fp32 1
+-----
++++++
+conf2 1.5 0 84.8 0
+1 gpu conv fp16 1 
+2 gpu batchnorm fp16 1 
+3 gpu relu fp16 1 
+4 gpu group_conv fp16 1 
+5 gpu batchnorm fp16 1 
+6 gpu relu fp16 1 
+7 gpu conv fp16 1 
+8 gpu batchnorm fp16 1 
+9 gpu relu fp16 1 
+10 gpu group_conv fp16 1 
+11 gpu batchnorm fp16 1 
+12 gpu relu fp16 1 
+13 gpu conv fp16 1 
+14 gpu batchnorm fp16 1 
+15 gpu relu fp16 1 
+16 gpu group_conv fp16 1 
+17 gpu batchnorm fp16 1 
+18 gpu relu fp16 1 
+19 gpu conv fp16 1 
+20 gpu batchnorm fp16 1 
+21 gpu relu fp16 1 
+22 gpu group_conv fp16 1 
+23 gpu batchnorm fp16 1 
+24 gpu relu fp16 1 
+25 gpu conv fp16 1 
+26 gpu batchnorm fp16 1 
+27 gpu relu fp16 1 
+28 gpu group_conv fp16 1 
+29 gpu batchnorm fp16 1 
+30 gpu relu fp16 1 
+31 gpu conv fp16 1 
+32 gpu batchnorm fp16 1 
+33 gpu relu fp16 1 
+34 gpu group_conv fp16 1 
+35 gpu batchnorm fp16 1 
+36 gpu relu fp16 1 
+37 gpu conv fp16 1 
+38 gpu batchnorm fp16 1 
+39 gpu relu fp16 1 
+40 gpu group_conv fp16 1 
+41 gpu batchnorm fp16 1 
+42 gpu relu fp16 1 
+43 gpu conv fp16 1 
+44 gpu batchnorm fp16 1 
+45 gpu relu fp16 1 
+46 gpu group_conv fp16 1 
+47 gpu batchnorm fp16 1 
+48 gpu relu fp16 1 
+49 gpu conv fp16 1 
+50 gpu batchnorm fp16 1 
+51 gpu relu fp16 1 
+52 gpu group_conv fp16 1 
+53 gpu batchnorm fp16 1 
+54 gpu relu fp16 1 
+55 gpu conv fp16 1 
+56 gpu batchnorm fp16 1 
+57 gpu relu fp16 1 
+58 gpu group_conv fp16 1 
+59 gpu batchnorm fp16 1 
+60 gpu relu fp16 1 
+61 gpu conv fp16 1 
+62 gpu batchnorm fp16 1 
+63 gpu relu fp16 1 
+64 gpu group_conv fp16 1 
+65 gpu batchnorm fp16 1 
+66 gpu relu fp16 1 
+67 gpu conv fp16 1 
+68 gpu batchnorm fp16 1 
+69 gpu relu fp16 1 
+70 gpu group_conv fp16 1 
+71 gpu batchnorm fp16 1 
+72 gpu relu fp16 1 
+73 gpu conv fp16 1 
+74 gpu batchnorm fp16 1 
+75 gpu relu fp16 1 
+76 gpu group_conv fp16 1 
+77 gpu batchnorm fp16 1 
+78 gpu relu fp16 1 
+79 gpu conv fp16 1 
+80 gpu batchnorm fp16 1 
+81 gpu relu fp16 1 
+82 gpu pool_mean fp16 1 
+83 gpu mul fp16 1 add fp16 1 
+84 gpu softmax fp32 1
+-----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2b3c537c5fbe845dbf9c97e24e8841e45ed3084f
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges.txt
@@ -0,0 +1,8 @@
+-1.9892114 2.126797 -1.51646211648 1.64720817745 -9.86898064232 10.5609560184 
+0.0 6.82138112736 -1.18343908739 1.27315966272 -9.87599849701 7.51305247974 
+0.0 4.82606745577 -0.599876856983 0.681207345724 -5.63328983307 5.17789223576 
+0.0 4.02646304417 -0.455596786201 0.494261391461 -5.31680394173 4.60585025024 
+0.0 4.53264906311 -0.356576155901 0.338216508806 -6.1012511816 4.36305006886 
+0.0 3.98747043872 -0.285027833283 0.286046403348 -4.24385170364 3.48625040674 
+0.0 6.56306590176 -0.189464023232 0.190123907179 -4.93811571312 3.53836347675 
+0.0 1.89083880007 -0.351403944016 0.422872786462 -0.23878151 0.26507422 -14.6308162231 27.2725212326
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges_rt.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges_rt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3a83ec095ec99c762d5ff05e2749db13db47909a
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges_rt.txt
@@ -0,0 +1,8 @@
+1 -1.9892114 2.126797 -1.51646211648 1.64720817745 -9.86898064232 10.5609560184 
+2 0.0 6.82138112736 -1.18343908739 1.27315966272 -9.87599849701 7.51305247974 
+3 0.0 4.82606745577 -0.599876856983 0.681207345724 -5.63328983307 5.17789223576 
+4 0.0 4.02646304417 -0.455596786201 0.494261391461 -5.31680394173 4.60585025024 
+5 0.0 4.53264906311 -0.356576155901 0.338216508806 -6.1012511816 4.36305006886 
+6 0.0 3.98747043872 -0.285027833283 0.286046403348 -4.24385170364 3.48625040674 
+7 0.0 6.56306590176 -0.189464023232 0.190123907179 -4.93811571312 3.53836347675 
+8 0.0 1.89083880007 -0.351403944016 0.422872786462 -0.23878151 0.26507422 -14.6308162231 27.2725212326
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/tuner_confs_base.txt
new file mode 100644
index 0000000000000000000000000000000000000000..501dfcc5e76d637d4e4136ac1c2486b6b4cbe639
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/tuner_confs_base.txt
@@ -0,0 +1,90 @@
++++++
+conf1 1 0 87.59 0
+1 gpu conv fp32 1 
+2 gpu batchnorm fp32 1 
+3 gpu relu fp32 1 
+4 gpu group_conv fp32 1 
+5 gpu batchnorm fp32 1 
+6 gpu relu fp32 1 
+7 gpu conv fp32 1 
+8 gpu batchnorm fp32 1 
+9 gpu relu fp32 1 
+10 gpu group_conv fp32 1 
+11 gpu batchnorm fp32 1 
+12 gpu relu fp32 1 
+13 gpu conv fp32 1 
+14 gpu batchnorm fp32 1 
+15 gpu relu fp32 1 
+16 gpu group_conv fp32 1 
+17 gpu batchnorm fp32 1 
+18 gpu relu fp32 1 
+19 gpu conv fp32 1 
+20 gpu batchnorm fp32 1 
+21 gpu relu fp32 1 
+22 gpu group_conv fp32 1 
+23 gpu batchnorm fp32 1 
+24 gpu relu fp32 1 
+25 gpu conv fp32 1 
+26 gpu batchnorm fp32 1 
+27 gpu relu fp32 1 
+28 gpu group_conv fp32 1 
+29 gpu batchnorm fp32 1 
+30 gpu relu fp32 1 
+31 gpu conv fp32 1 
+32 gpu batchnorm fp32 1 
+33 gpu relu fp32 1 
+34 gpu group_conv fp32 1 
+35 gpu batchnorm fp32 1 
+36 gpu relu fp32 1 
+37 gpu conv fp32 1 
+38 gpu batchnorm fp32 1 
+39 gpu relu fp32 1 
+40 gpu pool_mean fp32 1 
+41 gpu mul fp32 1 add fp32 1 
+42 gpu softmax fp32 1
+-----
++++++
+conf2 1.5 0 87.59 0
+1 gpu conv fp16 1 
+2 gpu batchnorm fp16 1 
+3 gpu relu fp16 1 
+4 gpu group_conv fp16 1 
+5 gpu batchnorm fp16 1 
+6 gpu relu fp16 1 
+7 gpu conv fp16 1 
+8 gpu batchnorm fp16 1 
+9 gpu relu fp16 1 
+10 gpu group_conv fp16 1 
+11 gpu batchnorm fp16 1 
+12 gpu relu fp16 1 
+13 gpu conv fp16 1 
+14 gpu batchnorm fp16 1 
+15 gpu relu fp16 1 
+16 gpu group_conv fp16 1 
+17 gpu batchnorm fp16 1 
+18 gpu relu fp16 1 
+19 gpu conv fp16 1 
+20 gpu batchnorm fp16 1 
+21 gpu relu fp16 1 
+22 gpu group_conv fp16 1 
+23 gpu batchnorm fp16 1 
+24 gpu relu fp16 1 
+25 gpu conv fp16 1 
+26 gpu batchnorm fp16 1 
+27 gpu relu fp16 1 
+28 gpu group_conv fp16 1 
+29 gpu batchnorm fp16 1 
+30 gpu relu fp16 1 
+31 gpu conv fp16 1 
+32 gpu batchnorm fp16 1 
+33 gpu relu fp16 1 
+34 gpu group_conv fp16 1 
+35 gpu batchnorm fp16 1 
+36 gpu relu fp16 1 
+37 gpu conv fp16 1 
+38 gpu batchnorm fp16 1 
+39 gpu relu fp16 1 
+40 gpu pool_mean fp16 1 
+41 gpu mul fp16 1 add fp16 1 
+42 gpu softmax fp32 1
+-----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/src/mobilenet_shallow_promise.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/src/mobilenet_shallow_promise.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..361fa0c1c44151cbefc98b6c983d17303d254eef
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/src/mobilenet_shallow_promise.cpp
@@ -0,0 +1,1225 @@
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/stat.h> 
+#include <cstring> 
+#include <visc.h> 
+#include <tensorTypes.h> 
+#include <tensorUtils.h> 
+
+void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::PROMISE_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_2_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_3_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 1, 1, 1, 32); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_5_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_6_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::PROMISE_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_8_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 2, 2, 1, 64); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_11_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::PROMISE_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_14_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 1, 1, 1, 128); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_16_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_17_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::PROMISE_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_20_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_21_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 2, 2, 1, 128); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_22_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_23_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_24_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::PROMISE_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_25_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_26_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_27_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 1, 1, 1, 256); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_28_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_29_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_30_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::PROMISE_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_31_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_32_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_33_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 2, 2, 1, 256); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_34_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_35_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_36_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::PROMISE_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_37_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(5, t1, t2, t3, t4, t5, 0); 
+
+  void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_38_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_39_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_pool_mean(t1, 2, 2, 0, 0, 2, 2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_40_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::PROMISE_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_mul(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_41_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::PROMISE_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_add(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_42_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_softmax(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void root(void* input, size_t input_bytes, 
+	  void* conv2d_1_w, size_t conv2d_1_w_bytes, 
+	  void* batch_normalization_1_gamma, size_t batch_normalization_1_gamma_bytes, 
+	  void* batch_normalization_1_beta, size_t batch_normalization_1_beta_bytes, 
+	  void* batch_normalization_1_mean, size_t batch_normalization_1_mean_bytes, 
+	  void* batch_normalization_1_variance, size_t batch_normalization_1_variance_bytes, 
+	  void* depthwise_conv2d_1_w, size_t depthwise_conv2d_1_w_bytes, 
+	  void* batch_normalization_2_gamma, size_t batch_normalization_2_gamma_bytes, 
+	  void* batch_normalization_2_beta, size_t batch_normalization_2_beta_bytes, 
+	  void* batch_normalization_2_mean, size_t batch_normalization_2_mean_bytes, 
+	  void* batch_normalization_2_variance, size_t batch_normalization_2_variance_bytes, 
+	  void* conv2d_2_w, size_t conv2d_2_w_bytes, 
+	  void* batch_normalization_3_gamma, size_t batch_normalization_3_gamma_bytes, 
+	  void* batch_normalization_3_beta, size_t batch_normalization_3_beta_bytes, 
+	  void* batch_normalization_3_mean, size_t batch_normalization_3_mean_bytes, 
+	  void* batch_normalization_3_variance, size_t batch_normalization_3_variance_bytes, 
+	  void* depthwise_conv2d_2_w, size_t depthwise_conv2d_2_w_bytes, 
+	  void* batch_normalization_4_gamma, size_t batch_normalization_4_gamma_bytes, 
+	  void* batch_normalization_4_beta, size_t batch_normalization_4_beta_bytes, 
+	  void* batch_normalization_4_mean, size_t batch_normalization_4_mean_bytes, 
+	  void* batch_normalization_4_variance, size_t batch_normalization_4_variance_bytes, 
+	  void* conv2d_3_w, size_t conv2d_3_w_bytes, 
+	  void* batch_normalization_5_gamma, size_t batch_normalization_5_gamma_bytes, 
+	  void* batch_normalization_5_beta, size_t batch_normalization_5_beta_bytes, 
+	  void* batch_normalization_5_mean, size_t batch_normalization_5_mean_bytes, 
+	  void* batch_normalization_5_variance, size_t batch_normalization_5_variance_bytes, 
+	  void* depthwise_conv2d_3_w, size_t depthwise_conv2d_3_w_bytes, 
+	  void* batch_normalization_6_gamma, size_t batch_normalization_6_gamma_bytes, 
+	  void* batch_normalization_6_beta, size_t batch_normalization_6_beta_bytes, 
+	  void* batch_normalization_6_mean, size_t batch_normalization_6_mean_bytes, 
+	  void* batch_normalization_6_variance, size_t batch_normalization_6_variance_bytes, 
+	  void* conv2d_4_w, size_t conv2d_4_w_bytes, 
+	  void* batch_normalization_7_gamma, size_t batch_normalization_7_gamma_bytes, 
+	  void* batch_normalization_7_beta, size_t batch_normalization_7_beta_bytes, 
+	  void* batch_normalization_7_mean, size_t batch_normalization_7_mean_bytes, 
+	  void* batch_normalization_7_variance, size_t batch_normalization_7_variance_bytes, 
+	  void* depthwise_conv2d_4_w, size_t depthwise_conv2d_4_w_bytes, 
+	  void* batch_normalization_8_gamma, size_t batch_normalization_8_gamma_bytes, 
+	  void* batch_normalization_8_beta, size_t batch_normalization_8_beta_bytes, 
+	  void* batch_normalization_8_mean, size_t batch_normalization_8_mean_bytes, 
+	  void* batch_normalization_8_variance, size_t batch_normalization_8_variance_bytes, 
+	  void* conv2d_5_w, size_t conv2d_5_w_bytes, 
+	  void* batch_normalization_9_gamma, size_t batch_normalization_9_gamma_bytes, 
+	  void* batch_normalization_9_beta, size_t batch_normalization_9_beta_bytes, 
+	  void* batch_normalization_9_mean, size_t batch_normalization_9_mean_bytes, 
+	  void* batch_normalization_9_variance, size_t batch_normalization_9_variance_bytes, 
+	  void* depthwise_conv2d_5_w, size_t depthwise_conv2d_5_w_bytes, 
+	  void* batch_normalization_10_gamma, size_t batch_normalization_10_gamma_bytes, 
+	  void* batch_normalization_10_beta, size_t batch_normalization_10_beta_bytes, 
+	  void* batch_normalization_10_mean, size_t batch_normalization_10_mean_bytes, 
+	  void* batch_normalization_10_variance, size_t batch_normalization_10_variance_bytes, 
+	  void* conv2d_6_w, size_t conv2d_6_w_bytes, 
+	  void* batch_normalization_11_gamma, size_t batch_normalization_11_gamma_bytes, 
+	  void* batch_normalization_11_beta, size_t batch_normalization_11_beta_bytes, 
+	  void* batch_normalization_11_mean, size_t batch_normalization_11_mean_bytes, 
+	  void* batch_normalization_11_variance, size_t batch_normalization_11_variance_bytes, 
+	  void* depthwise_conv2d_6_w, size_t depthwise_conv2d_6_w_bytes, 
+	  void* batch_normalization_12_gamma, size_t batch_normalization_12_gamma_bytes, 
+	  void* batch_normalization_12_beta, size_t batch_normalization_12_beta_bytes, 
+	  void* batch_normalization_12_mean, size_t batch_normalization_12_mean_bytes, 
+	  void* batch_normalization_12_variance, size_t batch_normalization_12_variance_bytes, 
+	  void* conv2d_7_w, size_t conv2d_7_w_bytes, 
+	  void* batch_normalization_13_gamma, size_t batch_normalization_13_gamma_bytes, 
+	  void* batch_normalization_13_beta, size_t batch_normalization_13_beta_bytes, 
+	  void* batch_normalization_13_mean, size_t batch_normalization_13_mean_bytes, 
+	  void* batch_normalization_13_variance, size_t batch_normalization_13_variance_bytes, 
+	  void* dense_1_w, size_t dense_1_w_bytes, 
+	  void* dense_1_b, size_t dense_1_b_bytes){ 
+
+
+  __visc__hint(visc::CPU_TARGET); 
+  __visc__attributes(68, input, conv2d_1_w, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, depthwise_conv2d_1_w, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, conv2d_2_w, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, depthwise_conv2d_2_w, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, conv2d_3_w, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, depthwise_conv2d_3_w, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, conv2d_4_w, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, depthwise_conv2d_4_w, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, conv2d_5_w, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, depthwise_conv2d_5_w, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, conv2d_6_w, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, depthwise_conv2d_6_w, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, conv2d_7_w, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, dense_1_w, dense_1_b, 0); 
+
+
+  void* var_0 = __visc__createNodeND(0, var_0_node); 
+
+  __visc__bindIn(var_0, 0, 0, 0); 
+  __visc__bindIn(var_0, 1, 1, 0); 
+  __visc__bindIn(var_0, 2, 2, 0); 
+  __visc__bindIn(var_0, 3, 3, 0); 
+
+  void* var_1 = __visc__createNodeND(0, var_1_node); 
+
+  __visc__edge(var_0, var_1, 1, 0, 0, 0); 
+  __visc__edge(var_0, var_1, 1, 1, 1, 0); 
+  __visc__bindIn(var_1, 4, 2, 0); 
+  __visc__bindIn(var_1, 5, 3, 0); 
+  __visc__bindIn(var_1, 6, 4, 0); 
+  __visc__bindIn(var_1, 7, 5, 0); 
+  __visc__bindIn(var_1, 8, 6, 0); 
+  __visc__bindIn(var_1, 9, 7, 0); 
+  __visc__bindIn(var_1, 10, 8, 0); 
+  __visc__bindIn(var_1, 11, 9, 0); 
+
+  void* var_2 = __visc__createNodeND(0, var_2_node); 
+
+  __visc__edge(var_1, var_2, 1, 0, 0, 0); 
+  __visc__edge(var_1, var_2, 1, 1, 1, 0); 
+
+  void* var_3 = __visc__createNodeND(0, var_3_node); 
+
+  __visc__edge(var_2, var_3, 1, 0, 0, 0); 
+  __visc__edge(var_2, var_3, 1, 1, 1, 0); 
+  __visc__bindIn(var_3, 12, 2, 0); 
+  __visc__bindIn(var_3, 13, 3, 0); 
+
+  void* var_4 = __visc__createNodeND(0, var_4_node); 
+
+  __visc__edge(var_3, var_4, 1, 0, 0, 0); 
+  __visc__edge(var_3, var_4, 1, 1, 1, 0); 
+  __visc__bindIn(var_4, 14, 2, 0); 
+  __visc__bindIn(var_4, 15, 3, 0); 
+  __visc__bindIn(var_4, 16, 4, 0); 
+  __visc__bindIn(var_4, 17, 5, 0); 
+  __visc__bindIn(var_4, 18, 6, 0); 
+  __visc__bindIn(var_4, 19, 7, 0); 
+  __visc__bindIn(var_4, 20, 8, 0); 
+  __visc__bindIn(var_4, 21, 9, 0); 
+
+  void* var_5 = __visc__createNodeND(0, var_5_node); 
+
+  __visc__edge(var_4, var_5, 1, 0, 0, 0); 
+  __visc__edge(var_4, var_5, 1, 1, 1, 0); 
+
+  void* var_6 = __visc__createNodeND(0, var_6_node); 
+
+  __visc__edge(var_5, var_6, 1, 0, 0, 0); 
+  __visc__edge(var_5, var_6, 1, 1, 1, 0); 
+  __visc__bindIn(var_6, 22, 2, 0); 
+  __visc__bindIn(var_6, 23, 3, 0); 
+
+  void* var_7 = __visc__createNodeND(0, var_7_node); 
+
+  __visc__edge(var_6, var_7, 1, 0, 0, 0); 
+  __visc__edge(var_6, var_7, 1, 1, 1, 0); 
+  __visc__bindIn(var_7, 24, 2, 0); 
+  __visc__bindIn(var_7, 25, 3, 0); 
+  __visc__bindIn(var_7, 26, 4, 0); 
+  __visc__bindIn(var_7, 27, 5, 0); 
+  __visc__bindIn(var_7, 28, 6, 0); 
+  __visc__bindIn(var_7, 29, 7, 0); 
+  __visc__bindIn(var_7, 30, 8, 0); 
+  __visc__bindIn(var_7, 31, 9, 0); 
+
+  void* var_8 = __visc__createNodeND(0, var_8_node); 
+
+  __visc__edge(var_7, var_8, 1, 0, 0, 0); 
+  __visc__edge(var_7, var_8, 1, 1, 1, 0); 
+
+  void* var_9 = __visc__createNodeND(0, var_9_node); 
+
+  __visc__edge(var_8, var_9, 1, 0, 0, 0); 
+  __visc__edge(var_8, var_9, 1, 1, 1, 0); 
+  __visc__bindIn(var_9, 32, 2, 0); 
+  __visc__bindIn(var_9, 33, 3, 0); 
+
+  void* var_10 = __visc__createNodeND(0, var_10_node); 
+
+  __visc__edge(var_9, var_10, 1, 0, 0, 0); 
+  __visc__edge(var_9, var_10, 1, 1, 1, 0); 
+  __visc__bindIn(var_10, 34, 2, 0); 
+  __visc__bindIn(var_10, 35, 3, 0); 
+  __visc__bindIn(var_10, 36, 4, 0); 
+  __visc__bindIn(var_10, 37, 5, 0); 
+  __visc__bindIn(var_10, 38, 6, 0); 
+  __visc__bindIn(var_10, 39, 7, 0); 
+  __visc__bindIn(var_10, 40, 8, 0); 
+  __visc__bindIn(var_10, 41, 9, 0); 
+
+  void* var_11 = __visc__createNodeND(0, var_11_node); 
+
+  __visc__edge(var_10, var_11, 1, 0, 0, 0); 
+  __visc__edge(var_10, var_11, 1, 1, 1, 0); 
+
+  void* var_12 = __visc__createNodeND(0, var_12_node); 
+
+  __visc__edge(var_11, var_12, 1, 0, 0, 0); 
+  __visc__edge(var_11, var_12, 1, 1, 1, 0); 
+  __visc__bindIn(var_12, 42, 2, 0); 
+  __visc__bindIn(var_12, 43, 3, 0); 
+
+  void* var_13 = __visc__createNodeND(0, var_13_node); 
+
+  __visc__edge(var_12, var_13, 1, 0, 0, 0); 
+  __visc__edge(var_12, var_13, 1, 1, 1, 0); 
+  __visc__bindIn(var_13, 44, 2, 0); 
+  __visc__bindIn(var_13, 45, 3, 0); 
+  __visc__bindIn(var_13, 46, 4, 0); 
+  __visc__bindIn(var_13, 47, 5, 0); 
+  __visc__bindIn(var_13, 48, 6, 0); 
+  __visc__bindIn(var_13, 49, 7, 0); 
+  __visc__bindIn(var_13, 50, 8, 0); 
+  __visc__bindIn(var_13, 51, 9, 0); 
+
+  void* var_14 = __visc__createNodeND(0, var_14_node); 
+
+  __visc__edge(var_13, var_14, 1, 0, 0, 0); 
+  __visc__edge(var_13, var_14, 1, 1, 1, 0); 
+
+  void* var_15 = __visc__createNodeND(0, var_15_node); 
+
+  __visc__edge(var_14, var_15, 1, 0, 0, 0); 
+  __visc__edge(var_14, var_15, 1, 1, 1, 0); 
+  __visc__bindIn(var_15, 52, 2, 0); 
+  __visc__bindIn(var_15, 53, 3, 0); 
+
+  void* var_16 = __visc__createNodeND(0, var_16_node); 
+
+  __visc__edge(var_15, var_16, 1, 0, 0, 0); 
+  __visc__edge(var_15, var_16, 1, 1, 1, 0); 
+  __visc__bindIn(var_16, 54, 2, 0); 
+  __visc__bindIn(var_16, 55, 3, 0); 
+  __visc__bindIn(var_16, 56, 4, 0); 
+  __visc__bindIn(var_16, 57, 5, 0); 
+  __visc__bindIn(var_16, 58, 6, 0); 
+  __visc__bindIn(var_16, 59, 7, 0); 
+  __visc__bindIn(var_16, 60, 8, 0); 
+  __visc__bindIn(var_16, 61, 9, 0); 
+
+  void* var_17 = __visc__createNodeND(0, var_17_node); 
+
+  __visc__edge(var_16, var_17, 1, 0, 0, 0); 
+  __visc__edge(var_16, var_17, 1, 1, 1, 0); 
+
+  void* var_18 = __visc__createNodeND(0, var_18_node); 
+
+  __visc__edge(var_17, var_18, 1, 0, 0, 0); 
+  __visc__edge(var_17, var_18, 1, 1, 1, 0); 
+  __visc__bindIn(var_18, 62, 2, 0); 
+  __visc__bindIn(var_18, 63, 3, 0); 
+
+  void* var_19 = __visc__createNodeND(0, var_19_node); 
+
+  __visc__edge(var_18, var_19, 1, 0, 0, 0); 
+  __visc__edge(var_18, var_19, 1, 1, 1, 0); 
+  __visc__bindIn(var_19, 64, 2, 0); 
+  __visc__bindIn(var_19, 65, 3, 0); 
+  __visc__bindIn(var_19, 66, 4, 0); 
+  __visc__bindIn(var_19, 67, 5, 0); 
+  __visc__bindIn(var_19, 68, 6, 0); 
+  __visc__bindIn(var_19, 69, 7, 0); 
+  __visc__bindIn(var_19, 70, 8, 0); 
+  __visc__bindIn(var_19, 71, 9, 0); 
+
+  void* var_20 = __visc__createNodeND(0, var_20_node); 
+
+  __visc__edge(var_19, var_20, 1, 0, 0, 0); 
+  __visc__edge(var_19, var_20, 1, 1, 1, 0); 
+
+  void* var_21 = __visc__createNodeND(0, var_21_node); 
+
+  __visc__edge(var_20, var_21, 1, 0, 0, 0); 
+  __visc__edge(var_20, var_21, 1, 1, 1, 0); 
+  __visc__bindIn(var_21, 72, 2, 0); 
+  __visc__bindIn(var_21, 73, 3, 0); 
+
+  void* var_22 = __visc__createNodeND(0, var_22_node); 
+
+  __visc__edge(var_21, var_22, 1, 0, 0, 0); 
+  __visc__edge(var_21, var_22, 1, 1, 1, 0); 
+  __visc__bindIn(var_22, 74, 2, 0); 
+  __visc__bindIn(var_22, 75, 3, 0); 
+  __visc__bindIn(var_22, 76, 4, 0); 
+  __visc__bindIn(var_22, 77, 5, 0); 
+  __visc__bindIn(var_22, 78, 6, 0); 
+  __visc__bindIn(var_22, 79, 7, 0); 
+  __visc__bindIn(var_22, 80, 8, 0); 
+  __visc__bindIn(var_22, 81, 9, 0); 
+
+  void* var_23 = __visc__createNodeND(0, var_23_node); 
+
+  __visc__edge(var_22, var_23, 1, 0, 0, 0); 
+  __visc__edge(var_22, var_23, 1, 1, 1, 0); 
+
+  void* var_24 = __visc__createNodeND(0, var_24_node); 
+
+  __visc__edge(var_23, var_24, 1, 0, 0, 0); 
+  __visc__edge(var_23, var_24, 1, 1, 1, 0); 
+  __visc__bindIn(var_24, 82, 2, 0); 
+  __visc__bindIn(var_24, 83, 3, 0); 
+
+  void* var_25 = __visc__createNodeND(0, var_25_node); 
+
+  __visc__edge(var_24, var_25, 1, 0, 0, 0); 
+  __visc__edge(var_24, var_25, 1, 1, 1, 0); 
+  __visc__bindIn(var_25, 84, 2, 0); 
+  __visc__bindIn(var_25, 85, 3, 0); 
+  __visc__bindIn(var_25, 86, 4, 0); 
+  __visc__bindIn(var_25, 87, 5, 0); 
+  __visc__bindIn(var_25, 88, 6, 0); 
+  __visc__bindIn(var_25, 89, 7, 0); 
+  __visc__bindIn(var_25, 90, 8, 0); 
+  __visc__bindIn(var_25, 91, 9, 0); 
+
+  void* var_26 = __visc__createNodeND(0, var_26_node); 
+
+  __visc__edge(var_25, var_26, 1, 0, 0, 0); 
+  __visc__edge(var_25, var_26, 1, 1, 1, 0); 
+
+  void* var_27 = __visc__createNodeND(0, var_27_node); 
+
+  __visc__edge(var_26, var_27, 1, 0, 0, 0); 
+  __visc__edge(var_26, var_27, 1, 1, 1, 0); 
+  __visc__bindIn(var_27, 92, 2, 0); 
+  __visc__bindIn(var_27, 93, 3, 0); 
+
+  void* var_28 = __visc__createNodeND(0, var_28_node); 
+
+  __visc__edge(var_27, var_28, 1, 0, 0, 0); 
+  __visc__edge(var_27, var_28, 1, 1, 1, 0); 
+  __visc__bindIn(var_28, 94, 2, 0); 
+  __visc__bindIn(var_28, 95, 3, 0); 
+  __visc__bindIn(var_28, 96, 4, 0); 
+  __visc__bindIn(var_28, 97, 5, 0); 
+  __visc__bindIn(var_28, 98, 6, 0); 
+  __visc__bindIn(var_28, 99, 7, 0); 
+  __visc__bindIn(var_28, 100, 8, 0); 
+  __visc__bindIn(var_28, 101, 9, 0); 
+
+  void* var_29 = __visc__createNodeND(0, var_29_node); 
+
+  __visc__edge(var_28, var_29, 1, 0, 0, 0); 
+  __visc__edge(var_28, var_29, 1, 1, 1, 0); 
+
+  void* var_30 = __visc__createNodeND(0, var_30_node); 
+
+  __visc__edge(var_29, var_30, 1, 0, 0, 0); 
+  __visc__edge(var_29, var_30, 1, 1, 1, 0); 
+  __visc__bindIn(var_30, 102, 2, 0); 
+  __visc__bindIn(var_30, 103, 3, 0); 
+
+  void* var_31 = __visc__createNodeND(0, var_31_node); 
+
+  __visc__edge(var_30, var_31, 1, 0, 0, 0); 
+  __visc__edge(var_30, var_31, 1, 1, 1, 0); 
+  __visc__bindIn(var_31, 104, 2, 0); 
+  __visc__bindIn(var_31, 105, 3, 0); 
+  __visc__bindIn(var_31, 106, 4, 0); 
+  __visc__bindIn(var_31, 107, 5, 0); 
+  __visc__bindIn(var_31, 108, 6, 0); 
+  __visc__bindIn(var_31, 109, 7, 0); 
+  __visc__bindIn(var_31, 110, 8, 0); 
+  __visc__bindIn(var_31, 111, 9, 0); 
+
+  void* var_32 = __visc__createNodeND(0, var_32_node); 
+
+  __visc__edge(var_31, var_32, 1, 0, 0, 0); 
+  __visc__edge(var_31, var_32, 1, 1, 1, 0); 
+
+  void* var_33 = __visc__createNodeND(0, var_33_node); 
+
+  __visc__edge(var_32, var_33, 1, 0, 0, 0); 
+  __visc__edge(var_32, var_33, 1, 1, 1, 0); 
+  __visc__bindIn(var_33, 112, 2, 0); 
+  __visc__bindIn(var_33, 113, 3, 0); 
+
+  void* var_34 = __visc__createNodeND(0, var_34_node); 
+
+  __visc__edge(var_33, var_34, 1, 0, 0, 0); 
+  __visc__edge(var_33, var_34, 1, 1, 1, 0); 
+  __visc__bindIn(var_34, 114, 2, 0); 
+  __visc__bindIn(var_34, 115, 3, 0); 
+  __visc__bindIn(var_34, 116, 4, 0); 
+  __visc__bindIn(var_34, 117, 5, 0); 
+  __visc__bindIn(var_34, 118, 6, 0); 
+  __visc__bindIn(var_34, 119, 7, 0); 
+  __visc__bindIn(var_34, 120, 8, 0); 
+  __visc__bindIn(var_34, 121, 9, 0); 
+
+  void* var_35 = __visc__createNodeND(0, var_35_node); 
+
+  __visc__edge(var_34, var_35, 1, 0, 0, 0); 
+  __visc__edge(var_34, var_35, 1, 1, 1, 0); 
+
+  void* var_36 = __visc__createNodeND(0, var_36_node); 
+
+  __visc__edge(var_35, var_36, 1, 0, 0, 0); 
+  __visc__edge(var_35, var_36, 1, 1, 1, 0); 
+  __visc__bindIn(var_36, 122, 2, 0); 
+  __visc__bindIn(var_36, 123, 3, 0); 
+
+  void* var_37 = __visc__createNodeND(0, var_37_node); 
+
+  __visc__edge(var_36, var_37, 1, 0, 0, 0); 
+  __visc__edge(var_36, var_37, 1, 1, 1, 0); 
+  __visc__bindIn(var_37, 124, 2, 0); 
+  __visc__bindIn(var_37, 125, 3, 0); 
+  __visc__bindIn(var_37, 126, 4, 0); 
+  __visc__bindIn(var_37, 127, 5, 0); 
+  __visc__bindIn(var_37, 128, 6, 0); 
+  __visc__bindIn(var_37, 129, 7, 0); 
+  __visc__bindIn(var_37, 130, 8, 0); 
+  __visc__bindIn(var_37, 131, 9, 0); 
+
+  void* var_38 = __visc__createNodeND(0, var_38_node); 
+
+  __visc__edge(var_37, var_38, 1, 0, 0, 0); 
+  __visc__edge(var_37, var_38, 1, 1, 1, 0); 
+
+  void* var_39 = __visc__createNodeND(0, var_39_node); 
+
+  __visc__edge(var_38, var_39, 1, 0, 0, 0); 
+  __visc__edge(var_38, var_39, 1, 1, 1, 0); 
+
+  void* var_40 = __visc__createNodeND(0, var_40_node); 
+
+  __visc__edge(var_39, var_40, 1, 0, 0, 0); 
+  __visc__edge(var_39, var_40, 1, 1, 1, 0); 
+  __visc__bindIn(var_40, 132, 2, 0); 
+  __visc__bindIn(var_40, 133, 3, 0); 
+
+  void* var_41 = __visc__createNodeND(0, var_41_node); 
+
+  __visc__edge(var_40, var_41, 1, 0, 0, 0); 
+  __visc__edge(var_40, var_41, 1, 1, 1, 0); 
+  __visc__bindIn(var_41, 134, 2, 0); 
+  __visc__bindIn(var_41, 135, 3, 0); 
+
+  void* var_42 = __visc__createNodeND(0, var_42_node); 
+
+  __visc__edge(var_41, var_42, 1, 0, 0, 0); 
+  __visc__edge(var_41, var_42, 1, 1, 1, 0); 
+
+  __visc__bindOut(var_42, 0, 0, 0); 
+  __visc__bindOut(var_42, 1, 1, 0); 
+
+}
+
+struct ret_t {
+  void* tensor; 
+  size_t bytes; 
+}; 
+
+typedef struct __attribute__((__packed__)) {
+  void* input; 
+  size_t input_bytes; 
+  void* conv2d_1_w; 
+  size_t conv2d_1_w_bytes; 
+  void* batch_normalization_1_gamma; 
+  size_t batch_normalization_1_gamma_bytes; 
+  void* batch_normalization_1_beta; 
+  size_t batch_normalization_1_beta_bytes; 
+  void* batch_normalization_1_mean; 
+  size_t batch_normalization_1_mean_bytes; 
+  void* batch_normalization_1_variance; 
+  size_t batch_normalization_1_variance_bytes; 
+  void* depthwise_conv2d_1_w; 
+  size_t depthwise_conv2d_1_w_bytes; 
+  void* batch_normalization_2_gamma; 
+  size_t batch_normalization_2_gamma_bytes; 
+  void* batch_normalization_2_beta; 
+  size_t batch_normalization_2_beta_bytes; 
+  void* batch_normalization_2_mean; 
+  size_t batch_normalization_2_mean_bytes; 
+  void* batch_normalization_2_variance; 
+  size_t batch_normalization_2_variance_bytes; 
+  void* conv2d_2_w; 
+  size_t conv2d_2_w_bytes; 
+  void* batch_normalization_3_gamma; 
+  size_t batch_normalization_3_gamma_bytes; 
+  void* batch_normalization_3_beta; 
+  size_t batch_normalization_3_beta_bytes; 
+  void* batch_normalization_3_mean; 
+  size_t batch_normalization_3_mean_bytes; 
+  void* batch_normalization_3_variance; 
+  size_t batch_normalization_3_variance_bytes; 
+  void* depthwise_conv2d_2_w; 
+  size_t depthwise_conv2d_2_w_bytes; 
+  void* batch_normalization_4_gamma; 
+  size_t batch_normalization_4_gamma_bytes; 
+  void* batch_normalization_4_beta; 
+  size_t batch_normalization_4_beta_bytes; 
+  void* batch_normalization_4_mean; 
+  size_t batch_normalization_4_mean_bytes; 
+  void* batch_normalization_4_variance; 
+  size_t batch_normalization_4_variance_bytes; 
+  void* conv2d_3_w; 
+  size_t conv2d_3_w_bytes; 
+  void* batch_normalization_5_gamma; 
+  size_t batch_normalization_5_gamma_bytes; 
+  void* batch_normalization_5_beta; 
+  size_t batch_normalization_5_beta_bytes; 
+  void* batch_normalization_5_mean; 
+  size_t batch_normalization_5_mean_bytes; 
+  void* batch_normalization_5_variance; 
+  size_t batch_normalization_5_variance_bytes; 
+  void* depthwise_conv2d_3_w; 
+  size_t depthwise_conv2d_3_w_bytes; 
+  void* batch_normalization_6_gamma; 
+  size_t batch_normalization_6_gamma_bytes; 
+  void* batch_normalization_6_beta; 
+  size_t batch_normalization_6_beta_bytes; 
+  void* batch_normalization_6_mean; 
+  size_t batch_normalization_6_mean_bytes; 
+  void* batch_normalization_6_variance; 
+  size_t batch_normalization_6_variance_bytes; 
+  void* conv2d_4_w; 
+  size_t conv2d_4_w_bytes; 
+  void* batch_normalization_7_gamma; 
+  size_t batch_normalization_7_gamma_bytes; 
+  void* batch_normalization_7_beta; 
+  size_t batch_normalization_7_beta_bytes; 
+  void* batch_normalization_7_mean; 
+  size_t batch_normalization_7_mean_bytes; 
+  void* batch_normalization_7_variance; 
+  size_t batch_normalization_7_variance_bytes; 
+  void* depthwise_conv2d_4_w; 
+  size_t depthwise_conv2d_4_w_bytes; 
+  void* batch_normalization_8_gamma; 
+  size_t batch_normalization_8_gamma_bytes; 
+  void* batch_normalization_8_beta; 
+  size_t batch_normalization_8_beta_bytes; 
+  void* batch_normalization_8_mean; 
+  size_t batch_normalization_8_mean_bytes; 
+  void* batch_normalization_8_variance; 
+  size_t batch_normalization_8_variance_bytes; 
+  void* conv2d_5_w; 
+  size_t conv2d_5_w_bytes; 
+  void* batch_normalization_9_gamma; 
+  size_t batch_normalization_9_gamma_bytes; 
+  void* batch_normalization_9_beta; 
+  size_t batch_normalization_9_beta_bytes; 
+  void* batch_normalization_9_mean; 
+  size_t batch_normalization_9_mean_bytes; 
+  void* batch_normalization_9_variance; 
+  size_t batch_normalization_9_variance_bytes; 
+  void* depthwise_conv2d_5_w; 
+  size_t depthwise_conv2d_5_w_bytes; 
+  void* batch_normalization_10_gamma; 
+  size_t batch_normalization_10_gamma_bytes; 
+  void* batch_normalization_10_beta; 
+  size_t batch_normalization_10_beta_bytes; 
+  void* batch_normalization_10_mean; 
+  size_t batch_normalization_10_mean_bytes; 
+  void* batch_normalization_10_variance; 
+  size_t batch_normalization_10_variance_bytes; 
+  void* conv2d_6_w; 
+  size_t conv2d_6_w_bytes; 
+  void* batch_normalization_11_gamma; 
+  size_t batch_normalization_11_gamma_bytes; 
+  void* batch_normalization_11_beta; 
+  size_t batch_normalization_11_beta_bytes; 
+  void* batch_normalization_11_mean; 
+  size_t batch_normalization_11_mean_bytes; 
+  void* batch_normalization_11_variance; 
+  size_t batch_normalization_11_variance_bytes; 
+  void* depthwise_conv2d_6_w; 
+  size_t depthwise_conv2d_6_w_bytes; 
+  void* batch_normalization_12_gamma; 
+  size_t batch_normalization_12_gamma_bytes; 
+  void* batch_normalization_12_beta; 
+  size_t batch_normalization_12_beta_bytes; 
+  void* batch_normalization_12_mean; 
+  size_t batch_normalization_12_mean_bytes; 
+  void* batch_normalization_12_variance; 
+  size_t batch_normalization_12_variance_bytes; 
+  void* conv2d_7_w; 
+  size_t conv2d_7_w_bytes; 
+  void* batch_normalization_13_gamma; 
+  size_t batch_normalization_13_gamma_bytes; 
+  void* batch_normalization_13_beta; 
+  size_t batch_normalization_13_beta_bytes; 
+  void* batch_normalization_13_mean; 
+  size_t batch_normalization_13_mean_bytes; 
+  void* batch_normalization_13_variance; 
+  size_t batch_normalization_13_variance_bytes; 
+  void* dense_1_w; 
+  size_t dense_1_w_bytes; 
+  void* dense_1_b; 
+  size_t dense_1_b_bytes; 
+
+  struct ret_t r; 
+}
+RootIn;
+
+int main(){ 
+
+  std::string dir_prefix = std::string("../../../../../../projects/hpvm-tensor-rt/model_params/mobilenet_shallow/");
+
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
+  std::string batch_normalization_1_gamma_path =  dir_prefix + std::string("batch_normalization_1_gamma.bin"); 
+  void* batch_normalization_1_gamma =  readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_1_beta_path =  dir_prefix + std::string("batch_normalization_1_beta.bin"); 
+  void* batch_normalization_1_beta =  readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_1_mean_path =  dir_prefix + std::string("batch_normalization_1_mean.bin"); 
+  void* batch_normalization_1_mean =  readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_1_variance_path =  dir_prefix + std::string("batch_normalization_1_variance.bin"); 
+  void* batch_normalization_1_variance =  readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); 
+  std::string depthwise_conv2d_1_w_path =  dir_prefix + std::string("depthwise_conv2d_1_w.bin"); 
+  void* depthwise_conv2d_1_w =  readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); 
+  std::string batch_normalization_2_gamma_path =  dir_prefix + std::string("batch_normalization_2_gamma.bin"); 
+  void* batch_normalization_2_gamma =  readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_2_beta_path =  dir_prefix + std::string("batch_normalization_2_beta.bin"); 
+  void* batch_normalization_2_beta =  readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_2_mean_path =  dir_prefix + std::string("batch_normalization_2_mean.bin"); 
+  void* batch_normalization_2_mean =  readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); 
+  std::string batch_normalization_2_variance_path =  dir_prefix + std::string("batch_normalization_2_variance.bin"); 
+  void* batch_normalization_2_variance =  readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); 
+  std::string batch_normalization_3_gamma_path =  dir_prefix + std::string("batch_normalization_3_gamma.bin"); 
+  void* batch_normalization_3_gamma =  readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_3_beta_path =  dir_prefix + std::string("batch_normalization_3_beta.bin"); 
+  void* batch_normalization_3_beta =  readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_3_mean_path =  dir_prefix + std::string("batch_normalization_3_mean.bin"); 
+  void* batch_normalization_3_mean =  readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_3_variance_path =  dir_prefix + std::string("batch_normalization_3_variance.bin"); 
+  void* batch_normalization_3_variance =  readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); 
+  std::string depthwise_conv2d_2_w_path =  dir_prefix + std::string("depthwise_conv2d_2_w.bin"); 
+  void* depthwise_conv2d_2_w =  readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); 
+  std::string batch_normalization_4_gamma_path =  dir_prefix + std::string("batch_normalization_4_gamma.bin"); 
+  void* batch_normalization_4_gamma =  readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_4_beta_path =  dir_prefix + std::string("batch_normalization_4_beta.bin"); 
+  void* batch_normalization_4_beta =  readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_4_mean_path =  dir_prefix + std::string("batch_normalization_4_mean.bin"); 
+  void* batch_normalization_4_mean =  readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); 
+  std::string batch_normalization_4_variance_path =  dir_prefix + std::string("batch_normalization_4_variance.bin"); 
+  void* batch_normalization_4_variance =  readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); 
+  std::string batch_normalization_5_gamma_path =  dir_prefix + std::string("batch_normalization_5_gamma.bin"); 
+  void* batch_normalization_5_gamma =  readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_5_beta_path =  dir_prefix + std::string("batch_normalization_5_beta.bin"); 
+  void* batch_normalization_5_beta =  readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_5_mean_path =  dir_prefix + std::string("batch_normalization_5_mean.bin"); 
+  void* batch_normalization_5_mean =  readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_5_variance_path =  dir_prefix + std::string("batch_normalization_5_variance.bin"); 
+  void* batch_normalization_5_variance =  readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); 
+  std::string depthwise_conv2d_3_w_path =  dir_prefix + std::string("depthwise_conv2d_3_w.bin"); 
+  void* depthwise_conv2d_3_w =  readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); 
+  std::string batch_normalization_6_gamma_path =  dir_prefix + std::string("batch_normalization_6_gamma.bin"); 
+  void* batch_normalization_6_gamma =  readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_6_beta_path =  dir_prefix + std::string("batch_normalization_6_beta.bin"); 
+  void* batch_normalization_6_beta =  readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_6_mean_path =  dir_prefix + std::string("batch_normalization_6_mean.bin"); 
+  void* batch_normalization_6_mean =  readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_6_variance_path =  dir_prefix + std::string("batch_normalization_6_variance.bin"); 
+  void* batch_normalization_6_variance =  readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); 
+  std::string batch_normalization_7_gamma_path =  dir_prefix + std::string("batch_normalization_7_gamma.bin"); 
+  void* batch_normalization_7_gamma =  readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_7_beta_path =  dir_prefix + std::string("batch_normalization_7_beta.bin"); 
+  void* batch_normalization_7_beta =  readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_7_mean_path =  dir_prefix + std::string("batch_normalization_7_mean.bin"); 
+  void* batch_normalization_7_mean =  readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_7_variance_path =  dir_prefix + std::string("batch_normalization_7_variance.bin"); 
+  void* batch_normalization_7_variance =  readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); 
+  std::string depthwise_conv2d_4_w_path =  dir_prefix + std::string("depthwise_conv2d_4_w.bin"); 
+  void* depthwise_conv2d_4_w =  readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); 
+  std::string batch_normalization_8_gamma_path =  dir_prefix + std::string("batch_normalization_8_gamma.bin"); 
+  void* batch_normalization_8_gamma =  readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_8_beta_path =  dir_prefix + std::string("batch_normalization_8_beta.bin"); 
+  void* batch_normalization_8_beta =  readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_8_mean_path =  dir_prefix + std::string("batch_normalization_8_mean.bin"); 
+  void* batch_normalization_8_mean =  readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); 
+  std::string batch_normalization_8_variance_path =  dir_prefix + std::string("batch_normalization_8_variance.bin"); 
+  void* batch_normalization_8_variance =  readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); 
+  std::string batch_normalization_9_gamma_path =  dir_prefix + std::string("batch_normalization_9_gamma.bin"); 
+  void* batch_normalization_9_gamma =  readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_9_beta_path =  dir_prefix + std::string("batch_normalization_9_beta.bin"); 
+  void* batch_normalization_9_beta =  readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_9_mean_path =  dir_prefix + std::string("batch_normalization_9_mean.bin"); 
+  void* batch_normalization_9_mean =  readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_9_variance_path =  dir_prefix + std::string("batch_normalization_9_variance.bin"); 
+  void* batch_normalization_9_variance =  readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); 
+  std::string depthwise_conv2d_5_w_path =  dir_prefix + std::string("depthwise_conv2d_5_w.bin"); 
+  void* depthwise_conv2d_5_w =  readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); 
+  std::string batch_normalization_10_gamma_path =  dir_prefix + std::string("batch_normalization_10_gamma.bin"); 
+  void* batch_normalization_10_gamma =  readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_10_beta_path =  dir_prefix + std::string("batch_normalization_10_beta.bin"); 
+  void* batch_normalization_10_beta =  readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_10_mean_path =  dir_prefix + std::string("batch_normalization_10_mean.bin"); 
+  void* batch_normalization_10_mean =  readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_10_variance_path =  dir_prefix + std::string("batch_normalization_10_variance.bin"); 
+  void* batch_normalization_10_variance =  readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
+  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); 
+  std::string batch_normalization_11_gamma_path =  dir_prefix + std::string("batch_normalization_11_gamma.bin"); 
+  void* batch_normalization_11_gamma =  readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_11_beta_path =  dir_prefix + std::string("batch_normalization_11_beta.bin"); 
+  void* batch_normalization_11_beta =  readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_11_mean_path =  dir_prefix + std::string("batch_normalization_11_mean.bin"); 
+  void* batch_normalization_11_mean =  readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_11_variance_path =  dir_prefix + std::string("batch_normalization_11_variance.bin"); 
+  void* batch_normalization_11_variance =  readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); 
+  std::string depthwise_conv2d_6_w_path =  dir_prefix + std::string("depthwise_conv2d_6_w.bin"); 
+  void* depthwise_conv2d_6_w =  readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); 
+  std::string batch_normalization_12_gamma_path =  dir_prefix + std::string("batch_normalization_12_gamma.bin"); 
+  void* batch_normalization_12_gamma =  readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_12_beta_path =  dir_prefix + std::string("batch_normalization_12_beta.bin"); 
+  void* batch_normalization_12_beta =  readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_12_mean_path =  dir_prefix + std::string("batch_normalization_12_mean.bin"); 
+  void* batch_normalization_12_mean =  readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); 
+  std::string batch_normalization_12_variance_path =  dir_prefix + std::string("batch_normalization_12_variance.bin"); 
+  void* batch_normalization_12_variance =  readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
+  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); 
+  std::string batch_normalization_13_gamma_path =  dir_prefix + std::string("batch_normalization_13_gamma.bin"); 
+  void* batch_normalization_13_gamma =  readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_13_beta_path =  dir_prefix + std::string("batch_normalization_13_beta.bin"); 
+  void* batch_normalization_13_beta =  readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_13_mean_path =  dir_prefix + std::string("batch_normalization_13_mean.bin"); 
+  void* batch_normalization_13_mean =  readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); 
+  std::string batch_normalization_13_variance_path =  dir_prefix + std::string("batch_normalization_13_variance.bin"); 
+  void* batch_normalization_13_variance =  readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); 
+  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+  void* input = readTrainedWeights(input_path.c_str(), 0, 5000,3,32,32); 
+  uint8_t* labels = readLabels(labels_path.c_str(), 5000); 
+
+  __visc__init(); 
+  RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); 
+
+  args->input = input; 
+  args->input_bytes = 0; 
+  args->conv2d_1_w = conv2d_1_w; 
+  args->conv2d_1_w_bytes = 0; 
+  args->batch_normalization_1_gamma = batch_normalization_1_gamma; 
+  args->batch_normalization_1_gamma_bytes = 0; 
+  args->batch_normalization_1_beta = batch_normalization_1_beta; 
+  args->batch_normalization_1_beta_bytes = 0; 
+  args->batch_normalization_1_mean = batch_normalization_1_mean; 
+  args->batch_normalization_1_mean_bytes = 0; 
+  args->batch_normalization_1_variance = batch_normalization_1_variance; 
+  args->batch_normalization_1_variance_bytes = 0; 
+  args->depthwise_conv2d_1_w = depthwise_conv2d_1_w; 
+  args->depthwise_conv2d_1_w_bytes = 0; 
+  args->batch_normalization_2_gamma = batch_normalization_2_gamma; 
+  args->batch_normalization_2_gamma_bytes = 0; 
+  args->batch_normalization_2_beta = batch_normalization_2_beta; 
+  args->batch_normalization_2_beta_bytes = 0; 
+  args->batch_normalization_2_mean = batch_normalization_2_mean; 
+  args->batch_normalization_2_mean_bytes = 0; 
+  args->batch_normalization_2_variance = batch_normalization_2_variance; 
+  args->batch_normalization_2_variance_bytes = 0; 
+  args->conv2d_2_w = conv2d_2_w; 
+  args->conv2d_2_w_bytes = 0; 
+  args->batch_normalization_3_gamma = batch_normalization_3_gamma; 
+  args->batch_normalization_3_gamma_bytes = 0; 
+  args->batch_normalization_3_beta = batch_normalization_3_beta; 
+  args->batch_normalization_3_beta_bytes = 0; 
+  args->batch_normalization_3_mean = batch_normalization_3_mean; 
+  args->batch_normalization_3_mean_bytes = 0; 
+  args->batch_normalization_3_variance = batch_normalization_3_variance; 
+  args->batch_normalization_3_variance_bytes = 0; 
+  args->depthwise_conv2d_2_w = depthwise_conv2d_2_w; 
+  args->depthwise_conv2d_2_w_bytes = 0; 
+  args->batch_normalization_4_gamma = batch_normalization_4_gamma; 
+  args->batch_normalization_4_gamma_bytes = 0; 
+  args->batch_normalization_4_beta = batch_normalization_4_beta; 
+  args->batch_normalization_4_beta_bytes = 0; 
+  args->batch_normalization_4_mean = batch_normalization_4_mean; 
+  args->batch_normalization_4_mean_bytes = 0; 
+  args->batch_normalization_4_variance = batch_normalization_4_variance; 
+  args->batch_normalization_4_variance_bytes = 0; 
+  args->conv2d_3_w = conv2d_3_w; 
+  args->conv2d_3_w_bytes = 0; 
+  args->batch_normalization_5_gamma = batch_normalization_5_gamma; 
+  args->batch_normalization_5_gamma_bytes = 0; 
+  args->batch_normalization_5_beta = batch_normalization_5_beta; 
+  args->batch_normalization_5_beta_bytes = 0; 
+  args->batch_normalization_5_mean = batch_normalization_5_mean; 
+  args->batch_normalization_5_mean_bytes = 0; 
+  args->batch_normalization_5_variance = batch_normalization_5_variance; 
+  args->batch_normalization_5_variance_bytes = 0; 
+  args->depthwise_conv2d_3_w = depthwise_conv2d_3_w; 
+  args->depthwise_conv2d_3_w_bytes = 0; 
+  args->batch_normalization_6_gamma = batch_normalization_6_gamma; 
+  args->batch_normalization_6_gamma_bytes = 0; 
+  args->batch_normalization_6_beta = batch_normalization_6_beta; 
+  args->batch_normalization_6_beta_bytes = 0; 
+  args->batch_normalization_6_mean = batch_normalization_6_mean; 
+  args->batch_normalization_6_mean_bytes = 0; 
+  args->batch_normalization_6_variance = batch_normalization_6_variance; 
+  args->batch_normalization_6_variance_bytes = 0; 
+  args->conv2d_4_w = conv2d_4_w; 
+  args->conv2d_4_w_bytes = 0; 
+  args->batch_normalization_7_gamma = batch_normalization_7_gamma; 
+  args->batch_normalization_7_gamma_bytes = 0; 
+  args->batch_normalization_7_beta = batch_normalization_7_beta; 
+  args->batch_normalization_7_beta_bytes = 0; 
+  args->batch_normalization_7_mean = batch_normalization_7_mean; 
+  args->batch_normalization_7_mean_bytes = 0; 
+  args->batch_normalization_7_variance = batch_normalization_7_variance; 
+  args->batch_normalization_7_variance_bytes = 0; 
+  args->depthwise_conv2d_4_w = depthwise_conv2d_4_w; 
+  args->depthwise_conv2d_4_w_bytes = 0; 
+  args->batch_normalization_8_gamma = batch_normalization_8_gamma; 
+  args->batch_normalization_8_gamma_bytes = 0; 
+  args->batch_normalization_8_beta = batch_normalization_8_beta; 
+  args->batch_normalization_8_beta_bytes = 0; 
+  args->batch_normalization_8_mean = batch_normalization_8_mean; 
+  args->batch_normalization_8_mean_bytes = 0; 
+  args->batch_normalization_8_variance = batch_normalization_8_variance; 
+  args->batch_normalization_8_variance_bytes = 0; 
+  args->conv2d_5_w = conv2d_5_w; 
+  args->conv2d_5_w_bytes = 0; 
+  args->batch_normalization_9_gamma = batch_normalization_9_gamma; 
+  args->batch_normalization_9_gamma_bytes = 0; 
+  args->batch_normalization_9_beta = batch_normalization_9_beta; 
+  args->batch_normalization_9_beta_bytes = 0; 
+  args->batch_normalization_9_mean = batch_normalization_9_mean; 
+  args->batch_normalization_9_mean_bytes = 0; 
+  args->batch_normalization_9_variance = batch_normalization_9_variance; 
+  args->batch_normalization_9_variance_bytes = 0; 
+  args->depthwise_conv2d_5_w = depthwise_conv2d_5_w; 
+  args->depthwise_conv2d_5_w_bytes = 0; 
+  args->batch_normalization_10_gamma = batch_normalization_10_gamma; 
+  args->batch_normalization_10_gamma_bytes = 0; 
+  args->batch_normalization_10_beta = batch_normalization_10_beta; 
+  args->batch_normalization_10_beta_bytes = 0; 
+  args->batch_normalization_10_mean = batch_normalization_10_mean; 
+  args->batch_normalization_10_mean_bytes = 0; 
+  args->batch_normalization_10_variance = batch_normalization_10_variance; 
+  args->batch_normalization_10_variance_bytes = 0; 
+  args->conv2d_6_w = conv2d_6_w; 
+  args->conv2d_6_w_bytes = 0; 
+  args->batch_normalization_11_gamma = batch_normalization_11_gamma; 
+  args->batch_normalization_11_gamma_bytes = 0; 
+  args->batch_normalization_11_beta = batch_normalization_11_beta; 
+  args->batch_normalization_11_beta_bytes = 0; 
+  args->batch_normalization_11_mean = batch_normalization_11_mean; 
+  args->batch_normalization_11_mean_bytes = 0; 
+  args->batch_normalization_11_variance = batch_normalization_11_variance; 
+  args->batch_normalization_11_variance_bytes = 0; 
+  args->depthwise_conv2d_6_w = depthwise_conv2d_6_w; 
+  args->depthwise_conv2d_6_w_bytes = 0; 
+  args->batch_normalization_12_gamma = batch_normalization_12_gamma; 
+  args->batch_normalization_12_gamma_bytes = 0; 
+  args->batch_normalization_12_beta = batch_normalization_12_beta; 
+  args->batch_normalization_12_beta_bytes = 0; 
+  args->batch_normalization_12_mean = batch_normalization_12_mean; 
+  args->batch_normalization_12_mean_bytes = 0; 
+  args->batch_normalization_12_variance = batch_normalization_12_variance; 
+  args->batch_normalization_12_variance_bytes = 0; 
+  args->conv2d_7_w = conv2d_7_w; 
+  args->conv2d_7_w_bytes = 0; 
+  args->batch_normalization_13_gamma = batch_normalization_13_gamma; 
+  args->batch_normalization_13_gamma_bytes = 0; 
+  args->batch_normalization_13_beta = batch_normalization_13_beta; 
+  args->batch_normalization_13_beta_bytes = 0; 
+  args->batch_normalization_13_mean = batch_normalization_13_mean; 
+  args->batch_normalization_13_mean_bytes = 0; 
+  args->batch_normalization_13_variance = batch_normalization_13_variance; 
+  args->batch_normalization_13_variance_bytes = 0; 
+  args->dense_1_w = dense_1_w; 
+  args->dense_1_w_bytes = 0; 
+  args->dense_1_b = dense_1_b; 
+  args->dense_1_b_bytes = 0; 
+
+  void* dfg = __visc__launch(0, root, (void*) args); 
+
+  __visc__wait(dfg); 
+
+  void *result = static_cast<RootIn*>(args)->input; 
+  hpvm_request_tensor(result, 0); 
+
+  __visc__cleanup(); 
+  computeAccuracy2(labels, 5000, result); 
+  return 0; 
+
+} 
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/quant_ranges_rt.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/quant_ranges_rt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a7b14d7348f424556ba5e7bb52b6fdf9bbbd89c
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/quant_ranges_rt.txt
@@ -0,0 +1,22 @@
+1 -0.5500815 0.60786617 -1.0248864 1.2929907 -0.36291853 0.2533059 0.0 0.753551840782 
+2 0.0 0.753551840782 -0.69884616 0.71849966 -0.2781147 0.45571187 0.0 1.01057458043 
+3 0.0 1.01057458043 -0.59568167 0.7714691 -0.8602873 0.19743633 -1.84771883726 1.87930787086 
+4 0.0 2.33981014252 -0.41976976 0.43748936 -0.7021962 0.3033103 0.0 1.04317724705 
+5 0.0 1.04317724705 -0.46757826 0.4635873 -0.20662616 0.1778044 -0.829483509064 0.786805033684 
+6 0.0 2.49733686686 -0.64404047 0.45383143 -0.819547 0.38550296 0.0 0.897360802293 
+7 0.0 0.897360802293 -0.41986948 0.33654243 -0.3563013 0.22371122 -0.957150224447 0.54919362247 
+8 0.0 2.37362146616 -0.4805263 0.50655717 -0.296758 0.7742441 0.0 3.01592136621 
+9 0.0 3.01592136621 -0.52083415 0.45517674 -0.20242067 0.8236838 -5.2759475708 5.79733039856 
+10 0.0 2.37362146616 -0.5338656 1.3395424 -0.20242067 0.8236838 -0.738995380998 2.33600783587 
+11 0.0 7.07933432579 -0.34429058 0.43629733 -1.0744808 0.056708273 0.0 1.58645607233 
+12 0.0 1.58645607233 -0.30342352 0.39493486 -0.44630566 0.6492069 -1.49672914267 1.29970229745 
+13 0.0 7.11914063454 -0.38351893 0.45775774 -1.4733055 -0.014426912 0.0 1.52876508832 
+14 0.0 1.52876508832 -0.25695276 0.45372736 -0.5259744 0.26591402 -1.59576894164 1.08074297309 
+15 0.0 6.94405080318 -0.55299705 0.5443531 -0.71790683 1.2730768 0.0 10.3651468277 
+16 0.0 10.3651468277 -0.4203967 0.48641303 -0.90653443 1.3546854 -22.372925148 17.2033731079 
+17 0.0 6.94405080318 -0.4365755 0.84913826 -0.90653443 1.3546851 -3.66810325861 4.87814051151 
+18 0.0 18.8401451111 -0.38657624 0.5228989 -1.2083547 0.76361173 0.0 19.1229192352 
+19 0.0 19.1229192352 -0.40857902 0.575035 -1.8731614 1.0960501 -31.3229312897 14.8234729958 
+20 0.0 23.7382488823 -0.33079496 0.5893278 -1.0234511 1.0016295 0.0 19.5892774963 
+21 0.0 19.5892774963 -0.27897888 0.38280907 -2.2086356 1.0066502 -34.4416886902 20.9890329933 
+22 0.0 10.8541981602 -1.5092047 1.0279838 -0.49379802 0.61032647 -40.9121678543 25.7082381058
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/tuner_confs_base.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6307de9ab85096d6934a2772507d802859b5ceb9
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/tuner_confs_base.txt
@@ -0,0 +1,90 @@
++++++
+conf1 1 0 89.59 0
+1 gpu conv fp32 1 add fp32 1 relu fp32 1 
+2 gpu conv fp32 1 add fp32 1 relu fp32 1 
+3 gpu conv fp32 1 add fp32 1 
+4 gpu add fp32 1 
+5 gpu relu fp32 1 
+6 gpu conv fp32 1 add fp32 1 relu fp32 1 
+7 gpu conv fp32 1 add fp32 1 
+8 gpu add fp32 1 
+9 gpu relu fp32 1 
+10 gpu conv fp32 1 add fp32 1 relu fp32 1 
+11 gpu conv fp32 1 add fp32 1 
+12 gpu add fp32 1 
+13 gpu relu fp32 1 
+14 gpu conv fp32 1 add fp32 1 relu fp32 1 
+15 gpu conv fp32 1 add fp32 1 
+16 gpu conv fp32 1 add fp32 1 
+17 gpu add fp32 1 
+18 gpu relu fp32 1 
+19 gpu conv fp32 1 add fp32 1 relu fp32 1 
+20 gpu conv fp32 1 add fp32 1 
+21 gpu add fp32 1 
+22 gpu relu fp32 1 
+23 gpu conv fp32 1 add fp32 1 relu fp32 1 
+24 gpu conv fp32 1 add fp32 1 
+25 gpu add fp32 1 
+26 gpu relu fp32 1 
+27 gpu conv fp32 1 add fp32 1 relu fp32 1 
+28 gpu conv fp32 1 add fp32 1 
+29 gpu conv fp32 1 add fp32 1 
+30 gpu add fp32 1 
+31 gpu relu fp32 1 
+32 gpu conv fp32 1 add fp32 1 relu fp32 1 
+33 gpu conv fp32 1 add fp32 1 
+34 gpu add fp32 1 
+35 gpu relu fp32 1 
+36 gpu conv fp32 1 add fp32 1 relu fp32 1 
+37 gpu conv fp32 1 add fp32 1 
+38 gpu add fp32 1 
+39 gpu relu fp32 1 
+40 gpu pool_mean fp32 1 
+41 gpu mul fp32 1 add fp32 1 
+42 gpu softmax fp32 1
+-----
++++++
+conf2 1.5 0 89.59 0
+1 gpu conv fp16 1 add fp16 1 relu fp16 1 
+2 gpu conv fp16 1 add fp16 1 relu fp16 1 
+3 gpu conv fp16 1 add fp16 1 
+4 gpu add fp16 1 
+5 gpu relu fp16 1 
+6 gpu conv fp16 1 add fp16 1 relu fp16 1 
+7 gpu conv fp16 1 add fp16 1 
+8 gpu add fp16 1 
+9 gpu relu fp16 1 
+10 gpu conv fp16 1 add fp16 1 relu fp16 1 
+11 gpu conv fp16 1 add fp16 1 
+12 gpu add fp16 1 
+13 gpu relu fp16 1 
+14 gpu conv fp16 1 add fp16 1 relu fp16 1 
+15 gpu conv fp16 1 add fp16 1 
+16 gpu conv fp16 1 add fp16 1 
+17 gpu add fp16 1 
+18 gpu relu fp16 1 
+19 gpu conv fp16 1 add fp16 1 relu fp16 1 
+20 gpu conv fp16 1 add fp16 1 
+21 gpu add fp16 1 
+22 gpu relu fp16 1 
+23 gpu conv fp16 1 add fp16 1 relu fp16 1 
+24 gpu conv fp16 1 add fp16 1 
+25 gpu add fp16 1 
+26 gpu relu fp16 1 
+27 gpu conv fp16 1 add fp16 1 relu fp16 1 
+28 gpu conv fp16 1 add fp16 1 
+29 gpu conv fp16 1 add fp16 1 
+30 gpu add fp16 1 
+31 gpu relu fp16 1 
+32 gpu conv fp16 1 add fp16 1 relu fp16 1 
+33 gpu conv fp16 1 add fp16 1 
+34 gpu add fp16 1 
+35 gpu relu fp16 1 
+36 gpu conv fp16 1 add fp16 1 relu fp16 1 
+37 gpu conv fp16 1 add fp16 1 
+38 gpu add fp16 1 
+39 gpu relu fp16 1 
+40 gpu pool_mean fp16 1 
+41 gpu mul fp16 1 add fp16 1 
+42 gpu softmax fp32 1
+-----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/quant_ranges_rt.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/quant_ranges_rt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..19f5523523f3b9fc7b8f81c69112630003d5597e
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/quant_ranges_rt.txt
@@ -0,0 +1,15 @@
+1 -1.8816367 2.0934217 -0.53275156 0.49437004 -0.6403629 0.2490165 0.0 1.35908746719 
+2 0.0 1.35908746719 -0.2688396 0.20639156 -0.7745511 0.82006615 0.0 2.52123117924 
+3 0.0 2.52123117924 -0.16776876 0.14878987 -0.35283303 0.5154362 0.0 1.20119857848 
+4 0.0 1.20119857848 -0.088948585 0.114222586 -0.30250227 0.36856708 0.0 1.03598809302 
+5 0.0 1.03598809302 -0.07739562 0.10973293 -0.15568458 0.17634983 0.0 0.300495595038 
+6 0.0 0.300495595038 -0.051649556 0.05435231 -0.07395447 0.07996062 0.0 0.11490475405 
+7 0.0 0.11490475405 -0.043513633 0.07577866 -0.06921874 0.02660573 0.0 0.16232508488 
+8 0.0 0.16232508488 -0.033842053 0.045218028 -0.022827804 0.023845317 0.0 0.124249965735 
+9 0.0 0.124249965735 -0.02211613 0.032084666 -0.02699063 0.03773564 0.0 0.174634486511 
+10 0.0 0.174634486511 -0.01979376 0.034854397 -0.036107242 0.07056531 0.0 0.575175762177 
+11 0.0 0.575175762177 -0.03452098 0.046055835 -0.051925894 0.07039055 0.0 0.771875114441 
+12 0.0 0.771875114441 -0.025946895 0.040090334 -0.06049362 0.12658806 0.0 1.17285169065 
+13 0.0 1.17285169065 -0.021766115 0.03315237 -0.20705001 0.117947325 0.0 2.00157693863 
+14 0.0 2.00157693863 -0.042597745 0.046707444 -0.21937433 0.2545502 0.0 2.00236111879 
+15 0.0 2.00236111879 -0.32550547 0.30829763 -1.1787822 1.2378151 -18.2514705467 24.1736344528
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/tuner_confs_base.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c9a6612a5df150f58c69e1a7faeaf83ed5c7d605
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/tuner_confs_base.txt
@@ -0,0 +1,38 @@
++++++
+conf1 1 0 90.19 0
+1 gpu conv fp32 1 add fp32 1 relu fp32 1 
+2 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 
+3 gpu conv fp32 1 add fp32 1 relu fp32 1 
+4 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 
+5 gpu conv fp32 1 add fp32 1 relu fp32 1 
+6 gpu conv fp32 1 add fp32 1 relu fp32 1 
+7 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 
+8 gpu conv fp32 1 add fp32 1 relu fp32 1 
+9 gpu conv fp32 1 add fp32 1 relu fp32 1 
+10 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 
+11 gpu conv fp32 1 add fp32 1 relu fp32 1 
+12 gpu conv fp32 1 add fp32 1 relu fp32 1 
+13 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 
+14 gpu mul fp32 1 add fp32 1 relu fp32 1 
+15 gpu mul fp32 1 add fp32 1 
+16 gpu softmax fp32 1
+-----
++++++
+conf2 1.5 0 90.19 0
+1 gpu conv fp16 1 add fp16 1 relu fp16 1 
+2 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 
+3 gpu conv fp16 1 add fp16 1 relu fp16 1 
+4 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 
+5 gpu conv fp16 1 add fp16 1 relu fp16 1 
+6 gpu conv fp16 1 add fp16 1 relu fp16 1 
+7 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 
+8 gpu conv fp16 1 add fp16 1 relu fp16 1 
+9 gpu conv fp16 1 add fp16 1 relu fp16 1 
+10 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 
+11 gpu conv fp16 1 add fp16 1 relu fp16 1 
+12 gpu conv fp16 1 add fp16 1 relu fp16 1 
+13 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 
+14 gpu mul fp16 1 add fp16 1 relu fp16 1 
+15 gpu mul fp16 1 add fp16 1 
+16 gpu softmax fp32 1
+-----
diff --git a/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h b/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h
index 095b3d6f89a670b16467db4d0e695adb2fe207d6..55f16e4d8d176e1709e2c6525c3cd47b2bb8da1c 100644
--- a/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h
+++ b/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h
@@ -96,6 +96,7 @@ void* __visc__tensor_add(void*, void*);
 void* __visc__tensor_mul(void*, void*);
 void* __visc__tensor_convolution(void*, void*, int, int, int, int);
 void* __visc__tensor_group_convolution(void*, void*, int, int, int, int, int, int);
+void* __visc__tensor_batchnorm(void*, void*, void*, void*, void*, double);  
 void* __visc__tensor_pool_max(void*, int, int, int, int, int, int);
 void* __visc__tensor_pool_mean(void*, int, int, int, int, int, int);
 void* __visc__tensor_relu(void*);