diff --git a/llvm/projects/hpvm-tensor-rt/bin/extractQuantRange.py b/llvm/projects/hpvm-tensor-rt/bin/extractQuantRange.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b7f09d92e91894d284b40cc0bd2d346c08e36c7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/bin/extractQuantRange.py
@@ -0,0 +1,42 @@
+
+
+import sys
+
+
+if __name__ == "__main__":
+
+    f = open(sys.argv[1], "r")
+    f2 = open("quant_ranges.txt", "w+")
+
+    layer_line = False
+    for x in f:
+        if "ConvLayer_PROMISE" in x or "FCLayer_PROMISE" in x or layer_line == True:
+            if layer_line == True:
+              layer_line = False
+            else:
+              layer_line = True
+            
+            print x 
+            toks = x.split(",")
+
+            for tok in toks:
+                tok = tok.strip()
+                tok_val = ""
+                try:
+                    tok_val = float(tok)
+                    try:
+                        tok_val = int(tok)
+                    except: 
+                        print (tok_val)
+                        f2.write(str(tok_val) + " ")
+                        #f2.write("tok_val = ", tok_val + " ")
+                except:
+                    continue
+
+            f2.write("\n")
+    
+
+    f.close()
+    f2.close()
+
+        
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py
index 9281773acf6f64b0a0d6b7b4a830b54dfaac6c54..7e969271c20031dab9f302b333a4f7feb0338871 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py
@@ -15,7 +15,7 @@
 # Batch 20: 3-Skip levels + + 2 runs + 1500 Runs + EnergyBandSize now % of Max (Compare against Batch19
 
 
-batch_id = "batch200"
+batch_id = "batch201"
 
 class Benchmark:
   def __init__(self):
@@ -67,7 +67,8 @@ Alexnet1.loss2_result_file = "tuner_results/alexnet_cifar10/loss_2/promise_tuned
 
 Alexnet1.autotuner_runs = 1500
 Alexnet1.tuner_accuracy = 79.9
-Alexnet1.promise_accuracy = 79.9
+#Alexnet1.promise_accuracy = 79.9
+Alexnet1.promise_accuracy = 79.5
 Alexnet1.validation_accuracy = 79.19
 
 bench_tuner_data["alexnet_cifar10"] = Alexnet1
@@ -97,7 +98,8 @@ Alexnet2.layer_knobs = "../opentuner/data/alexnet2/knobs.txt"
 #Alexnet2.loss2_result_file = "tuner_results/alexnet2_cifar10/loss_2/promise_tuned_confs/promise_confs.txt"
 Alexnet2.autotuner_runs = 1500
 Alexnet2.tuner_accuracy = 84.19
-Alexnet2.promise_accuracy = 84.19
+#Alexnet2.promise_accuracy = 84.19
+Alexnet2.promise_accuracy = 84.8
 Alexnet2.validation_accuracy = 85.15
 
 bench_tuner_data["alexnet2_cifar10"] = Alexnet2
@@ -131,7 +133,8 @@ Alexnet3.loss2_result_file = "tuner_results/vgg16_cifar10/loss_2/promise_tuned_c
 
 Alexnet3.autotuner_runs = 1500
 Alexnet3.tuner_accuracy = 90.19
-Alexnet3.promise_accuracy = 90.19
+#Alexnet3.promise_accuracy = 90.19
+Alexnet3.promise_accuracy = 89.55
 Alexnet3.validation_accuracy = 89.05
 
 bench_tuner_data["vgg16_cifar10"] = Alexnet3
@@ -163,7 +166,8 @@ Alexnet4.loss2_result_file = "tuner_results/resnet18_cifar10/loss_2/promise_tune
 
 Alexnet4.autotuner_runs = 1500
 Alexnet4.tuner_accuracy = 89.6
-Alexnet4.promise_accuracy = 89.59
+#Alexnet4.promise_accuracy = 89.59  - 1000 images
+Alexnet4.promise_accuracy = 89.94
 Alexnet4.validation_accuracy = 89.65
 
 bench_tuner_data["resnet18_cifar10"] = Alexnet4
@@ -197,7 +201,8 @@ Alexnet5.loss1_result_file = "tuner_results/vgg_cifar100/loss_1/promise_tuned_co
 Alexnet5.loss2_result_file = "tuner_results/vgg_cifar100/loss_2/promise_tuned_confs/promise_confs.txt"
 Alexnet5.autotuner_runs = 1500
 Alexnet5.tuner_accuracy = 67.95
-Alexnet5.promise_accuracy = 66.8
+#Alexnet5.promise_accuracy = 66.8
+Alexnet5.promise_accuracy = 70.1
 Alexnet5.validation_accuracy = 68.65
 
 bench_tuner_data["vgg16_cifar100"] = Alexnet5
@@ -265,7 +270,8 @@ Alexnet7.loss1_result_file = "tuner_results/mobilenet/loss_1/batch1/promise_tune
 Alexnet7.loss2_result_file = "tuner_results/mobilenet/loss_2/batch1/promise_tuner/high_confidence/promise_confs.txt"
 Alexnet7.autotuner_runs = 1500
 Alexnet7.tuner_accuracy = 84.8
-Alexnet7.promise_accuracy = 84.8
+#Alexnet7.promise_accuracy = 84.8
+Alexnet7.promise_accuracy = 83.65
 Alexnet7.validation_accuracy = 84.4
 
 bench_tuner_data["mobilenet_cifar10"] = Alexnet7
@@ -298,7 +304,8 @@ Alexnet8.loss2_result_file = "../build_tuner/tuner_results/mobilenet_shallow/los
 
 Alexnet8.autotuner_runs = 1500
 Alexnet8.tuner_accuracy = 87.6
-Alexnet8.promise_accuracy = 87.59
+#Alexnet8.promise_accuracy = 87.59
+Alexnet8.promise_accuracy = 89.25
 Alexnet8.validation_accuracy = 88.5
 
 bench_tuner_data["mobilenet_shallow"] = Alexnet8
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py
index ca1772637c0c294386c894238e457edc71c01ca5..6a07ef86e53d2b4b6372e1e253611ba6f018aaad 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py
@@ -138,7 +138,7 @@ def loadConfigData(result_dir, baseline_accuracy):
           config.avg_loss = baseline_accuracy - avg_accuracy 
           config.speedup = speedup
           config.fname = fname
-          print ("acc = " + str(avg_accuracy) + "\n")
+          #print ("acc = " + str(avg_accuracy) + "\n")
         else:
           flag = int(x.strip())
           config.flags.append(flag)
@@ -242,7 +242,8 @@ def buildConfigStr(config, layer_desc):
 
 def dumpConfig(layer_desc, config_arrs, result_dir):
 
-  f = open(result_dir + "/tuner_confs.txt", "w+")
+  
+  f = open(result_dir + "/tuner_confs_11.txt", "w+")
 
   it = 1
   for config in config_arrs:
@@ -274,34 +275,82 @@ def generateConf(Bench):
 
 
 
+def dumpBaselineConfs(Bench):
+
+  layer_desc = loadLayerDesc(Bench.layer_file)
+
+  f = open(Bench.base_dir + "/tuner_confs_base.txt", "w+")
+ 
+  f.write("+++++\n")
+  f.write("conf" + str(1) + " " + str(1) + " 0 " + str(Bench.promise_accuracy) + " " + str(0) + "\n")
+
+  config = Config()
+  flags = []
+  for i in range(Bench.num_layers):
+    flags.append(11)
+    
+  config.flags = flags
+  config_str = buildConfigStr(config, layer_desc)
+
+  f.write(config_str)  
+  f.write("-----\n")
+          
+
+  
+  f.write("+++++\n")
+  f.write("conf" + str(2) + " " + str(1.5) + " 0 " + str(Bench.promise_accuracy) + " " + str(0) + "\n")
+
+  config = Config()
+  flags = []
+  for i in range(Bench.num_layers):
+    flags.append(10)
+    
+  config.flags = flags
+  config_str = buildConfigStr(config, layer_desc)
+
+  f.write(config_str)    
+  f.write("-----\n")
+
+
+
+  
+
+
 if __name__ == "__main__":
 
-  """
   Bench = bench_tuner_data["alexnet_cifar10"]
-  generateConf(Bench)
-    
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["alexnet2_cifar10"]
-  generateConf(Bench)
-        
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["vgg16_cifar10"]
-  generateConf(Bench)
-    
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["vgg16_cifar100"]
-  generateConf(Bench)
-
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["resnet18_cifar10"]
-  generateConf(Bench)
-    
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["lenet_keras"]
-  generateConf(Bench)
-
-  """
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
+  
   Bench = bench_tuner_data["mobilenet_cifar10"]
-  generateConf(Bench)
+  #generateConf(Bench)
+  dumpBaselineConfs(Bench)
     
-  #Bench = bench_tuner_data["mobilenet_shallow"]
+  Bench = bench_tuner_data["mobilenet_shallow"]
   #generateConf(Bench)
+  dumpBaselineConfs(Bench)
 
 
 
 
+  
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py
index 8f2ded4680af9351fa4e3b571d16eb3725316af1..73d460be0c4091067c9d52e07ea7f4d421765ff3 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py
@@ -264,6 +264,9 @@ def runSensAnalysis():
 
 def runAlgoTuner():
 
+  Bench = bench_tuner_data["alexnet_cifar10"]  
+  runAlgoBench(Bench)
+
   Bench = bench_tuner_data["mobilenet_shallow"]  
   runAlgoBench(Bench)
 
@@ -273,14 +276,12 @@ def runAlgoTuner():
   Bench = bench_tuner_data["vgg16_cifar10"]  
   runAlgoBench(Bench)
 
-  Bench = bench_tuner_data["lenet_keras"]  
-  runAlgoBench(Bench)
+  #Bench = bench_tuner_data["lenet_keras"]  
+  #runAlgoBench(Bench)
 
   Bench = bench_tuner_data["alexnet2_cifar10"]  
   runAlgoBench(Bench)
 
-  Bench = bench_tuner_data["alexnet_cifar10"]  
-  runAlgoBench(Bench)
 
   Bench = bench_tuner_data["vgg16_cifar100"]  
   runAlgoBench(Bench)
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc
index 6074dacf3f56e672ac5ca80eda572a53a58f1044..66e824f6d098434e140d764edda7cdacd11e110f 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc
@@ -30,7 +30,7 @@ int main(int argc, char* argv[]){
   }
 
   
-  llvm_hpvm_initTensorRt(1); 
+  llvm_hpvm_initTensorRt(0); 
 
   int missed = 0;
   for (int i = 0 ; i < total_runs; i++){ 
@@ -41,7 +41,7 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
+    int test_input_size = 2000; 
     int batch_size = 1000;
     int offset = 5000;
     
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc
index 0513723b5a4a36984e736b94ee82b9fc3fb2d1f9..6b951cffcaf142bd917abc7f7c04a2c691c472d7 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc
@@ -31,9 +31,9 @@ int main(int argc, char* argv[]){
   }
 
   
-  llvm_hpvm_initTensorRt(1); 
-
+  llvm_hpvm_initTensorRt(0); 
 
+  
   int missed = 0; 
   for (int i = 0 ; i < total_runs; i++){ 
 
@@ -43,15 +43,15 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
+    int test_input_size = 2000; 
     int batch_size = 1000;
     int offset = 5000;
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
 
-    for(int i = 0; i < batch_count; i++){ 
-
-      std::string dir_prefix = std::string("../model_params/alexnet_cifar10_test/"); 
+    for(int i = 0; i < batch_count; i++){
+      
+      std::string dir_prefix = std::string("../model_params/alexnet_cifar10_test/");   
       std::string input_path =  dir_prefix + std::string("input.bin"); 
       std::string labels_path =  dir_prefix + std::string("labels.bin"); 
       std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
@@ -79,6 +79,7 @@ int main(int argc, char* argv[]){
       std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
       void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
 
+      
 
       int start = i * batch_size + offset; 
       int end = (i + 1) * batch_size + offset; 
@@ -117,3 +118,4 @@ int main(int argc, char* argv[]){
   return 0; 
 
 }
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc
index 1cf73cd92a39a14c6a1fdd3965e63bfabee634b1..052809f29b9d89534005e56125e66c5e4a0bd1cf 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc
@@ -43,8 +43,10 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
-    int batch_size = 1000; 
+    int test_input_size = 2000; 
+    int batch_size = 1000;
+    int offset = 5000;
+    
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
 
@@ -330,93 +332,93 @@ int main(int argc, char* argv[]){
       void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
 
 
-      int start = i * batch_size; 
-      int end = (i + 1) * batch_size; 
+      int start = i * batch_size + offset; 
+      int end = (i + 1) * batch_size + offset; 
 
       void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
 
       void* var_0 = ConvLayer_PROMISE(input, -1.9892114, 2.126797, conv2d_1_w, -2.196306920051575, 1.347581704139706, NULL, 0, 0, 1, 1, 1, 1, -1, 0, -1, -60.89275047302246, 51.99256916046146, 9); 
-      void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
+      void* var_1 = tensorHalfBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
       void* var_2 = tensorRelu(var_1); 
       void* var_3 = tensorConvolution(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
-      void* var_4 = tensorBatchNorm(var_3, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
+      void* var_4 = tensorHalfBatchNorm(var_3, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
       void* var_5 = tensorRelu(var_4); 
       void* var_6 = ConvLayer_PROMISE(var_5, 0.0, 5.713541553974245, conv2d_2_w, -0.9317721160650253, 1.0774258937835774, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -6.518589503288269, 6.810842518806449, 9); 
-      void* var_7 = tensorBatchNorm(var_6, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
+      void* var_7 = tensorHalfBatchNorm(var_6, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
       void* var_8 = tensorRelu(var_7); 
       void* var_9 = tensorConvolution(var_8, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
-      void* var_10 = tensorBatchNorm(var_9, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
+      void* var_10 = tensorHalfBatchNorm(var_9, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
       void* var_11 = tensorRelu(var_10); 
       void* var_12 = ConvLayer_PROMISE(var_11, 0.0, 4.932139402866376, conv2d_3_w, -0.5316544661521911, 0.5753790403604531, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.482631235122681, 3.96730119752885, 9); 
-      void* var_13 = tensorBatchNorm(var_12, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
+      void* var_13 = tensorHalfBatchNorm(var_12, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
       void* var_14 = tensorRelu(var_13); 
       void* var_15 = tensorConvolution(var_14, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
-      void* var_16 = tensorBatchNorm(var_15, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
+      void* var_16 = tensorHalfBatchNorm(var_15, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
       void* var_17 = tensorRelu(var_16); 
       void* var_18 = ConvLayer_PROMISE(var_17, 0.0, 4.103263397693674, conv2d_4_w, -0.36234098821878435, 0.4076913900375366, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.04261828327179, 3.88677932929993, 9); 
-      void* var_19 = tensorBatchNorm(var_18, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
+      void* var_19 = tensorHalfBatchNorm(var_18, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
       void* var_20 = tensorRelu(var_19); 
       void* var_21 = tensorConvolution(var_20, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
-      void* var_22 = tensorBatchNorm(var_21, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
+      void* var_22 = tensorHalfBatchNorm(var_21, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
       void* var_23 = tensorRelu(var_22); 
       void* var_24 = ConvLayer_PROMISE(var_23, 0.0, 5.383221302509475, conv2d_5_w, -0.3131200549006462, 0.29357679939270065, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -5.921469215393066, 4.338679324150087, 9); 
-      void* var_25 = tensorBatchNorm(var_24, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
+      void* var_25 = tensorHalfBatchNorm(var_24, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
       void* var_26 = tensorRelu(var_25); 
       void* var_27 = tensorConvolution(var_26, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
-      void* var_28 = tensorBatchNorm(var_27, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
+      void* var_28 = tensorHalfBatchNorm(var_27, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
       void* var_29 = tensorRelu(var_28); 
       void* var_30 = ConvLayer_PROMISE(var_29, 0.0, 4.316738154411368, conv2d_6_w, -0.23299247801303866, 0.2580290257930756, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.207789947509766, 3.932436970710759, 9); 
-      void* var_31 = tensorBatchNorm(var_30, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
+      void* var_31 = tensorHalfBatchNorm(var_30, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
       void* var_32 = tensorRelu(var_31); 
       void* var_33 = tensorConvolution(var_32, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
-      void* var_34 = tensorBatchNorm(var_33, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
+      void* var_34 = tensorHalfBatchNorm(var_33, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
       void* var_35 = tensorRelu(var_34); 
       void* var_36 = ConvLayer_PROMISE(var_35, 0.0, 5.830408106803901, conv2d_7_w, -0.20233777219057084, 0.18998308175802117, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -6.298286915779113, 4.848135117530843, 9); 
-      void* var_37 = tensorBatchNorm(var_36, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
+      void* var_37 = tensorHalfBatchNorm(var_36, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
       void* var_38 = tensorRelu(var_37); 
       void* var_39 = tensorConvolution(var_38, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
-      void* var_40 = tensorBatchNorm(var_39, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
+      void* var_40 = tensorHalfBatchNorm(var_39, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
       void* var_41 = tensorRelu(var_40); 
       void* var_42 = ConvLayer_PROMISE(var_41, 0.0, 4.446417809963227, conv2d_8_w, -0.17442735651135444, 0.17695830866694454, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.347910885810852, 3.6144364695549145, 9); 
-      void* var_43 = tensorBatchNorm(var_42, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
+      void* var_43 = tensorHalfBatchNorm(var_42, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
       void* var_44 = tensorRelu(var_43); 
       void* var_45 = tensorConvolution(var_44, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
-      void* var_46 = tensorBatchNorm(var_45, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
+      void* var_46 = tensorHalfBatchNorm(var_45, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
       void* var_47 = tensorRelu(var_46); 
       void* var_48 = ConvLayer_PROMISE(var_47, 0.0, 4.518095604896667, conv2d_9_w, -0.14546796187758446, 0.15256431668996823, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -3.0287702755928043, 2.9487365779876953, 9); 
-      void* var_49 = tensorBatchNorm(var_48, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
+      void* var_49 = tensorHalfBatchNorm(var_48, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
       void* var_50 = tensorRelu(var_49); 
       void* var_51 = tensorConvolution(var_50, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
-      void* var_52 = tensorBatchNorm(var_51, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
+      void* var_52 = tensorHalfBatchNorm(var_51, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
       void* var_53 = tensorRelu(var_52); 
       void* var_54 = ConvLayer_PROMISE(var_53, 0.0, 6.348575634956407, conv2d_10_w, -0.13025874522328376, 0.13558243343234128, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.2293100805282595, 3.5315046372413645, 9); 
-      void* var_55 = tensorBatchNorm(var_54, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
+      void* var_55 = tensorHalfBatchNorm(var_54, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
       void* var_56 = tensorRelu(var_55); 
       void* var_57 = tensorConvolution(var_56, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
-      void* var_58 = tensorBatchNorm(var_57, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
+      void* var_58 = tensorHalfBatchNorm(var_57, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
       void* var_59 = tensorRelu(var_58); 
       void* var_60 = ConvLayer_PROMISE(var_59, 0.0, 5.221003110408843, conv2d_11_w, -0.11900172759592534, 0.12536374783515936, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.038203780174255, 4.004009407043483, 9); 
-      void* var_61 = tensorBatchNorm(var_60, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
+      void* var_61 = tensorHalfBatchNorm(var_60, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
       void* var_62 = tensorRelu(var_61); 
       void* var_63 = tensorConvolution(var_62, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
-      void* var_64 = tensorBatchNorm(var_63, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
+      void* var_64 = tensorHalfBatchNorm(var_63, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
       void* var_65 = tensorRelu(var_64); 
       void* var_66 = ConvLayer_PROMISE(var_65, 0.0, 5.732498347759442, conv2d_12_w, -0.10839721685647964, 0.11625668607652187, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -3.3111015114784244, 4.462933233261136, 9); 
-      void* var_67 = tensorBatchNorm(var_66, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
+      void* var_67 = tensorHalfBatchNorm(var_66, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
       void* var_68 = tensorRelu(var_67); 
       void* var_69 = tensorConvolution(var_68, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
-      void* var_70 = tensorBatchNorm(var_69, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
-      void* var_71 = tensorRelu(var_70); 
+      void* var_70 = tensorHalfBatchNorm(var_69, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
+      void* var_71 = tensorHalfRelu(var_70); 
       void* var_72 = ConvLayer_PROMISE(var_71, 0.0, 7.240498211860681, conv2d_13_w, -0.08623744961619377, 0.08859449951350662, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.175431394577027, 6.2043294754027345, 9); 
-      void* var_73 = tensorBatchNorm(var_72, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-      void* var_74 = tensorRelu(var_73); 
+      void* var_73 = tensorHalfBatchNorm(var_72, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
+      void* var_74 = tensorHalfRelu(var_73); 
       void* var_75 = tensorConvolution(var_74, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
-      void* var_76 = tensorBatchNorm(var_75, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
+      void* var_76 = tensorHalfBatchNorm(var_75, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
       void* var_77 = tensorRelu(var_76); 
       void* var_78 = ConvLayer_PROMISE(var_77, 0.0, 7.813958834648251, conv2d_14_w, -0.06813025139272214, 0.07002027779817581, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -10.920566423416137, 2.6442912578582534, 9); 
-      void* var_79 = tensorBatchNorm(var_78, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-      void* var_80 = tensorRelu(var_79); 
-      void* var_81 = tensorPooling(var_80,1,2,2,0,0,2,2); 
+      void* var_79 = tensorHalfBatchNorm(var_78, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
+      void* var_80 = tensorHalfRelu(var_79); 
+      void* var_81 = tensorHalfPooling(var_80,1,2,2,0,0,2,2); 
       void* var_82 = FCLayer_PROMISE(var_81, 0.0, 2.8692066650391013, dense_1_w, -0.22301019695401192, 0.1442659378200768, dense_1_b, -0.1654396, 0.23336112, -1, -12.245949958801269, 23.80532513427739, 9); 
       void* var_83 = tensorSoftmax(var_82); 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc
index 394ec85390aa4248fd93aefa339ff196f39a5559..42d26d34e65939b410143485a61f23e705906bfc 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc
@@ -42,8 +42,10 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
-    int batch_size = 1000; 
+    int test_input_size = 2000; 
+    int batch_size = 1000;
+    int offset = 5000;
+    
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
 
@@ -188,8 +190,8 @@ int main(int argc, char* argv[]){
       void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
 
 
-      int start = i * batch_size; 
-      int end = (i + 1) * batch_size; 
+      int start = i * batch_size + offset; 
+      int end = (i + 1) * batch_size + offset; 
 
       void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc
index cc0981dc7d1d75ce56388f3135fa0f89f8c688e3..0e5cdd1d284e6c7621cd3331b924c06969be79db 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc
@@ -30,7 +30,7 @@ int main(int argc, char* argv[]){
   }
 
 
-  llvm_hpvm_initTensorRt(1); 
+  llvm_hpvm_initTensorRt(0); 
 
   int missed = 0; 
   for (int i = 0 ; i < total_runs; i++){ 
@@ -41,9 +41,10 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 1000; 
+    int test_input_size = 2000; 
     int batch_size = 1000;
     int offset = 5000;
+    
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc
index ec5de9a5e2c2d66be44fdd99b83dd634d8f5b2f9..33c68eae84a075f50b2bc8e7484036c54ade5620 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc
@@ -31,7 +31,7 @@ int main(int argc, char* argv[]){
   }
 
 
-  llvm_hpvm_initTensorRt(1); 
+  llvm_hpvm_initTensorRt(0); 
 
   
   int missed = 0; 
@@ -43,8 +43,10 @@ int main(int argc, char* argv[]){
 
    startMemTracking(); 
 
-   int test_input_size = 1000; 
-   int batch_size = 1000; 
+   int test_input_size = 2000; 
+   int batch_size = 1000;
+   int offset = 5000;
+
    int batch_count = test_input_size / batch_size; 
    float final_accuracy = 0.0; 
    
@@ -115,8 +117,9 @@ int main(int argc, char* argv[]){
      void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); 
 
 
-     int start = i * batch_size; 
-     int end = (i + 1) * batch_size; 
+     int start = i * batch_size + offset; 
+     int end = (i + 1) * batch_size + offset;
+     
 
      void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc
index 798b5f67aa9636f8e7ad3b9d08b9fc8e53cb137d..ff767235e9d44139f97ad885aa89eef1c385ad33 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc
@@ -29,7 +29,7 @@ int main(int argc, char* argv[]){
    to_skip = atoi(argv[3]);   
  }
 
- llvm_hpvm_initTensorRt(1); 
+ llvm_hpvm_initTensorRt(0); 
 
  int missed = 0; 
  for (int i = 0 ; i < total_runs; i++){ 
@@ -40,7 +40,7 @@ int main(int argc, char* argv[]){
    
    startMemTracking(); 
 
-   int test_input_size = 1000; 
+   int test_input_size = 2000; 
    int batch_size = 1000;
    int offset = 5000;
    
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py
index 0d2f1ff481258b7d2605e98468cf6ebd66bffd64..b8145e179893bc0db2631cf1f7ee0f11bcc9be0e 100644
--- a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py
@@ -132,6 +132,7 @@ def readKnobConfig(file_path):
 
 def getConfigCost(cfg):
 
+  orig_cost = 0.0
   total_cost = 0.0
   for it in range(tunerData.num_layers):
     flag = tunerData.tuning_flags[it]
@@ -140,9 +141,27 @@ def getConfigCost(cfg):
     speedup = tunerData.knobs_speedup[flag_value]
 
     total_cost += (op_cost * 1.0 / speedup * 1.0)
-    it += 1
+    orig_cost += op_cost
     
-  return total_cost
+    it += 1
+
+  speedup = (orig_cost * 1.0) / (total_cost * 1.0)
+  
+  return total_cost, speedup
+
+
+
+def appendTopLine(f_path, accuracy, total_runs, total_comps, speedup):
+
+  f_str = open(f_path, "r").read()
+
+  f_out = open(f_path, "w+")
+
+  f_out.write("total_runs=" + str(total_runs) + "\tconfidence=100.0" + "\tavg_accuracy=" + str(accuracy) + "\tconfig_cost=" + str(total_comps) + "\tspeedup=" + str(speedup) + "\n" )
+  f_out.write(f_str)
+
+  f_out.close()
+      
 
 
 
@@ -213,7 +232,7 @@ class ClangFlagsTuner(MeasurementInterface):
     accuracy = getAccuracy("final_accuracy")
     
     # getConfigCost returns the cost associated with the selected configuration
-    total_comps = getConfigCost(cfg)
+    total_comps, speedup = getConfigCost(cfg)
    
     
     Result = opentuner.resultsdb.models.Result()
@@ -226,7 +245,10 @@ class ClangFlagsTuner(MeasurementInterface):
     if min_accuracy > tunerData.accuracy_threshold:
       config_tuple = (total_comps, accuracy, cfg)
       self.configs_list.append(config_tuple)
-      shutil.copy('promise_flags', tunerData.output_dir + '/' + tunerData.binary_path + '_' + str(tunerData.test_id))
+      f_path = tunerData.output_dir + '/' + tunerData.binary_path + '_' + str(tunerData.test_id)
+      shutil.copy('promise_flags', f_path)
+
+      appendTopLine(f_path, accuracy, total_runs, total_comps, speedup)
 
       f_acc = open(tunerData.output_dir + '/' + tunerData.binary_path + '_' + str(tunerData.test_id) + "_accuracy", "w")
       f_acc.write(str(accuracy))