diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py
index 7e969271c20031dab9f302b333a4f7feb0338871..41bd4c73483e19472053406104f8590e54bd5d36 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py
@@ -12,10 +12,18 @@
 # Batch 18: Batch13 (Basline) + ParetoCurve (1500 Runs) - BUGGY IGNORE!!!
 
 # Batch 19: (Basline) + ParetoCurve + 2 runs in Tuning Phase (1500 Runs)
+
 # Batch 20: 3-Skip levels + + 2 runs + 1500 Runs + EnergyBandSize now % of Max (Compare against Batch19
 
 
-batch_id = "batch201"
+# Batch 200: AlgoTuner - 1000 images - 1500 runs (IGNORE)
+# Batch 201: AlgoTuner - 2000 images - 1500 runs
+# Batch 202: AlgoTuner - 2000 images - 500 runs
+
+
+
+batch_id = "batch202"
+
 
 class Benchmark:
   def __init__(self):
@@ -65,7 +73,7 @@ Alexnet1.layer_knobs = "../opentuner/data/alexnet/knobs.txt"
 Alexnet1.loss1_result_file = "tuner_results/alexnet_cifar10/loss_1/promise_tuned_confs/promise_confs.txt"
 Alexnet1.loss2_result_file = "tuner_results/alexnet_cifar10/loss_2/promise_tuned_confs/promise_confs.txt"
 
-Alexnet1.autotuner_runs = 1500
+Alexnet1.autotuner_runs = 500
 Alexnet1.tuner_accuracy = 79.9
 #Alexnet1.promise_accuracy = 79.9
 Alexnet1.promise_accuracy = 79.5
@@ -96,7 +104,7 @@ Alexnet2.cost_file = "../build_tuner/tuner_results/alexnet2_cifar10/op_cost.txt"
 Alexnet2.layer_knobs = "../opentuner/data/alexnet2/knobs.txt"
 #Alexnet2.loss1_result_file = "tuner_results/alexnet2_cifar10/loss_1/promise_tuned_confs/promise_confs.txt"
 #Alexnet2.loss2_result_file = "tuner_results/alexnet2_cifar10/loss_2/promise_tuned_confs/promise_confs.txt"
-Alexnet2.autotuner_runs = 1500
+Alexnet2.autotuner_runs = 500
 Alexnet2.tuner_accuracy = 84.19
 #Alexnet2.promise_accuracy = 84.19
 Alexnet2.promise_accuracy = 84.8
@@ -131,7 +139,7 @@ Alexnet3.layer_knobs = "../opentuner/data/vgg16_cifar10/knobs.txt"
 Alexnet3.loss1_result_file = "tuner_results/vgg16_cifar10/loss_1/promise_tuned_confs/promise_confs.txt"
 Alexnet3.loss2_result_file = "tuner_results/vgg16_cifar10/loss_2/promise_tuned_confs/promise_confs.txt"
 
-Alexnet3.autotuner_runs = 1500
+Alexnet3.autotuner_runs = 500
 Alexnet3.tuner_accuracy = 90.19
 #Alexnet3.promise_accuracy = 90.19
 Alexnet3.promise_accuracy = 89.55
@@ -164,7 +172,7 @@ Alexnet4.layer_knobs = "../opentuner/data/resnet/knobs.txt"
 Alexnet4.loss1_result_file = "tuner_results/resnet18_cifar10/loss_1/promise_tuned_confs/promise_confs.txt"
 Alexnet4.loss2_result_file = "tuner_results/resnet18_cifar10/loss_2/promise_tuned_confs/promise_confs.txt"
 
-Alexnet4.autotuner_runs = 1500
+Alexnet4.autotuner_runs = 500
 Alexnet4.tuner_accuracy = 89.6
 #Alexnet4.promise_accuracy = 89.59  - 1000 images
 Alexnet4.promise_accuracy = 89.94
@@ -199,7 +207,7 @@ Alexnet5.layer_knobs = "../opentuner/data/vgg16_cifar100/knobs.txt"
 
 Alexnet5.loss1_result_file = "tuner_results/vgg_cifar100/loss_1/promise_tuned_confs/promise_confs.txt"
 Alexnet5.loss2_result_file = "tuner_results/vgg_cifar100/loss_2/promise_tuned_confs/promise_confs.txt"
-Alexnet5.autotuner_runs = 1500
+Alexnet5.autotuner_runs = 500
 Alexnet5.tuner_accuracy = 67.95
 #Alexnet5.promise_accuracy = 66.8
 Alexnet5.promise_accuracy = 70.1
@@ -233,7 +241,7 @@ Alexnet6.layer_knobs = "../opentuner/data/lenet/knobs.txt"
 
 #Alexnet6.loss1_result_file = "tuner_results/vgg_cifar100/loss_1/promise_tuned_confs/promise_confs.txt"
 #Alexnet6.loss2_result_file = "tuner_results/vgg_cifar100/loss_2/promise_tuned_confs/promise_confs.txt"
-Alexnet6.autotuner_runs = 900
+Alexnet6.autotuner_runs = 500
 Alexnet6.tuner_accuracy = 98.9
 Alexnet6.promise_accuracy = 98.9
 Alexnet6.validation_accuracy = 99
@@ -268,7 +276,7 @@ Alexnet7.layer_knobs = "../opentuner/data/mobilenet/knobs.txt"
 #--- Files below needed for VALIDATION experiment
 Alexnet7.loss1_result_file = "tuner_results/mobilenet/loss_1/batch1/promise_tuner/high_confidence/promise_confs.txt"
 Alexnet7.loss2_result_file = "tuner_results/mobilenet/loss_2/batch1/promise_tuner/high_confidence/promise_confs.txt"
-Alexnet7.autotuner_runs = 1500
+Alexnet7.autotuner_runs = 500
 Alexnet7.tuner_accuracy = 84.8
 #Alexnet7.promise_accuracy = 84.8
 Alexnet7.promise_accuracy = 83.65
@@ -302,7 +310,7 @@ Alexnet8.layer_knobs = "../opentuner/data/mobilenet_shallow/knobs.txt"
 Alexnet8.loss1_result_file = "../build_tuner/tuner_results/mobilenet_shallow/loss_1/batch2/promise_tuner/high_confidence/promise_selected_confs.txt"
 Alexnet8.loss2_result_file = "../build_tuner/tuner_results/mobilenet_shallow/loss_2/batch2/promise_tuner/high_confidence/promise_selected_confs.txt"
 
-Alexnet8.autotuner_runs = 1500
+Alexnet8.autotuner_runs = 500
 Alexnet8.tuner_accuracy = 87.6
 #Alexnet8.promise_accuracy = 87.59
 Alexnet8.promise_accuracy = 89.25
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py
index 6a07ef86e53d2b4b6372e1e253611ba6f018aaad..f82b1a24f3982802c249f502161d944110b389e0 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py
@@ -89,6 +89,9 @@ def skipFile(fname):
   skip_files["confidence_summary.txt"] = 1
   skip_files["promise_confs.txt"] = 1
 
+  if "accuracy" in fname: # *_accuracy files should be skipped
+    return True
+  
   if fname in skip_files:
     return True
   else:
@@ -117,7 +120,8 @@ def loadConfigData(result_dir, baseline_accuracy):
 
   config_arr = []
   
-  result_dir += "/promise_tuner/high_confidence/"
+  #result_dir += "/promise_tuner/high_confidence/"
+  result_dir += "/algo_tuner/high_confidence/"
   file_names = os.listdir(result_dir)
 
   
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py
index 73d460be0c4091067c9d52e07ea7f4d421765ff3..1e533b8702139966166f860b72a3df1ccae03ee6 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py
@@ -12,7 +12,7 @@ from run_ha_tuner import runTunerBench
 from run_hs_tuner import runPromiseBench
 from run_algo_tuner import runAlgoBench
 from compute_confs import computePSNRBenchSwings, computeBenchSwings
-from validation import runPromiseBenchValidation, runPromiseBenchValidation2, runBenchValidation
+from validation import runPromiseBenchValidation2, runBenchValidation, runAlgoBenchValidate  
 from profiling import startProfile, stopProfile, dumpProfiles  
 from utils import createResultDirs
 from benchmarks import batch_id
@@ -276,8 +276,8 @@ def runAlgoTuner():
   Bench = bench_tuner_data["vgg16_cifar10"]  
   runAlgoBench(Bench)
 
-  #Bench = bench_tuner_data["lenet_keras"]  
-  #runAlgoBench(Bench)
+  Bench = bench_tuner_data["lenet_keras"]  
+  runAlgoBench(Bench)
 
   Bench = bench_tuner_data["alexnet2_cifar10"]  
   runAlgoBench(Bench)
@@ -292,6 +292,36 @@ def runAlgoTuner():
 
 
   
+def runAlgoTunerValidation():
+
+
+  Bench = bench_tuner_data["alexnet_cifar10"]  
+  runAlgoBenchValidate(Bench)
+
+  Bench = bench_tuner_data["mobilenet_shallow"]  
+  runAlgoBenchValidate(Bench)
+
+  Bench = bench_tuner_data["mobilenet_cifar10"]  
+  runAlgoBenchValidate(Bench)
+
+  Bench = bench_tuner_data["vgg16_cifar10"]  
+  runAlgoBenchValidate(Bench)
+
+  Bench = bench_tuner_data["lenet_keras"]  
+  runAlgoBenchValidate(Bench)
+
+  Bench = bench_tuner_data["alexnet2_cifar10"]  
+  runAlgoBenchValidate(Bench)
+
+  Bench = bench_tuner_data["vgg16_cifar100"]  
+  runAlgoBenchValidate(Bench)
+
+  Bench = bench_tuner_data["resnet18_cifar10"]  
+  runAlgoBenchValidate(Bench)
+
+  
+
+  
   
 if __name__ == "__main__":
 
@@ -305,7 +335,9 @@ if __name__ == "__main__":
   
   #runPromiseTuner()    
 
-  runAlgoTuner()
+  #runAlgoTuner()
+
+  runAlgoTunerValidation()
   
   #runPromiseValidation()
 
diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/validation.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/validation.py
index 586d23d70a661558bad0cb8ac75c2367e3e73e2f..06a076103b8052d7321951f39bab738ec6d58358 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/validation.py
+++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/validation.py
@@ -488,7 +488,7 @@ def runPromiseBenchValidation(Bench):
 def copyValidatedConf(result_dir, validated_confs):
 
   src_dir = result_dir + "/promise_tuner/high_confidence/"
-  dest_dir = result_dir + "/promise_tuner/validated/"
+  dest_dir = result_dir + "/promise_tuner/validated_test/"
 
   if not os.path.isdir(dest_dir):
     os.mkdir(dest_dir)
@@ -501,7 +501,7 @@ def copyValidatedConf(result_dir, validated_confs):
 def copyFailedConf(result_dir, failed_confs):
 
   src_dir = result_dir + "/promise_tuner/high_confidence/"
-  dest_dir = result_dir + "/promise_tuner/failed/"
+  dest_dir = result_dir + "/promise_tuner/failed_test/"
 
   if not os.path.isdir(dest_dir):
     os.mkdir(dest_dir)
@@ -529,6 +529,9 @@ def validateConfigs(Bench, result_dir, configs_arr, acc_thresh):
   copyFailedConf(result_dir, failed_confs) 
 
 
+
+
+
                  
 
 def runPromiseBenchValidation2(Bench):
@@ -543,3 +546,94 @@ def runPromiseBenchValidation2(Bench):
   validateConfigs(Bench, Bench.result_dir_2, config_arr2, 2.0)
   validateConfigs(Bench, Bench.result_dir_3, config_arr3, 3.0)
   
+
+
+
+### NOTE: Algo Tuner Validation routines
+
+
+def readConfidence(target_acc):
+
+  f = open("run_accuracies.txt")
+  index = 0.0
+  unsuccessful = 0.0
+  sum_acc = 0.0
+  for x in f:
+    x = x.strip()
+    acc = float(x)
+    if acc < target_acc:
+      unsuccessful += 1
+    index += 1
+    sum_acc += acc
+
+  f.close()
+  
+  confidence = ( (index - unsuccessful) / index) * 100.0
+  print ("run_confidence = ", confidence) 
+  avg_acc = sum_acc / index
+
+  return confidence
+
+
+
+def invokeBinary(Bench, layer_swings, target_acc, runs): # threshold):
+
+  validation_binary = Bench.promise_binary
+
+  # Write to promise_flags
+  fout = open("promise_flags", "w+")
+  for swing in layer_swings:
+    int_swing = int(swing)
+    if int_swing > 0:
+      fout.write(str(swing) + "\n")
+  fout.close()
+  
+  # Execute Validation Run
+  p = subprocess.Popen("./" + validation_binary, shell=True)
+  p.wait()
+  
+
+
+
+
+def validateAlgoConfigs(Bench, result_dir, configs_arr, acc_thresh):
+
+  # NOTE: Use confidence target as 95%
+  confidence_target = 95
+  # NOTE: 1 run sufficient for software approximations
+  runs = 1
+  
+  validated_confs = []
+  failed_confs = []
+
+  validation_acc = Bench.validation_accuracy
+  target_acc = validation_acc - acc_thresh
+  
+  for conf in configs_arr:
+    layer_swings = conf.flags
+    invokeBinary(Bench, layer_swings, target_acc, runs) # acc_thresh)
+    confidence = readConfidence(target_acc)
+    
+    if confidence >= confidence_target:
+      validated_confs.append(conf.fname)
+    else:
+      failed_confs.append(conf.fname)
+
+    
+  copyValidatedConf(result_dir, validated_confs)                    
+  copyFailedConf(result_dir, failed_confs) 
+  
+
+
+
+def runAlgoBenchValidate(Bench):
+  
+  config_arr1 = loadConfigData(Bench.result_dir_1, 100)
+  config_arr2 = loadConfigData(Bench.result_dir_2, 100)
+  config_arr3 = loadConfigData(Bench.result_dir_3, 100)
+
+  
+  validateAlgoConfigs(Bench, Bench.result_dir_1, config_arr1, 1.0)
+  validateAlgoConfigs(Bench, Bench.result_dir_2, config_arr2, 2.0)
+  validateAlgoConfigs(Bench, Bench.result_dir_3, config_arr3, 3.0)
+  
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc
index 66e824f6d098434e140d764edda7cdacd11e110f..241eb4cea8795af05983eb7e7ea7e645b42b9edb 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc
@@ -16,6 +16,9 @@ int to_skip = 5;
 
 int main(int argc, char* argv[]){ 
 
+  int test_input_size = 2000; 
+  int batch_size = 1000;
+  int offset = 5000;
 
   if (argc > 1){
     total_runs = atoi(argv[1]);
@@ -29,6 +32,14 @@ int main(int argc, char* argv[]){
     to_skip = atoi(argv[3]);   
   }
 
+  if(argc > 4){
+    test_input_size = atoi(argv[4]);   
+  }
+
+  if(argc > 5){
+    offset = atoi(argv[5]);   
+  }
+
   
   llvm_hpvm_initTensorRt(0); 
 
@@ -41,9 +52,6 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 2000; 
-    int batch_size = 1000;
-    int offset = 5000;
     
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc
index 6b951cffcaf142bd917abc7f7c04a2c691c472d7..3777d11718adf573f17348ceb0262641293fd2a5 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc
@@ -17,6 +17,9 @@ int to_skip = 5;
 
 int main(int argc, char* argv[]){ 
 
+  int test_input_size = 2000; 
+  int batch_size = 1000;
+  int offset = 5000;
 
   if (argc > 1){
     total_runs = atoi(argv[1]);
@@ -29,23 +32,29 @@ int main(int argc, char* argv[]){
   if(argc > 3){
     to_skip = atoi(argv[3]);   
   }
+  
+  if(argc > 4){
+    test_input_size = atoi(argv[4]);   
+  }
+
+  if(argc > 5){
+    offset = atoi(argv[5]);   
+  }
 
   
   llvm_hpvm_initTensorRt(0); 
 
+
   
   int missed = 0; 
   for (int i = 0 ; i < total_runs; i++){ 
-
+    
     if (missed >= to_skip){
      break;           
     }
 
     startMemTracking(); 
 
-    int test_input_size = 2000; 
-    int batch_size = 1000;
-    int offset = 5000;
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/lenet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/lenet_promise.cc
index 37a30c55002bd2fea2ac4054649869e5cf4b5c6f..2247ccba9fcf08d2f24368907c328f4b77b173b0 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/lenet_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/lenet_promise.cc
@@ -11,24 +11,24 @@
 #include "../../include/utils.h"
 
 
-// Piped OpenTuner usage is now deprecated
-bool Opentuner_run = false;
 int total_runs = 1;
 float bench_acc = 0;
 int to_skip = 5;
 
+int test_input_size = 2000; 
+int batch_size = 2000;
+int offset = 5000;
+
 
 /* NOTE: Reference Architecture to use for profiling */
 void testLenetTanh(){
 
-
-  printf("********* Lenet-2 Architecture ********** \n");
-  // FIXIT: Extend this to batch of images - currently 5 images
-
-  int test_batch_size = 1000;
-
-  uint8_t* labels = readLabels("../model_params/lenet_params/datasets/t10k-labels-idx1-ubyte", test_batch_size);
-
+  printf("********* Lenet-5 Architecture ********** \n");
+  
+  std::string dir_prefix = std::string("../model_params/lenet_params/");   
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  
   clearTensorMap();
 
   int missed = 0;
@@ -38,10 +38,13 @@ void testLenetTanh(){
       break;           
     }   
 
-    void* input = readInputTensor("../model_params/lenet_params/datasets/t10k-images-idx3-ubyte",
-				  CUDNN_DATA_FLOAT,
-				  test_batch_size, 1, 28, 28);
+    int start = offset; 
+    int end = batch_size + offset; 
+
+    // Loading Input Batch
+    void* input = readInputBatch(input_path.c_str(),0,start,end,1,28,28); 
 
+    // Loading Weights
     void* conv1_filter = readTrainedWeights("../model_params/lenet_keras/conv1.bin",
 					    float_type, 32, 1, 5, 5);    
     void* conv1_bias = readTrainedWeights("../model_params/lenet_keras/conv1_bias.bin",
@@ -60,27 +63,25 @@ void testLenetTanh(){
 					float_type, 1, 10, 1, 1);  
 
     
-
-    
-    readOpenTunerFlags("promise_flags"); // Resets the OpenTuner counters
-
-      
+    // DNN Operations
     void* conv1_out = ConvLayer_PROMISE(input, 0,1, conv1_filter, -1,1, conv1_bias, -1,1,
 					2, 2, 1, 1, 0, 2, 0, -1,1, 9);
-
     void* conv2_out = ConvLayer_PROMISE(conv1_out, -1,1, conv2_filter, -1,1,
 					conv2_bias, -1,1,
 					2, 2, 1, 1, 0, 2, 0, -1,1, 9);
 
     void* fc1_out = FCLayer_PROMISE(conv2_out, -1,1, fc1_weights, -1,1, fc1_bias, -1,1,
-				    0, -1,1, 9);
-    
+				    0, -1,1, 9);    
     void* fc2_out = FCLayer_PROMISE(fc1_out, -1,1, fc2_weights, -1,1, fc2_bias, -1,1,
 				    0, -1,1, 9);
 
     void* result = tensorSoftmax(fc2_out);
 
-    float accuracy = computeAccuracy2(labels, test_batch_size, result);
+    
+    uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
+
+    float accuracy = computeAccuracy2(labels, batch_size, result);
+
     
     freeOutputTensors();  
 
@@ -110,8 +111,18 @@ int main(int argc, char* argv[]){
   if(argc > 3){
     to_skip = atoi(argv[3]);   
   }
+    
+  if(argc > 4){
+    test_input_size = atoi(argv[4]);
+    batch_size = atoi(argv[4]);
+  }
+
+  if(argc > 5){
+    offset = atoi(argv[5]);   
+  }
 
 
+  
   llvm_hpvm_initTensorRt(0);
 
   testLenetTanh();
@@ -124,43 +135,3 @@ int main(int argc, char* argv[]){
 
 
 
-
-
-
-/*    if(Opentuner_run){
-
-      char* myfifo = "/tmp/myfifo";
-      int fd = open(myfifo, O_RDONLY);
-
-      int ret_val = fcntl(fd, F_GETFD);
-      if(ret_val == -1){
-      printf("Invalid descriptor \n");
-      abort();
-      }
-
-      char str[100];
-      read(fd, str, 80);
-      if(strcmp(str, "stop_run") == 0){
-      abort();
-      }
-
-      close(fd);
-      }
-*/
-
-
-/*    if(Opentuner_run){
-
-      char* myfifo = "/tmp/myfifo";
-      int fd_out = open(myfifo, O_WRONLY);
-      int ret_val = fcntl(fd_out, F_GETFD);
-      if(ret_val == -1){
-      printf("Invalid descriptor \n");
-      abort();
-      }
-      
-      const char* str = "completed***!\n\0";
-      write(fd_out, str, 80);
-      close(fd_out);
-      }
-*/
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc
index 052809f29b9d89534005e56125e66c5e4a0bd1cf..45abde0c285c858904dafc54104aec797ca0abf7 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc
@@ -17,6 +17,9 @@ int to_skip = 5;
 
 int main(int argc, char* argv[]){ 
 
+  int test_input_size = 2000; 
+  int batch_size = 1000;
+  int offset = 5000;
   
   if (argc > 1){
     total_runs = atoi(argv[1]);
@@ -30,6 +33,14 @@ int main(int argc, char* argv[]){
     to_skip = atoi(argv[3]);   
   }
 
+  if(argc > 4){
+    test_input_size = atoi(argv[4]);   
+  }
+
+  if(argc > 5){
+    offset = atoi(argv[5]);   
+  }
+
     
   llvm_hpvm_initTensorRt(0); 
 
@@ -43,9 +54,6 @@ int main(int argc, char* argv[]){
 
     startMemTracking(); 
 
-    int test_input_size = 2000; 
-    int batch_size = 1000;
-    int offset = 5000;
     
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc
index 42d26d34e65939b410143485a61f23e705906bfc..2585d96530a1c089beb3db8c15ade0a99be25718 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc
@@ -17,6 +17,9 @@ int to_skip = 5;
 
 int main(int argc, char* argv[]){ 
 
+  int test_input_size = 2000; 
+  int batch_size = 1000;
+  int offset = 5000;
 
   if (argc > 1){
     total_runs = atoi(argv[1]);
@@ -30,6 +33,14 @@ int main(int argc, char* argv[]){
     to_skip = atoi(argv[3]);   
   }
 
+  if(argc > 4){
+    test_input_size = atoi(argv[4]);   
+  }
+
+  if(argc > 5){
+    offset = atoi(argv[5]);   
+  }
+
   
   llvm_hpvm_initTensorRt(0); 
 
@@ -41,10 +52,6 @@ int main(int argc, char* argv[]){
     }
 
     startMemTracking(); 
-
-    int test_input_size = 2000; 
-    int batch_size = 1000;
-    int offset = 5000;
     
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc
index 0e5cdd1d284e6c7621cd3331b924c06969be79db..d2e852664e931957b518902881d813acf6692408 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc
@@ -17,6 +17,10 @@ int to_skip = 5;
 
 int main(int argc, char* argv[]){ 
 
+  int test_input_size = 2000; 
+  int batch_size = 1000;
+  int offset = 5000;
+
   if (argc > 1){
     total_runs = atoi(argv[1]);
   }
@@ -29,6 +33,15 @@ int main(int argc, char* argv[]){
     to_skip = atoi(argv[3]);   
   }
 
+  if(argc > 4){
+    test_input_size = atoi(argv[4]);   
+  }
+
+  if(argc > 5){
+    offset = atoi(argv[5]);   
+  }
+
+  
 
   llvm_hpvm_initTensorRt(0); 
 
@@ -40,10 +53,6 @@ int main(int argc, char* argv[]){
     }
 
     startMemTracking(); 
-
-    int test_input_size = 2000; 
-    int batch_size = 1000;
-    int offset = 5000;
     
     int batch_count = test_input_size / batch_size; 
     float final_accuracy = 0.0; 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc
index 33c68eae84a075f50b2bc8e7484036c54ade5620..0f4c9cd62adee6df3c93de9d99812fad96f4d650 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc
@@ -18,6 +18,11 @@ int to_skip = 5;
 
 int main(int argc, char* argv[]){ 
 
+  int test_input_size = 2000; 
+  int batch_size = 1000;
+  int offset = 5000;
+
+  
   if (argc > 1){
     total_runs = atoi(argv[1]);
   }
@@ -30,6 +35,14 @@ int main(int argc, char* argv[]){
     to_skip = atoi(argv[3]);   
   }
 
+  if(argc > 4){
+    test_input_size = atoi(argv[4]);   
+  }
+
+  if(argc > 5){
+    offset = atoi(argv[5]);   
+  }
+
 
   llvm_hpvm_initTensorRt(0); 
 
@@ -43,9 +56,6 @@ int main(int argc, char* argv[]){
 
    startMemTracking(); 
 
-   int test_input_size = 2000; 
-   int batch_size = 1000;
-   int offset = 5000;
 
    int batch_count = test_input_size / batch_size; 
    float final_accuracy = 0.0; 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc
index ff767235e9d44139f97ad885aa89eef1c385ad33..a9363acd7614c83f8def2e4df55e23c6f767733e 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc
@@ -16,7 +16,12 @@ int to_skip = 5;
 
 
 int main(int argc, char* argv[]){ 
-   
+
+ int test_input_size = 2000; 
+ int batch_size = 1000;
+ int offset = 5000;
+
+ 
  if (argc > 1){
    total_runs = atoi(argv[1]);
  }
@@ -29,6 +34,15 @@ int main(int argc, char* argv[]){
    to_skip = atoi(argv[3]);   
  }
 
+ if(argc > 4){
+   test_input_size = atoi(argv[4]);   
+ }
+
+ if(argc > 5){
+   offset = atoi(argv[5]);   
+ }
+
+
  llvm_hpvm_initTensorRt(0); 
 
  int missed = 0; 
@@ -40,9 +54,6 @@ int main(int argc, char* argv[]){
    
    startMemTracking(); 
 
-   int test_input_size = 2000; 
-   int batch_size = 1000;
-   int offset = 5000;
    
    int batch_count = test_input_size / batch_size; 
    float final_accuracy = 0.0; 
diff --git a/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/input.bin b/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/input.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4d2423f74188cfe0364185ccb66837785ccf4c4e
Binary files /dev/null and b/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/input.bin differ
diff --git a/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/labels.bin b/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/labels.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5e1f3881897f4729d6d90ff208a08ccdabb8fe7c
Binary files /dev/null and b/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/labels.bin differ
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
index 2dc985a0c14ebc18a68d5e54f78bd416f9d3b523..ff0b2a8c68d44b26982423d02ea891136ae87d8e 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h
@@ -31,8 +31,14 @@ extern "C"{
   void* tensorConvPerfCuda(void* input, void* filter,
 			   int vertical_pad, int horizontal_pad,
 			   int vertical_stride, int horizontal_stride,
-			   int conv_mode, int conv_groups, int row, int col, int start);
-
+			   int conv_mode, int conv_groups,
+			   int row, int col, int start);
+
+  void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
+			       int vertical_pad, int horizontal_pad,
+			       int vertical_stride, int horizontal_stride,
+			       int conv_mode, int conv_groups,
+			       int row, int col, int start);
   
   void sampleFilter(Tensor* filter, int skip_rate, int skip_offset);
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
index a81ffe296233178126555bbb53babdcd4192a7bf..0a741316682324ca6270aab2066ebc9f0b48bcdf 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -2,6 +2,36 @@
 #include "tensor_utils.cu"
 
 
+//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
+__global__ void convToGemmApproxHalf(__half * const __restrict__ output,
+				     const __half * const __restrict input, const int N, const int C,
+				     const int H, const int W, const int KH, const int KW, const int V_pad,
+				     const int H_pad, const int H_out, const int W_out, const int V_stride,
+				     const int H_stride, const int reduced_filter_elem,
+				     const int skip_every) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
+	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
+	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	  else
+	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
+	}
+      }
+    }
+  }
+}
+
 
 //This skips every xth row
 //H_eff is the number of rows calculated exactly
@@ -350,3 +380,477 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
   
   return new_output;
 }
+
+__global__
+void convToGemmPerfRowHalf(__half * const __restrict__ output,
+			   const __half * const __restrict input, const int N, const int C,
+			   const int H, const int W, const int KH, const int KW, const int V_pad,
+			   const int H_pad, const int H_out, const int W_out, const int V_stride,
+			   const int H_stride, const int x, const int start, const int H_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_eff * W_out); //output image number
+  const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
+  const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  int past_start = (h % (x - 1) >= (x - 1 - start));
+  const int inH = (h / (x - 1) * x + h % (x-1) +
+		   past_start) * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0;
+
+      }
+    }
+  }
+
+}
+
+
+//For use in tensorConvPerfCuda
+//Interpolates every xth row starting from x - 1 - start
+//N is total number of elements in final output array
+__global__
+void approxInterpolateRowHalf(int N, int old_h, int b, int c, int h, int w,
+			      __half *old_data, __half *new_data, int x, int start){
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < N; i += stride){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n = i / (c * h * w);
+    int past_start = ((row % x) >= (x - 1 - start));
+
+    if(row == h-1)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
+    else if (row == 0)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+    else if(row % x == x - 1 - start){
+      int past_startO = ((row - 1) % x) > (x - 1 - start);
+      int oldIdx1 = ch * (b * old_h * w) + n * (old_h * w) +
+	((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col;
+
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	__hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1 * w]), 2);
+    }
+    else
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * old_h * w) + n * (old_h * w) +
+		 ((x-1) * (row / x) + row % x - past_start )  * (w) + col];
+
+
+  }
+
+}
+
+
+//This skips every xth row
+//W_eff is the number of cols calculated exactly
+__global__
+void convToGemmPerfColHalf(__half * const __restrict__ output,
+			   const __half * const __restrict input, const int N, const int C,
+			   const int H, const int W, const int KH, const int KW, const int V_pad,
+			   const int H_pad, const int H_out, const int W_out, const int V_stride,
+			   const int H_stride, const int x, const int start, const int W_eff){
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_eff); //output image number
+  const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
+  const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
+  const int w = tx % W_eff; //output width index (col number)
+  int past_start = (w % (x - 1)) >= (x - 1 - start);
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = (w / (x - 1) * x + w % (x-1) +
+		   past_start) * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] = 0;
+
+      }
+    }
+  }
+
+}
+
+
+//For use in tensorConvPerfCuda
+//Interpolates every xth col starting from x - 1 - start
+//N is total number of elements in final output array
+__global__
+void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
+			      __half *old_data, __half *new_data, int x, int start){
+
+
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < N; i += stride){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n = i / (c * h * w);
+    int past_start = ((col % x) >= (x - 1 - start));
+
+    if(col == w-1)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
+    else if (col == 0)
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
+    else if(col % x == x - 1 - start){
+      int past_startO = ((col - 1) % x) > (x - 1 - start);
+      int oldIdx1 = ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
+	((x-1) * ((col - 1) / x) + (col-1) % x - past_startO);
+
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	__hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1]), 2);
+    }
+    else
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+	old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w +
+		 ((x-1) * (col / x) + col % x - past_start)];
+
+  } 
+}
+
+__global__
+void switchMatrix(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < N){
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
+
+    new_data[((n_new * c + ch) * h + row ) * w + col] =
+      old_data[((ch * n + n_new) * h + row ) * w + col];
+  }
+
+}
+						
+
+__global__
+void createNewFilter(__half *new_filter, __half *old_filter,
+		     int newFilterSize, int oldFilterSize){
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < newFilterSize; i += stride){
+    new_filter[i] = old_filter[i % oldFilterSize];
+  }
+}
+
+__global__
+void createBatches(int n, const __half * matA[], const __half * matB[], __half * matC[],
+		   __half * convData, __half * newFilter, __half * output,
+		   int aStride, int bStride, int cStride){
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+
+  for(int i = index; i < n; i += stride){
+    matA[i] = &convData[i * aStride];
+    matB[i] = &newFilter[i * bStride];
+    matC[i] = &output[i * cStride];
+  }
+}
+
+//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
+__global__ void convToGemmApproxHalfN(__half * const __restrict__ output,
+				     const __half * const __restrict input, const int N, const int C,
+				     const int H, const int W, const int KH, const int KW, const int V_pad,
+				     const int H_pad, const int H_out, const int W_out, const int V_stride,
+				     const int H_stride, const int reduced_filter_elem,
+				     const int skip_every) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int n = tx / (C * H_out * W_out); //output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
+  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
+  const int w = tx % W_out; //output width index (col number)
+  const int inH = h * V_stride - V_pad; //input height index (row number)
+  const int inW = w * H_stride - H_pad; //input width index (col number)
+  if(n < N) { //is thread id within bounds?
+    for(int i = 0; i < KH; i++) {
+      for(int j = 0; j < KW; j++) {
+	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
+	const int output_col = filter_elem_num; //calculate output column, taking skipping into account
+	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+	  output[((output_col * N + n) * H_out + h) * W_out + w] =
+	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+	else
+	  output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
+
+      }
+    }
+  }
+}
+
+//start has to be less than row or less than col
+//row and col have to be >= 0
+//row = col = 1 means no perforation
+void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr,
+			     int vertical_pad, int horizontal_pad, int vertical_stride,
+			     int horizontal_stride, int conv_mode, int conv_groups,
+			     int row, int col, int start){
+
+  INFO("*** TensorConvolution half perforation \n");
+
+  Tensor* input = (Tensor*)input_ptr;
+  Tensor* filter = (Tensor*)filter_ptr;
+  //FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
+    conv_groups = 1;
+  }
+
+  profileEvent("F2H_start");
+
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  convertToFP16(input);
+  convertToFP16(filter);
+
+  /******* END OF INPUT DATA CONVERSIONS*/
+  profileEvent("F2H_end");
+
+  profileEvent("Conv");
+
+  Tensor* output_half;
+  int n, c, h, w; // output dimensions
+  n = input->dims.dim_sizes[0];
+  c = filter->dims.dim_sizes[0]; //number of filters
+  const int KH = filter->dims.dim_sizes[2];
+  const int KW = filter->dims.dim_sizes[3];
+
+  h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  int h_eff = h - h / row;
+  if(h % row > row - 1 - start)
+    h_eff = h_eff - 1;
+
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  int w_eff = w - w / col;
+  if(w % col > col - 1 - start)
+    w_eff = w_eff - 1;
+
+
+  Tensor *new_output;
+  if(row > 1){
+    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, CUDNN_TENSOR_NCHW,
+					  n, c, h_eff, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
+						   horizontal_pad, h, w,
+						   vertical_stride, horizontal_stride, row, start, h_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h_eff * w, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h_eff * w,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+    
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    approxInterpolateRowHalf<<<numBlocks,256>>>(n * c * h * w, h_eff, n, c, h, w,
+						(__half *)output_half->gpu_half_data,
+						(__half *)new_output->gpu_half_data,
+						row, start);
+    cudaDeviceSynchronize();
+
+    cudaFree(output_half);
+    cudaFree(convData);
+  }
+  else if(col > 1){
+    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						   input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
+						   horizontal_pad, h, w,
+						   vertical_stride, horizontal_stride, col, start, w_eff);
+
+
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h * w_eff, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h * w_eff,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+    
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					 CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(new_output, DEVICE);
+
+    //interpolate
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    approxInterpolateColHalf<<<numBlocks,256>>>(n * c * h * w, w_eff, n, c, h, w,
+						(__half *)output_half->gpu_half_data,
+						(__half *)new_output->gpu_half_data,
+						col, start);
+    
+    cudaDeviceSynchronize();
+
+    cudaFree(output_half);
+    cudaFree(convData);
+
+  }
+  else{
+    output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					  CUDNN_TENSOR_NCHW, c, n, h, w);
+
+    // NOTE: Changing output tensor placement from host to device
+    changeTensorPlacement(output_half, DEVICE);
+    // NOTE: Necessary to insert the above call for every output tensor
+    //total number of filter elem
+    const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
+
+    __half * convData;
+    int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApproxHalfN<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
+						  input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+						  input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w,
+						  vertical_stride, horizontal_stride, num_filter_elem, c * h * w);
+    checkCudaErrors(cudaDeviceSynchronize());
+    //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+
+    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
+				 n * h * w, c, num_filter_elem,
+				 alpha_half,
+				 convData, CUDA_R_16F, n * h * w,
+				 (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+				 beta_half,
+				 (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
+				 CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+
+
+
+    // profileEvent("gemm_end", true);
+    new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
+					  CUDNN_TENSOR_NCHW, n, c, h, w);
+    changeTensorPlacement(new_output, DEVICE);
+
+    
+    int numBlocks = (n * c * h * w  + 255) / 256;
+    switchMatrix<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
+				    (__half *)output_half->gpu_half_data,
+				    (__half *)new_output->gpu_half_data);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    
+    cudaFree(convData);
+    cudaFree(output_half);
+  }
+
+  profileEvent("Conv_end", true);
+
+  profileEvent("H2F_start");
+
+  convertToFP32(new_output);
+
+  profileEvent("H2F_end");
+
+
+  #ifdef ERROR_INJECTION_ENABLED
+  if (op_counter >= total_ops) {
+    ERROR("No accuracy flag found \n");
+  }
+  int op_acc = op_accuracies[op_counter];
+  // Skip errorInjection if explicitly requested
+  if (skip_tensors.find(op_counter) != skip_tensors.end()) {
+    op_acc = 0;
+  }
+  void* error_norms = tensorAddError(output, op_acc);
+  add_norms(error_norms, "tensorConv", op_acc);
+  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
+  op_counter++;
+  #endif
+  return new_output;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc5cddf8a2121a937dbe8cd4582fe1022fd99f48
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
@@ -0,0 +1,238 @@
+#ifndef APPROXHPVM_IMG_RUNTIME_UTILS
+#define APPROXHPVM_IMG_RUNTIME_UTILS
+
+#include "configuration.h"
+#include "hpvm-rt-controller.h"
+
+#include "img_tensor_runtime.h"
+
+
+// Utilities header for ApproxHPVM image runtime API (wrapper runtime API)
+
+void* handleTensorFftApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorFft(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorFft", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorFft", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorReduceApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorReduce(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second); // and here
+        return t_out;
+        }
+      case GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorReductionSamplingReduce(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorReductionSamplingReduce",
+                                              pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorReductionSamplingReduce",
+                                                pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorProjectiveTApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorProjectiveT(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorProjectiveT", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorProjectiveT", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorMap1ApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorMap1(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorMap1", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorMap1", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorMap2ApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorMap2(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorMap2", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorMap2", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+void* handleTensorMap3ApproximationTuples(
+  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
+   void* input) {
+  if (approxTuples.size() == 1) {
+    enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
+    int param = approxTuples[0].second;
+    switch (approx) {
+      case GPUNodeConfiguration::APPROX::FP32 :
+        {
+        void* t_out;
+        RC->resume_profiler();
+        t_out = tensorMap3(input); //TODO: correct name here
+        RC->pause_profiler();
+        std::pair<double, double> pinfo = RC->get_time_energy();
+        RC->reset_profiler();
+        RC->addToCurrentIterationComputeTime("tensorMap3", pinfo.first); // and here
+        RC->addToCurrentIterationComputeEnergy("tensorMap3", pinfo.second); // and here
+        return t_out;
+        }
+      default :
+        CUSTOM_ASSERT(false && "Unknown approximation type");
+        ERROR("Unknown approximation type");
+        abort();
+      // TODO additional approx methods implemented here
+      }
+    } else if (approxTuples.size() == 2) {
+      ERROR("Currently unsupported case");
+      abort();
+    } else {
+      ERROR("Unsupported case");
+      abort();
+    }
+  return NULL;
+}
+
+
+#endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c098719e52e31fcd06b6425964c8e1d48a15210
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
@@ -0,0 +1,26 @@
+#ifndef IMG_TENSOR_RUNTIME_H
+#define IMG_TENSOR_RUNTIME_H
+
+// ***                        Runtime declaration                        *** //
+void* tensorFft(void* input);
+void* tensorReduce(void* input);
+void* tensorReductionSamplingReduce(void* input);
+void* tensorProjectiveT(void* input);
+void* tensorMap1(void* input);
+void* tensorMap2(void* input);
+void* tensorMap3(void* input);
+
+// ***                      Wrapper API declaration                      *** //
+void* wrapper_tensorFft(const char* hpvm_node_id, void* input);
+void* wrapper_tensorReduce(const char* hpvm_node_id, void* input);
+void* wrapper_tensorProjectiveT(const char* hpvm_node_id, void* input);
+void* wrapper_tensorMap1(const char* hpvm_node_id, void* input);
+void* wrapper_tensorMap2(const char* hpvm_node_id, void* input);
+void* wrapper_tensorMap3(const char* hpvm_node_id, void* input);
+
+// Tentative
+void* wrapper_tensorStencil(const char* hpvm_node_id, void* input);
+void* wrapper_tensorCosineT(const char* hpvm_node_id, void* input);
+
+
+#endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
index 6041bb4de989be20acef973ac7f632b838a097a4..06c492c9e8fb45e0a51de153e8cf434a79a50e23 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
@@ -10,6 +10,7 @@
 #include "tensor.h"
 #include "rt-controller-api.h"
 
+#include "img_tensor_runtime.h"
 
 #ifndef CUDNN_HEADER
 #define CUDNN_HEADER
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0460e490fd2b188b85f53cf9b109f09ac3d6b83a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
@@ -0,0 +1,137 @@
+#include "../include/debug.h"
+#include "../include/img_tensor_runtime.h"
+#include "../include/approxhpvm_img_runtime_utils.h" 
+
+// ***                       Runtime implementation                      *** //
+void* tensorFft(void* input) {
+
+}
+
+void* tensorReduce(void* input) {
+
+}
+
+void* tensorReductionSamplingReduce(void* input) {
+
+}
+
+void* tensorProjectiveT(void* input) {
+
+}
+
+void* tensorMap1(void* input) {
+
+}
+
+void* tensorMap2(void* input) {
+
+}
+
+void* tensorMap3(void* input) {
+
+}
+
+
+// ***                     Wrapper API implementation                    *** //
+
+void* wrapper_tensorFft(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a fft operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::FFT &&
+         "Invalid configuration generated for tensor fft wrapper operation");
+  return handleTensorFftApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorReduce(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a reduce operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::REDUCE &&
+         "Invalid configuration generated for tensor reduce wrapper operation");
+  return handleTensorReduceApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorProjectiveT(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a projectiveT operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T &&
+         "Invalid configuration generated for tensor projectiveT wrapper operation");
+  return handleTensorProjectiveTApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorMap1(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a map1 operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP1 &&
+         "Invalid configuration generated for tensor map1 wrapper operation");
+  return handleTensorMap1ApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorMap2(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a map2 operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP2 &&
+         "Invalid configuration generated for tensor map2 wrapper operation");
+  return handleTensorMap2ApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+void* wrapper_tensorMap3(const char* hpvm_node_id, void* input) {
+  GPUNodeConfiguration *GPUConf =
+    (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
+  std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
+                          std::vector< std::pair<GPUNodeConfiguration::APPROX,
+                                                 int> > > > &ApproxChoices =
+    GPUConf->getApproxChoices();
+  // Approximation choices must be for a map3 operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+         ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP3 &&
+         "Invalid configuration generated for tensor map3 wrapper operation");
+  return handleTensorMap3ApproximationTuples(ApproxChoices[0].second,
+                                            input);
+}
+
+// Tentative
+void* wrapper_tensorStencil(const char* hpvm_node_id, void* input) {
+  ERROR("Stencil operation currently unsupported.\n");
+  abort();
+}
+
+void* wrapper_tensorCosineT(const char* hpvm_node_id, void* input) {
+  ERROR("CosineT operation currently unsupported.\n");
+  abort();
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index 9e58f36a402844c33a1cb665ae4113e6e6a8534f..cc2a5d6ba91ff3c7bae29980340142656e2ef62b 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -49,6 +49,8 @@
 
 #include "../include/approx_simulation.h"
 
+// Image tensor runtime implementation
+#include "img_tensor_runtime.cu"
 
 //** Potential Improvements:
 //   1) Add support for dataypes beyond floats and half 
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp
index d92bc0c45d1115620d529aea4636ece8d3d62127..50732550db8c8f02c940e485702c3253a7bb9760 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp
@@ -440,7 +440,7 @@ int main(){
   int test_input_size = 10000;
   int batch_count = test_input_size / batch_size; 
   
-  void* input = create4DTensor(0,nchw,batch_size,3,32,32);
+  // void* input = create4DTensor(0,nchw,batch_size,3,32,32);
 
   startMemTracking();
   startProfiling();
@@ -450,8 +450,13 @@ int main(){
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
 
-    copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-  
+    // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
+
+	// Replaced create4DTensor and copyInputBatch with readInputBatch 
+    void* input = readInputBatch(input_path.c_str(), 0,
+                     start, end, 
+                     3, 32, 32);
+ 
     args->input = input;
     args->input_bytes = 0; 
   
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile
index 30c80f2a1a65ad122681b76e531ed5d99ec8a12b..914c9817d64cd0e55baababe78fd7ae6e33c34ab 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile
@@ -60,18 +60,17 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll
 	$(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP)_promise.ll -S -o  $(BUILD_DIR)/$(APP)_promise.visc.ll
 	$(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP)_loop.ll -S -o  $(BUILD_DIR)/$(APP)_loop.visc.ll
 	$(OPT) $(VISC_OPTFLAGS)  $(BUILD_DIR)/$(APP).visc.ll  -o  $(BUILD_DIR)/$(APP)_cudnn.bc
-	$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_promise.bc
+	#$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_promise.bc
 	$(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_wrapperapi.bc
 	$(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll  -o  $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc
-	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc
+	#$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc
 	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
+	#$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
 	$(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS)
 	$(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS)
-	#$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS)
 
 $(BUILD_DIR):
 	mkdir -p $@
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/src/alexnet2_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/src/alexnet2_loop.cpp
index 03ffd82ee7df6240e565397d099e37aecd9dad9e..91bf3b0c4523e7239d7f11f6ad350f9dbb454a91 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/src/alexnet2_loop.cpp
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/src/alexnet2_loop.cpp
@@ -491,16 +491,21 @@ int main(){
   int batch_count = test_input_size / batch_size;
 
   std::string input_path =  dir_prefix + std::string("input.bin"); 
-  void* input = create4DTensor(0,nchw,batch_size,3,32,32);
+  // void* input = create4DTensor(0,nchw,batch_size,3,32,32);
 
-  
   startMemTracking();
+  startProfiling();
+
   for (int i = 0; i < batch_count; i++){
 
     int start = i * batch_size; 
     int end = (i + 1) * batch_size; 
 
-    copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
+    // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
+
+    void* input = readInputBatch(input_path.c_str(), 0,
+                     start, end,
+                     3, 32, 32);
 
     args->input = input; 
     args->input_bytes = 0; 
@@ -512,19 +517,12 @@ int main(){
     void *result = static_cast<RootIn*>(args)->input; 
     hpvm_request_tensor(result, 0); 
 
-
-//    uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); 
-//
-//    computeAccuracy3(labels, result);
-
     llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
       
     freeBatchMemory();
   }
-
-  
+  stopProfiling();
   __visc__cleanup(); 
 
-
   return 0; 
 }