diff --git a/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py b/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py
index e007ca9277f9e584708488ee57fd08c693a00279..88f74c9194bb105a2a731ed063aed7e2ac875e6a 100644
--- a/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py
+++ b/hpvm/projects/hpvm-profiler/hpvm_profiler/__init__.py
@@ -1,8 +1,11 @@
 from pathlib import Path
+from subprocess import CalledProcessError, PIPE
 from typing import Iterable, List, Tuple, Union
 from dataclasses import dataclass
+from tqdm import trange
 
 PathLike = Union[Path, str]
+conf_opening, conf_closing = "+++++", "-----"
 
 
 def profile_configs(
@@ -43,12 +46,18 @@ def profile_configs(
         raise ValueError("Config file with no configs is unsupported.")
     temp_file = NamedTemporaryFile("w")
     baseline_time, baseline_acc = None, None
-    for idx, config in enumerate(configs):
+    for idx in trange(len(configs), desc="Configs profiled"):
+        config = configs[idx]
         # Write config to temp config file
         write_hpvm_config(header, [config], Path(temp_file.name))
         # Run binary_path binary,
         # which generates `profile_filename` and `qos_filename` file in cwd.
-        check_call(str(binary_path))
+        try:
+            check_call([str(binary_path), "-c", str(temp_file.name)], stdout=PIPE)
+        except CalledProcessError as e:
+            print("Output from the program:")
+            print(e.output)
+            raise e
         # Read these two files for time and QoS info.
         time = _read_profile_file(Path(profile_filename))
         acc = _read_qos_file(Path(qos_filename))
@@ -112,7 +121,7 @@ class Config:
 
     def update_profile_results(self, speedup: float, qos: float, base_qos: float):
         recorded_base_qos = self.qos + self.qos_loss
-        if abs(recorded_base_qos - base_qos) > 1e-3:
+        if abs(recorded_base_qos - base_qos) > 0.02:
             raise ValueError(
                 f"Baseline QoS mismatch. Original: {recorded_base_qos}, measured: {base_qos}"
             )
@@ -129,7 +138,8 @@ class Config:
             self.qos_loss,
         ]
         header = " ".join(str(field) for field in header_fields)
-        return f"{header}\n{self.config_body}"
+        lines = [conf_opening, header, *self.config_body, conf_closing]
+        return "\n".join(lines)
 
     __str__ = __repr__
 
@@ -139,13 +149,12 @@ def read_hpvm_configs(config_file: PathLike) -> Tuple[str, List[Config]]:
     ret_configs = []
     with open(config_file) as f:
         text = f.read()
-    opening, closing = "+++++", "-----"
     # There's 1 float sitting on the first line of config file.
     # We don't use it, but want to keep that intact.
-    header, *configs = text.split(opening)
+    header, *configs = text.split(conf_opening)
     header = header.strip()
     for config_text in configs:
-        config_text = config_text.replace(closing, "").strip()
+        config_text = config_text.replace(conf_closing, "").strip()
         config_header, *config_body = config_text.splitlines()
         conf_name, *number_fields = config_header.split(" ")
         speedup, energy, qos, qos_drop = [float(s) for s in number_fields]
diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in
index 1c4a386ce2fd9e50953a49377df20c9d3ebf75da..1f6dd875ffa6b39ab57609d7690c9a9ad3944b44 100644
--- a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in
+++ b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in
@@ -93,7 +93,9 @@ int main(int argc, char *argv[]){
   #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++){
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, {{input_shape|join(', ')}}, {{input_name}});
+    void *{{input_name}} = readInputBatch(input_path.c_str(), 0, start, end, {{input_shape|join(', ')}});
+    args->{{input_name}} = {{input_name}};
+    args->{{input_name}}_bytes = 0;
 
     void* dfg = __hpvm__launch(0, root, (void*) args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
index 35f8188f785d023264c31a20480f661f066fb9f5..860e3b6423bc78d073096a981f765bed10fb73a7 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
@@ -405,10 +405,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -488,11 +508,17 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
index 5bcc5b627b546d714404c89d9a775856c647e7bc..f44e19dece121cb01a1f3e6a8bf9e27ea945e6ce 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
@@ -410,10 +410,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -493,12 +513,18 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
index 51e0dd137db1cd835412bc6ee5694795718e739d..6d8973ad982b1aa3b206a0cf40ee1888c37e293f 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
@@ -356,10 +356,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -430,11 +450,17 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
index 74c5420fd9b77aa2deab656204e43b164a241304..b2a940d501d8b1c2e29dbe7240012ace8197bbb4 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
@@ -362,10 +362,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -436,12 +456,18 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
index 16bcecf939051ef7490d58a196a12786b0d4f465..474ab64cadf3eac158d39e6e1e6686765c3bac36 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
@@ -454,10 +454,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 100, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -548,11 +568,17 @@ int main() {
   args->dense_3_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 224, 224);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
index 5ddd9694328db6d892c8c23b44b2e165afe77953..10e95202f2e2188a9dcd1c12a168a612f897fcf9 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
@@ -459,10 +459,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 100, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -553,12 +573,18 @@ int main() {
   args->dense_3_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 224, 224);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
index ee81665ec94a4c9cc634c16bdda5bbea96e120df..5c42f6953cfd9256cea73b39868a7ec571f18565 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
@@ -258,10 +258,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 1000, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -311,11 +331,17 @@ int main() {
   args->dense_2_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 1, 28, 28, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 1, 28, 28);
+    args->input = input;
+    args->input_bytes = 0;;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
index eecc7f5d60cf63b10ea5af098156a0dfa2890f80..0c2568f81b701cb474a257b190be61b4bba45f3e 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
@@ -263,10 +263,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 1000, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -316,12 +336,18 @@ int main() {
   args->dense_2_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 1, 28, 28, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 1, 28, 28);
+    args->input = input;
+    args->input_bytes = 0;;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
index 58051e0993e8b8893a398eee3a0358556a18c2f4..01d027341686291c83e605bdeee1bbcffa68d6e9 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
@@ -1959,10 +1959,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix =
       std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
 
@@ -2780,11 +2800,17 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
index 482a37d4c4be22eda1079b0a900c762fdb4d1001..e51e85dd980dd910389ec4415174e6e005f75c41 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
@@ -1964,10 +1964,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix =
       std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
@@ -2785,12 +2805,18 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
index a254a625709f13ec08b403c26eac126a09df6daa..fa83c534d0639241205758018f8f7c37401e6b22 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
@@ -1294,10 +1294,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -1526,11 +1546,17 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
index da1ce91ba3fdb4dc7d74e6b854dad7fc1c2d412e..c7b789c2343a8dfd1e847652af2bd1d6adfd51f1 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
@@ -1225,10 +1225,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -1458,12 +1478,18 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
index a3ece5fedec57a73537d870199b6b4270b541b42..91d07e30469e675fd2027f29290e35a0db888174 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
@@ -5126,10 +5126,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 25, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix =
       std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
@@ -6953,11 +6973,17 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 224, 224);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
index 03674b50a5b6b9dcde87fd1e32b0520362ca8ca3..932580e03e7ccc4495d8d76be2f7147369e36d68 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
@@ -4902,10 +4902,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 50, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix =
       std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
@@ -6730,12 +6750,18 @@ int main() {
   args->dense_1_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 224, 224);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
index cad22649fdfe4fd6271f5202aa524cea2f3f1383..195c676c11d53b19e0d18ed4908198a929d188aa 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
@@ -821,10 +821,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -983,11 +1003,17 @@ int main() {
   args->dense_2_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
index 662520282892f852fd8f634061cc0f6f72e465f9..c304237ea57ba15d48cff0773860cdc469fc2a04 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
@@ -826,10 +826,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -989,12 +1009,18 @@ int main() {
   args->dense_2_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
index 54417171fbcda003e27d7662a11f35499f7c0cc8..4cd5c134293d85983146352175e278915ab1d2ba 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
@@ -821,10 +821,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -983,11 +1003,17 @@ int main() {
   args->dense_2_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
index 9f989e361051a8623657d11224cbb898f061032e..532fca6b856f296624c21e9a18421763c4b70f48 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
@@ -826,10 +826,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 500, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -988,12 +1008,18 @@ int main() {
   args->dense_2_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 32, 32);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
index 12f7870a152d8f42fa01b90429bc1102059861ae..8e299f40e6ddd04a3ce9f8d9dffff49b1de36189 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
@@ -869,10 +869,30 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 10, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -1043,11 +1063,17 @@ int main() {
   args->dense_3_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 224, 224);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
index 189460c928d65ed989201dc715df5cbe0ccd5bde..930a33e43c706e6e91475fc97671c39c23f63387 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
@@ -874,10 +874,30 @@ void write_accuracy(float accuracy) {
   fout << std::fixed << accuracy;
 }
 
+void printUsage(const std::string &bin_name) {
+  std::cerr << "Usage: " << bin_name << " [-c CONF_FILE]\n";
+}
+
 const int batch_size = 25, input_size = 5000,
           batch_count = input_size / batch_size;
 
-int main() {
+int main(int argc, char *argv[]) {
+  std::string config_path = "";
+  int flag;
+  while ((flag = getopt(argc, argv, "hc:")) != -1) {
+    switch (flag) {
+    case 'c':
+      config_path = std::string(optarg);
+      break;
+    case 'h':
+      printUsage(argv[0]);
+      return 0;
+    default:
+      printUsage(argv[0]);
+      return 1;
+    }
+  }
+
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -1049,12 +1069,18 @@ int main() {
   args->dense_3_b_bytes = 0;
 
   __hpvm__init();
+  if (config_path != "") {
+    llvm_hpvm_initializeRuntimeController(config_path.c_str());
+  }
+
   float total_accuracy = 0;
   startMemTracking();
 #pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
     int start = i * batch_size, end = start + batch_size;
-    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+    void* input = readInputBatch(input_path.c_str(), nchw, start, end, 3, 224, 224);
+    args->input = input;
+    args->input_bytes = 0;
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
     __hpvm__wait(dfg);
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h b/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h
index 05d9157a6473fb74061e6edefc4455080368f706..1d5ac7d908b0990f21de885c645786997640264c 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h
@@ -329,46 +329,6 @@ struct Tensor *readInputBatch(const char *file_name, long data_type, long start,
   return weights;
 }
 
-void *copyInputBatch(const char *file_name, long start, long end,
-                     long dim2_size, long dim3_size, long dim4_size,
-                     void *inputTensor_ptr) {
-
-  struct Tensor *inputTensor = (struct Tensor *)inputTensor_ptr;
-
-  int dim1_size = end - start;
-  // FIXIT: Don't assume floating point types
-  int type_size = 4; // NOTE: Assuming floating point tensors
-  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes =
-      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
-  long int file_header_size =
-      type_size * start * dim2_size * dim3_size * dim4_size;
-
-  FILE *file = fopen(file_name, "rb");
-  if (file == NULL) {
-    printf("Data file %s is not found. Aborting... \n", file_name);
-    abort();
-  }
-
-  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
-
-  fclose(file);
-
-  initTensorData(inputTensor, tensor_data, size_in_bytes);
-  free(tensor_data);
-
-  printf("******NOTE: tensor Dims = %d \n", inputTensor->dims.num_dims);
-  if (inputTensor->host_data == NULL || inputTensor->gpu_data == NULL)
-    printf("ERROR: NULL data pointers \n");
-
-  // Chaning Tensor Placement to HOST
-  changeTensorPlacement(inputTensor, HOST);
-
-  return inputTensor;
-}
-
 uint8_t *readLabels(const char *labels_file, int num_labels) {
 
   uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
diff --git a/hpvm/test/dnn_benchmarks/profiling/CMakeLists.txt b/hpvm/test/dnn_benchmarks/profiling/CMakeLists.txt
index 712741c0e347acfc84e37bc2c91d998f549c7077..23e0e9161884ce95152e2feffe19d6b1acfcf381 100644
--- a/hpvm/test/dnn_benchmarks/profiling/CMakeLists.txt
+++ b/hpvm/test/dnn_benchmarks/profiling/CMakeLists.txt
@@ -11,4 +11,5 @@ configure_lit_site_cfg(
 add_lit_testsuite(check-hpvm-profiler "Run tests for package hpvm-profiler"
   ${CMAKE_CURRENT_BINARY_DIR}
   DEPENDS dnn_benchmarks  # Requires all dnn benchmarks
+  ARGS "-j1"  # Run DNN benchmarks sequentially
 )
diff --git a/hpvm/test/dnn_benchmarks/profiling/lit.cfg.py b/hpvm/test/dnn_benchmarks/profiling/lit.cfg.py
index c3584478209402a308ed17ba2c3e5994a49dab76..5c11f61baf0d3d3ec8464d15828d24d7c54f22c0 100644
--- a/hpvm/test/dnn_benchmarks/profiling/lit.cfg.py
+++ b/hpvm/test/dnn_benchmarks/profiling/lit.cfg.py
@@ -28,7 +28,5 @@ config.test_exec_root = current_binary_dir
 # Tweak the PATH to include the tools dir.
 llvm_config.with_environment("PATH", config.llvm_tools_dir, append_path=True)
 
-llvm_config.use_default_substitutions()
-
 # Add substitution for our main script in this directory.
 llvm_config.add_tool_substitutions(["test_hpvm_c_profiling.py"], config.test_source_root)