From 03425866e7a72c1bf50cef375c75d26460f8c13e Mon Sep 17 00:00:00 2001
From: Yifan Zhao <yifanz16@illinois.edu>
Date: Thu, 25 Mar 2021 03:51:03 -0500
Subject: [PATCH] Fixed bugs in hpvm-c benchmarks

---
 .../alexnet2_cifar10/alexnet2_cifar10.cpp     |  72 +-
 .../alexnet2_cifar10_cudnn.cpp                |  54 +-
 .../alexnet_cifar10/alexnet_cifar10.cpp       |  69 +-
 .../alexnet_cifar10/alexnet_cifar10_cudnn.cpp |  48 +-
 .../alexnet_imagenet/alexnet_imagenet.cpp     |  74 +-
 .../alexnet_imagenet_cudnn.cpp                |  51 +-
 .../benchmarks/lenet_mnist/lenet_mnist.cpp    |  48 +-
 .../lenet_mnist/lenet_mnist_cudnn.cpp         |  53 +-
 .../mobilenet_cifar10/mobilenet_cifar10.cpp   |  49 +-
 .../mobilenet_cifar10_cudnn.cpp               |  49 +-
 .../resnet18_cifar10/resnet18_cifar10.cpp     |  74 +-
 .../resnet18_cifar10_cudnn.cpp                |  51 +-
 .../resnet50_imagenet/resnet50_imagenet.cpp   |  64 +-
 .../resnet50_imagenet_cudnn.cpp               |  48 +-
 .../vgg16_cifar10/vgg16_cifar10.cpp           |  67 +-
 .../vgg16_cifar10/vgg16_cifar10_cudnn.cpp     |  60 +-
 .../vgg16_cifar100/vgg16_cifar100.cpp         |  68 +-
 .../vgg16_cifar100/vgg16_cifar100_cudnn.cpp   |  51 +-
 .../vgg16_imagenet/vgg16_imagenet.cpp         |  65 +-
 .../vgg16_imagenet/vgg16_imagenet_cudnn.cpp   |  48 +-
 .../hpvm-c/include/tensorUtils.h              | 663 ++++++++----------
 21 files changed, 771 insertions(+), 1055 deletions(-)

diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
index dd689d202a..35f8188f78 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -411,10 +405,12 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
-
+  std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
@@ -458,12 +454,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-  // void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32);
-  // uint8_t* labels = readLabels(labels_path.c_str(),10000);
-
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -493,45 +487,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
 
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-
-  std::string input_path = dir_prefix + std::string("test_input.bin");
-  // void* input = create4DTensor(0,nchw,batch_size,3,32,32);
-
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-
-      // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-
-      args->input = input;
-      args->input_bytes = 0;
-
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-
-      __hpvm__wait(dfg);
-
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
   }
-
-  stopProfiling();
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
index 448db0b4c8..5bcc5b627b 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -411,10 +405,17 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/";
+  std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
@@ -458,9 +459,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -490,41 +492,25 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
 
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-
-  std::string input_path = dir_prefix + std::string("test_input.bin");
-  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
-
+  __hpvm__init();
+  float total_accuracy = 0;
   startMemTracking();
+#pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
-
-    int start = i * batch_size;
-    int end = (i + 1) * batch_size;
-
+    int start = i * batch_size, end = start + batch_size;
     copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
 
-    args->input = input;
-    args->input_bytes = 0;
-
     void *dfg = __hpvm__launch(0, root, (void *)args);
-
     __hpvm__wait(dfg);
-
     void *result = static_cast<RootIn *>(args)->r.tensor;
     hpvm_request_tensor(result, 0);
 
     uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
-
-    computeAccuracy3(labels, result);
-
-    // llvm_hpvm_invokeRtControl2(result, labels);
-
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
     freeBatchMemory();
   }
-
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
index ae5f31b7dc..51e0dd137d 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -362,12 +356,12 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
-
   std::string input_path = dir_prefix + std::string("test_input.bin");
-  // void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32);
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   uint8_t *labels = readLabels(labels_path.c_str(), 5000);
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -406,11 +400,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -436,43 +429,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
 
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-
-      // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-
-      // Replaced create4DTensor and copyInputBatch with readInputBatch
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-
-      args->input = input;
-      args->input_bytes = 0;
-
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-
-      __hpvm__wait(dfg);
-
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
   }
-
-  stopProfiling();
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
index ab4f6a2402..74c5420fd9 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -363,12 +357,17 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
-  void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 3, 32, 32);
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -407,9 +406,8 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -436,14 +434,26 @@ int main() {
   args->dense_1_w_bytes = 0;
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-
-  hpvm_request_tensor(result, 0);
 
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-  computeAccuracy3(labels, result);
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
index d49c0d2d06..16bcecf939 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp
@@ -1,14 +1,8 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <config.h>
 #include <hpvm.h>
+#include <string>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
-#include <config.h>
 
 void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) {
   __hpvm__hint(hpvm::TENSOR_TARGET);
@@ -460,11 +454,11 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
-
-  std::string dir_prefix =
-      std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
+const int batch_size = 100, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -515,16 +509,11 @@ int main() {
   std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
   void *dense_3_b =
       readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
-  // void* input = readTrainedWeights(input_path.c_str(), 0, 1000,3,224,224);
-  // uint32_t* labels = readLabels2(labels_path.c_str(),6000);
 
-  // uint32_t* labels = readLabels3(labels_path.c_str(), 1000);
-
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -558,40 +547,21 @@ int main() {
   args->dense_3_b = dense_3_b;
   args->dense_3_b_bytes = 0;
 
-  int batch_size = 200;
-  int test_input_size = 4000;
-  int batch_count = test_input_size / batch_size;
-
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
-
-      args->input = input;
-      args->input_bytes = 0;
-
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-
-      __hpvm__wait(dfg);
-
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
   }
-
-  stopProfiling();
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
index 4acba95173..5ddd969432 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -460,8 +454,15 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
+const int batch_size = 100, input_size = 5000,
+          batch_count = input_size / batch_size;
+
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -513,14 +514,9 @@ int main() {
   std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
   void *dense_3_b =
       readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 1000, 3, 224, 224);
-  // uint32_t* labels = readLabels2(labels_path.c_str(),6000);
-
-  uint32_t *labels = readLabels3(labels_path.c_str(), 1000);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -556,14 +552,25 @@ int main() {
   args->dense_3_b = dense_3_b;
   args->dense_3_b_bytes = 0;
 
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-
-  __hpvm__wait(dfg);
-
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
-
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-  computeAccuracy3(labels, result);
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
index b67d585d01..ee81665ec9 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -264,13 +258,13 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+const int batch_size = 1000, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
-
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
-
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 1, 5, 5);
@@ -294,15 +288,11 @@ int main() {
       readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 1024, 10);
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
-  //  void* input = readTrainedWeights(input_path.c_str(), 0, 5000,1,28,28);
 
-  //  uint32_t* labels = readLabels3(labels_path.c_str(), 5000);
-
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
-  //  args->input = input;
-  //  args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 1, 28, 28);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -320,37 +310,21 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
 
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
+#pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
-
-    int start = i * batch_size;
-    int end = (i + 1) * batch_size;
-
-    void *input = readInputBatch(input_path.c_str(), 0, start, end, 1, 28, 28);
-
-    args->input = input;
-    args->input_bytes = 0;
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 1, 28, 28, input);
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
-
     __hpvm__wait(dfg);
-
     void *result = static_cast<RootIn *>(args)->r.tensor;
     hpvm_request_tensor(result, 0);
 
     llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
     freeBatchMemory();
   }
-
-  stopProfiling();
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
index 2159cfe286..eecc7f5d60 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -264,13 +258,18 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
+const int batch_size = 1000, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
-
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 1, 5, 5);
@@ -294,13 +293,9 @@ int main() {
       readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 1024, 10);
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 1, 28, 28);
-
-  uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 1, 28, 28);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -320,15 +315,25 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
 
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-
-  __hpvm__wait(dfg);
-
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
-
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 1, 28, 28, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-  computeAccuracy3(labels, result);
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
index a4de282621..58051e0993 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -1965,8 +1959,12 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
+
 int main() {
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
+  std::string dir_prefix =
+      std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
 
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -2501,14 +2499,11 @@ int main() {
       readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10);
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  // void* input = readTrainedWeights(input_path.c_str(), 0, 5000,3,32,32);
-  // uint8_t* labels = readLabels(labels_path.c_str(), 5000);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->batch_normalization_1_gamma = batch_normalization_1_gamma;
@@ -2784,39 +2779,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
 
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-
-  // void* input = create4DTensor(0,nchw,batch_size,3,32,32);
-
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
+#pragma clang loop unroll(disable)
   for (int i = 0; i < batch_count; i++) {
-
-    int start = i * batch_size;
-    int end = (i + 1) * batch_size;
-
-    // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-
-    args->input = input;
-    args->input_bytes = 0;
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
 
     void *dfg = __hpvm__launch(0, root, (void *)args);
-
     __hpvm__wait(dfg);
-
     void *result = static_cast<RootIn *>(args)->r.tensor;
     hpvm_request_tensor(result, 0);
 
     llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
     freeBatchMemory();
   }
-  stopProfiling();
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
index 9f4069b34b..482a37d4c4 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -1965,11 +1959,17 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
+
+int main() {
   std::string dir_prefix =
       std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/";
-
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -2503,12 +2503,10 @@ int main() {
       readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10);
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 3, 32, 32);
   uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -2786,14 +2784,25 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
 
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-
-  __hpvm__wait(dfg);
-
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
-
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-  computeAccuracy3(labels, result);
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
index 66ab37cd33..a254a62570 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp
@@ -1,11 +1,5 @@
 
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -1300,14 +1294,13 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
+
 int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
-
   std::string input_path = dir_prefix + std::string("test_input.bin");
-  // void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32);
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
-  // uint32_t* labels = readLabels3(labels_path.c_str(),5000);
-
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
   void *conv2d_1_w =
       readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3);
@@ -1439,11 +1432,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -1533,47 +1525,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
 
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-
-  // NOTE-HASHIM: commented out
-  // void* input = create4DTensor(0,nchw,batch_size,3,32,32);
-
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-
-      // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-
-      // NOTE-HASHIM: Commented out above line and line that does create4DTensor
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-
-      args->input = input;
-      args->input_bytes = 0;
-
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-
-      __hpvm__wait(dfg);
-
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
-      printf("RUNNING BATCH = %d \n", i);
-
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
   }
-
-  stopProfiling();
   __hpvm__cleanup();
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
index 1b6c98b886..da1ce91ba3 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp
@@ -1,11 +1,5 @@
 
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -1226,12 +1220,17 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
-  void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 3, 32, 32);
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   uint32_t *labels = readLabels3(labels_path.c_str(), 5000);
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -1365,9 +1364,8 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -1459,16 +1457,25 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
 
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-
-  __hpvm__wait(dfg);
-
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
-
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-
-  computeAccuracy3(labels, result);
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
index db6b64daa0..a3ece5fede 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -5132,8 +5126,10 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+const int batch_size = 25, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
   std::string dir_prefix =
       std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
@@ -6311,12 +6307,10 @@ int main() {
   void *dense_1_b =
       readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1);
 
-  // void* input = readTrainedWeights(input_path.c_str(), 0,100,3,224,224);
-  // uint32_t* labels = readLabelsBatch3(labels_path.c_str(),0,100);
-
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -6958,39 +6952,21 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
 
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
-  unsigned int batch_size = 50;
-  unsigned int test_input_size = 1000;
-  unsigned int batch_count = test_input_size / batch_size;
-
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
-
-      args->input = input;
-      args->input_bytes = 0;
-
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-
-      __hpvm__wait(dfg);
-
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
   }
-
-  stopProfiling();
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
index c7817caf53..03674b50a5 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -4903,8 +4897,15 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
+const int batch_size = 50, input_size = 5000,
+          batch_count = input_size / batch_size;
+
+int main() {
   std::string dir_prefix =
       std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
@@ -6081,12 +6082,10 @@ int main() {
   std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
   void *dense_1_b =
       readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 100, 3, 224, 224);
   uint32_t *labels = readLabels3(labels_path.c_str(), 100);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -6730,14 +6729,25 @@ int main() {
   args->dense_1_b = dense_1_b;
   args->dense_1_b_bytes = 0;
 
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-
-  __hpvm__wait(dfg);
-
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
-
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-  computeAccuracy3(labels, result);
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
index 39c2ffc876..cad22649fd 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -827,8 +821,10 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -920,14 +916,11 @@ int main() {
       readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10);
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
-  // void* input = readTrainedWeights(input_path.c_str(), 0,2000,3,32,32);
-  // uint32_t* labels = readLabels3(labels_path.c_str(),2000);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
-  // args->input = input;
-  // args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -989,41 +982,21 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
 
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-
-      // copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-
-      args->input = input;
-      args->input_bytes = 0;
-
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-
-      __hpvm__wait(dfg);
-
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
   }
-
-  stopProfiling();
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
index 2f18dc17c6..6625202828 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -827,8 +821,15 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
+
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -920,12 +921,10 @@ int main() {
       readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10);
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32);
   uint32_t *labels = readLabels3(labels_path.c_str(), 2000);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -989,28 +988,25 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
 
+  __hpvm__init();
+  float total_accuracy = 0;
   startMemTracking();
-  startProfiling();
-
-  input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32);
-
-  args->input = input;
-  args->input_bytes = 0;
-
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-
-  __hpvm__wait(dfg);
-
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
-
-  computeAccuracy3(labels, result);
-
-  freeBatchMemory();
-
-  stopProfiling();
-
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
index ce899cd0a2..54417171fb 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -827,10 +821,11 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
-
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -922,14 +917,10 @@ int main() {
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1);
 
-  //  void* input = readTrainedWeights(input_path.c_str(), 0,2000,3,32,32);
-  //  uint32_t* labels = readLabels3(labels_path.c_str(),2000);
-
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
-  //  args->input = input;
-  //  args->input_bytes = 0;
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -991,40 +982,21 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
 
-  int batch_size = 500;
-  int test_input_size = 5000;
-  int batch_count = test_input_size / batch_size;
-
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
-
-      args->input = input;
-      args->input_bytes = 0;
-
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-
-      __hpvm__wait(dfg);
-
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
   }
-
-  stopProfiling();
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
index a3e147cb1a..9f989e3610 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -827,10 +821,16 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
-  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
+const int batch_size = 500, input_size = 5000,
+          batch_count = input_size / batch_size;
 
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -922,12 +922,8 @@ int main() {
   std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
   void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1);
 
-  void *input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32);
-  uint32_t *labels = readLabels3(labels_path.c_str(), 2000);
-
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -991,14 +987,25 @@ int main() {
   args->dense_2_b = dense_2_b;
   args->dense_2_b_bytes = 0;
 
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-
-  __hpvm__wait(dfg);
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
-
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 32, 32, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-  computeAccuracy3(labels, result);
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
index 91af01fe8e..12f7870a15 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -875,10 +869,11 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+const int batch_size = 10, input_size = 5000,
+          batch_count = input_size / batch_size;
 
-  std::string dir_prefix =
-      std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
+int main() {
+  std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
   std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
@@ -978,9 +973,10 @@ int main() {
   void *dense_3_b =
       readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
+  args->input = input;
+  args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
   args->conv2d_1_w_bytes = 0;
   args->conv2d_1_b = conv2d_1_b;
@@ -1046,40 +1042,21 @@ int main() {
   args->dense_3_b = dense_3_b;
   args->dense_3_b_bytes = 0;
 
+  __hpvm__init();
   startMemTracking();
-  startProfiling();
-
-  unsigned int batch_size = 50;
-  unsigned int test_input_size = 1000;
-  unsigned int batch_count = test_input_size / batch_size;
-
-  for (int j = 0; j < 1; j++) {
-    for (int i = 0; i < batch_count; i++) {
-
-      int start = i * batch_size;
-      int end = (i + 1) * batch_size;
-
-      void *input =
-          readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
-
-      args->input = input;
-      args->input_bytes = 0;
-
-      void *dfg = __hpvm__launch(0, root, (void *)args);
-
-      __hpvm__wait(dfg);
-
-      void *result = static_cast<RootIn *>(args)->r.tensor;
-      hpvm_request_tensor(result, 0);
-
-      llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
-
-      freeBatchMemory();
-    }
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
+    freeBatchMemory();
   }
-
-  stopProfiling();
   __hpvm__cleanup();
-
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
index 995502f907..189460c928 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp
@@ -1,10 +1,4 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <cstring>
+#include <string>
 #include <hpvm.h>
 #include <tensorTypes.h>
 #include <tensorUtils.h>
@@ -875,8 +869,15 @@ typedef struct __attribute__((__packed__)) {
   struct ret_t r;
 } RootIn;
 
-int main() {
+void write_accuracy(float accuracy) {
+  std::ofstream fout("final_accuracy");
+  fout << std::fixed << accuracy;
+}
 
+const int batch_size = 25, input_size = 5000,
+          batch_count = input_size / batch_size;
+
+int main() {
   std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/";
   std::string input_path = dir_prefix + std::string("test_input.bin");
   std::string labels_path = dir_prefix + std::string("test_labels.bin");
@@ -976,12 +977,10 @@ int main() {
   std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
   void *dense_3_b =
       readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
-  void *input = readTrainedWeights(input_path.c_str(), 0, 100, 3, 224, 224);
   uint32_t *labels = readLabelsBatch3(labels_path.c_str(), 0, 100);
 
-  __hpvm__init();
   RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn)));
-
+  void *input = create4DTensor(0, nchw, batch_size, 3, 224, 224);
   args->input = input;
   args->input_bytes = 0;
   args->conv2d_1_w = conv2d_1_w;
@@ -1049,14 +1048,25 @@ int main() {
   args->dense_3_b = dense_3_b;
   args->dense_3_b_bytes = 0;
 
-  void *dfg = __hpvm__launch(0, root, (void *)args);
-
-  __hpvm__wait(dfg);
-
-  void *result = static_cast<RootIn *>(args)->r.tensor;
-  hpvm_request_tensor(result, 0);
-
+  __hpvm__init();
+  float total_accuracy = 0;
+  startMemTracking();
+#pragma clang loop unroll(disable)
+  for (int i = 0; i < batch_count; i++) {
+    int start = i * batch_size, end = start + batch_size;
+    copyInputBatch(input_path.c_str(), start, end, 3, 224, 224, input);
+
+    void *dfg = __hpvm__launch(0, root, (void *)args);
+    __hpvm__wait(dfg);
+    void *result = static_cast<RootIn *>(args)->r.tensor;
+    hpvm_request_tensor(result, 0);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+    float accuracy = computeAccuracy3(labels, result);
+    total_accuracy += accuracy * batch_size;
+    freeBatchMemory();
+  }
+  write_accuracy(total_accuracy / input_size);
   __hpvm__cleanup();
-  computeAccuracy3(labels, result);
   return 0;
 }
diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h b/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h
index 71e1c26872..05d9157a64 100644
--- a/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h
+++ b/hpvm/test/dnn_benchmarks/hpvm-c/include/tensorUtils.h
@@ -3,7 +3,6 @@
 #ifndef UTILS_HEADER
 #define UTILS_HEADER
 
-
 #include <sstream>
 #include <vector>
 #include <bits/stdc++.h>
@@ -11,15 +10,13 @@
 #include <tensor.h>
 #include <cmath>
 
-
 std::vector<float> run_accuracies;
 
+void printTensorInfo(void *tensor_ptr) {
 
-void printTensorInfo(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-
-  if(tensor->gpu_data != NULL){
+  if (tensor->gpu_data != NULL) {
     printf("Successful cudaMalloc \n");
   }
 
@@ -29,376 +26,354 @@ void printTensorInfo(void* tensor_ptr){
   printf("num_elems = %lu \n", tensor->num_elems);
 }
 
-
 // FIXIT: Move this to debug.h and include in all files
-void dumpWeightsToFile(char* file_name, void* weights_ptr){
+void dumpWeightsToFile(char *file_name, void *weights_ptr) {
 
-  struct Tensor* weights = (Tensor*) weights_ptr;
+  struct Tensor *weights = (Tensor *)weights_ptr;
   // Move data back to host
   hpvm_request_tensor(weights, 0);
-  
-  FILE* fp = fopen(file_name, "wb");
-  if(fp == NULL){
-    printf("File %s could not be created. Check if directory exists \n", file_name);
+
+  FILE *fp = fopen(file_name, "wb");
+  if (fp == NULL) {
+    printf("File %s could not be created. Check if directory exists \n",
+           file_name);
     abort();
   }
 
-  //printf("size_in_bytes = %lu \n", weights->size_in_bytes);
-  size_t bytes_written = fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
-  //printf("bytes_written = %lu \n", bytes_written);
+  // printf("size_in_bytes = %lu \n", weights->size_in_bytes);
+  size_t bytes_written =
+      fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
+  // printf("bytes_written = %lu \n", bytes_written);
   fclose(fp);
 }
 
+void fillTensorWithOnes(void *tensor_ptr) {
 
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-void fillTensorWithOnes(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = 1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = 1.0;
     }
   }
 }
 
+void fillWithOnesAndTwos(void *tensor_ptr) {
 
-void fillWithOnesAndTwos(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-  
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems/2; i++){
-      data_arr[i] = 1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems / 2; i++) {
+      data_arr[i] = 1.0;
     }
 
-    for(unsigned int i = tensor->num_elems/2; i < tensor->num_elems; i++){
-      data_arr[i] = 2.0;    
+    for (unsigned int i = tensor->num_elems / 2; i < tensor->num_elems; i++) {
+      data_arr[i] = 2.0;
     }
- 
   }
 }
 
+void fillTensorWithVal(void *tensor_ptr, float target_value) {
 
-void fillTensorWithVal(void* tensor_ptr, float target_value){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = target_value;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = target_value;
     }
   }
 }
 
+void fillTensorWithNegOnes(void *tensor_ptr) {
 
-void fillTensorWithNegOnes(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = -1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = -1.0;
     }
   }
 }
 
+void fillTensorVals(void *tensor_ptr) {
 
-void fillTensorVals(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = i + 1;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = i + 1;
     }
   }
 }
 
+void printTensorValues(void *tensor_ptr) {
 
-void printTensorValues(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
   hpvm_request_tensor(tensor, 0);
-  
+
   // printing is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      printf("%f,", data_arr[i]);    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      printf("%f,", data_arr[i]);
     }
   }
 
   printf("\n");
 }
 
+void printTensorDims(void *tensor_ptr) {
 
-void printTensorDims(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
   printf("Num_elems = %lu \n", tensor->num_elems);
-  for (int i = 0; i < tensor->dims.num_dims; i++){
+  for (int i = 0; i < tensor->dims.num_dims; i++) {
     printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]);
   }
 }
 
+void compareTensors(void *tensor1_ptr, void *tensor2_ptr) {
 
-
-void compareTensors(void* tensor1_ptr, void* tensor2_ptr){
-
-  struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr;
-  struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr;
+  struct Tensor *tensor1 = (struct Tensor *)tensor1_ptr;
+  struct Tensor *tensor2 = (struct Tensor *)tensor2_ptr;
 
   hpvm_request_tensor(tensor1, 0);
   hpvm_request_tensor(tensor2, 0);
 
-  float* tensor_data1 = (float*) tensor1->host_data;
-  float* tensor_data2 = (float*) tensor2->host_data;
-  
-  for(unsigned int i = 0; i < tensor1->num_elems; i++){
-    if(tensor_data1[i] != tensor_data2[i]){
+  float *tensor_data1 = (float *)tensor1->host_data;
+  float *tensor_data2 = (float *)tensor2->host_data;
+
+  for (unsigned int i = 0; i < tensor1->num_elems; i++) {
+    if (tensor_data1[i] != tensor_data2[i]) {
       printf("Tensor data mismatch at index %d \n", i);
       abort();
     }
   }
 }
 
+void compareValues(void *tensor_ptr, float *data, size_t num_elems) {
 
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-void compareValues(void* tensor_ptr, float* data, size_t num_elems){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
-  float* tensor_data = (float*) tensor->host_data;
-  for(unsigned int i = 0; i < num_elems; i++){
-    if(tensor_data[i] != data[i]){
+
+  float *tensor_data = (float *)tensor->host_data;
+  for (unsigned int i = 0; i < num_elems; i++) {
+    if (tensor_data[i] != data[i]) {
       printf("Tensor data mismatch");
       abort();
     }
   }
 }
 
-
-void* readInputTensor(const char* file_name, int data_type, int dim1_size, int dim2_size,
-		      int dim3_size, int dim4_size){
+void *readInputTensor(const char *file_name, int data_type, int dim1_size,
+                      int dim2_size, int dim3_size, int dim4_size) {
 
   int type_size = 4; // NOTE: Assuming floating point tensors
   int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
   int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  uint8_t* file_data = (uint8_t*) malloc(sizeof(char) * num_elems);
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  uint8_t *file_data = (uint8_t *)malloc(sizeof(char) * num_elems);
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   int file_header_size = 16;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
 
- 
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(file_data, 1, sizeof(uint8_t) * num_elems, file);
 
   fclose(file);
-  
-  for (size_t i = 0; i < num_elems; ++i){
-    tensor_data[i] = (float) file_data[i] / 255.0f;
+
+  for (size_t i = 0; i < num_elems; ++i) {
+    tensor_data[i] = (float)file_data[i] / 255.0f;
   }
 
   // NOTE: Using NCHW format
-  struct Tensor* input = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					dim3_size, dim4_size);
-  
+  struct Tensor *input = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+
   initTensorData(input, tensor_data, size_in_bytes);
   //  compareValues(input, tensor_data, num_elems);
-  
-  return input;  
-}
 
+  return input;
+}
 
 //*** FIXIT: Move this to CPU-only
-struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type,
-				     int dim1_size, int dim2_size,
-				     int dim3_size, int dim4_size){
+struct Tensor *readTrainedWeightsCPU(const char *file_name, int data_type,
+                                     int dim1_size, int dim2_size,
+                                     int dim3_size, int dim4_size) {
 
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   int file_header_size = 0;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
   printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
 
   fclose(file);
-  
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
-  
+
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+
   initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
+  // compareValues(weights, tensor_data, num_elems);
   free(tensor_data);
 
   return weights;
 }
 
-
-struct Tensor* readTrainedWeights(const char* file_name, int data_type,
-				  long int dim1_size, long int dim2_size,
-				  long int dim3_size, long int dim4_size){
+struct Tensor *readTrainedWeights(const char *file_name, int data_type,
+                                  long int dim1_size, long int dim2_size,
+                                  long int dim3_size, long int dim4_size) {
 
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   printf("size_in_bytes  = %lu \n", size_in_bytes);
-  
+
   int file_header_size = 0;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
-  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
+  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes,
+  // bytes_read);
 
   fclose(file);
-  
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
-  
+
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+
   initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
+  // compareValues(weights, tensor_data, num_elems);
   free(tensor_data);
 
   return weights;
 }
 
-
-
-
-struct Tensor* readInputBatch(const char* file_name, int data_type,
-			      int start, int end,
-			      int dim2_size, int dim3_size, int dim4_size){
+struct Tensor *readInputBatch(const char *file_name, long data_type, long start,
+                              long end, long dim2_size, long dim3_size,
+                              long dim4_size) {
 
   long int dim1_size = end - start;
   // FIXIT: Don't assume floating point types
   long int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  long int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
+  long int file_header_size =
+      type_size * start * dim2_size * dim3_size * dim4_size;
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
   fclose(file);
 
-  //printf ("FIXED input BATCH read \n");
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
-  
+  // printf ("FIXED input BATCH read \n");
+
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+
   initTensorData(weights, tensor_data, size_in_bytes);
   free(tensor_data);
 
   return weights;
 }
 
+void *copyInputBatch(const char *file_name, long start, long end,
+                     long dim2_size, long dim3_size, long dim4_size,
+                     void *inputTensor_ptr) {
 
+  struct Tensor *inputTensor = (struct Tensor *)inputTensor_ptr;
 
-void* copyInputBatch(const char* file_name, 
-		    int start, int end,
-		    int dim2_size, int dim3_size, int dim4_size,
-		    void* inputTensor_ptr){
-
-  struct Tensor* inputTensor = (struct Tensor*) inputTensor_ptr;
-  
   int dim1_size = end - start;
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
+  long int file_header_size =
+      type_size * start * dim2_size * dim3_size * dim4_size;
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
   fclose(file);
-  
-    
+
   initTensorData(inputTensor, tensor_data, size_in_bytes);
   free(tensor_data);
 
   printf("******NOTE: tensor Dims = %d \n", inputTensor->dims.num_dims);
-  if(inputTensor->host_data == NULL || inputTensor->gpu_data == NULL)
+  if (inputTensor->host_data == NULL || inputTensor->gpu_data == NULL)
     printf("ERROR: NULL data pointers \n");
 
-
-  // Chaning Tensor Placement to HOST 
+  // Chaning Tensor Placement to HOST
   changeTensorPlacement(inputTensor, HOST);
 
-
   return inputTensor;
 }
 
+uint8_t *readLabels(const char *labels_file, int num_labels) {
 
-
-uint8_t* readLabels(const char* labels_file, int num_labels){
-
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+  uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
@@ -406,17 +381,15 @@ uint8_t* readLabels(const char* labels_file, int num_labels){
   size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
   fclose(file);
-  
+
   return labels;
 }
 
+uint32_t *readLabels3(const char *labels_file, int num_labels) {
 
-
-uint32_t* readLabels3(const char* labels_file, int num_labels){
-
-  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+  uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
@@ -424,264 +397,248 @@ uint32_t* readLabels3(const char* labels_file, int num_labels){
   size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
 
   fclose(file);
-  
+
   return labels;
 }
 
-
-uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
+uint8_t *readLabelsBatch(const char *labels_file, int start, int end) {
 
   int num_labels = end - start;
   int file_header_size = sizeof(uint8_t) * start;
-  
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+
+  uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
-  
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-    
-  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
+  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
   fclose(file);
-  
+
   // printf("--labels bytes_read = %lu \n", bytes_read);
   return labels;
 }
 
-
-uint32_t* readLabelsBatch3(const char* labels_file, int start, int end){
+uint32_t *readLabelsBatch3(const char *labels_file, int start, int end) {
 
   int num_labels = end - start;
   int file_header_size = sizeof(uint32_t) * start;
-  
-  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+
+  uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
-  
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-    
-  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
 
+  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
 
   fclose(file);
-  
+
   return labels;
 }
 
+void computeAccuracy(const char *labels_file, int num_labels,
+                     void *result_ptr) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
-  uint8_t* labels = readLabels(labels_file, num_labels);
+  uint8_t *labels = readLabels(labels_file, num_labels);
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
-  
-  for(int i = 0; i < batch_dim; i++){
+
+  for (int i = 0; i < batch_dim; i++) {
     int chosen = 0;
-    for (int id = 1; id < 10; ++id){
-      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
+    for (int id = 1; id < 10; ++id) {
+      if (data[i * channels + chosen] < data[i * channels + id])
+        chosen = id;
     }
-    
-    //printf("chosen = %d, label = %d \n", chosen, labels[i]);
-    if(chosen != labels[i])
+
+    // printf("chosen = %d, label = %d \n", chosen, labels[i]);
+    if (chosen != labels[i])
       num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
     fclose(fp);
   }
-  
 }
 
+// NOTE: batch_size and num_classes are Unused arguments
+float computeAccuracy2(uint8_t *labels, int batch_size, void *result_ptr,
+                       size_t num_classes = 10) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-
-// NOTE: batch_size and num_classes are Unused arguments 
-float computeAccuracy2(uint8_t* labels, int batch_size,
-		       void* result_ptr, size_t num_classes = 10){
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
   num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, num_classes);
-  
-  for(unsigned int i = 0; i < batch_dim; i++){ 
-      
+
+  for (unsigned int i = 0; i < batch_dim; i++) {
+
     int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
+    for (int id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
     }
-    
-    if(chosen != labels[i])
-      num_errors++;
 
+    if (chosen != labels[i])
+      num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
+float computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-float computeAccuracy3(uint32_t* labels, void* result_ptr){
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
   printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
-  
-  for(int i = 0; i < batch_dim; i++){
-  
+
+  for (int i = 0; i < batch_dim; i++) {
+
     int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
+    for (int id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
     }
-    
-    if(chosen != labels[i])
+
+    if (chosen != labels[i])
       num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
-
-
-struct ClassProb{
+struct ClassProb {
   float prob;
   int index;
 };
 
-
-bool descendFloatComp(ClassProb obj1, ClassProb obj2){
+bool descendFloatComp(ClassProb obj1, ClassProb obj2) {
   return obj1.prob > obj2.prob;
 }
 
+float computeTop5Accuracy(uint8_t *labels, int num_labels, void *result_ptr,
+                          unsigned num_classes = 10) {
+
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-float computeTop5Accuracy(uint8_t* labels, int num_labels,
-			  void* result_ptr, unsigned num_classes = 10){
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
-  
-  for(int i = 0; i < num_labels; i++){
+
+  for (int i = 0; i < num_labels; i++) {
 
     std::vector<ClassProb> elem_probs;
-    for (int id = 0; id < num_classes; ++id){
+    for (int id = 0; id < num_classes; ++id) {
       ClassProb cProb;
       cProb.prob = data[i * channels + id];
       cProb.index = id;
-      elem_probs.push_back(cProb);   
+      elem_probs.push_back(cProb);
     }
 
-    std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
+  std:
+    sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
     // Check if any of top-5 predictions matches
     bool matched = false;
-    for(int j = 0; j < 5; j++){
+    for (int j = 0; j < 5; j++) {
       ClassProb cProb = elem_probs[j];
-      if(cProb.index == labels[i])
+      if (cProb.index == labels[i])
         matched = true;
     }
 
-    if(!matched)
-      num_errors +=1; 
+    if (!matched)
+      num_errors += 1;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
-
-
-
-void dumpFinalAccuracy(float accuracy){
+void dumpFinalAccuracy(float accuracy) {
 
   printf("\n\n **** Final Accuracy = %f \n", accuracy);
-  
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
@@ -690,44 +647,37 @@ void dumpFinalAccuracy(float accuracy){
   run_accuracies.push_back(accuracy);
 }
 
+void dumpAvgPSNR(float avg_psnr) {
 
-
-void dumpAvgPSNR(float avg_psnr){
-
-  FILE* fp = fopen("avg_psnr", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("avg_psnr", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << avg_psnr;
-    std::string print_str = ss.str(); 
+    std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 }
 
+void dumpPSNRStd(float psnr_std) {
 
-void dumpPSNRStd(float psnr_std){
-
-  FILE* fp = fopen("psnr_std.txt", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("psnr_std.txt", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << psnr_std;
-    std::string print_str = ss.str(); 
+    std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 }
 
+void dumpExecutionAccuracies() {
 
-
-
-
-void dumpExecutionAccuracies(){
-
-  FILE* fp = fopen("run_accuracies.txt", "w+");
-  if(fp != NULL){  
-    for (int i = 0; i < run_accuracies.size(); i++){
+  FILE *fp = fopen("run_accuracies.txt", "w+");
+  if (fp != NULL) {
+    for (int i = 0; i < run_accuracies.size(); i++) {
       float accuracy = run_accuracies[i];
       std::ostringstream ss;
       ss << std::fixed << accuracy;
@@ -735,63 +685,60 @@ void dumpExecutionAccuracies(){
       fwrite(print_str.c_str(), 1, print_str.length(), fp);
       fwrite("\n", 1, 1, fp);
     }
-
   }
 
   fclose(fp);
 }
 
-
-float readPSNRFromFile(const char* file_name){
+float readPSNRFromFile(const char *file_name) {
 
   float psnr;
-  FILE* pFile = fopen(file_name, "r");
-  if(pFile == NULL){
+  FILE *pFile = fopen(file_name, "r");
+  if (pFile == NULL) {
     printf("ERROR: psnr.txt not found! \n");
     abort();
   }
-  
+
   fscanf(pFile, "%f", &psnr);
   printf("**** PSNR read = %f \n\n", psnr);
-  return psnr; 
+  return psnr;
 }
 
+float computePSNRViolation(void *gold_ptr, void *approx_ptr,
+                           float PSNR_threshold) {
 
-float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){
-
-  
   PSNR_threshold = readPSNRFromFile("psnr.txt");
   std::vector<float> psnr_list;
-  
-  struct Tensor* gold_tensor = (struct Tensor*) gold_ptr;
-  struct Tensor* approx_tensor = (struct Tensor*) approx_ptr;
 
-  size_t* dim_sizes = gold_tensor->dims.dim_sizes;
+  struct Tensor *gold_tensor = (struct Tensor *)gold_ptr;
+  struct Tensor *approx_tensor = (struct Tensor *)approx_ptr;
+
+  size_t *dim_sizes = gold_tensor->dims.dim_sizes;
   size_t batch_dim = dim_sizes[0];
   size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
-  
+
   printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size);
-	 
-  float* gold_data = (float*) gold_tensor->host_data;
-  float* approx_data = (float*) approx_tensor->host_data;
 
-  FILE* fp = fopen("img_psnr.txt", "w+");
+  float *gold_data = (float *)gold_tensor->host_data;
+  float *approx_data = (float *)approx_tensor->host_data;
+
+  FILE *fp = fopen("img_psnr.txt", "w+");
 
   float sum_psnr = 0.0;
-  int num_errors = 0;  
-  for(size_t i = 0; i < batch_dim; i++){
+  int num_errors = 0;
+  for (size_t i = 0; i < batch_dim; i++) {
     float mse_sum = 0.0;
-    float max_val = -999999;     
+    float max_val = -999999;
     size_t offset = i * image_size;
-    
-    for(size_t j = 0; j < image_size; j++){
+
+    for (size_t j = 0; j < image_size; j++) {
       float diff = gold_data[offset + j] - approx_data[offset + j];
       float diff_square = diff * diff;
       mse_sum += diff_square;
 
-      if(max_val < gold_data[offset + j]){
-	max_val = gold_data[offset + j];
-      }   
+      if (max_val < gold_data[offset + j]) {
+        max_val = gold_data[offset + j];
+      }
     }
 
     mse_sum = mse_sum / image_size;
@@ -799,7 +746,7 @@ float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshol
 
     sum_psnr += psnr;
     if (psnr < PSNR_threshold)
-      num_errors += 1;    
+      num_errors += 1;
 
     printf("PSNR value = %f \n", psnr);
     psnr_list.push_back(psnr);
@@ -817,39 +764,35 @@ float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshol
   float avg_psnr = sum_psnr / batch_dim;
   printf("*** avg_psnr =  %f \n\n", avg_psnr);
   dumpAvgPSNR(avg_psnr);
- 
+
   float success_rate = 100.0 - violation_rate;
   dumpFinalAccuracy(success_rate);
 
   fclose(fp);
 
-
   float var = 0.0;
-  for(size_t i = 0; i < batch_dim; i++){
-    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); 
+  for (size_t i = 0; i < batch_dim; i++) {
+    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr);
   }
 
   var /= batch_dim;
   float std = sqrt(var);
 
   dumpPSNRStd(std);
-  
-  return violation_rate;  
-}
 
+  return violation_rate;
+}
 
-void dumpOutput(void* output_ptr, const char* file_name){
+void dumpOutput(void *output_ptr, const char *file_name) {
 
-  struct Tensor* out_tensor = (struct Tensor*) output_ptr;  
+  struct Tensor *out_tensor = (struct Tensor *)output_ptr;
   size_t size_in_bytes = out_tensor->size_in_bytes;
-  printf ("** Output size = %lu \n", size_in_bytes);
-  
-  float* host_data = (float*) out_tensor->host_data; 
-  FILE* fd = fopen(file_name, "w+");
+  printf("** Output size = %lu \n", size_in_bytes);
+
+  float *host_data = (float *)out_tensor->host_data;
+  FILE *fd = fopen(file_name, "w+");
   fwrite(host_data, 1, size_in_bytes, fd);
   fclose(fd);
 }
 
-
-
 #endif
-- 
GitLab