From caaf0792f1923495da2235905c2fa56e00d9205a Mon Sep 17 00:00:00 2001
From: Yifan Zhao <yifanz16@illinois.edu>
Date: Tue, 2 Feb 2021 02:41:01 -0600
Subject: [PATCH] Formatted all C/C++/CUDA sources in hpvm-tensor-rt

---
 .../dnn_sources/include/op_overheads.h        |   93 +-
 .../dnn_sources/include/types.h               |    2 +-
 .../dnn_sources/include/utils.h               |  760 ++--
 .../dnn_sources/include/utils_cpu.h           |  289 +-
 .../src/fp16/alexnet2_cifar10_half.cc         |  177 +-
 .../src/fp16/alexnet_cifar10_half.cc          |  165 +-
 .../dnn_sources/src/fp16/lenet_mnist_half.cc  |  119 +-
 .../dnn_sources/src/fp16/mobilenet_half.cc    | 1106 +++---
 .../src/fp16/resnet18_cifar10_half.cc         |  431 ++-
 .../src/fp16/vgg16_cifar100_half.cc           |  336 +-
 .../src/fp16/vgg16_cifar10_half.cc            |  306 +-
 .../dnn_sources/src/fp32/alexnet2_cifar10.cc  |  180 +-
 .../dnn_sources/src/fp32/alexnet_cifar10.cc   |  167 +-
 .../dnn_sources/src/fp32/alexnet_imagenet.cc  |  228 +-
 .../dnn_sources/src/fp32/lenet_mnist.cc       |  129 +-
 .../dnn_sources/src/fp32/mobilenet.cc         | 1108 +++---
 .../dnn_sources/src/fp32/resnet18_cifar10.cc  |  431 ++-
 .../dnn_sources/src/fp32/resnet50_imagenet.cc | 2447 ++++++++-----
 .../dnn_sources/src/fp32/vgg16_cifar10.cc     |  304 +-
 .../dnn_sources/src/fp32/vgg16_cifar100.cc    |  336 +-
 .../dnn_sources/src/fp32/vgg16_imagenet.cc    |  360 +-
 .../dnn_sources/src/unit_tests.cc             |  884 ++---
 .../include/approx_techniques2.h              |   56 +-
 .../include/approxhpvm_runtime_utils.h        | 1309 ++++---
 .../tensor_runtime/include/configuration.h    |    9 +-
 .../include/rt-controller-api.h               |    3 +-
 .../include/tensor_cpu_runtime.h              |   95 +-
 .../tensor_runtime/include/tensor_runtime.h   |   27 +-
 .../include/tensor_signatures.cc              |    6 +-
 .../tensor_runtime/src/approx_knobs_utils.cc  |   16 +-
 .../tensor_runtime/src/approx_simulation.cu   | 1307 +++----
 .../tensor_runtime/src/approx_techniques.cu   | 3057 ++++++++--------
 .../src/approx_techniques2_tuned.cu           | 3134 +++++++++--------
 .../tensor_runtime/src/configuration.cpp      |   43 +-
 .../tensor_runtime/src/debug.cc               |    2 +-
 .../tensor_runtime/src/device_math.cu         |   14 +-
 .../tensor_runtime/src/error.cu               |  406 +--
 .../tensor_runtime/src/fp16_gemm.cu           |  361 +-
 .../tensor_runtime/src/global_data.cc         |    1 -
 .../tensor_runtime/src/group_conv.cu          |  598 ++--
 .../tensor_runtime/src/half_precision_api.cu  |  381 +-
 .../tensor_runtime/src/hpvm-rt-controller.cpp |  406 +--
 .../tensor_runtime/src/init_api.cc            |    4 +-
 .../tensor_runtime/src/profiling.cc           |   10 +-
 .../tensor_runtime/src/tensor_cpu_runtime.cc  | 2013 ++++++-----
 .../tensor_runtime/src/tensor_runtime.cu      |  570 ++-
 .../tensor_runtime/src/tensor_utils.cu        |  559 ++-
 .../tensor_runtime/src/wrapper_runtime.cu     | 1094 +++---
 48 files changed, 13260 insertions(+), 12579 deletions(-)

diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h
index 4eaf88e6d6..8a97fbf3d3 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h
@@ -3,22 +3,18 @@
 #ifndef OP_OVERHEADS_HEADER
 #define OP_OVERHEADS_HEADER
 
-
 #include <sstream>
 #include "../../tensor_runtime/include/tensor.h"
 #include "types.h"
 
-
 float scale_down_factor = 10000.0;
 float error_factor = 0.1;
 std::string result_str = "";
 
-
 // TODO: Every routine needs testing
 
-
 // private function
-static float getScaledComps(double total_comps, int error_scale){
+static float getScaledComps(double total_comps, int error_scale) {
 
   total_comps = total_comps / scale_down_factor;
   float comp_scale = 1.0 + (error_factor * error_scale);
@@ -27,122 +23,107 @@ static float getScaledComps(double total_comps, int error_scale){
   return total_comps;
 }
 
-
-static void addNormToResult(float comps){
+static void addNormToResult(float comps) {
 
   std::ostringstream ss;
   ss << std::fixed << comps;
-  
-  result_str.append( std::string(ss.str()) );
+
+  result_str.append(std::string(ss.str()));
   result_str.append("\t");
 }
 
-
-
-static void addCompsToResult(float comps){
+static void addCompsToResult(float comps) {
 
   std::ostringstream ss;
   ss << std::fixed << comps;
-  
-  result_str.append( std::string(ss.str()) );
+
+  result_str.append(std::string(ss.str()));
   result_str.append("\n");
 }
 
+void add_conv_overheads(void *input_ptr, void *filter_ptr, int strideA,
+                        int strideB, int error_scale) {
 
-void add_conv_overheads(void* input_ptr, void* filter_ptr,
-			int strideA, int strideB, int error_scale){
-
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
-  
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 }
 
+void add_gemm_overheads(void *lhs_ptr, void *rhs_ptr, int error_scale) {
 
-void add_gemm_overheads(void* lhs_ptr, void* rhs_ptr, int error_scale){
+  Tensor *lhs = (Tensor *)lhs_ptr;
+  Tensor *rhs = (Tensor *)rhs_ptr;
 
-  Tensor* lhs = (Tensor*) lhs_ptr;
-  Tensor* rhs = (Tensor*) rhs_ptr;
-    
   int m = lhs->dims.dim_sizes[0];
   // The rhs last dimension must contain the neurons
-  int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
   int k = 1;
-  
+
   // Flattening the dimensions after the batch dimension
-  for (int j = 1 ; j < lhs->dims.num_dims; j++){
+  for (int j = 1; j < lhs->dims.num_dims; j++) {
     k = k * lhs->dims.dim_sizes[j]; // input neurons
   }
 
-  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
+  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
   // Dimension-note: Check if k is same across the two tensors
   printf("m = %d, n = %d, k = %d \n", m, n, k);
-  
-  if(rhs_k != k){
+
+  if (rhs_k != k) {
     printf("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k);
     abort();
   }
-  
+
   double total_comps = m * n * rhs_k * 1.0;
   float scaled_comps = getScaledComps(total_comps, error_scale);
-  
+
   printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n",
-	 error_scale, total_comps, scaled_comps);
+         error_scale, total_comps, scaled_comps);
 
   addCompsToResult(scaled_comps);
-  
 }
 
+void add_bias_overheads(void *input_ptr, int error_scale) {
 
-void add_bias_overheads(void* input_ptr, int error_scale){
+  Tensor *input = (Tensor *)input_ptr;
 
-  Tensor* input = (Tensor*) input_ptr;
-  
   double total_comps = input->num_elems;
   float scaled_comps = getScaledComps(total_comps, error_scale);
 
   printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n",
-	 error_scale, total_comps, scaled_comps);
+         error_scale, total_comps, scaled_comps);
 
   addCompsToResult(scaled_comps);
-
 }
 
+void add_relu_overheads(void *input_ptr, int error_scale) {
+
+  Tensor *input = (Tensor *)input_ptr;
 
-void add_relu_overheads(void* input_ptr, int error_scale){
-  
-  Tensor* input = (Tensor*) input_ptr;
-  
   double total_comps = input->num_elems;
   float scaled_comps = getScaledComps(total_comps, error_scale);
 
   printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n",
-	 error_scale, total_comps, scaled_comps);				     
+         error_scale, total_comps, scaled_comps);
 
   addCompsToResult(scaled_comps);
-
-}
-
-float add_pool_overheads(void* input_ptr, int kernel_size,
-			 int stride_size, int error_scale){
-
 }
 
+float add_pool_overheads(void *input_ptr, int kernel_size, int stride_size,
+                         int error_scale) {}
 
-void add_norms(void* norms_ptr){
+void add_norms(void *norms_ptr) {
 
-  Norm_t* norms = (Norm_t*) norms_ptr;
+  Norm_t *norms = (Norm_t *)norms_ptr;
 
   addNormToResult(norms->l1_norm);
   addNormToResult(norms->l2_norm);
   addNormToResult(norms->inf_norm);
- 
 }
 
-void dump_result(char* file_name){
+void dump_result(char *file_name) {
 
-  FILE* fp = fopen(file_name, "w+");
+  FILE *fp = fopen(file_name, "w+");
   fwrite(result_str.c_str(), 1, result_str.length(), fp);
-  fclose(fp); 
+  fclose(fp);
 }
 
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h
index 3e4f64610d..cafd37f703 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h
@@ -32,7 +32,7 @@ enum Tensor_type_t{
 // NOTE: Currently only NCHW is supported due to limited cuDNN support
 enum Tensor_format_t{
   nchw,
-  nhwc 
+  nhwc
 };
 */
 
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
index 5d1e0e66ad..178454153b 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
@@ -4,9 +4,9 @@
 #define UTILS_HEADER
 
 #include <stdio.h>
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
 #include <sstream>
 #include <vector>
 #include <bits/stdc++.h>
@@ -15,17 +15,14 @@
 #include <cmath>
 #include <string.h>
 
-
 std::vector<float> run_accuracies;
 std::string model_params_path = "../../../build/model_params/";
 
+void printTensorInfo(void *tensor_ptr) {
 
-void printTensorInfo(void* tensor_ptr){
-
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  if(tensor->gpu_data != NULL){
+  if (tensor->gpu_data != NULL) {
     printf("Successful cudaMalloc \n");
   }
 
@@ -35,388 +32,363 @@ void printTensorInfo(void* tensor_ptr){
   printf("num_elems = %lu \n", tensor->num_elems);
 }
 
-
 // FIXIT: Move this to debug.h and include in all files
-void dumpWeightsToFile(const char* file_name, void* weights_ptr){
+void dumpWeightsToFile(const char *file_name, void *weights_ptr) {
 
-  struct Tensor* weights = (Tensor*) weights_ptr;
+  struct Tensor *weights = (Tensor *)weights_ptr;
   // Move data back to host
   hpvm_request_tensor(weights, 0);
-  
-  FILE* fp = fopen(file_name, "wb");
-  if(fp == NULL){
-    printf("File %s could not be created. Check if directory exists \n", file_name);
+
+  FILE *fp = fopen(file_name, "wb");
+  if (fp == NULL) {
+    printf("File %s could not be created. Check if directory exists \n",
+           file_name);
     abort();
   }
 
-  //printf("size_in_bytes = %lu \n", weights->size_in_bytes);
-  size_t bytes_written = fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
-  //printf("bytes_written = %lu \n", bytes_written);
+  // printf("size_in_bytes = %lu \n", weights->size_in_bytes);
+  size_t bytes_written =
+      fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
+  // printf("bytes_written = %lu \n", bytes_written);
   fclose(fp);
 }
 
+void fillTensorWithOnes(void *tensor_ptr) {
 
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-void fillTensorWithOnes(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = 1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = 1.0;
     }
   }
 }
 
+void fillWithOnesAndTwos(void *tensor_ptr) {
 
-void fillWithOnesAndTwos(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-  
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
 
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
       if (i % 2 == 0)
         data_arr[i] = 1.0;
       else
-	data_arr[i] = 2.0;
+        data_arr[i] = 2.0;
     }
 
     /*for(unsigned int i = 0; i < tensor->num_elems/2; i++){
-      data_arr[i] = 1.0;    
+      data_arr[i] = 1.0;
     }
 
     for(unsigned int i = tensor->num_elems/2; i < tensor->num_elems; i++){
-      data_arr[i] = 2.0;    
+      data_arr[i] = 2.0;
     }*/
- 
   }
 }
 
+void fillTensorWithVal(void *tensor_ptr, float target_value) {
 
-void fillTensorWithVal(void* tensor_ptr, float target_value){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = target_value;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = target_value;
     }
   }
 }
 
+void fillTensorWithNegOnes(void *tensor_ptr) {
 
-void fillTensorWithNegOnes(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
+
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = -1.0;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = -1.0;
     }
   }
 }
 
+void fillTensorVals(void *tensor_ptr) {
 
-void fillTensorVals(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
   // initialization is specific to the floating point type
-  if(tensor->data_type == CUDNN_DATA_FLOAT){
-    float* data_arr = (float*) tensor->host_data;
-    for(unsigned int i = 0; i < tensor->num_elems; i++){
-      data_arr[i] = i + 1;    
+  if (tensor->data_type == CUDNN_DATA_FLOAT) {
+    float *data_arr = (float *)tensor->host_data;
+    for (unsigned int i = 0; i < tensor->num_elems; i++) {
+      data_arr[i] = i + 1;
     }
   }
 }
 
+void printTensorValues(void *tensor_ptr) {
 
-void printTensorValues(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
   hpvm_request_tensor(tensor, 0);
-  
+
   // printing is specific to the floating point type
-  if(tensor->data_type != CUDNN_DATA_FLOAT){
-    //printf("\n WARNING: The tensor is non-float type tensor \n\n");
-  }  
+  if (tensor->data_type != CUDNN_DATA_FLOAT) {
+    // printf("\n WARNING: The tensor is non-float type tensor \n\n");
+  }
 
-  float* data_arr = (float*) tensor->host_data;
+  float *data_arr = (float *)tensor->host_data;
 
-  for(unsigned int i = 0; i < tensor->num_elems; i++){
-      printf("%f,", data_arr[i]);    
+  for (unsigned int i = 0; i < tensor->num_elems; i++) {
+    printf("%f,", data_arr[i]);
   }
-   
 
   printf("\n");
 }
 
+void printTensorDims(void *tensor_ptr) {
 
-void printTensorDims(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
   printf("Num_elems = %lu \n", tensor->num_elems);
-  for (int i = 0; i < tensor->dims.num_dims; i++){
+  for (int i = 0; i < tensor->dims.num_dims; i++) {
     printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]);
   }
 }
 
+void compareTensors(void *tensor1_ptr, void *tensor2_ptr) {
 
-
-void compareTensors(void* tensor1_ptr, void* tensor2_ptr){
-
-  struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr;
-  struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr;
+  struct Tensor *tensor1 = (struct Tensor *)tensor1_ptr;
+  struct Tensor *tensor2 = (struct Tensor *)tensor2_ptr;
 
   hpvm_request_tensor(tensor1, 0);
   hpvm_request_tensor(tensor2, 0);
 
-  float* tensor_data1 = (float*) tensor1->host_data;
-  float* tensor_data2 = (float*) tensor2->host_data;
-  
-  for(unsigned int i = 0; i < tensor1->num_elems; i++){
-    if(tensor_data1[i] != tensor_data2[i]){
+  float *tensor_data1 = (float *)tensor1->host_data;
+  float *tensor_data2 = (float *)tensor2->host_data;
+
+  for (unsigned int i = 0; i < tensor1->num_elems; i++) {
+    if (tensor_data1[i] != tensor_data2[i]) {
       printf("Tensor data mismatch at index %d \n", i);
       abort();
     }
   }
 }
 
+void compareValues(void *tensor_ptr, float *data, size_t num_elems) {
 
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-void compareValues(void* tensor_ptr, float* data, size_t num_elems){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-    
   hpvm_request_tensor(tensor, 0);
-  
-  float* tensor_data = (float*) tensor->host_data;
-  for(unsigned int i = 0; i < num_elems; i++){
-    if(tensor_data[i] != data[i]){
+
+  float *tensor_data = (float *)tensor->host_data;
+  for (unsigned int i = 0; i < num_elems; i++) {
+    if (tensor_data[i] != data[i]) {
       printf("Tensor data mismatch");
       abort();
     }
   }
 }
 
-
-void* readInputTensor(const char* file_name, int data_type, int dim1_size, int dim2_size,
-		      int dim3_size, int dim4_size){
+void *readInputTensor(const char *file_name, int data_type, int dim1_size,
+                      int dim2_size, int dim3_size, int dim4_size) {
 
   int type_size = 4; // NOTE: Assuming floating point tensors
   int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
   int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  uint8_t* file_data = (uint8_t*) malloc(sizeof(char) * num_elems);
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  uint8_t *file_data = (uint8_t *)malloc(sizeof(char) * num_elems);
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   int file_header_size = 16;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
 
- 
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(file_data, 1, sizeof(uint8_t) * num_elems, file);
 
   fclose(file);
-  
-  for (size_t i = 0; i < num_elems; ++i){
-    tensor_data[i] = (float) file_data[i] / 255.0f;
+
+  for (size_t i = 0; i < num_elems; ++i) {
+    tensor_data[i] = (float)file_data[i] / 255.0f;
   }
 
   // NOTE: Using NCHW format
-  struct Tensor* input = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					dim3_size, dim4_size);
-  
+  struct Tensor *input = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+
   initTensorData(input, tensor_data, size_in_bytes);
   //  compareValues(input, tensor_data, num_elems);
-  
-  return input;  
-}
 
+  return input;
+}
 
 //*** FIXIT: Move this to CPU-only
-struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type,
-				     int dim1_size, int dim2_size,
-				     int dim3_size, int dim4_size){
+struct Tensor *readTrainedWeightsCPU(const char *file_name, int data_type,
+                                     int dim1_size, int dim2_size,
+                                     int dim3_size, int dim4_size) {
 
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   int file_header_size = 0;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
-  //printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
+  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes,
+  // bytes_read);
 
   fclose(file);
-  
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
-  
+
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+
   initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
+  // compareValues(weights, tensor_data, num_elems);
   free(tensor_data);
 
   return weights;
 }
 
-
-struct Tensor* readTrainedWeights(const char* file_name, int data_type,
-				  long int dim1_size, long int dim2_size,
-				  long int dim3_size, long int dim4_size){
+struct Tensor *readTrainedWeights(const char *file_name, int data_type,
+                                  long int dim1_size, long int dim2_size,
+                                  long int dim3_size, long int dim4_size) {
 
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  //printf("size_in_bytes  = %lu \n", size_in_bytes);
-  
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
+  // printf("size_in_bytes  = %lu \n", size_in_bytes);
+
   int file_header_size = 0;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
-  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
+  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes,
+  // bytes_read);
 
   fclose(file);
-  
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
-  
+
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+
   initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
+  // compareValues(weights, tensor_data, num_elems);
   free(tensor_data);
 
   return weights;
 }
 
-
-
-
-struct Tensor* readInputBatch(const char* file_name, int data_type,
-			      long int start, long int end,
-			      long int dim2_size, long int dim3_size, long int dim4_size){
+struct Tensor *readInputBatch(const char *file_name, int data_type,
+                              long int start, long int end, long int dim2_size,
+                              long int dim3_size, long int dim4_size) {
 
   long int dim1_size = end - start;
   // FIXIT: Don't assume floating point types
   long int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
-  long int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
+  long int file_header_size =
+      type_size * start * dim2_size * dim3_size * dim4_size;
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
-
   fclose(file);
-  
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-					                   dim3_size, dim4_size);
-  
+
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+
   initTensorData(weights, tensor_data, size_in_bytes);
   free(tensor_data);
 
   return weights;
 }
 
+void *copyInputBatch(const char *file_name, int start, int end,
+                     long int dim2_size, long int dim3_size, long int dim4_size,
+                     void *inputTensor_ptr) {
 
+  struct Tensor *inputTensor = (struct Tensor *)inputTensor_ptr;
 
-void* copyInputBatch(const char* file_name, 
-		    int start, int end,
-		    long int dim2_size, long int dim3_size, long int dim4_size,
-		    void* inputTensor_ptr){
-
-  struct Tensor* inputTensor = (struct Tensor*) inputTensor_ptr;
-  
   long int dim1_size = end - start;
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
   fclose(file);
-  
-    
+
   initTensorData(inputTensor, tensor_data, size_in_bytes);
   free(tensor_data);
 
   printf("******NOTE: tensor Dims = %d \n", inputTensor->dims.num_dims);
-  if(inputTensor->host_data == NULL || inputTensor->gpu_data == NULL)
+  if (inputTensor->host_data == NULL || inputTensor->gpu_data == NULL)
     printf("ERROR: NULL data pointers \n");
 
-
-  // Chaning Tensor Placement to HOST 
+  // Chaning Tensor Placement to HOST
   changeTensorPlacement(inputTensor, HOST);
 
-
   return inputTensor;
 }
 
+uint8_t *readLabels(const char *labels_file, int num_labels) {
 
-
-uint8_t* readLabels(const char* labels_file, int num_labels){
-
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+  uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
@@ -424,17 +396,15 @@ uint8_t* readLabels(const char* labels_file, int num_labels){
   size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
   fclose(file);
-  
+
   return labels;
 }
 
+uint32_t *readLabels3(const char *labels_file, int num_labels) {
 
-
-uint32_t* readLabels3(const char* labels_file, int num_labels){
-
-  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+  uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
@@ -442,264 +412,248 @@ uint32_t* readLabels3(const char* labels_file, int num_labels){
   size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
 
   fclose(file);
-  
+
   return labels;
 }
 
-
-uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
+uint8_t *readLabelsBatch(const char *labels_file, int start, int end) {
 
   int num_labels = end - start;
   int file_header_size = sizeof(uint8_t) * start;
-  
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+
+  uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
-  
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-    
-  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
+  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
   fclose(file);
-  
+
   // printf("--labels bytes_read = %lu \n", bytes_read);
   return labels;
 }
 
-
-uint32_t* readLabelsBatch3(const char* labels_file, int start, int end){
+uint32_t *readLabelsBatch3(const char *labels_file, int start, int end) {
 
   int num_labels = end - start;
   int file_header_size = sizeof(uint32_t) * start;
-  
-  uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+
+  uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
-  
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-    
-  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
 
+  size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file);
 
   fclose(file);
-  
+
   return labels;
 }
 
+void computeAccuracy(const char *labels_file, int num_labels,
+                     void *result_ptr) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
-  uint8_t* labels = readLabels(labels_file, num_labels);
+  uint8_t *labels = readLabels(labels_file, num_labels);
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
-  
-  for(int i = 0; i < batch_dim; i++){
+
+  for (int i = 0; i < batch_dim; i++) {
     int chosen = 0;
-    for (int id = 1; id < 10; ++id){
-      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
+    for (int id = 1; id < 10; ++id) {
+      if (data[i * channels + chosen] < data[i * channels + id])
+        chosen = id;
     }
-    
-    //printf("chosen = %d, label = %d \n", chosen, labels[i]);
-    if(chosen != labels[i])
+
+    // printf("chosen = %d, label = %d \n", chosen, labels[i]);
+    if (chosen != labels[i])
       num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
     fclose(fp);
   }
-  
 }
 
+// NOTE: batch_size and num_classes are Unused arguments
+float computeAccuracy2(uint8_t *labels, int batch_size, void *result_ptr,
+                       size_t num_classes = 10) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-
-// NOTE: batch_size and num_classes are Unused arguments 
-float computeAccuracy2(uint8_t* labels, int batch_size,
-		       void* result_ptr, size_t num_classes = 10){
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
   num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, num_classes);
-  
-  for(unsigned int i = 0; i < batch_dim; i++){ 
-      
+
+  for (unsigned int i = 0; i < batch_dim; i++) {
+
     int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
+    for (int id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
     }
-    
-    if(chosen != labels[i])
-      num_errors++;
 
+    if (chosen != labels[i])
+      num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
+float computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-float computeAccuracy3(uint32_t* labels, void* result_ptr){
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
   printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes);
-  
-  for(int i = 0; i < batch_dim; i++){
-  
+
+  for (int i = 0; i < batch_dim; i++) {
+
     int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
+    for (int id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
     }
-    
-    if(chosen != labels[i])
+
+    if (chosen != labels[i])
       num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
-
-
-struct ClassProb{
+struct ClassProb {
   float prob;
   int index;
 };
 
-
-bool descendFloatComp(ClassProb obj1, ClassProb obj2){
+bool descendFloatComp(ClassProb obj1, ClassProb obj2) {
   return obj1.prob > obj2.prob;
 }
 
+float computeTop5Accuracy(uint8_t *labels, int num_labels, void *result_ptr,
+                          unsigned num_classes = 10) {
+
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-float computeTop5Accuracy(uint8_t* labels, int num_labels,
-			  void* result_ptr, unsigned num_classes = 10){
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
-  
-  for(int i = 0; i < num_labels; i++){
+
+  for (int i = 0; i < num_labels; i++) {
 
     std::vector<ClassProb> elem_probs;
-    for (int id = 0; id < num_classes; ++id){
+    for (int id = 0; id < num_classes; ++id) {
       ClassProb cProb;
       cProb.prob = data[i * channels + id];
       cProb.index = id;
-      elem_probs.push_back(cProb);   
+      elem_probs.push_back(cProb);
     }
 
-    std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
+  std:
+    sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
     // Check if any of top-5 predictions matches
     bool matched = false;
-    for(int j = 0; j < 5; j++){
+    for (int j = 0; j < 5; j++) {
       ClassProb cProb = elem_probs[j];
-      if(cProb.index == labels[i])
+      if (cProb.index == labels[i])
         matched = true;
     }
 
-    if(!matched)
-      num_errors +=1; 
+    if (!matched)
+      num_errors += 1;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
 
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
-
-
-
-void dumpFinalAccuracy(float accuracy){
+void dumpFinalAccuracy(float accuracy) {
 
   printf("\n\n **** Final Accuracy = %f \n", accuracy);
-  
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << accuracy;
     std::string print_str = ss.str();
-  
+
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
@@ -708,44 +662,37 @@ void dumpFinalAccuracy(float accuracy){
   run_accuracies.push_back(accuracy);
 }
 
+void dumpAvgPSNR(float avg_psnr) {
 
-
-void dumpAvgPSNR(float avg_psnr){
-
-  FILE* fp = fopen("avg_psnr", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("avg_psnr", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << avg_psnr;
-    std::string print_str = ss.str(); 
+    std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 }
 
+void dumpPSNRStd(float psnr_std) {
 
-void dumpPSNRStd(float psnr_std){
-
-  FILE* fp = fopen("psnr_std.txt", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("psnr_std.txt", "w+");
+  if (fp != NULL) {
     std::ostringstream ss;
     ss << std::fixed << psnr_std;
-    std::string print_str = ss.str(); 
+    std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 }
 
+void dumpExecutionAccuracies() {
 
-
-
-
-void dumpExecutionAccuracies(){
-
-  FILE* fp = fopen("run_accuracies.txt", "w+");
-  if(fp != NULL){  
-    for (int i = 0; i < run_accuracies.size(); i++){
+  FILE *fp = fopen("run_accuracies.txt", "w+");
+  if (fp != NULL) {
+    for (int i = 0; i < run_accuracies.size(); i++) {
       float accuracy = run_accuracies[i];
       std::ostringstream ss;
       ss << std::fixed << accuracy;
@@ -753,63 +700,60 @@ void dumpExecutionAccuracies(){
       fwrite(print_str.c_str(), 1, print_str.length(), fp);
       fwrite("\n", 1, 1, fp);
     }
-
   }
 
   fclose(fp);
 }
 
-
-float readPSNRFromFile(const char* file_name){
+float readPSNRFromFile(const char *file_name) {
 
   float psnr;
-  FILE* pFile = fopen(file_name, "r");
-  if(pFile == NULL){
+  FILE *pFile = fopen(file_name, "r");
+  if (pFile == NULL) {
     printf("ERROR: psnr.txt not found! \n");
     abort();
   }
-  
+
   fscanf(pFile, "%f", &psnr);
   printf("**** PSNR read = %f \n\n", psnr);
-  return psnr; 
+  return psnr;
 }
 
+float computePSNRViolation(void *gold_ptr, void *approx_ptr,
+                           float PSNR_threshold) {
 
-float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){
-
-  
   PSNR_threshold = readPSNRFromFile("psnr.txt");
   std::vector<float> psnr_list;
-  
-  struct Tensor* gold_tensor = (struct Tensor*) gold_ptr;
-  struct Tensor* approx_tensor = (struct Tensor*) approx_ptr;
 
-  size_t* dim_sizes = gold_tensor->dims.dim_sizes;
+  struct Tensor *gold_tensor = (struct Tensor *)gold_ptr;
+  struct Tensor *approx_tensor = (struct Tensor *)approx_ptr;
+
+  size_t *dim_sizes = gold_tensor->dims.dim_sizes;
   size_t batch_dim = dim_sizes[0];
   size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
-  
+
   printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size);
-	 
-  float* gold_data = (float*) gold_tensor->host_data;
-  float* approx_data = (float*) approx_tensor->host_data;
 
-  FILE* fp = fopen("img_psnr.txt", "w+");
+  float *gold_data = (float *)gold_tensor->host_data;
+  float *approx_data = (float *)approx_tensor->host_data;
+
+  FILE *fp = fopen("img_psnr.txt", "w+");
 
   float sum_psnr = 0.0;
-  int num_errors = 0;  
-  for(size_t i = 0; i < batch_dim; i++){
+  int num_errors = 0;
+  for (size_t i = 0; i < batch_dim; i++) {
     float mse_sum = 0.0;
-    float max_val = -999999;     
+    float max_val = -999999;
     size_t offset = i * image_size;
-    
-    for(size_t j = 0; j < image_size; j++){
+
+    for (size_t j = 0; j < image_size; j++) {
       float diff = gold_data[offset + j] - approx_data[offset + j];
       float diff_square = diff * diff;
       mse_sum += diff_square;
 
-      if(max_val < gold_data[offset + j]){
-	max_val = gold_data[offset + j];
-      }   
+      if (max_val < gold_data[offset + j]) {
+        max_val = gold_data[offset + j];
+      }
     }
 
     mse_sum = mse_sum / image_size;
@@ -817,7 +761,7 @@ float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshol
 
     sum_psnr += psnr;
     if (psnr < PSNR_threshold)
-      num_errors += 1;    
+      num_errors += 1;
 
     printf("PSNR value = %f \n", psnr);
     psnr_list.push_back(psnr);
@@ -835,126 +779,104 @@ float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshol
   float avg_psnr = sum_psnr / batch_dim;
   printf("*** avg_psnr =  %f \n\n", avg_psnr);
   dumpAvgPSNR(avg_psnr);
- 
+
   float success_rate = 100.0 - violation_rate;
   dumpFinalAccuracy(success_rate);
 
   fclose(fp);
 
-
   float var = 0.0;
-  for(size_t i = 0; i < batch_dim; i++){
-    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); 
+  for (size_t i = 0; i < batch_dim; i++) {
+    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr);
   }
 
   var /= batch_dim;
   float std = sqrt(var);
 
   dumpPSNRStd(std);
-  
-  return violation_rate;  
-}
 
+  return violation_rate;
+}
 
-void dumpOutput(void* output_ptr, const char* file_name){
+void dumpOutput(void *output_ptr, const char *file_name) {
 
-  struct Tensor* out_tensor = (struct Tensor*) output_ptr;  
+  struct Tensor *out_tensor = (struct Tensor *)output_ptr;
   size_t size_in_bytes = out_tensor->size_in_bytes;
-  printf ("** Output size = %lu \n", size_in_bytes);
-  
-  float* host_data = (float*) out_tensor->host_data; 
-  FILE* fd = fopen(file_name, "w+");
+  printf("** Output size = %lu \n", size_in_bytes);
+
+  float *host_data = (float *)out_tensor->host_data;
+  FILE *fd = fopen(file_name, "w+");
   fwrite(host_data, 1, size_in_bytes, fd);
   fclose(fd);
 }
 
+void copyClassConfsAndLabels(void *result_ptr, float *classConfs,
+                             int *predictedLabels, int start, int end) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-
-
-void copyClassConfsAndLabels(void* result_ptr,
-			     float* classConfs,
-			     int* predictedLabels,
-			     int start, int end){
-
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
- 
   size_t num_classes = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
+
+  int it_count = end - start;
+  for (int i = 0; i < it_count; i++) {
 
-  
-  int it_count = end - start;  
-  for(int i = 0; i < it_count; i++){
-  
     int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id;
+    for (int id = 1; id < num_classes; ++id) {
+      if (data[i * num_classes + chosen] < data[i * num_classes + id])
+        chosen = id;
     }
 
     predictedLabels[start + i] = chosen;
     classConfs[start + i] = data[i * num_classes + chosen];
   }
-  
-
 }
 
+void dumpClassConfsAndLabels(float *classConfs, int *predictedLabels,
+                             uint32_t *goldLabels, int test_input_size) {
 
-void dumpClassConfsAndLabels(float* classConfs,
-			     int* predictedLabels,
-			     uint32_t* goldLabels, 
-			     int test_input_size){
+  FILE *labels_fp = fopen("predicted_confs_labels.txt", "w+");
 
-  FILE* labels_fp = fopen("predicted_confs_labels.txt", "w+");
-  
-  for (int i = 0; i < test_input_size; i++){
+  for (int i = 0; i < test_input_size; i++) {
 
     int label = predictedLabels[i];
-    int gold_label = (int) goldLabels[i];
+    int gold_label = (int)goldLabels[i];
     float conf = classConfs[i];
-    
+
     std::ostringstream ss;
     ss << std::fixed << conf;
-    std::string print_str = ss.str(); 
+    std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), labels_fp);
     fwrite(" ", 1, 1, labels_fp);
 
-
     std::ostringstream label_ss;
     label_ss << label;
-    std::string label_str = label_ss.str(); 
+    std::string label_str = label_ss.str();
     fwrite(label_str.c_str(), 1, label_str.length(), labels_fp);
     fwrite(" ", 1, 1, labels_fp);
 
-
     std::ostringstream gold_ss;
     gold_ss << gold_label;
-    std::string gold_str = gold_ss.str(); 
+    std::string gold_str = gold_ss.str();
     fwrite(gold_str.c_str(), 1, gold_str.length(), labels_fp);
     fwrite("\n", 1, 1, labels_fp);
- 
- 
   }
 
   fclose(labels_fp);
 }
 
-
-
-
-
 /**** Routines for Handling Piped Execution ***/
-void stallOnOpenTunerSignal(){
+void stallOnOpenTunerSignal() {
 
-  const char* myfifo = "/tmp/opentuner_fifo";
+  const char *myfifo = "/tmp/opentuner_fifo";
   int fd = open(myfifo, O_RDONLY);
-  if (fd == -1){
+  if (fd == -1) {
     printf("OpenTuner pipe could not be opened \n");
     abort();
   }
-    
+
   int ret_val = fcntl(fd, F_GETFD);
-  if(ret_val == -1){
+  if (ret_val == -1) {
     printf("Invalid descriptor \n");
     abort();
   }
@@ -963,32 +885,26 @@ void stallOnOpenTunerSignal(){
   read(fd, str, 100);
   readOpenTunerFlags("promise_flags");
 
-   
-  if(strcmp(str, "stop_run") == 0){
+  if (strcmp(str, "stop_run") == 0) {
     abort();
   }
 
   close(fd);
 }
 
+void signalPipeToOpenTuner() {
 
-
-void signalPipeToOpenTuner(){
-
-  const char* myfifo = "/tmp/opentuner_fifo";
+  const char *myfifo = "/tmp/opentuner_fifo";
   int fd_out = open(myfifo, O_WRONLY);
   int ret_val = fcntl(fd_out, F_GETFD);
-  if(ret_val == -1){
+  if (ret_val == -1) {
     printf("Invalid descriptor \n");
     abort();
   }
-      
-  const char* str = "completed***!\n\0";
+
+  const char *str = "completed***!\n\0";
   write(fd_out, str, 80);
   close(fd_out);
 }
 
-
-
-
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h
index 45ef7211a4..ef4d1afda7 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h
@@ -3,7 +3,6 @@
 #ifndef UTILS_HEADER
 #define UTILS_HEADER
 
-
 #include <sstream>
 #include <vector>
 #include <bits/stdc++.h>
@@ -13,15 +12,13 @@
 #include <cmath>
 #include <stdint.h>
 
-
 std::vector<float> run_accuracies;
 
+void printTensorInfo(void *tensor_ptr) {
 
-void printTensorInfo(void* tensor_ptr){
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
-
-  if(tensor->gpu_data != NULL){
+  if (tensor->gpu_data != NULL) {
     printf("Successful cudaMalloc \n");
   }
 
@@ -31,59 +28,54 @@ void printTensorInfo(void* tensor_ptr){
   printf("num_elems = %lu \n", tensor->num_elems);
 }
 
+void printTensorDims(void *tensor_ptr) {
 
-
-void printTensorDims(void* tensor_ptr){
-
-  struct Tensor* tensor = (struct Tensor*) tensor_ptr;
+  struct Tensor *tensor = (struct Tensor *)tensor_ptr;
 
   printf("Num_elems = %lu \n", tensor->num_elems);
-  for (int i = 0; i < tensor->dims.num_dims; i++){
+  for (int i = 0; i < tensor->dims.num_dims; i++) {
     printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]);
   }
 }
 
+void compareTensors(void *tensor1_ptr, void *tensor2_ptr) {
 
+  struct Tensor *tensor1 = (struct Tensor *)tensor1_ptr;
+  struct Tensor *tensor2 = (struct Tensor *)tensor2_ptr;
 
-void compareTensors(void* tensor1_ptr, void* tensor2_ptr){
-
-  struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr;
-  struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr;
+  // hpvm_request_tensor(tensor1, 0);
+  // hpvm_request_tensor(tensor2, 0);
 
-  //hpvm_request_tensor(tensor1, 0);
-  //hpvm_request_tensor(tensor2, 0);
+  float *tensor_data1 = (float *)tensor1->host_data;
+  float *tensor_data2 = (float *)tensor2->host_data;
 
-  float* tensor_data1 = (float*) tensor1->host_data;
-  float* tensor_data2 = (float*) tensor2->host_data;
-  
-  for(unsigned int i = 0; i < tensor1->num_elems; i++){
-    if(tensor_data1[i] != tensor_data2[i]){
+  for (unsigned int i = 0; i < tensor1->num_elems; i++) {
+    if (tensor_data1[i] != tensor_data2[i]) {
       printf("Tensor data mismatch at index %d \n", i);
       abort();
     }
   }
 }
 
-
-
 //*** FIXIT: Move this to CPU-only
-struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type,
-				     int dim1_size, int dim2_size,
-				     int dim3_size, int dim4_size){
+struct Tensor *readTrainedWeightsCPU(const char *file_name, int data_type,
+                                     int dim1_size, int dim2_size,
+                                     int dim3_size, int dim4_size) {
 
   // FIXIT: Don't assume floating point types
   int type_size = 4; // NOTE: Assuming floating point tensors
   long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
-  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  long int size_in_bytes =
+      type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
   int file_header_size = 0;
-  
-  FILE* file = fopen(file_name, "rb");
-  if(file == NULL){
+
+  FILE *file = fopen(file_name, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting... \n", file_name);
     abort();
   }
-    
+
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
@@ -91,32 +83,29 @@ struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type,
 
   fclose(file);
 
-  
-  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
-							   dim3_size, dim4_size);
-  
+  struct Tensor *weights = (struct Tensor *)create4DTensor(
+      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+
   initTensorData(weights, tensor_data, size_in_bytes);
-  //compareValues(weights, tensor_data, num_elems);
+  // compareValues(weights, tensor_data, num_elems);
   free(tensor_data);
 
   return weights;
 }
 
+struct Tensor *readTrainedWeights(const char *file_name, int data_type,
+                                  int dim1_size, int dim2_size, int dim3_size,
+                                  int dim4_size) {
 
-struct Tensor* readTrainedWeights(const char* file_name, int data_type,
-				     int dim1_size, int dim2_size,
-				     int dim3_size, int dim4_size){
-
-  return readTrainedWeightsCPU(file_name, data_type, dim1_size, dim2_size, dim3_size, dim4_size);
+  return readTrainedWeightsCPU(file_name, data_type, dim1_size, dim2_size,
+                               dim3_size, dim4_size);
 }
 
+uint8_t *readLabels(const char *labels_file, int num_labels) {
 
-
-uint8_t* readLabels(const char* labels_file, int num_labels){
-
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+  uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
@@ -124,176 +113,168 @@ uint8_t* readLabels(const char* labels_file, int num_labels){
   size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
   fclose(file);
-  
+
   return labels;
 }
 
-
-uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
+uint8_t *readLabelsBatch(const char *labels_file, int start, int end) {
 
   int num_labels = end - start;
   int file_header_size = sizeof(uint8_t) * start;
-  
-  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
-  FILE* file = fopen(labels_file, "rb");
-  if(file == NULL){
+
+  uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels);
+  FILE *file = fopen(labels_file, "rb");
+  if (file == NULL) {
     printf("Data file %s is not found. Aborting...\n", labels_file);
     abort();
   }
-  
+
   fseek(file, file_header_size, SEEK_SET); // Skipping the file header
-    
-  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
+  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
   fclose(file);
-  
+
   return labels;
 }
 
+void computeAccuracy(const char *labels_file, int num_labels,
+                     void *result_ptr) {
 
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){
-
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
-  uint8_t* labels = readLabels(labels_file, num_labels);
+  uint8_t *labels = readLabels(labels_file, num_labels);
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
-  
-  for(int i = 0; i < batch_dim; i++){
+
+  for (int i = 0; i < batch_dim; i++) {
     int chosen = 0;
-    for (int id = 1; id < 10; ++id){
-      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
+    for (int id = 1; id < 10; ++id) {
+      if (data[i * channels + chosen] < data[i * channels + id])
+        chosen = id;
     }
-    
-    if(chosen != labels[i])
+
+    if (chosen != labels[i])
       num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     fprintf(fp, "%f", accuracy);
     fclose(fp);
   }
-  
 }
 
+float computeAccuracy2(uint8_t *labels, int num_labels, void *result_ptr,
+                       unsigned num_classes = 10) {
 
+  unsigned num_zeros = 0;
 
-float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-  unsigned num_zeros = 0;
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
-  
-  for(int i = 0; i < num_labels; i++){  
+
+  for (int i = 0; i < num_labels; i++) {
     int chosen = 0;
-    for (int id = 1; id < num_classes; ++id){
-      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
+    for (int id = 1; id < num_classes; ++id) {
+      if (data[i * channels + chosen] < data[i * channels + id])
+        chosen = id;
     }
-    
-    if(labels[i] == 0)
+
+    if (labels[i] == 0)
       num_zeros++;
-      
-    if(chosen != labels[i])
+
+    if (chosen != labels[i])
       num_errors++;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     fprintf(fp, "%f", accuracy);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
-
-struct ClassProb{
+struct ClassProb {
   float prob;
   int index;
 };
 
-
-bool descendFloatComp(ClassProb obj1, ClassProb obj2){
+bool descendFloatComp(ClassProb obj1, ClassProb obj2) {
   return obj1.prob > obj2.prob;
 }
 
+float computeTop5Accuracy(uint8_t *labels, int num_labels, void *result_ptr,
+                          unsigned num_classes = 10) {
+
+  struct Tensor *result = (struct Tensor *)result_ptr;
 
-float computeTop5Accuracy(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){
-  
-  struct Tensor* result = (struct Tensor*) result_ptr;
-  
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t channels = result->dims.dim_sizes[1];
-  float* data = (float*) result->host_data;
+  float *data = (float *)result->host_data;
   int num_errors = 0;
 
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
-  
-  for(int i = 0; i < num_labels; i++){
+
+  for (int i = 0; i < num_labels; i++) {
 
     std::vector<ClassProb> elem_probs;
-    for (int id = 0; id < num_classes; ++id){
+    for (int id = 0; id < num_classes; ++id) {
       ClassProb cProb;
       cProb.prob = data[i * channels + id];
       cProb.index = id;
-      elem_probs.push_back(cProb);   
+      elem_probs.push_back(cProb);
     }
 
-    std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
+  std:
+    sort(elem_probs.begin(), elem_probs.end(), descendFloatComp);
     // Check if any of top-5 predictions matches
     bool matched = false;
-    for(int j = 0; j < 5; j++){
+    for (int j = 0; j < 5; j++) {
       ClassProb cProb = elem_probs[j];
-      if(cProb.index == labels[i])
+      if (cProb.index == labels[i])
         matched = true;
     }
 
-    if(!matched)
-      num_errors +=1; 
+    if (!matched)
+      num_errors += 1;
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
 
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     fprintf(fp, "%f", accuracy);
   }
 
   fclose(fp);
 
-  return accuracy;    
+  return accuracy;
 }
 
-
-
-
-void dumpFinalAccuracy(float accuracy){
+void dumpFinalAccuracy(float accuracy) {
 
   printf("\n\n **** Final Accuracy = %f \n", accuracy);
-  
-  FILE* fp = fopen("final_accuracy", "w+");
-  if(fp != NULL){
+
+  FILE *fp = fopen("final_accuracy", "w+");
+  if (fp != NULL) {
     fprintf(fp, "%f", accuracy);
   }
 
@@ -302,15 +283,13 @@ void dumpFinalAccuracy(float accuracy){
   run_accuracies.push_back(accuracy);
 }
 
-
-
 /*void dumpAvgPSNR(float avg_psnr){
 
   FILE* fp = fopen("avg_psnr", "w+");
   if(fp != NULL){
     std::ostringstream ss;
     ss << std::fixed << avg_psnr;
-    std::string print_str = ss.str(); 
+    std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
@@ -324,21 +303,18 @@ void dumpFinalAccuracy(float accuracy){
   if(fp != NULL){
     std::ostringstream ss;
     ss << std::fixed << psnr_std;
-    std::string print_str = ss.str(); 
+    std::string print_str = ss.str();
     fwrite(print_str.c_str(), 1, print_str.length(), fp);
   }
 
   fclose(fp);
 }*/
 
-
-
-
 /*
 void dumpExecutionAccuracies(){
 
   FILE* fp = fopen("run_accuracies.txt", "w+");
-  if(fp != NULL){  
+  if(fp != NULL){
     for (int i = 0; i < run_accuracies.size(); i++){
       float accuracy = run_accuracies[i];
       std::ostringstream ss;
@@ -354,56 +330,56 @@ void dumpExecutionAccuracies(){
 }
 */
 
-float readPSNRFromFile(const char* file_name){
+float readPSNRFromFile(const char *file_name) {
 
   float psnr;
-  FILE* pFile = fopen(file_name, "r");
-  if(pFile == NULL){
+  FILE *pFile = fopen(file_name, "r");
+  if (pFile == NULL) {
     printf("ERROR: psnr.txt not found! \n");
     abort();
   }
-  
+
   fscanf(pFile, "%f", &psnr);
   printf("**** PSNR read = %f \n\n", psnr);
-  return psnr; 
+  return psnr;
 }
 
+/*float computePSNRViolation(void* gold_ptr, void* approx_ptr, float
+PSNR_threshold){
 
-/*float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){
 
-  
   PSNR_threshold = readPSNRFromFile("psnr.txt");
   std::vector<float> psnr_list;
-  
+
   struct Tensor* gold_tensor = (struct Tensor*) gold_ptr;
   struct Tensor* approx_tensor = (struct Tensor*) approx_ptr;
 
   size_t* dim_sizes = gold_tensor->dims.dim_sizes;
   size_t batch_dim = dim_sizes[0];
   size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
-  
+
   printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size);
-	 
+
   float* gold_data = (float*) gold_tensor->host_data;
   float* approx_data = (float*) approx_tensor->host_data;
 
   FILE* fp = fopen("img_psnr.txt", "w+");
 
   float sum_psnr = 0.0;
-  int num_errors = 0;  
+  int num_errors = 0;
   for(size_t i = 0; i < batch_dim; i++){
     float mse_sum = 0.0;
-    float max_val = -999999;     
+    float max_val = -999999;
     size_t offset = i * image_size;
-    
+
     for(size_t j = 0; j < image_size; j++){
       float diff = gold_data[offset + j] - approx_data[offset + j];
       float diff_square = diff * diff;
       mse_sum += diff_square;
 
       if(max_val < gold_data[offset + j]){
-	max_val = gold_data[offset + j];
-      }   
+        max_val = gold_data[offset + j];
+      }
     }
 
     mse_sum = mse_sum / image_size;
@@ -411,7 +387,7 @@ float readPSNRFromFile(const char* file_name){
 
     sum_psnr += psnr;
     if (psnr < PSNR_threshold)
-      num_errors += 1;    
+      num_errors += 1;
 
     printf("PSNR value = %f \n", psnr);
     psnr_list.push_back(psnr);
@@ -429,7 +405,7 @@ float readPSNRFromFile(const char* file_name){
   float avg_psnr = sum_psnr / batch_dim;
   printf("*** avg_psnr =  %f \n\n", avg_psnr);
   dumpAvgPSNR(avg_psnr);
- 
+
   float success_rate = 100.0 - violation_rate;
   dumpFinalAccuracy(success_rate);
 
@@ -438,30 +414,27 @@ float readPSNRFromFile(const char* file_name){
 
   float var = 0.0;
   for(size_t i = 0; i < batch_dim; i++){
-    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); 
+    var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr);
   }
 
   var /= batch_dim;
   float std = sqrt(var);
 
   //dumpPSNRStd(std);
-  
-  return violation_rate;  
-}*/
 
+  return violation_rate;
+}*/
 
-void dumpOutput(void* output_ptr, const char* file_name){
+void dumpOutput(void *output_ptr, const char *file_name) {
 
-  struct Tensor* out_tensor = (struct Tensor*) output_ptr;  
+  struct Tensor *out_tensor = (struct Tensor *)output_ptr;
   size_t size_in_bytes = out_tensor->size_in_bytes;
-  printf ("** Output size = %lu \n", size_in_bytes);
-  
-  float* host_data = (float*) out_tensor->host_data; 
-  FILE* fd = fopen(file_name, "w+");
+  printf("** Output size = %lu \n", size_in_bytes);
+
+  float *host_data = (float *)out_tensor->host_data;
+  FILE *fd = fopen(file_name, "w+");
   fwrite(host_data, 1, size_in_bytes, fd);
   fclose(fd);
 }
 
-
-
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
index d93110945b..846500ad35 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc
@@ -11,50 +11,62 @@
 #include "../../../tensor_runtime/include/tensor_runtime.h"
 #include "../../include/utils.h"
 
-
-
 /* NOTE: Reference Architecture to use for profiling */
-void testCifarNet(){
+void testCifarNet() {
 
   printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
- 
-  std::string dir_prefix = model_params_path + std::string("/alexnet2_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,32,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,128,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-  
-  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
-  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
 
+  std::string dir_prefix =
+      model_params_path + std::string("/alexnet2_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
+
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 32, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
+
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision =
+      0; // NOTE: using Float as compute precision. FIXIT: use enum
 
   startMemTracking();
 
@@ -65,61 +77,61 @@ void testCifarNet(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
-    
-    void* conv1out = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv1out, conv2d_1_b); 
-    void* conv1_tanh = tensorHalfTanh(conv1out);
-    
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *conv1out = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv1out, conv2d_1_b);
+    void *conv1_tanh = tensorHalfTanh(conv1out);
+
     // 2nd Layer
-    void* conv2out = tensorHalfConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv2out, conv2d_2_b); 
-    void* conv2_tanh = tensorHalfTanh(conv2out);
-    void* pool2out = tensorHalfPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
-     
+    void *conv2out = tensorHalfConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv2out, conv2d_2_b);
+    void *conv2_tanh = tensorHalfTanh(conv2out);
+    void *pool2out = tensorHalfPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // 3rd Layer
-    void* conv3out = tensorHalfConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv3out, conv2d_3_b); 
-    void* conv3_tanh = tensorHalfTanh(conv3out);
+    void *conv3out = tensorHalfConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv3out, conv2d_3_b);
+    void *conv3_tanh = tensorHalfTanh(conv3out);
 
     // 4th Layer
-    void* conv4out = tensorHalfConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv4out, conv2d_4_b); 
-    void* conv4_tanh = tensorHalfTanh(conv4out);
-    void* pool4out = tensorHalfPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
-    
+    void *conv4out = tensorHalfConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv4out, conv2d_4_b);
+    void *conv4_tanh = tensorHalfTanh(conv4out);
+    void *pool4out = tensorHalfPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // 5th Layer
-    void* conv5out = tensorHalfConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorHalfAdd(conv5out, conv2d_5_b); 
-    void* conv5_tanh = tensorHalfTanh(conv5out);
+    void *conv5out = tensorHalfConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
+    tensorHalfAdd(conv5out, conv2d_5_b);
+    void *conv5_tanh = tensorHalfTanh(conv5out);
 
     // 6th Layer
-    void* conv6out = tensorHalfConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
+    void *conv6out = tensorHalfConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1,
+                                           conv_mode, conv_precision);
     tensorHalfAdd(conv6out, conv2d_6_b);
-    void* conv6_tanh = tensorHalfTanh(conv6out);
-    void* pool6out = tensorHalfPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
-    
+    void *conv6_tanh = tensorHalfTanh(conv6out);
+    void *pool6out = tensorHalfPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // final FC Layer
-    void* gemm1out = tensorHalfGemmGPU(pool6out, dense_1_w);  
-    void* gemm1biasout = tensorHalfAdd(gemm1out, dense_1_b);
-    void* result = tensorSoftmax(gemm1biasout);
+    void *gemm1out = tensorHalfGemmGPU(pool6out, dense_1_w);
+    void *gemm1biasout = tensorHalfAdd(gemm1out, dense_1_b);
+    void *result = tensorSoftmax(gemm1biasout);
 
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
 
-    float accuracy = computeAccuracy2(labels, batch_size, result); 
+    float accuracy = computeAccuracy2(labels, batch_size, result);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -127,11 +139,9 @@ void testCifarNet(){
 
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
-
 }
 
-
-int main(int argc, char* argv[]){
+int main(int argc, char *argv[]) {
 
   llvm_hpvm_initTensorRt(0);
 
@@ -141,4 +151,3 @@ int main(int argc, char* argv[]){
 
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc
index b7695bbd7a..2bde9d1eea 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc
@@ -1,49 +1,58 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../../tensor_runtime/include/tensor_runtime.h" 
-#include "../../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-
-  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); 
-
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include "../../../tensor_runtime/include/tensor_runtime.h"
+#include "../../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/");
+
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 4096, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -54,40 +63,40 @@ int main(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-   for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
-
-    void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
-    void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorHalfTanh(var_1); 
-    void* var_3 = tensorHalfPooling(var_2,0,2,2,0,0,2,2); 
-    void* var_5 = tensorHalfConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
-    void* var_6 = tensorHalfAdd(var_5, conv2d_2_b); 
-    void* var_7 = tensorHalfTanh(var_6); 
-    void* var_8 = tensorHalfPooling(var_7,0,2,2,0,0,2,2); 
-    void* var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorHalfAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorHalfTanh(var_11); 
-    void* var_13 = tensorHalfConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_14 = tensorHalfAdd(var_13, conv2d_4_b); 
-    void* var_15 = tensorHalfTanh(var_14); 
-    void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorHalfTanh(var_17); 
-    void* var_19 = tensorHalfPooling(var_18,0,2,2,0,0,2,2); 
-    void* var_22 = tensorHalfGemmGPU(var_19, dense_1_w); 
-    void* var_23 = tensorHalfAdd(var_22, dense_1_b); 
-    void* var_24 = tensorSoftmax(var_23); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_24); 
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0);
+    void *var_1 = tensorHalfAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorHalfTanh(var_1);
+    void *var_3 = tensorHalfPooling(var_2, 0, 2, 2, 0, 0, 2, 2);
+    void *var_5 = tensorHalfConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0);
+    void *var_6 = tensorHalfAdd(var_5, conv2d_2_b);
+    void *var_7 = tensorHalfTanh(var_6);
+    void *var_8 = tensorHalfPooling(var_7, 0, 2, 2, 0, 0, 2, 2);
+    void *var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_11 = tensorHalfAdd(var_10, conv2d_3_b);
+    void *var_12 = tensorHalfTanh(var_11);
+    void *var_13 = tensorHalfConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_14 = tensorHalfAdd(var_13, conv2d_4_b);
+    void *var_15 = tensorHalfTanh(var_14);
+    void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorHalfAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorHalfTanh(var_17);
+    void *var_19 = tensorHalfPooling(var_18, 0, 2, 2, 0, 0, 2, 2);
+    void *var_22 = tensorHalfGemmGPU(var_19, dense_1_w);
+    void *var_23 = tensorHalfAdd(var_22, dense_1_b);
+    void *var_24 = tensorSoftmax(var_23);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_24);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -96,9 +105,7 @@ int main(){
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
 
+  llvm_hpvm_cleanupTensorRt();
 
-  llvm_hpvm_cleanupTensorRt(); 
-
-  return 0; 
-
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
index 29f392c630..037a3d7a3e 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc
@@ -8,108 +8,102 @@
 #include <sys/stat.h>
 #include <string.h>
 
-
 #include "tensor_runtime.h"
 #include "utils.h"
 
-
 /* NOTE: Reference Architecture to use for profiling */
-void testLenetTanh(){
+void testLenetTanh() {
   int total_runs = 1;
   printf("********* Lenet-2 Architecture ********** \n");
   // FIXIT: Extend this to batch of images - currently 5 images
 
   int test_batch_size = 5000;
 
-  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");   
+  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");
+
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-  
   // Loading Input Batch
-  void* input = readInputBatch(input_path.c_str(),0, 0,test_batch_size,1,28,28); 
-  uint8_t* labels = readLabelsBatch(labels_path.c_str(), 0,test_batch_size); 
-    
-  void* conv1_filter = readTrainedWeights("../model_params/lenet_mnist/conv1.bin",
-					  float_type, 32, 1, 5, 5);    
-  void* conv1_bias = readTrainedWeights("../model_params/lenet_mnist/conv1_bias.bin",
-					float_type, 1, 32, 1, 1);  
-  void* conv2_filter = readTrainedWeights("../model_params/lenet_mnist/conv2.bin",
-					  float_type, 64, 32, 5, 5);  
-  void* conv2_bias = readTrainedWeights("../model_params/lenet_mnist/conv2_bias.bin",
-					float_type, 1, 64, 1, 1);  
-  void* fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin",
-					 float_type, 1, 1, 7*7*64, 1024);  
-  void* fc1_bias = readTrainedWeights("../model_params/lenet_mnist/fc1_bias.bin",
-				      float_type, 1, 1024, 1, 1);  
-  void* fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin",
-					 float_type, 1, 1, 1024, 10);  
-  void* fc2_bias = readTrainedWeights("../model_params/lenet_mnist/fc2_bias.bin",
-				      float_type, 1, 10, 1, 1);  
-
-
-  
+  void *input =
+      readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28);
+  uint8_t *labels = readLabelsBatch(labels_path.c_str(), 0, test_batch_size);
+
+  void *conv1_filter = readTrainedWeights(
+      "../model_params/lenet_mnist/conv1.bin", float_type, 32, 1, 5, 5);
+  void *conv1_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/conv1_bias.bin", float_type, 1, 32, 1, 1);
+  void *conv2_filter = readTrainedWeights(
+      "../model_params/lenet_mnist/conv2.bin", float_type, 64, 32, 5, 5);
+  void *conv2_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/conv2_bias.bin", float_type, 1, 64, 1, 1);
+  void *fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin",
+                                         float_type, 1, 1, 7 * 7 * 64, 1024);
+  void *fc1_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/fc1_bias.bin", float_type, 1, 1024, 1, 1);
+  void *fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin",
+                                         float_type, 1, 1, 1024, 10);
+  void *fc2_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/fc2_bias.bin", float_type, 1, 10, 1, 1);
+
   clearTensorMap();
-  
-  for(int i = 0; i < total_runs; i++){
+
+  for (int i = 0; i < total_runs; i++) {
     readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters
 
-    // Start power and performnce profiling 
+    // Start power and performnce profiling
     startProfiling();
-  
+
     int conv_mode = 1; // NOTE: using CROSS_CORRELATION
-    int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+    int conv_precision =
+        0; // NOTE: using Float as compute precision. FIXIT: use enum
 
     // NOTE: 'SAME' convolution
-    void* conv1out = tensorHalfConvolution(input, conv1_filter, 2, 2, 1, 1,
-				       conv_mode, conv_precision);
+    void *conv1out = tensorHalfConvolution(input, conv1_filter, 2, 2, 1, 1,
+                                           conv_mode, conv_precision);
 
-    // NOTE: For tensorAdd, the only dimension that MUST match is channels  
+    // NOTE: For tensorAdd, the only dimension that MUST match is channels
     tensorHalfAdd(conv1out, conv1_bias); // NOTE: In place operation
 
-    void* pool1out = tensorHalfPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
+    void *pool1out = tensorHalfPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
 
-    void* conv1_tanh = tensorHalfTanh(pool1out);
+    void *conv1_tanh = tensorHalfTanh(pool1out);
 
-    // NOTE: input channels have to match between tensor op inputs and outputs 
-    void* conv2out = tensorHalfConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1,
-				       conv_mode, conv_precision);
+    // NOTE: input channels have to match between tensor op inputs and outputs
+    void *conv2out = tensorHalfConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1,
+                                           conv_mode, conv_precision);
     tensorHalfAdd(conv2out, conv2_bias); // NOTE: In place operation
 
-    void* pool2out = tensorHalfPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+    void *pool2out = tensorHalfPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+
+    void *conv2_tanh = tensorHalfTanh(pool2out);
+
+    void *gemm1out = tensorHalfGemm(conv2_tanh, fc1_weights);
 
-    void* conv2_tanh = tensorHalfTanh(pool2out);
+    void *gemm1biasout = tensorHalfAdd(gemm1out, fc1_bias);
 
-    void* gemm1out = tensorHalfGemm(conv2_tanh, fc1_weights);  
+    void *tanh1out = tensorHalfTanh(gemm1biasout);
 
-    void* gemm1biasout = tensorHalfAdd(gemm1out, fc1_bias);
+    void *gemm2out = tensorHalfGemm(tanh1out, fc2_weights);
 
-    void* tanh1out = tensorHalfTanh(gemm1biasout);
-  
-    void* gemm2out = tensorHalfGemm(tanh1out, fc2_weights);  
-  
-    void* gemm2_biasout = tensorHalfAdd(gemm2out, fc2_bias);
+    void *gemm2_biasout = tensorHalfAdd(gemm2out, fc2_bias);
 
-    void* tanh2out = tensorHalfTanh(gemm2_biasout);
-  
-    void* result = tensorSoftmax(tanh2out);
+    void *tanh2out = tensorHalfTanh(gemm2_biasout);
+
+    void *result = tensorSoftmax(tanh2out);
 
     // End profiling and dump output to profile.txt
     stopProfiling();
-  
+
     computeAccuracy2(labels, test_batch_size, result);
-    
+
     dumpAccuracyNorms();
     freeOutputTensors();
   }
-
-
-  
 }
 
-
-int main(int argc, char* argv[]){
+int main(int argc, char *argv[]) {
   llvm_hpvm_initTensorRt(0);
 
   testLenetTanh();
@@ -118,4 +112,3 @@ int main(int argc, char* argv[]){
 
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
index d662dc1584..8940aeb3f1 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc
@@ -1,411 +1,731 @@
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
 
 #include "../../../tensor_runtime/include/tensor_runtime.h"
 #include "../../include/utils.h"
 
-int main(){ 
+int main() {
 
-    llvm_hpvm_initTensorRt(0); 
+  llvm_hpvm_initTensorRt(0);
 
+  std::string dir_prefix = model_params_path + std::string("/mobilenet/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
+  std::string batch_normalization_1_gamma_path =
+      dir_prefix + std::string("batch_normalization_1_gamma.bin");
+  void *batch_normalization_1_gamma = readTrainedWeights(
+      batch_normalization_1_gamma_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_beta_path =
+      dir_prefix + std::string("batch_normalization_1_beta.bin");
+  void *batch_normalization_1_beta = readTrainedWeights(
+      batch_normalization_1_beta_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_mean_path =
+      dir_prefix + std::string("batch_normalization_1_mean.bin");
+  void *batch_normalization_1_mean = readTrainedWeights(
+      batch_normalization_1_mean_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_variance_path =
+      dir_prefix + std::string("batch_normalization_1_variance.bin");
+  void *batch_normalization_1_variance = readTrainedWeights(
+      batch_normalization_1_variance_path.c_str(), 0, 1, 32, 1, 1);
+  std::string depthwise_conv2d_1_w_path =
+      dir_prefix + std::string("depthwise_conv2d_1_w.bin");
+  void *depthwise_conv2d_1_w =
+      readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0, 32, 1, 3, 3);
+  std::string batch_normalization_2_gamma_path =
+      dir_prefix + std::string("batch_normalization_2_gamma.bin");
+  void *batch_normalization_2_gamma = readTrainedWeights(
+      batch_normalization_2_gamma_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_beta_path =
+      dir_prefix + std::string("batch_normalization_2_beta.bin");
+  void *batch_normalization_2_beta = readTrainedWeights(
+      batch_normalization_2_beta_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_mean_path =
+      dir_prefix + std::string("batch_normalization_2_mean.bin");
+  void *batch_normalization_2_mean = readTrainedWeights(
+      batch_normalization_2_mean_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_variance_path =
+      dir_prefix + std::string("batch_normalization_2_variance.bin");
+  void *batch_normalization_2_variance = readTrainedWeights(
+      batch_normalization_2_variance_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 32, 1, 1);
+  std::string batch_normalization_3_gamma_path =
+      dir_prefix + std::string("batch_normalization_3_gamma.bin");
+  void *batch_normalization_3_gamma = readTrainedWeights(
+      batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_beta_path =
+      dir_prefix + std::string("batch_normalization_3_beta.bin");
+  void *batch_normalization_3_beta = readTrainedWeights(
+      batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_mean_path =
+      dir_prefix + std::string("batch_normalization_3_mean.bin");
+  void *batch_normalization_3_mean = readTrainedWeights(
+      batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_variance_path =
+      dir_prefix + std::string("batch_normalization_3_variance.bin");
+  void *batch_normalization_3_variance = readTrainedWeights(
+      batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string depthwise_conv2d_2_w_path =
+      dir_prefix + std::string("depthwise_conv2d_2_w.bin");
+  void *depthwise_conv2d_2_w =
+      readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0, 64, 1, 3, 3);
+  std::string batch_normalization_4_gamma_path =
+      dir_prefix + std::string("batch_normalization_4_gamma.bin");
+  void *batch_normalization_4_gamma = readTrainedWeights(
+      batch_normalization_4_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_beta_path =
+      dir_prefix + std::string("batch_normalization_4_beta.bin");
+  void *batch_normalization_4_beta = readTrainedWeights(
+      batch_normalization_4_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_mean_path =
+      dir_prefix + std::string("batch_normalization_4_mean.bin");
+  void *batch_normalization_4_mean = readTrainedWeights(
+      batch_normalization_4_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_variance_path =
+      dir_prefix + std::string("batch_normalization_4_variance.bin");
+  void *batch_normalization_4_variance = readTrainedWeights(
+      batch_normalization_4_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 1, 1);
+  std::string batch_normalization_5_gamma_path =
+      dir_prefix + std::string("batch_normalization_5_gamma.bin");
+  void *batch_normalization_5_gamma = readTrainedWeights(
+      batch_normalization_5_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_beta_path =
+      dir_prefix + std::string("batch_normalization_5_beta.bin");
+  void *batch_normalization_5_beta = readTrainedWeights(
+      batch_normalization_5_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_mean_path =
+      dir_prefix + std::string("batch_normalization_5_mean.bin");
+  void *batch_normalization_5_mean = readTrainedWeights(
+      batch_normalization_5_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_variance_path =
+      dir_prefix + std::string("batch_normalization_5_variance.bin");
+  void *batch_normalization_5_variance = readTrainedWeights(
+      batch_normalization_5_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string depthwise_conv2d_3_w_path =
+      dir_prefix + std::string("depthwise_conv2d_3_w.bin");
+  void *depthwise_conv2d_3_w =
+      readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0, 128, 1, 3, 3);
+  std::string batch_normalization_6_gamma_path =
+      dir_prefix + std::string("batch_normalization_6_gamma.bin");
+  void *batch_normalization_6_gamma = readTrainedWeights(
+      batch_normalization_6_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_beta_path =
+      dir_prefix + std::string("batch_normalization_6_beta.bin");
+  void *batch_normalization_6_beta = readTrainedWeights(
+      batch_normalization_6_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_mean_path =
+      dir_prefix + std::string("batch_normalization_6_mean.bin");
+  void *batch_normalization_6_mean = readTrainedWeights(
+      batch_normalization_6_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_variance_path =
+      dir_prefix + std::string("batch_normalization_6_variance.bin");
+  void *batch_normalization_6_variance = readTrainedWeights(
+      batch_normalization_6_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 1, 1);
+  std::string batch_normalization_7_gamma_path =
+      dir_prefix + std::string("batch_normalization_7_gamma.bin");
+  void *batch_normalization_7_gamma = readTrainedWeights(
+      batch_normalization_7_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_beta_path =
+      dir_prefix + std::string("batch_normalization_7_beta.bin");
+  void *batch_normalization_7_beta = readTrainedWeights(
+      batch_normalization_7_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_mean_path =
+      dir_prefix + std::string("batch_normalization_7_mean.bin");
+  void *batch_normalization_7_mean = readTrainedWeights(
+      batch_normalization_7_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_variance_path =
+      dir_prefix + std::string("batch_normalization_7_variance.bin");
+  void *batch_normalization_7_variance = readTrainedWeights(
+      batch_normalization_7_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string depthwise_conv2d_4_w_path =
+      dir_prefix + std::string("depthwise_conv2d_4_w.bin");
+  void *depthwise_conv2d_4_w =
+      readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0, 128, 1, 3, 3);
+  std::string batch_normalization_8_gamma_path =
+      dir_prefix + std::string("batch_normalization_8_gamma.bin");
+  void *batch_normalization_8_gamma = readTrainedWeights(
+      batch_normalization_8_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_beta_path =
+      dir_prefix + std::string("batch_normalization_8_beta.bin");
+  void *batch_normalization_8_beta = readTrainedWeights(
+      batch_normalization_8_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_mean_path =
+      dir_prefix + std::string("batch_normalization_8_mean.bin");
+  void *batch_normalization_8_mean = readTrainedWeights(
+      batch_normalization_8_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_variance_path =
+      dir_prefix + std::string("batch_normalization_8_variance.bin");
+  void *batch_normalization_8_variance = readTrainedWeights(
+      batch_normalization_8_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 1, 1);
+  std::string batch_normalization_9_gamma_path =
+      dir_prefix + std::string("batch_normalization_9_gamma.bin");
+  void *batch_normalization_9_gamma = readTrainedWeights(
+      batch_normalization_9_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_beta_path =
+      dir_prefix + std::string("batch_normalization_9_beta.bin");
+  void *batch_normalization_9_beta = readTrainedWeights(
+      batch_normalization_9_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_mean_path =
+      dir_prefix + std::string("batch_normalization_9_mean.bin");
+  void *batch_normalization_9_mean = readTrainedWeights(
+      batch_normalization_9_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_variance_path =
+      dir_prefix + std::string("batch_normalization_9_variance.bin");
+  void *batch_normalization_9_variance = readTrainedWeights(
+      batch_normalization_9_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string depthwise_conv2d_5_w_path =
+      dir_prefix + std::string("depthwise_conv2d_5_w.bin");
+  void *depthwise_conv2d_5_w =
+      readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0, 256, 1, 3, 3);
+  std::string batch_normalization_10_gamma_path =
+      dir_prefix + std::string("batch_normalization_10_gamma.bin");
+  void *batch_normalization_10_gamma = readTrainedWeights(
+      batch_normalization_10_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_beta_path =
+      dir_prefix + std::string("batch_normalization_10_beta.bin");
+  void *batch_normalization_10_beta = readTrainedWeights(
+      batch_normalization_10_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_mean_path =
+      dir_prefix + std::string("batch_normalization_10_mean.bin");
+  void *batch_normalization_10_mean = readTrainedWeights(
+      batch_normalization_10_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_variance_path =
+      dir_prefix + std::string("batch_normalization_10_variance.bin");
+  void *batch_normalization_10_variance = readTrainedWeights(
+      batch_normalization_10_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 1, 1);
+  std::string batch_normalization_11_gamma_path =
+      dir_prefix + std::string("batch_normalization_11_gamma.bin");
+  void *batch_normalization_11_gamma = readTrainedWeights(
+      batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_beta_path =
+      dir_prefix + std::string("batch_normalization_11_beta.bin");
+  void *batch_normalization_11_beta = readTrainedWeights(
+      batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_mean_path =
+      dir_prefix + std::string("batch_normalization_11_mean.bin");
+  void *batch_normalization_11_mean = readTrainedWeights(
+      batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_variance_path =
+      dir_prefix + std::string("batch_normalization_11_variance.bin");
+  void *batch_normalization_11_variance = readTrainedWeights(
+      batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string depthwise_conv2d_6_w_path =
+      dir_prefix + std::string("depthwise_conv2d_6_w.bin");
+  void *depthwise_conv2d_6_w =
+      readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0, 256, 1, 3, 3);
+  std::string batch_normalization_12_gamma_path =
+      dir_prefix + std::string("batch_normalization_12_gamma.bin");
+  void *batch_normalization_12_gamma = readTrainedWeights(
+      batch_normalization_12_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_beta_path =
+      dir_prefix + std::string("batch_normalization_12_beta.bin");
+  void *batch_normalization_12_beta = readTrainedWeights(
+      batch_normalization_12_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_mean_path =
+      dir_prefix + std::string("batch_normalization_12_mean.bin");
+  void *batch_normalization_12_mean = readTrainedWeights(
+      batch_normalization_12_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_variance_path =
+      dir_prefix + std::string("batch_normalization_12_variance.bin");
+  void *batch_normalization_12_variance = readTrainedWeights(
+      batch_normalization_12_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 512, 256, 1, 1);
+  std::string batch_normalization_13_gamma_path =
+      dir_prefix + std::string("batch_normalization_13_gamma.bin");
+  void *batch_normalization_13_gamma = readTrainedWeights(
+      batch_normalization_13_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_beta_path =
+      dir_prefix + std::string("batch_normalization_13_beta.bin");
+  void *batch_normalization_13_beta = readTrainedWeights(
+      batch_normalization_13_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_mean_path =
+      dir_prefix + std::string("batch_normalization_13_mean.bin");
+  void *batch_normalization_13_mean = readTrainedWeights(
+      batch_normalization_13_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_variance_path =
+      dir_prefix + std::string("batch_normalization_13_variance.bin");
+  void *batch_normalization_13_variance = readTrainedWeights(
+      batch_normalization_13_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_7_w_path =
+      dir_prefix + std::string("depthwise_conv2d_7_w.bin");
+  void *depthwise_conv2d_7_w =
+      readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_14_gamma_path =
+      dir_prefix + std::string("batch_normalization_14_gamma.bin");
+  void *batch_normalization_14_gamma = readTrainedWeights(
+      batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_beta_path =
+      dir_prefix + std::string("batch_normalization_14_beta.bin");
+  void *batch_normalization_14_beta = readTrainedWeights(
+      batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_mean_path =
+      dir_prefix + std::string("batch_normalization_14_mean.bin");
+  void *batch_normalization_14_mean = readTrainedWeights(
+      batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_variance_path =
+      dir_prefix + std::string("batch_normalization_14_variance.bin");
+  void *batch_normalization_14_variance = readTrainedWeights(
+      batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_15_gamma_path =
+      dir_prefix + std::string("batch_normalization_15_gamma.bin");
+  void *batch_normalization_15_gamma = readTrainedWeights(
+      batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_beta_path =
+      dir_prefix + std::string("batch_normalization_15_beta.bin");
+  void *batch_normalization_15_beta = readTrainedWeights(
+      batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_mean_path =
+      dir_prefix + std::string("batch_normalization_15_mean.bin");
+  void *batch_normalization_15_mean = readTrainedWeights(
+      batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_variance_path =
+      dir_prefix + std::string("batch_normalization_15_variance.bin");
+  void *batch_normalization_15_variance = readTrainedWeights(
+      batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_8_w_path =
+      dir_prefix + std::string("depthwise_conv2d_8_w.bin");
+  void *depthwise_conv2d_8_w =
+      readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_16_gamma_path =
+      dir_prefix + std::string("batch_normalization_16_gamma.bin");
+  void *batch_normalization_16_gamma = readTrainedWeights(
+      batch_normalization_16_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_beta_path =
+      dir_prefix + std::string("batch_normalization_16_beta.bin");
+  void *batch_normalization_16_beta = readTrainedWeights(
+      batch_normalization_16_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_mean_path =
+      dir_prefix + std::string("batch_normalization_16_mean.bin");
+  void *batch_normalization_16_mean = readTrainedWeights(
+      batch_normalization_16_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_variance_path =
+      dir_prefix + std::string("batch_normalization_16_variance.bin");
+  void *batch_normalization_16_variance = readTrainedWeights(
+      batch_normalization_16_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_17_gamma_path =
+      dir_prefix + std::string("batch_normalization_17_gamma.bin");
+  void *batch_normalization_17_gamma = readTrainedWeights(
+      batch_normalization_17_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_beta_path =
+      dir_prefix + std::string("batch_normalization_17_beta.bin");
+  void *batch_normalization_17_beta = readTrainedWeights(
+      batch_normalization_17_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_mean_path =
+      dir_prefix + std::string("batch_normalization_17_mean.bin");
+  void *batch_normalization_17_mean = readTrainedWeights(
+      batch_normalization_17_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_variance_path =
+      dir_prefix + std::string("batch_normalization_17_variance.bin");
+  void *batch_normalization_17_variance = readTrainedWeights(
+      batch_normalization_17_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_9_w_path =
+      dir_prefix + std::string("depthwise_conv2d_9_w.bin");
+  void *depthwise_conv2d_9_w =
+      readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_18_gamma_path =
+      dir_prefix + std::string("batch_normalization_18_gamma.bin");
+  void *batch_normalization_18_gamma = readTrainedWeights(
+      batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_beta_path =
+      dir_prefix + std::string("batch_normalization_18_beta.bin");
+  void *batch_normalization_18_beta = readTrainedWeights(
+      batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_mean_path =
+      dir_prefix + std::string("batch_normalization_18_mean.bin");
+  void *batch_normalization_18_mean = readTrainedWeights(
+      batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_variance_path =
+      dir_prefix + std::string("batch_normalization_18_variance.bin");
+  void *batch_normalization_18_variance = readTrainedWeights(
+      batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_19_gamma_path =
+      dir_prefix + std::string("batch_normalization_19_gamma.bin");
+  void *batch_normalization_19_gamma = readTrainedWeights(
+      batch_normalization_19_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_beta_path =
+      dir_prefix + std::string("batch_normalization_19_beta.bin");
+  void *batch_normalization_19_beta = readTrainedWeights(
+      batch_normalization_19_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_mean_path =
+      dir_prefix + std::string("batch_normalization_19_mean.bin");
+  void *batch_normalization_19_mean = readTrainedWeights(
+      batch_normalization_19_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_variance_path =
+      dir_prefix + std::string("batch_normalization_19_variance.bin");
+  void *batch_normalization_19_variance = readTrainedWeights(
+      batch_normalization_19_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_10_w_path =
+      dir_prefix + std::string("depthwise_conv2d_10_w.bin");
+  void *depthwise_conv2d_10_w =
+      readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_20_gamma_path =
+      dir_prefix + std::string("batch_normalization_20_gamma.bin");
+  void *batch_normalization_20_gamma = readTrainedWeights(
+      batch_normalization_20_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_beta_path =
+      dir_prefix + std::string("batch_normalization_20_beta.bin");
+  void *batch_normalization_20_beta = readTrainedWeights(
+      batch_normalization_20_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_mean_path =
+      dir_prefix + std::string("batch_normalization_20_mean.bin");
+  void *batch_normalization_20_mean = readTrainedWeights(
+      batch_normalization_20_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_variance_path =
+      dir_prefix + std::string("batch_normalization_20_variance.bin");
+  void *batch_normalization_20_variance = readTrainedWeights(
+      batch_normalization_20_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_21_gamma_path =
+      dir_prefix + std::string("batch_normalization_21_gamma.bin");
+  void *batch_normalization_21_gamma = readTrainedWeights(
+      batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_beta_path =
+      dir_prefix + std::string("batch_normalization_21_beta.bin");
+  void *batch_normalization_21_beta = readTrainedWeights(
+      batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_mean_path =
+      dir_prefix + std::string("batch_normalization_21_mean.bin");
+  void *batch_normalization_21_mean = readTrainedWeights(
+      batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_variance_path =
+      dir_prefix + std::string("batch_normalization_21_variance.bin");
+  void *batch_normalization_21_variance = readTrainedWeights(
+      batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_11_w_path =
+      dir_prefix + std::string("depthwise_conv2d_11_w.bin");
+  void *depthwise_conv2d_11_w =
+      readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_22_gamma_path =
+      dir_prefix + std::string("batch_normalization_22_gamma.bin");
+  void *batch_normalization_22_gamma = readTrainedWeights(
+      batch_normalization_22_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_beta_path =
+      dir_prefix + std::string("batch_normalization_22_beta.bin");
+  void *batch_normalization_22_beta = readTrainedWeights(
+      batch_normalization_22_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_mean_path =
+      dir_prefix + std::string("batch_normalization_22_mean.bin");
+  void *batch_normalization_22_mean = readTrainedWeights(
+      batch_normalization_22_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_variance_path =
+      dir_prefix + std::string("batch_normalization_22_variance.bin");
+  void *batch_normalization_22_variance = readTrainedWeights(
+      batch_normalization_22_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_23_gamma_path =
+      dir_prefix + std::string("batch_normalization_23_gamma.bin");
+  void *batch_normalization_23_gamma = readTrainedWeights(
+      batch_normalization_23_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_beta_path =
+      dir_prefix + std::string("batch_normalization_23_beta.bin");
+  void *batch_normalization_23_beta = readTrainedWeights(
+      batch_normalization_23_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_mean_path =
+      dir_prefix + std::string("batch_normalization_23_mean.bin");
+  void *batch_normalization_23_mean = readTrainedWeights(
+      batch_normalization_23_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_variance_path =
+      dir_prefix + std::string("batch_normalization_23_variance.bin");
+  void *batch_normalization_23_variance = readTrainedWeights(
+      batch_normalization_23_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_12_w_path =
+      dir_prefix + std::string("depthwise_conv2d_12_w.bin");
+  void *depthwise_conv2d_12_w =
+      readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_24_gamma_path =
+      dir_prefix + std::string("batch_normalization_24_gamma.bin");
+  void *batch_normalization_24_gamma = readTrainedWeights(
+      batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_beta_path =
+      dir_prefix + std::string("batch_normalization_24_beta.bin");
+  void *batch_normalization_24_beta = readTrainedWeights(
+      batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_mean_path =
+      dir_prefix + std::string("batch_normalization_24_mean.bin");
+  void *batch_normalization_24_mean = readTrainedWeights(
+      batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_variance_path =
+      dir_prefix + std::string("batch_normalization_24_variance.bin");
+  void *batch_normalization_24_variance = readTrainedWeights(
+      batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 1024, 512, 1, 1);
+  std::string batch_normalization_25_gamma_path =
+      dir_prefix + std::string("batch_normalization_25_gamma.bin");
+  void *batch_normalization_25_gamma = readTrainedWeights(
+      batch_normalization_25_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_beta_path =
+      dir_prefix + std::string("batch_normalization_25_beta.bin");
+  void *batch_normalization_25_beta = readTrainedWeights(
+      batch_normalization_25_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_mean_path =
+      dir_prefix + std::string("batch_normalization_25_mean.bin");
+  void *batch_normalization_25_mean = readTrainedWeights(
+      batch_normalization_25_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_variance_path =
+      dir_prefix + std::string("batch_normalization_25_variance.bin");
+  void *batch_normalization_25_variance = readTrainedWeights(
+      batch_normalization_25_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string depthwise_conv2d_13_w_path =
+      dir_prefix + std::string("depthwise_conv2d_13_w.bin");
+  void *depthwise_conv2d_13_w =
+      readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0, 1024, 1, 3, 3);
+  std::string batch_normalization_26_gamma_path =
+      dir_prefix + std::string("batch_normalization_26_gamma.bin");
+  void *batch_normalization_26_gamma = readTrainedWeights(
+      batch_normalization_26_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_beta_path =
+      dir_prefix + std::string("batch_normalization_26_beta.bin");
+  void *batch_normalization_26_beta = readTrainedWeights(
+      batch_normalization_26_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_mean_path =
+      dir_prefix + std::string("batch_normalization_26_mean.bin");
+  void *batch_normalization_26_mean = readTrainedWeights(
+      batch_normalization_26_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_variance_path =
+      dir_prefix + std::string("batch_normalization_26_variance.bin");
+  void *batch_normalization_26_variance = readTrainedWeights(
+      batch_normalization_26_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 1024, 1024, 1, 1);
+  std::string batch_normalization_27_gamma_path =
+      dir_prefix + std::string("batch_normalization_27_gamma.bin");
+  void *batch_normalization_27_gamma = readTrainedWeights(
+      batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_beta_path =
+      dir_prefix + std::string("batch_normalization_27_beta.bin");
+  void *batch_normalization_27_beta = readTrainedWeights(
+      batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_mean_path =
+      dir_prefix + std::string("batch_normalization_27_mean.bin");
+  void *batch_normalization_27_mean = readTrainedWeights(
+      batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_variance_path =
+      dir_prefix + std::string("batch_normalization_27_variance.bin");
+  void *batch_normalization_27_variance = readTrainedWeights(
+      batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-    std::string dir_prefix = model_params_path + std::string("/mobilenet/"); 
-    std::string input_path =  dir_prefix + std::string("input.bin"); 
-    std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-    std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-    void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
-    std::string batch_normalization_1_gamma_path =  dir_prefix + std::string("batch_normalization_1_gamma.bin"); 
-    void* batch_normalization_1_gamma =  readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_1_beta_path =  dir_prefix + std::string("batch_normalization_1_beta.bin"); 
-    void* batch_normalization_1_beta =  readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_1_mean_path =  dir_prefix + std::string("batch_normalization_1_mean.bin"); 
-    void* batch_normalization_1_mean =  readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_1_variance_path =  dir_prefix + std::string("batch_normalization_1_variance.bin"); 
-    void* batch_normalization_1_variance =  readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); 
-    std::string depthwise_conv2d_1_w_path =  dir_prefix + std::string("depthwise_conv2d_1_w.bin"); 
-    void* depthwise_conv2d_1_w =  readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); 
-    std::string batch_normalization_2_gamma_path =  dir_prefix + std::string("batch_normalization_2_gamma.bin"); 
-    void* batch_normalization_2_gamma =  readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_2_beta_path =  dir_prefix + std::string("batch_normalization_2_beta.bin"); 
-    void* batch_normalization_2_beta =  readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_2_mean_path =  dir_prefix + std::string("batch_normalization_2_mean.bin"); 
-    void* batch_normalization_2_mean =  readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); 
-    std::string batch_normalization_2_variance_path =  dir_prefix + std::string("batch_normalization_2_variance.bin"); 
-    void* batch_normalization_2_variance =  readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); 
-    std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-    void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); 
-    std::string batch_normalization_3_gamma_path =  dir_prefix + std::string("batch_normalization_3_gamma.bin"); 
-    void* batch_normalization_3_gamma =  readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_3_beta_path =  dir_prefix + std::string("batch_normalization_3_beta.bin"); 
-    void* batch_normalization_3_beta =  readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_3_mean_path =  dir_prefix + std::string("batch_normalization_3_mean.bin"); 
-    void* batch_normalization_3_mean =  readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_3_variance_path =  dir_prefix + std::string("batch_normalization_3_variance.bin"); 
-    void* batch_normalization_3_variance =  readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); 
-    std::string depthwise_conv2d_2_w_path =  dir_prefix + std::string("depthwise_conv2d_2_w.bin"); 
-    void* depthwise_conv2d_2_w =  readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); 
-    std::string batch_normalization_4_gamma_path =  dir_prefix + std::string("batch_normalization_4_gamma.bin"); 
-    void* batch_normalization_4_gamma =  readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_4_beta_path =  dir_prefix + std::string("batch_normalization_4_beta.bin"); 
-    void* batch_normalization_4_beta =  readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_4_mean_path =  dir_prefix + std::string("batch_normalization_4_mean.bin"); 
-    void* batch_normalization_4_mean =  readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); 
-    std::string batch_normalization_4_variance_path =  dir_prefix + std::string("batch_normalization_4_variance.bin"); 
-    void* batch_normalization_4_variance =  readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); 
-    std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-    void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); 
-    std::string batch_normalization_5_gamma_path =  dir_prefix + std::string("batch_normalization_5_gamma.bin"); 
-    void* batch_normalization_5_gamma =  readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_5_beta_path =  dir_prefix + std::string("batch_normalization_5_beta.bin"); 
-    void* batch_normalization_5_beta =  readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_5_mean_path =  dir_prefix + std::string("batch_normalization_5_mean.bin"); 
-    void* batch_normalization_5_mean =  readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_5_variance_path =  dir_prefix + std::string("batch_normalization_5_variance.bin"); 
-    void* batch_normalization_5_variance =  readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); 
-    std::string depthwise_conv2d_3_w_path =  dir_prefix + std::string("depthwise_conv2d_3_w.bin"); 
-    void* depthwise_conv2d_3_w =  readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); 
-    std::string batch_normalization_6_gamma_path =  dir_prefix + std::string("batch_normalization_6_gamma.bin"); 
-    void* batch_normalization_6_gamma =  readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_6_beta_path =  dir_prefix + std::string("batch_normalization_6_beta.bin"); 
-    void* batch_normalization_6_beta =  readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_6_mean_path =  dir_prefix + std::string("batch_normalization_6_mean.bin"); 
-    void* batch_normalization_6_mean =  readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_6_variance_path =  dir_prefix + std::string("batch_normalization_6_variance.bin"); 
-    void* batch_normalization_6_variance =  readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); 
-    std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-    void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); 
-    std::string batch_normalization_7_gamma_path =  dir_prefix + std::string("batch_normalization_7_gamma.bin"); 
-    void* batch_normalization_7_gamma =  readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_7_beta_path =  dir_prefix + std::string("batch_normalization_7_beta.bin"); 
-    void* batch_normalization_7_beta =  readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_7_mean_path =  dir_prefix + std::string("batch_normalization_7_mean.bin"); 
-    void* batch_normalization_7_mean =  readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_7_variance_path =  dir_prefix + std::string("batch_normalization_7_variance.bin"); 
-    void* batch_normalization_7_variance =  readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); 
-    std::string depthwise_conv2d_4_w_path =  dir_prefix + std::string("depthwise_conv2d_4_w.bin"); 
-    void* depthwise_conv2d_4_w =  readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); 
-    std::string batch_normalization_8_gamma_path =  dir_prefix + std::string("batch_normalization_8_gamma.bin"); 
-    void* batch_normalization_8_gamma =  readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_8_beta_path =  dir_prefix + std::string("batch_normalization_8_beta.bin"); 
-    void* batch_normalization_8_beta =  readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_8_mean_path =  dir_prefix + std::string("batch_normalization_8_mean.bin"); 
-    void* batch_normalization_8_mean =  readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); 
-    std::string batch_normalization_8_variance_path =  dir_prefix + std::string("batch_normalization_8_variance.bin"); 
-    void* batch_normalization_8_variance =  readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); 
-    std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-    void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); 
-    std::string batch_normalization_9_gamma_path =  dir_prefix + std::string("batch_normalization_9_gamma.bin"); 
-    void* batch_normalization_9_gamma =  readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_9_beta_path =  dir_prefix + std::string("batch_normalization_9_beta.bin"); 
-    void* batch_normalization_9_beta =  readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_9_mean_path =  dir_prefix + std::string("batch_normalization_9_mean.bin"); 
-    void* batch_normalization_9_mean =  readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_9_variance_path =  dir_prefix + std::string("batch_normalization_9_variance.bin"); 
-    void* batch_normalization_9_variance =  readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); 
-    std::string depthwise_conv2d_5_w_path =  dir_prefix + std::string("depthwise_conv2d_5_w.bin"); 
-    void* depthwise_conv2d_5_w =  readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); 
-    std::string batch_normalization_10_gamma_path =  dir_prefix + std::string("batch_normalization_10_gamma.bin"); 
-    void* batch_normalization_10_gamma =  readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_10_beta_path =  dir_prefix + std::string("batch_normalization_10_beta.bin"); 
-    void* batch_normalization_10_beta =  readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_10_mean_path =  dir_prefix + std::string("batch_normalization_10_mean.bin"); 
-    void* batch_normalization_10_mean =  readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_10_variance_path =  dir_prefix + std::string("batch_normalization_10_variance.bin"); 
-    void* batch_normalization_10_variance =  readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-    void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); 
-    std::string batch_normalization_11_gamma_path =  dir_prefix + std::string("batch_normalization_11_gamma.bin"); 
-    void* batch_normalization_11_gamma =  readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_11_beta_path =  dir_prefix + std::string("batch_normalization_11_beta.bin"); 
-    void* batch_normalization_11_beta =  readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_11_mean_path =  dir_prefix + std::string("batch_normalization_11_mean.bin"); 
-    void* batch_normalization_11_mean =  readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_11_variance_path =  dir_prefix + std::string("batch_normalization_11_variance.bin"); 
-    void* batch_normalization_11_variance =  readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); 
-    std::string depthwise_conv2d_6_w_path =  dir_prefix + std::string("depthwise_conv2d_6_w.bin"); 
-    void* depthwise_conv2d_6_w =  readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); 
-    std::string batch_normalization_12_gamma_path =  dir_prefix + std::string("batch_normalization_12_gamma.bin"); 
-    void* batch_normalization_12_gamma =  readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_12_beta_path =  dir_prefix + std::string("batch_normalization_12_beta.bin"); 
-    void* batch_normalization_12_beta =  readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_12_mean_path =  dir_prefix + std::string("batch_normalization_12_mean.bin"); 
-    void* batch_normalization_12_mean =  readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); 
-    std::string batch_normalization_12_variance_path =  dir_prefix + std::string("batch_normalization_12_variance.bin"); 
-    void* batch_normalization_12_variance =  readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-    void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); 
-    std::string batch_normalization_13_gamma_path =  dir_prefix + std::string("batch_normalization_13_gamma.bin"); 
-    void* batch_normalization_13_gamma =  readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_13_beta_path =  dir_prefix + std::string("batch_normalization_13_beta.bin"); 
-    void* batch_normalization_13_beta =  readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_13_mean_path =  dir_prefix + std::string("batch_normalization_13_mean.bin"); 
-    void* batch_normalization_13_mean =  readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_13_variance_path =  dir_prefix + std::string("batch_normalization_13_variance.bin"); 
-    void* batch_normalization_13_variance =  readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_7_w_path =  dir_prefix + std::string("depthwise_conv2d_7_w.bin"); 
-    void* depthwise_conv2d_7_w =  readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_14_gamma_path =  dir_prefix + std::string("batch_normalization_14_gamma.bin"); 
-    void* batch_normalization_14_gamma =  readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_14_beta_path =  dir_prefix + std::string("batch_normalization_14_beta.bin"); 
-    void* batch_normalization_14_beta =  readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_14_mean_path =  dir_prefix + std::string("batch_normalization_14_mean.bin"); 
-    void* batch_normalization_14_mean =  readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_14_variance_path =  dir_prefix + std::string("batch_normalization_14_variance.bin"); 
-    void* batch_normalization_14_variance =  readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-    void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_15_gamma_path =  dir_prefix + std::string("batch_normalization_15_gamma.bin"); 
-    void* batch_normalization_15_gamma =  readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_15_beta_path =  dir_prefix + std::string("batch_normalization_15_beta.bin"); 
-    void* batch_normalization_15_beta =  readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_15_mean_path =  dir_prefix + std::string("batch_normalization_15_mean.bin"); 
-    void* batch_normalization_15_mean =  readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_15_variance_path =  dir_prefix + std::string("batch_normalization_15_variance.bin"); 
-    void* batch_normalization_15_variance =  readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_8_w_path =  dir_prefix + std::string("depthwise_conv2d_8_w.bin"); 
-    void* depthwise_conv2d_8_w =  readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_16_gamma_path =  dir_prefix + std::string("batch_normalization_16_gamma.bin"); 
-    void* batch_normalization_16_gamma =  readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_16_beta_path =  dir_prefix + std::string("batch_normalization_16_beta.bin"); 
-    void* batch_normalization_16_beta =  readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_16_mean_path =  dir_prefix + std::string("batch_normalization_16_mean.bin"); 
-    void* batch_normalization_16_mean =  readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_16_variance_path =  dir_prefix + std::string("batch_normalization_16_variance.bin"); 
-    void* batch_normalization_16_variance =  readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-    void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_17_gamma_path =  dir_prefix + std::string("batch_normalization_17_gamma.bin"); 
-    void* batch_normalization_17_gamma =  readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_17_beta_path =  dir_prefix + std::string("batch_normalization_17_beta.bin"); 
-    void* batch_normalization_17_beta =  readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_17_mean_path =  dir_prefix + std::string("batch_normalization_17_mean.bin"); 
-    void* batch_normalization_17_mean =  readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_17_variance_path =  dir_prefix + std::string("batch_normalization_17_variance.bin"); 
-    void* batch_normalization_17_variance =  readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_9_w_path =  dir_prefix + std::string("depthwise_conv2d_9_w.bin"); 
-    void* depthwise_conv2d_9_w =  readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_18_gamma_path =  dir_prefix + std::string("batch_normalization_18_gamma.bin"); 
-    void* batch_normalization_18_gamma =  readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_18_beta_path =  dir_prefix + std::string("batch_normalization_18_beta.bin"); 
-    void* batch_normalization_18_beta =  readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_18_mean_path =  dir_prefix + std::string("batch_normalization_18_mean.bin"); 
-    void* batch_normalization_18_mean =  readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_18_variance_path =  dir_prefix + std::string("batch_normalization_18_variance.bin"); 
-    void* batch_normalization_18_variance =  readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-    void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_19_gamma_path =  dir_prefix + std::string("batch_normalization_19_gamma.bin"); 
-    void* batch_normalization_19_gamma =  readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_19_beta_path =  dir_prefix + std::string("batch_normalization_19_beta.bin"); 
-    void* batch_normalization_19_beta =  readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_19_mean_path =  dir_prefix + std::string("batch_normalization_19_mean.bin"); 
-    void* batch_normalization_19_mean =  readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_19_variance_path =  dir_prefix + std::string("batch_normalization_19_variance.bin"); 
-    void* batch_normalization_19_variance =  readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_10_w_path =  dir_prefix + std::string("depthwise_conv2d_10_w.bin"); 
-    void* depthwise_conv2d_10_w =  readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_20_gamma_path =  dir_prefix + std::string("batch_normalization_20_gamma.bin"); 
-    void* batch_normalization_20_gamma =  readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_20_beta_path =  dir_prefix + std::string("batch_normalization_20_beta.bin"); 
-    void* batch_normalization_20_beta =  readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_20_mean_path =  dir_prefix + std::string("batch_normalization_20_mean.bin"); 
-    void* batch_normalization_20_mean =  readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_20_variance_path =  dir_prefix + std::string("batch_normalization_20_variance.bin"); 
-    void* batch_normalization_20_variance =  readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-    void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_21_gamma_path =  dir_prefix + std::string("batch_normalization_21_gamma.bin"); 
-    void* batch_normalization_21_gamma =  readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_21_beta_path =  dir_prefix + std::string("batch_normalization_21_beta.bin"); 
-    void* batch_normalization_21_beta =  readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_21_mean_path =  dir_prefix + std::string("batch_normalization_21_mean.bin"); 
-    void* batch_normalization_21_mean =  readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_21_variance_path =  dir_prefix + std::string("batch_normalization_21_variance.bin"); 
-    void* batch_normalization_21_variance =  readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_11_w_path =  dir_prefix + std::string("depthwise_conv2d_11_w.bin"); 
-    void* depthwise_conv2d_11_w =  readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_22_gamma_path =  dir_prefix + std::string("batch_normalization_22_gamma.bin"); 
-    void* batch_normalization_22_gamma =  readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_22_beta_path =  dir_prefix + std::string("batch_normalization_22_beta.bin"); 
-    void* batch_normalization_22_beta =  readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_22_mean_path =  dir_prefix + std::string("batch_normalization_22_mean.bin"); 
-    void* batch_normalization_22_mean =  readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_22_variance_path =  dir_prefix + std::string("batch_normalization_22_variance.bin"); 
-    void* batch_normalization_22_variance =  readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-    void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); 
-    std::string batch_normalization_23_gamma_path =  dir_prefix + std::string("batch_normalization_23_gamma.bin"); 
-    void* batch_normalization_23_gamma =  readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_23_beta_path =  dir_prefix + std::string("batch_normalization_23_beta.bin"); 
-    void* batch_normalization_23_beta =  readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_23_mean_path =  dir_prefix + std::string("batch_normalization_23_mean.bin"); 
-    void* batch_normalization_23_mean =  readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_23_variance_path =  dir_prefix + std::string("batch_normalization_23_variance.bin"); 
-    void* batch_normalization_23_variance =  readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); 
-    std::string depthwise_conv2d_12_w_path =  dir_prefix + std::string("depthwise_conv2d_12_w.bin"); 
-    void* depthwise_conv2d_12_w =  readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); 
-    std::string batch_normalization_24_gamma_path =  dir_prefix + std::string("batch_normalization_24_gamma.bin"); 
-    void* batch_normalization_24_gamma =  readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_24_beta_path =  dir_prefix + std::string("batch_normalization_24_beta.bin"); 
-    void* batch_normalization_24_beta =  readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_24_mean_path =  dir_prefix + std::string("batch_normalization_24_mean.bin"); 
-    void* batch_normalization_24_mean =  readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); 
-    std::string batch_normalization_24_variance_path =  dir_prefix + std::string("batch_normalization_24_variance.bin"); 
-    void* batch_normalization_24_variance =  readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-    void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); 
-    std::string batch_normalization_25_gamma_path =  dir_prefix + std::string("batch_normalization_25_gamma.bin"); 
-    void* batch_normalization_25_gamma =  readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_25_beta_path =  dir_prefix + std::string("batch_normalization_25_beta.bin"); 
-    void* batch_normalization_25_beta =  readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_25_mean_path =  dir_prefix + std::string("batch_normalization_25_mean.bin"); 
-    void* batch_normalization_25_mean =  readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_25_variance_path =  dir_prefix + std::string("batch_normalization_25_variance.bin"); 
-    void* batch_normalization_25_variance =  readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); 
-    std::string depthwise_conv2d_13_w_path =  dir_prefix + std::string("depthwise_conv2d_13_w.bin"); 
-    void* depthwise_conv2d_13_w =  readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); 
-    std::string batch_normalization_26_gamma_path =  dir_prefix + std::string("batch_normalization_26_gamma.bin"); 
-    void* batch_normalization_26_gamma =  readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_26_beta_path =  dir_prefix + std::string("batch_normalization_26_beta.bin"); 
-    void* batch_normalization_26_beta =  readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_26_mean_path =  dir_prefix + std::string("batch_normalization_26_mean.bin"); 
-    void* batch_normalization_26_mean =  readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_26_variance_path =  dir_prefix + std::string("batch_normalization_26_variance.bin"); 
-    void* batch_normalization_26_variance =  readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); 
-    std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-    void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); 
-    std::string batch_normalization_27_gamma_path =  dir_prefix + std::string("batch_normalization_27_gamma.bin"); 
-    void* batch_normalization_27_gamma =  readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_27_beta_path =  dir_prefix + std::string("batch_normalization_27_beta.bin"); 
-    void* batch_normalization_27_beta =  readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_27_mean_path =  dir_prefix + std::string("batch_normalization_27_mean.bin"); 
-    void* batch_normalization_27_mean =  readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); 
-    std::string batch_normalization_27_variance_path =  dir_prefix + std::string("batch_normalization_27_variance.bin"); 
-    void* batch_normalization_27_variance =  readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); 
-    std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-    void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); 
-    std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-    void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+  startMemTracking();
 
+  int test_input_size = 2000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
 
+  float final_accuracy = 0.0;
 
-    startMemTracking(); 
+  for (int i = 0; i < batch_count; i++) {
 
-    int test_input_size = 2000; 
-    int batch_size = 1000;  
-    int batch_count = test_input_size / batch_size; 
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
 
-    float final_accuracy = 0.0; 
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
 
-    for(int i = 0; i < batch_count; i++){ 
+    void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1);
+    void *var_1 = tensorHalfBatchNorm(
+        var_0, batch_normalization_1_gamma, batch_normalization_1_beta,
+        batch_normalization_1_mean, batch_normalization_1_variance, 0.001);
+    void *var_2 = tensorHalfRelu(var_1);
+    void *var_4 =
+        tensorHalfConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32);
+    void *var_5 = tensorHalfBatchNorm(
+        var_4, batch_normalization_2_gamma, batch_normalization_2_beta,
+        batch_normalization_2_mean, batch_normalization_2_variance, 0.001);
+    void *var_6 = tensorHalfRelu(var_5);
+    void *var_7 = tensorHalfConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1);
+    void *var_8 = tensorHalfBatchNorm(
+        var_7, batch_normalization_3_gamma, batch_normalization_3_beta,
+        batch_normalization_3_mean, batch_normalization_3_variance, 0.001);
+    void *var_9 = tensorHalfRelu(var_8);
+    void *var_11 =
+        tensorHalfConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64);
+    void *var_12 = tensorHalfBatchNorm(
+        var_11, batch_normalization_4_gamma, batch_normalization_4_beta,
+        batch_normalization_4_mean, batch_normalization_4_variance, 0.001);
+    void *var_13 = tensorHalfRelu(var_12);
+    void *var_14 = tensorHalfConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1);
+    void *var_15 = tensorHalfBatchNorm(
+        var_14, batch_normalization_5_gamma, batch_normalization_5_beta,
+        batch_normalization_5_mean, batch_normalization_5_variance, 0.001);
+    void *var_16 = tensorHalfRelu(var_15);
+    void *var_18 =
+        tensorHalfConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128);
+    void *var_19 = tensorHalfBatchNorm(
+        var_18, batch_normalization_6_gamma, batch_normalization_6_beta,
+        batch_normalization_6_mean, batch_normalization_6_variance, 0.001);
+    void *var_20 = tensorHalfRelu(var_19);
+    void *var_21 = tensorHalfConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1);
+    void *var_22 = tensorHalfBatchNorm(
+        var_21, batch_normalization_7_gamma, batch_normalization_7_beta,
+        batch_normalization_7_mean, batch_normalization_7_variance, 0.001);
+    void *var_23 = tensorHalfRelu(var_22);
+    void *var_26 =
+        tensorHalfConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128);
+    void *var_27 = tensorHalfBatchNorm(
+        var_26, batch_normalization_8_gamma, batch_normalization_8_beta,
+        batch_normalization_8_mean, batch_normalization_8_variance, 0.001);
+    void *var_28 = tensorHalfRelu(var_27);
+    void *var_29 = tensorHalfConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1);
+    void *var_30 = tensorHalfBatchNorm(
+        var_29, batch_normalization_9_gamma, batch_normalization_9_beta,
+        batch_normalization_9_mean, batch_normalization_9_variance, 0.001);
+    void *var_31 = tensorHalfRelu(var_30);
+    void *var_33 =
+        tensorHalfConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256);
+    void *var_34 = tensorHalfBatchNorm(
+        var_33, batch_normalization_10_gamma, batch_normalization_10_beta,
+        batch_normalization_10_mean, batch_normalization_10_variance, 0.001);
+    void *var_35 = tensorHalfRelu(var_34);
+    void *var_36 = tensorHalfConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1);
+    void *var_37 = tensorHalfBatchNorm(
+        var_36, batch_normalization_11_gamma, batch_normalization_11_beta,
+        batch_normalization_11_mean, batch_normalization_11_variance, 0.001);
+    void *var_38 = tensorHalfRelu(var_37);
+    void *var_41 =
+        tensorHalfConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256);
+    void *var_42 = tensorHalfBatchNorm(
+        var_41, batch_normalization_12_gamma, batch_normalization_12_beta,
+        batch_normalization_12_mean, batch_normalization_12_variance, 0.001);
+    void *var_43 = tensorHalfRelu(var_42);
+    void *var_44 = tensorHalfConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1);
+    void *var_45 = tensorHalfBatchNorm(
+        var_44, batch_normalization_13_gamma, batch_normalization_13_beta,
+        batch_normalization_13_mean, batch_normalization_13_variance, 0.001);
+    void *var_46 = tensorHalfRelu(var_45);
+    void *var_48 =
+        tensorHalfConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512);
+    void *var_49 = tensorHalfBatchNorm(
+        var_48, batch_normalization_14_gamma, batch_normalization_14_beta,
+        batch_normalization_14_mean, batch_normalization_14_variance, 0.001);
+    void *var_50 = tensorHalfRelu(var_49);
+    void *var_51 = tensorHalfConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1);
+    void *var_52 = tensorHalfBatchNorm(
+        var_51, batch_normalization_15_gamma, batch_normalization_15_beta,
+        batch_normalization_15_mean, batch_normalization_15_variance, 0.001);
+    void *var_53 = tensorHalfRelu(var_52);
+    void *var_55 =
+        tensorHalfConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512);
+    void *var_56 = tensorHalfBatchNorm(
+        var_55, batch_normalization_16_gamma, batch_normalization_16_beta,
+        batch_normalization_16_mean, batch_normalization_16_variance, 0.001);
+    void *var_57 = tensorHalfRelu(var_56);
+    void *var_58 = tensorHalfConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1);
+    void *var_59 = tensorHalfBatchNorm(
+        var_58, batch_normalization_17_gamma, batch_normalization_17_beta,
+        batch_normalization_17_mean, batch_normalization_17_variance, 0.001);
+    void *var_60 = tensorHalfRelu(var_59);
+    void *var_63 =
+        tensorHalfConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512);
+    void *var_64 = tensorHalfBatchNorm(
+        var_63, batch_normalization_18_gamma, batch_normalization_18_beta,
+        batch_normalization_18_mean, batch_normalization_18_variance, 0.001);
+    void *var_65 = tensorHalfRelu(var_64);
+    void *var_66 = tensorHalfConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1);
+    void *var_67 = tensorHalfBatchNorm(
+        var_66, batch_normalization_19_gamma, batch_normalization_19_beta,
+        batch_normalization_19_mean, batch_normalization_19_variance, 0.001);
+    void *var_68 = tensorHalfRelu(var_67);
+    void *var_70 = tensorHalfConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1,
+                                         1, 1, 512);
+    void *var_71 = tensorHalfBatchNorm(
+        var_70, batch_normalization_20_gamma, batch_normalization_20_beta,
+        batch_normalization_20_mean, batch_normalization_20_variance, 0.001);
+    void *var_72 = tensorHalfRelu(var_71);
+    void *var_73 = tensorHalfConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1);
+    void *var_74 = tensorHalfBatchNorm(
+        var_73, batch_normalization_21_gamma, batch_normalization_21_beta,
+        batch_normalization_21_mean, batch_normalization_21_variance, 0.001);
+    void *var_75 = tensorHalfRelu(var_74);
+    void *var_77 = tensorHalfConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1,
+                                         1, 1, 512);
+    void *var_78 = tensorHalfBatchNorm(
+        var_77, batch_normalization_22_gamma, batch_normalization_22_beta,
+        batch_normalization_22_mean, batch_normalization_22_variance, 0.001);
+    void *var_79 = tensorHalfRelu(var_78);
+    void *var_80 = tensorHalfConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1);
+    void *var_81 = tensorHalfBatchNorm(
+        var_80, batch_normalization_23_gamma, batch_normalization_23_beta,
+        batch_normalization_23_mean, batch_normalization_23_variance, 0.001);
+    void *var_82 = tensorHalfRelu(var_81);
+    void *var_85 = tensorHalfConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2,
+                                         2, 1, 512);
+    void *var_86 = tensorHalfBatchNorm(
+        var_85, batch_normalization_24_gamma, batch_normalization_24_beta,
+        batch_normalization_24_mean, batch_normalization_24_variance, 0.001);
+    void *var_87 = tensorHalfRelu(var_86);
+    void *var_88 = tensorHalfConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1);
+    void *var_89 = tensorHalfBatchNorm(
+        var_88, batch_normalization_25_gamma, batch_normalization_25_beta,
+        batch_normalization_25_mean, batch_normalization_25_variance, 0.001);
+    void *var_90 = tensorHalfRelu(var_89);
+    void *var_92 = tensorHalfConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1,
+                                         1, 1, 1024);
+    void *var_93 = tensorHalfBatchNorm(
+        var_92, batch_normalization_26_gamma, batch_normalization_26_beta,
+        batch_normalization_26_mean, batch_normalization_26_variance, 0.001);
+    void *var_94 = tensorHalfRelu(var_93);
+    void *var_95 = tensorHalfConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1);
+    void *var_96 = tensorHalfBatchNorm(
+        var_95, batch_normalization_27_gamma, batch_normalization_27_beta,
+        batch_normalization_27_mean, batch_normalization_27_variance, 0.001);
+    void *var_97 = tensorHalfRelu(var_96);
+    void *var_99 = tensorHalfPooling(var_97, 1, 2, 2, 0, 0, 2, 2);
+    void *var_101 = tensorHalfGemmGPU(var_99, dense_1_w);
+    void *var_102 = tensorHalfAdd(var_101, dense_1_b);
+    void *var_103 = tensorSoftmax(var_102);
 
-        int start = i * batch_size; 
-        int end = (i + 1) * batch_size; 
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
 
-        void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
+    float accuracy = computeAccuracy2(labels, batch_size, var_103);
+    final_accuracy += accuracy;
+    freeBatchMemory();
+  }
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-        void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
-        void* var_1 = tensorHalfBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
-        void* var_2 = tensorHalfRelu(var_1); 
-        void* var_4 = tensorHalfConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
-        void* var_5 = tensorHalfBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
-        void* var_6 = tensorHalfRelu(var_5); 
-        void* var_7 = tensorHalfConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
-        void* var_8 = tensorHalfBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
-        void* var_9 = tensorHalfRelu(var_8); 
-        void* var_11 = tensorHalfConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
-        void* var_12 = tensorHalfBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
-        void* var_13 = tensorHalfRelu(var_12); 
-        void* var_14 = tensorHalfConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
-        void* var_15 = tensorHalfBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
-        void* var_16 = tensorHalfRelu(var_15); 
-        void* var_18 = tensorHalfConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
-        void* var_19 = tensorHalfBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
-        void* var_20 = tensorHalfRelu(var_19); 
-        void* var_21 = tensorHalfConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
-        void* var_22 = tensorHalfBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
-        void* var_23 = tensorHalfRelu(var_22); 
-        void* var_26 = tensorHalfConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
-        void* var_27 = tensorHalfBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
-        void* var_28 = tensorHalfRelu(var_27); 
-        void* var_29 = tensorHalfConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
-        void* var_30 = tensorHalfBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
-        void* var_31 = tensorHalfRelu(var_30); 
-        void* var_33 = tensorHalfConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
-        void* var_34 = tensorHalfBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
-        void* var_35 = tensorHalfRelu(var_34); 
-        void* var_36 = tensorHalfConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
-        void* var_37 = tensorHalfBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
-        void* var_38 = tensorHalfRelu(var_37); 
-        void* var_41 = tensorHalfConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
-        void* var_42 = tensorHalfBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
-        void* var_43 = tensorHalfRelu(var_42); 
-        void* var_44 = tensorHalfConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); 
-        void* var_45 = tensorHalfBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
-        void* var_46 = tensorHalfRelu(var_45); 
-        void* var_48 = tensorHalfConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
-        void* var_49 = tensorHalfBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
-        void* var_50 = tensorHalfRelu(var_49); 
-        void* var_51 = tensorHalfConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
-        void* var_52 = tensorHalfBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
-        void* var_53 = tensorHalfRelu(var_52); 
-        void* var_55 = tensorHalfConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
-        void* var_56 = tensorHalfBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
-        void* var_57 = tensorHalfRelu(var_56); 
-        void* var_58 = tensorHalfConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
-        void* var_59 = tensorHalfBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
-        void* var_60 = tensorHalfRelu(var_59); 
-        void* var_63 = tensorHalfConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
-        void* var_64 = tensorHalfBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
-        void* var_65 = tensorHalfRelu(var_64); 
-        void* var_66 = tensorHalfConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); 
-        void* var_67 = tensorHalfBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
-        void* var_68 = tensorHalfRelu(var_67); 
-        void* var_70 = tensorHalfConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
-        void* var_71 = tensorHalfBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
-        void* var_72 = tensorHalfRelu(var_71); 
-        void* var_73 = tensorHalfConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
-        void* var_74 = tensorHalfBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
-        void* var_75 = tensorHalfRelu(var_74); 
-        void* var_77 = tensorHalfConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
-        void* var_78 = tensorHalfBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
-        void* var_79 = tensorHalfRelu(var_78); 
-        void* var_80 = tensorHalfConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); 
-        void* var_81 = tensorHalfBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
-        void* var_82 = tensorHalfRelu(var_81); 
-        void* var_85 = tensorHalfConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
-        void* var_86 = tensorHalfBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
-        void* var_87 = tensorHalfRelu(var_86); 
-        void* var_88 = tensorHalfConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); 
-        void* var_89 = tensorHalfBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-        void* var_90 = tensorHalfRelu(var_89); 
-        void* var_92 = tensorHalfConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
-        void* var_93 = tensorHalfBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
-        void* var_94 = tensorHalfRelu(var_93); 
-        void* var_95 = tensorHalfConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
-        void* var_96 = tensorHalfBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-        void* var_97 = tensorHalfRelu(var_96); 
-        void* var_99 = tensorHalfPooling(var_97,1,2,2,0,0,2,2); 
-        void* var_101 = tensorHalfGemmGPU(var_99, dense_1_w); 
-        void* var_102 = tensorHalfAdd(var_101, dense_1_b); 
-        void* var_103 = tensorSoftmax(var_102); 
-
-        uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
-
-        float accuracy = computeAccuracy2(labels, batch_size, var_103); 
-        final_accuracy += accuracy; 
-        freeBatchMemory(); 
-    }
-    final_accuracy = final_accuracy / batch_count; 
-    dumpFinalAccuracy(final_accuracy); 
-
-    llvm_hpvm_cleanupTensorRt(); 
-
-    return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
index 741c4a443c..d674591027 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc
@@ -1,112 +1,155 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../../tensor_runtime/include/tensor_runtime.h" 
-#include "../../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-  
-  std::string dir_prefix = model_params_path + std::string("/resnet18_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_14_b_path =  dir_prefix + std::string("conv2d_14_b.bin"); 
-  void* conv2d_14_b =  readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_15_w_path =  dir_prefix + std::string("conv2d_15_w.bin"); 
-  void* conv2d_15_w =  readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); 
-  std::string conv2d_15_b_path =  dir_prefix + std::string("conv2d_15_b.bin"); 
-  void* conv2d_15_b =  readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_17_w_path =  dir_prefix + std::string("conv2d_17_w.bin"); 
-  void* conv2d_17_w =  readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); 
-  std::string conv2d_17_b_path =  dir_prefix + std::string("conv2d_17_b.bin"); 
-  void* conv2d_17_b =  readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_16_w_path =  dir_prefix + std::string("conv2d_16_w.bin"); 
-  void* conv2d_16_w =  readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_16_b_path =  dir_prefix + std::string("conv2d_16_b.bin"); 
-  void* conv2d_16_b =  readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_18_w_path =  dir_prefix + std::string("conv2d_18_w.bin"); 
-  void* conv2d_18_w =  readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_18_b_path =  dir_prefix + std::string("conv2d_18_b.bin"); 
-  void* conv2d_18_b =  readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_19_w_path =  dir_prefix + std::string("conv2d_19_w.bin"); 
-  void* conv2d_19_w =  readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_19_b_path =  dir_prefix + std::string("conv2d_19_b.bin"); 
-  void* conv2d_19_b =  readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_20_w_path =  dir_prefix + std::string("conv2d_20_w.bin"); 
-  void* conv2d_20_w =  readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_20_b_path =  dir_prefix + std::string("conv2d_20_b.bin"); 
-  void* conv2d_20_b =  readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_21_w_path =  dir_prefix + std::string("conv2d_21_w.bin"); 
-  void* conv2d_21_w =  readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_21_b_path =  dir_prefix + std::string("conv2d_21_b.bin"); 
-  void* conv2d_21_b =  readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include "../../../tensor_runtime/include/tensor_runtime.h"
+#include "../../include/utils.h"
 
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix =
+      model_params_path + std::string("/resnet18_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  // void* input = readTrainedWeights(input_path.c_str(), 0,
+  // batch_size,3,32,32);
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  // uint8_t* labels = readLabels(labels_path.c_str(), batch_size);
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 32, 16, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 32, 16, 1, 1);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin");
+  void *conv2d_14_b =
+      readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin");
+  void *conv2d_15_w =
+      readTrainedWeights(conv2d_15_w_path.c_str(), 0, 64, 32, 3, 3);
+  std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin");
+  void *conv2d_15_b =
+      readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin");
+  void *conv2d_17_w =
+      readTrainedWeights(conv2d_17_w_path.c_str(), 0, 64, 32, 1, 1);
+  std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin");
+  void *conv2d_17_b =
+      readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin");
+  void *conv2d_16_w =
+      readTrainedWeights(conv2d_16_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin");
+  void *conv2d_16_b =
+      readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin");
+  void *conv2d_18_w =
+      readTrainedWeights(conv2d_18_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin");
+  void *conv2d_18_b =
+      readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin");
+  void *conv2d_19_w =
+      readTrainedWeights(conv2d_19_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin");
+  void *conv2d_19_b =
+      readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin");
+  void *conv2d_20_w =
+      readTrainedWeights(conv2d_20_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin");
+  void *conv2d_20_b =
+      readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin");
+  void *conv2d_21_w =
+      readTrainedWeights(conv2d_21_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin");
+  void *conv2d_21_b =
+      readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 64, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -117,94 +160,94 @@ int main(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
-    
-    void* var_2 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_3 = tensorHalfAdd(var_2, conv2d_1_b); 
-    void* var_4 = tensorHalfRelu(var_3); 
-    void* var_6 = tensorHalfConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_7 = tensorHalfAdd(var_6, conv2d_2_b); 
-    void* var_8 = tensorHalfRelu(var_7); 
-    void* var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorHalfAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorHalfAdd(var_4, var_11); 
-    void* var_13 = tensorHalfRelu(var_12); 
-    void* var_15 = tensorHalfConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_16 = tensorHalfAdd(var_15, conv2d_4_b); 
-    void* var_17 = tensorHalfRelu(var_16); 
-    void* var_19 = tensorHalfConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_20 = tensorHalfAdd(var_19, conv2d_5_b); 
-    void* var_21 = tensorHalfAdd(var_13, var_20); 
-    void* var_22 = tensorHalfRelu(var_21); 
-    void* var_24 = tensorHalfConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorHalfAdd(var_24, conv2d_6_b); 
-    void* var_26 = tensorHalfRelu(var_25); 
-    void* var_28 = tensorHalfConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorHalfAdd(var_28, conv2d_7_b); 
-    void* var_30 = tensorHalfAdd(var_22, var_29); 
-    void* var_31 = tensorHalfRelu(var_30); 
-    void* var_33 = tensorHalfConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); 
-    void* var_34 = tensorHalfAdd(var_33, conv2d_8_b); 
-    void* var_35 = tensorHalfRelu(var_34); 
-    void* var_37 = tensorHalfConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_38 = tensorHalfAdd(var_37, conv2d_9_b); 
-    void* var_40 = tensorHalfConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); 
-    void* var_41 = tensorHalfAdd(var_40, conv2d_10_b); 
-    void* var_42 = tensorHalfAdd(var_41, var_38); 
-    void* var_43 = tensorHalfRelu(var_42); 
-    void* var_45 = tensorHalfConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_46 = tensorHalfAdd(var_45, conv2d_11_b); 
-    void* var_47 = tensorHalfRelu(var_46); 
-    void* var_49 = tensorHalfConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_50 = tensorHalfAdd(var_49, conv2d_12_b); 
-    void* var_51 = tensorHalfAdd(var_43, var_50); 
-    void* var_52 = tensorHalfRelu(var_51); 
-    void* var_54 = tensorHalfConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_55 = tensorHalfAdd(var_54, conv2d_13_b); 
-    void* var_56 = tensorHalfRelu(var_55); 
-    void* var_58 = tensorHalfConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); 
-    void* var_59 = tensorHalfAdd(var_58, conv2d_14_b); 
-    void* var_60 = tensorHalfAdd(var_52, var_59); 
-    void* var_61 = tensorHalfRelu(var_60); 
-    void* var_63 = tensorHalfConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); 
-    void* var_64 = tensorHalfAdd(var_63, conv2d_15_b); 
-    void* var_65 = tensorHalfRelu(var_64); 
-    void* var_67 = tensorHalfConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); 
-    void* var_68 = tensorHalfAdd(var_67, conv2d_16_b); 
-    void* var_70 = tensorHalfConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); 
-    void* var_71 = tensorHalfAdd(var_70, conv2d_17_b); 
-    void* var_72 = tensorHalfAdd(var_71, var_68); 
-    void* var_73 = tensorHalfRelu(var_72); 
-    void* var_75 = tensorHalfConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); 
-    void* var_76 = tensorHalfAdd(var_75, conv2d_18_b); 
-    void* var_77 = tensorHalfRelu(var_76); 
-    void* var_79 = tensorHalfConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); 
-    void* var_80 = tensorHalfAdd(var_79, conv2d_19_b); 
-    void* var_81 = tensorHalfAdd(var_73, var_80); 
-    void* var_82 = tensorHalfRelu(var_81); 
-    void* var_84 = tensorHalfConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); 
-    void* var_85 = tensorHalfAdd(var_84, conv2d_20_b); 
-    void* var_86 = tensorHalfRelu(var_85); 
-    void* var_88 = tensorHalfConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); 
-    void* var_89 = tensorHalfAdd(var_88, conv2d_21_b); 
-    void* var_90 = tensorHalfAdd(var_82, var_89); 
-    void* var_91 = tensorHalfRelu(var_90); 
-    void* var_92 = tensorHalfPooling(var_91,1,8,8,0,0,8,8); 
-    void* var_94 = tensorHalfGemmGPU(var_92, dense_1_w); 
-    void* var_95 = tensorHalfAdd(var_94, dense_1_b); 
-    void* var_96 = tensorSoftmax(var_95); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_96); 
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_2 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_3 = tensorHalfAdd(var_2, conv2d_1_b);
+    void *var_4 = tensorHalfRelu(var_3);
+    void *var_6 = tensorHalfConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_7 = tensorHalfAdd(var_6, conv2d_2_b);
+    void *var_8 = tensorHalfRelu(var_7);
+    void *var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_11 = tensorHalfAdd(var_10, conv2d_3_b);
+    void *var_12 = tensorHalfAdd(var_4, var_11);
+    void *var_13 = tensorHalfRelu(var_12);
+    void *var_15 = tensorHalfConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_16 = tensorHalfAdd(var_15, conv2d_4_b);
+    void *var_17 = tensorHalfRelu(var_16);
+    void *var_19 = tensorHalfConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_20 = tensorHalfAdd(var_19, conv2d_5_b);
+    void *var_21 = tensorHalfAdd(var_13, var_20);
+    void *var_22 = tensorHalfRelu(var_21);
+    void *var_24 = tensorHalfConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorHalfAdd(var_24, conv2d_6_b);
+    void *var_26 = tensorHalfRelu(var_25);
+    void *var_28 = tensorHalfConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorHalfAdd(var_28, conv2d_7_b);
+    void *var_30 = tensorHalfAdd(var_22, var_29);
+    void *var_31 = tensorHalfRelu(var_30);
+    void *var_33 = tensorHalfConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0);
+    void *var_34 = tensorHalfAdd(var_33, conv2d_8_b);
+    void *var_35 = tensorHalfRelu(var_34);
+    void *var_37 = tensorHalfConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_38 = tensorHalfAdd(var_37, conv2d_9_b);
+    void *var_40 = tensorHalfConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0);
+    void *var_41 = tensorHalfAdd(var_40, conv2d_10_b);
+    void *var_42 = tensorHalfAdd(var_41, var_38);
+    void *var_43 = tensorHalfRelu(var_42);
+    void *var_45 = tensorHalfConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_46 = tensorHalfAdd(var_45, conv2d_11_b);
+    void *var_47 = tensorHalfRelu(var_46);
+    void *var_49 = tensorHalfConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_50 = tensorHalfAdd(var_49, conv2d_12_b);
+    void *var_51 = tensorHalfAdd(var_43, var_50);
+    void *var_52 = tensorHalfRelu(var_51);
+    void *var_54 = tensorHalfConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_55 = tensorHalfAdd(var_54, conv2d_13_b);
+    void *var_56 = tensorHalfRelu(var_55);
+    void *var_58 = tensorHalfConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0);
+    void *var_59 = tensorHalfAdd(var_58, conv2d_14_b);
+    void *var_60 = tensorHalfAdd(var_52, var_59);
+    void *var_61 = tensorHalfRelu(var_60);
+    void *var_63 = tensorHalfConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0);
+    void *var_64 = tensorHalfAdd(var_63, conv2d_15_b);
+    void *var_65 = tensorHalfRelu(var_64);
+    void *var_67 = tensorHalfConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0);
+    void *var_68 = tensorHalfAdd(var_67, conv2d_16_b);
+    void *var_70 = tensorHalfConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0);
+    void *var_71 = tensorHalfAdd(var_70, conv2d_17_b);
+    void *var_72 = tensorHalfAdd(var_71, var_68);
+    void *var_73 = tensorHalfRelu(var_72);
+    void *var_75 = tensorHalfConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0);
+    void *var_76 = tensorHalfAdd(var_75, conv2d_18_b);
+    void *var_77 = tensorHalfRelu(var_76);
+    void *var_79 = tensorHalfConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0);
+    void *var_80 = tensorHalfAdd(var_79, conv2d_19_b);
+    void *var_81 = tensorHalfAdd(var_73, var_80);
+    void *var_82 = tensorHalfRelu(var_81);
+    void *var_84 = tensorHalfConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0);
+    void *var_85 = tensorHalfAdd(var_84, conv2d_20_b);
+    void *var_86 = tensorHalfRelu(var_85);
+    void *var_88 = tensorHalfConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0);
+    void *var_89 = tensorHalfAdd(var_88, conv2d_21_b);
+    void *var_90 = tensorHalfAdd(var_82, var_89);
+    void *var_91 = tensorHalfRelu(var_90);
+    void *var_92 = tensorHalfPooling(var_91, 1, 8, 8, 0, 0, 8, 8);
+    void *var_94 = tensorHalfGemmGPU(var_92, dense_1_w);
+    void *var_95 = tensorHalfAdd(var_94, dense_1_b);
+    void *var_96 = tensorSoftmax(var_95);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_96);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -213,9 +256,7 @@ int main(){
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
 
-  
-  llvm_hpvm_cleanupTensorRt(); 
-
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
index 9ac1deea68..fff901a330 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc
@@ -1,160 +1,186 @@
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
 
 #include "../../../tensor_runtime/include/tensor_runtime.h"
 #include "../../include/utils.h"
 
-int main(){ 
-
-    llvm_hpvm_initTensorRt(0); 
-
-    std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); 
-    std::string input_path =  dir_prefix + std::string("input.bin"); 
-    std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-    std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-    void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-    std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-    void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-    std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-    void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-    std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-    void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-    std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-    void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-    std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-    void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-    std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-    void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-    std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-    void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-    std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-    void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-    std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-    void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-    void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-    std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-    void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-    void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-    std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-    void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-    std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-    void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-    std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-    void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-    void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-    void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-    void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-    void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-    void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-    void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-    void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-    void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-    std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-    void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-    std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-    void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-    std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-    void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
-    std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-    void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
-    std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-    void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); 
-    std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-    void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); 
-
-
-    startMemTracking(); 
-
-    int test_input_size = 2000; 
-    int batch_size = 1000; 
-    int batch_count = test_input_size / batch_size; 
-    float final_accuracy = 0.0; 
-
-    for(int i = 0; i < batch_count; i++){ 
-
-        int start = i * batch_size; 
-        int end = (i + 1) * batch_size; 
-
-        void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
-
-        void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-        void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); 
-        void* var_2 = tensorHalfRelu(var_1); 
-        void* var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-        void* var_5 = tensorHalfAdd(var_4, conv2d_2_b); 
-        void* var_6 = tensorHalfRelu(var_5); 
-        void* var_7 = tensorHalfPooling(var_6,0,2,2,0,0,2,2); 
-        void* var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-        void* var_9 = tensorHalfAdd(var_8, conv2d_3_b); 
-        void* var_10 = tensorHalfRelu(var_9); 
-        void* var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-        void* var_13 = tensorHalfAdd(var_12, conv2d_4_b); 
-        void* var_14 = tensorHalfRelu(var_13); 
-        void* var_15 = tensorHalfPooling(var_14,0,2,2,0,0,2,2); 
-        void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-        void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); 
-        void* var_18 = tensorHalfRelu(var_17); 
-        void* var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-        void* var_21 = tensorHalfAdd(var_20, conv2d_6_b); 
-        void* var_22 = tensorHalfRelu(var_21); 
-        void* var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-        void* var_25 = tensorHalfAdd(var_24, conv2d_7_b); 
-        void* var_26 = tensorHalfRelu(var_25); 
-        void* var_27 = tensorHalfPooling(var_26,0,2,2,0,0,2,2); 
-        void* var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
-        void* var_29 = tensorHalfAdd(var_28, conv2d_8_b); 
-        void* var_30 = tensorHalfRelu(var_29); 
-        void* var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-        void* var_33 = tensorHalfAdd(var_32, conv2d_9_b); 
-        void* var_34 = tensorHalfRelu(var_33); 
-        void* var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
-        void* var_37 = tensorHalfAdd(var_36, conv2d_10_b); 
-        void* var_38 = tensorHalfRelu(var_37); 
-        void* var_39 = tensorHalfPooling(var_38,0,2,2,0,0,2,2); 
-        void* var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-        void* var_41 = tensorHalfAdd(var_40, conv2d_11_b); 
-        void* var_42 = tensorHalfRelu(var_41); 
-        void* var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-        void* var_45 = tensorHalfAdd(var_44, conv2d_12_b); 
-        void* var_46 = tensorHalfRelu(var_45); 
-        void* var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-        void* var_49 = tensorHalfAdd(var_48, conv2d_13_b); 
-        void* var_50 = tensorHalfRelu(var_49); 
-        void* var_51 = tensorHalfPooling(var_50,0,2,2,0,0,2,2); 
-        void* var_54 = tensorHalfGemmGPU(var_51, dense_1_w); 
-        void* var_55 = tensorHalfAdd(var_54, dense_1_b); 
-        void* var_56 = tensorHalfRelu(var_55); 
-        void* var_58 = tensorHalfGemmGPU(var_56, dense_2_w); 
-        void* var_59 = tensorHalfAdd(var_58, dense_2_b); 
-        void* var_60 = tensorSoftmax(var_59); 
-
-        uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
-
-        float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); 
-        final_accuracy += accuracy; 
-        freeBatchMemory(); 
-
-    }
-
-    final_accuracy = final_accuracy / batch_count;
-    dumpFinalAccuracy(final_accuracy); 
-
-    llvm_hpvm_cleanupTensorRt(); 
-
-    return 0; 
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1);
+
+  startMemTracking();
+
+  int test_input_size = 2000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  for (int i = 0; i < batch_count; i++) {
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_1 = tensorHalfAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorHalfRelu(var_1);
+    void *var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_5 = tensorHalfAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorHalfRelu(var_5);
+    void *var_7 = tensorHalfPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_9 = tensorHalfAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorHalfRelu(var_9);
+    void *var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_13 = tensorHalfAdd(var_12, conv2d_4_b);
+    void *var_14 = tensorHalfRelu(var_13);
+    void *var_15 = tensorHalfPooling(var_14, 0, 2, 2, 0, 0, 2, 2);
+    void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorHalfAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorHalfRelu(var_17);
+    void *var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_21 = tensorHalfAdd(var_20, conv2d_6_b);
+    void *var_22 = tensorHalfRelu(var_21);
+    void *var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorHalfAdd(var_24, conv2d_7_b);
+    void *var_26 = tensorHalfRelu(var_25);
+    void *var_27 = tensorHalfPooling(var_26, 0, 2, 2, 0, 0, 2, 2);
+    void *var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorHalfAdd(var_28, conv2d_8_b);
+    void *var_30 = tensorHalfRelu(var_29);
+    void *var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_33 = tensorHalfAdd(var_32, conv2d_9_b);
+    void *var_34 = tensorHalfRelu(var_33);
+    void *var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0);
+    void *var_37 = tensorHalfAdd(var_36, conv2d_10_b);
+    void *var_38 = tensorHalfRelu(var_37);
+    void *var_39 = tensorHalfPooling(var_38, 0, 2, 2, 0, 0, 2, 2);
+    void *var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_41 = tensorHalfAdd(var_40, conv2d_11_b);
+    void *var_42 = tensorHalfRelu(var_41);
+    void *var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_45 = tensorHalfAdd(var_44, conv2d_12_b);
+    void *var_46 = tensorHalfRelu(var_45);
+    void *var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_49 = tensorHalfAdd(var_48, conv2d_13_b);
+    void *var_50 = tensorHalfRelu(var_49);
+    void *var_51 = tensorHalfPooling(var_50, 0, 2, 2, 0, 0, 2, 2);
+    void *var_54 = tensorHalfGemmGPU(var_51, dense_1_w);
+    void *var_55 = tensorHalfAdd(var_54, dense_1_b);
+    void *var_56 = tensorHalfRelu(var_55);
+    void *var_58 = tensorHalfGemmGPU(var_56, dense_2_w);
+    void *var_59 = tensorHalfAdd(var_58, dense_2_b);
+    void *var_60 = tensorSoftmax(var_59);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_60, 100);
+    final_accuracy += accuracy;
+    freeBatchMemory();
+  }
+
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc
index f92bac10e2..3d6f0f3566 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc
@@ -1,82 +1,109 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <string.h>
 #include "../../../tensor_runtime/include/tensor_runtime.h"
-#include "../../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); 
-
+#include "../../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -85,83 +112,82 @@ int main(){
   int batch_count = test_input_size / batch_size;
   float final_accuracy = 0.0;
 
-  // Start power and performance profiling 
+  // Start power and performance profiling
   startProfiling();
 
-  for(int i = 0; i < batch_count; i++){
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); 
- 
-    void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorHalfRelu(var_1); 
-    void* var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_5 = tensorHalfAdd(var_4, conv2d_2_b); 
-    void* var_6 = tensorHalfRelu(var_5); 
-    void* var_7 = tensorHalfPooling(var_6,0,2,2,0,0,2,2); 
-    void* var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_9 = tensorHalfAdd(var_8, conv2d_3_b); 
-    void* var_10 = tensorHalfRelu(var_9); 
-    void* var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_13 = tensorHalfAdd(var_12, conv2d_4_b); 
-    void* var_14 = tensorHalfRelu(var_13); 
-    void* var_15 = tensorHalfPooling(var_14,0,2,2,0,0,2,2); 
-    void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorHalfRelu(var_17); 
-    void* var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_21 = tensorHalfAdd(var_20, conv2d_6_b); 
-    void* var_22 = tensorHalfRelu(var_21); 
-    void* var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorHalfAdd(var_24, conv2d_7_b); 
-    void* var_26 = tensorHalfRelu(var_25); 
-    void* var_27 = tensorHalfPooling(var_26,0,2,2,0,0,2,2); 
-    void* var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorHalfAdd(var_28, conv2d_8_b); 
-    void* var_30 = tensorHalfRelu(var_29); 
-    void* var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_33 = tensorHalfAdd(var_32, conv2d_9_b); 
-    void* var_34 = tensorHalfRelu(var_33); 
-    void* var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
-    void* var_37 = tensorHalfAdd(var_36, conv2d_10_b); 
-    void* var_38 = tensorHalfRelu(var_37); 
-    void* var_39 = tensorHalfPooling(var_38,0,2,2,0,0,2,2); 
-    void* var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_41 = tensorHalfAdd(var_40, conv2d_11_b); 
-    void* var_42 = tensorHalfRelu(var_41); 
-    void* var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_45 = tensorHalfAdd(var_44, conv2d_12_b); 
-    void* var_46 = tensorHalfRelu(var_45); 
-    void* var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_49 = tensorHalfAdd(var_48, conv2d_13_b); 
-    void* var_50 = tensorHalfRelu(var_49); 
-    void* var_51 = tensorHalfPooling(var_50,0,2,2,0,0,2,2); 
-    void* var_54 = tensorHalfGemmGPU(var_51, dense_1_w); 
-    void* var_55 = tensorHalfAdd(var_54, dense_1_b); 
-    void* var_56 = tensorHalfRelu(var_55); 
-    void* var_58 = tensorHalfGemmGPU(var_56, dense_2_w); 
-    void* var_59 = tensorHalfAdd(var_58, dense_2_b); 
-    void* var_60 = tensorSoftmax(var_59); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_60); 
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_1 = tensorHalfAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorHalfRelu(var_1);
+    void *var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_5 = tensorHalfAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorHalfRelu(var_5);
+    void *var_7 = tensorHalfPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_9 = tensorHalfAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorHalfRelu(var_9);
+    void *var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_13 = tensorHalfAdd(var_12, conv2d_4_b);
+    void *var_14 = tensorHalfRelu(var_13);
+    void *var_15 = tensorHalfPooling(var_14, 0, 2, 2, 0, 0, 2, 2);
+    void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorHalfAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorHalfRelu(var_17);
+    void *var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_21 = tensorHalfAdd(var_20, conv2d_6_b);
+    void *var_22 = tensorHalfRelu(var_21);
+    void *var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorHalfAdd(var_24, conv2d_7_b);
+    void *var_26 = tensorHalfRelu(var_25);
+    void *var_27 = tensorHalfPooling(var_26, 0, 2, 2, 0, 0, 2, 2);
+    void *var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorHalfAdd(var_28, conv2d_8_b);
+    void *var_30 = tensorHalfRelu(var_29);
+    void *var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_33 = tensorHalfAdd(var_32, conv2d_9_b);
+    void *var_34 = tensorHalfRelu(var_33);
+    void *var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0);
+    void *var_37 = tensorHalfAdd(var_36, conv2d_10_b);
+    void *var_38 = tensorHalfRelu(var_37);
+    void *var_39 = tensorHalfPooling(var_38, 0, 2, 2, 0, 0, 2, 2);
+    void *var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_41 = tensorHalfAdd(var_40, conv2d_11_b);
+    void *var_42 = tensorHalfRelu(var_41);
+    void *var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_45 = tensorHalfAdd(var_44, conv2d_12_b);
+    void *var_46 = tensorHalfRelu(var_45);
+    void *var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_49 = tensorHalfAdd(var_48, conv2d_13_b);
+    void *var_50 = tensorHalfRelu(var_49);
+    void *var_51 = tensorHalfPooling(var_50, 0, 2, 2, 0, 0, 2, 2);
+    void *var_54 = tensorHalfGemmGPU(var_51, dense_1_w);
+    void *var_55 = tensorHalfAdd(var_54, dense_1_b);
+    void *var_56 = tensorHalfRelu(var_55);
+    void *var_58 = tensorHalfGemmGPU(var_56, dense_2_w);
+    void *var_59 = tensorHalfAdd(var_58, dense_2_b);
+    void *var_60 = tensorSoftmax(var_59);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_60);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
-  // Start power and performance profiling 
+  // Start power and performance profiling
   stopProfiling();
 
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
-  
-  llvm_hpvm_cleanupTensorRt(); 
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
index 50d9747f99..20484a3a0b 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc
@@ -11,52 +11,62 @@
 #include "../../tensor_runtime/include/tensor_runtime.h"
 #include "../include/utils.h"
 
-
-
 /* NOTE: Reference Architecture to use for profiling */
-void testCifarNet(){
+void testCifarNet() {
 
   printf("********* Alexnet2 CIFAR-10 DNN ********** \n");
- 
-
-  std::string dir_prefix = model_params_path +  std::string("/alexnet2_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,32,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,128,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-
-  
-  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
-  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
 
+  std::string dir_prefix =
+      model_params_path + std::string("/alexnet2_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
+
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 32, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
+
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision =
+      0; // NOTE: using Float as compute precision. FIXIT: use enum
 
   startMemTracking();
 
@@ -67,62 +77,61 @@ void testCifarNet(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
-    
-    void* conv1out = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv1out, conv2d_1_b); 
-    void* conv1_tanh = tensorTanh(conv1out);
-    
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *conv1out = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, conv_mode,
+                                       conv_precision);
+    tensorAdd(conv1out, conv2d_1_b);
+    void *conv1_tanh = tensorTanh(conv1out);
+
     // 2nd Layer
-    void* conv2out = tensorConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv2out, conv2d_2_b); 
-    void* conv2_tanh = tensorTanh(conv2out);
-    void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
-     
+    void *conv2out = tensorConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv2out, conv2d_2_b);
+    void *conv2_tanh = tensorTanh(conv2out);
+    void *pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // 3rd Layer
-    void* conv3out = tensorConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv3out, conv2d_3_b); 
-    void* conv3_tanh = tensorTanh(conv3out);
+    void *conv3out = tensorConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv3out, conv2d_3_b);
+    void *conv3_tanh = tensorTanh(conv3out);
 
     // 4th Layer
-    void* conv4out = tensorConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv4out, conv2d_4_b); 
-    void* conv4_tanh = tensorTanh(conv4out);
-    void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
-    
+    void *conv4out = tensorConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv4out, conv2d_4_b);
+    void *conv4_tanh = tensorTanh(conv4out);
+    void *pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // 5th Layer
-    void* conv5out = tensorConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv5out, conv2d_5_b); 
-    void* conv5_tanh = tensorTanh(conv5out);
+    void *conv5out = tensorConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv5out, conv2d_5_b);
+    void *conv5_tanh = tensorTanh(conv5out);
 
     // 6th Layer
-    void* conv6out = tensorConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv6out, conv2d_6_b); 
-    void* conv6_tanh = tensorTanh(conv6out);
-    void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
-    
+    void *conv6out = tensorConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1,
+                                       conv_mode, conv_precision);
+    tensorAdd(conv6out, conv2d_6_b);
+    void *conv6_tanh = tensorTanh(conv6out);
+    void *pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+
     // final FC Layer
-    void* gemm1out = tensorGemmGPU(pool6out, dense_1_w);  
-    void* gemm1biasout = tensorAdd(gemm1out, dense_1_b);
-    void* result = tensorSoftmax(gemm1biasout);
+    void *gemm1out = tensorGemmGPU(pool6out, dense_1_w);
+    void *gemm1biasout = tensorAdd(gemm1out, dense_1_b);
+    void *result = tensorSoftmax(gemm1biasout);
 
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
 
-    float accuracy = computeAccuracy2(labels, batch_size, result); 
+    float accuracy = computeAccuracy2(labels, batch_size, result);
     final_accuracy += accuracy;
 
-    
     freeBatchMemory();
   }
 
@@ -130,11 +139,9 @@ void testCifarNet(){
 
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
-
 }
 
-
-int main(int argc, char* argv[]){
+int main(int argc, char *argv[]) {
 
   llvm_hpvm_initTensorRt(0);
 
@@ -144,4 +151,3 @@ int main(int argc, char* argv[]){
 
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
index 1a76f1ae8b..70d582d11c 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc
@@ -1,50 +1,59 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); 
-
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-
-
-  
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/");
+
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 4096, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
+
   startMemTracking();
 
   int test_input_size = 5000;
@@ -54,40 +63,40 @@ int main(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
-
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
-    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorTanh(var_1); 
-    void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); 
-    void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
-    void* var_6 = tensorAdd(var_5, conv2d_2_b); 
-    void* var_7 = tensorTanh(var_6); 
-    void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); 
-    void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorTanh(var_11); 
-    void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_14 = tensorAdd(var_13, conv2d_4_b); 
-    void* var_15 = tensorTanh(var_14); 
-    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorTanh(var_17); 
-    void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); 
-    void* var_22 = tensorGemmGPU(var_19, dense_1_w); 
-    void* var_23 = tensorAdd(var_22, dense_1_b); 
-    void* var_24 = tensorSoftmax(var_23); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_24); 
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0);
+    void *var_1 = tensorAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorTanh(var_1);
+    void *var_3 = tensorPooling(var_2, 0, 2, 2, 0, 0, 2, 2);
+    void *var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0);
+    void *var_6 = tensorAdd(var_5, conv2d_2_b);
+    void *var_7 = tensorTanh(var_6);
+    void *var_8 = tensorPooling(var_7, 0, 2, 2, 0, 0, 2, 2);
+    void *var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_11 = tensorAdd(var_10, conv2d_3_b);
+    void *var_12 = tensorTanh(var_11);
+    void *var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_14 = tensorAdd(var_13, conv2d_4_b);
+    void *var_15 = tensorTanh(var_14);
+    void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorTanh(var_17);
+    void *var_19 = tensorPooling(var_18, 0, 2, 2, 0, 0, 2, 2);
+    void *var_22 = tensorGemmGPU(var_19, dense_1_w);
+    void *var_23 = tensorAdd(var_22, dense_1_b);
+    void *var_24 = tensorSoftmax(var_23);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_24);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -96,9 +105,7 @@ int main(){
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
 
+  llvm_hpvm_cleanupTensorRt();
 
-  llvm_hpvm_cleanupTensorRt(); 
-
-  return 0; 
-
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
index aa518d77a1..9d7e8fe2a2 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc
@@ -1,116 +1,126 @@
 
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "tensor_runtime.h" 
-#include "utils.h" 
-
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-
-  std::string dir_prefix = std::string("/home/nvidia/sd_card/alexnet_imagenet_tune/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,9216,4096); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); 
-  std::string dense_3_w_path =  dir_prefix + std::string("dense_3_w.bin"); 
-  void* dense_3_w =  readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); 
-  std::string dense_3_b_path =  dir_prefix + std::string("dense_3_b.bin"); 
-  void* dense_3_b =  readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); 
-
-
-
-  startMemTracking(); 
-
-  int test_input_size = 1000; 
-  int batch_size = 100; 
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
-
-  for(int i = 0; i < batch_count; i++){ 
-
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
-
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); 
-
-    void* var_2 = tensorConvolution(input, conv2d_1_w, 2, 2, 4, 4, 1, 1); 
-    void* var_3 = tensorAdd(var_2, conv2d_1_b); 
-    void* var_4 = tensorRelu(var_3); 
-    void* var_5 = tensorPooling(var_4,0,3,3,0,0,2,2); 
-    void* var_7 = tensorConvolution(var_5, conv2d_2_w, 2, 2, 1, 1, 1, 1); 
-    void* var_8 = tensorAdd(var_7, conv2d_2_b); 
-    void* var_9 = tensorRelu(var_8); 
-    void* var_10 = tensorPooling(var_9,0,3,3,0,0,2,2); 
-    void* var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); 
-    void* var_12 = tensorAdd(var_11, conv2d_3_b); 
-    void* var_13 = tensorRelu(var_12); 
-    void* var_14 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 1); 
-    void* var_15 = tensorAdd(var_14, conv2d_4_b); 
-    void* var_16 = tensorRelu(var_15); 
-    void* var_17 = tensorConvolution(var_16, conv2d_5_w, 1, 1, 1, 1, 1, 1); 
-    void* var_18 = tensorAdd(var_17, conv2d_5_b); 
-    void* var_19 = tensorRelu(var_18); 
-    void* var_20 = tensorPooling(var_19,0,3,3,0,0,2,2); 
-    void* var_23 = tensorGemmGPU(var_20, dense_1_w); 
-    void* var_24 = tensorAdd(var_23, dense_1_b); 
-    void* var_25 = tensorRelu(var_24); 
-    void* var_27 = tensorGemmGPU(var_25, dense_2_w); 
-    void* var_28 = tensorAdd(var_27, dense_2_b); 
-    void* var_29 = tensorRelu(var_28); 
-    void* var_30 = tensorGemmGPU(var_29, dense_3_w); 
-    void* var_31 = tensorAdd(var_30, dense_3_b); 
-    void* var_32 = tensorSoftmax(var_31); 
-
-    uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy3(labels, var_32); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
- 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include "tensor_runtime.h"
+#include "utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix =
+      std::string("/home/nvidia/sd_card/alexnet_imagenet_tune/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 9216, 4096);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b =
+      readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 4096, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 4096, 4096);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b =
+      readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 4096, 1, 1);
+  std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin");
+  void *dense_3_w =
+      readTrainedWeights(dense_3_w_path.c_str(), 0, 1, 1, 4096, 1000);
+  std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
+  void *dense_3_b =
+      readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
+
+  startMemTracking();
+
+  int test_input_size = 1000;
+  int batch_size = 100;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  for (int i = 0; i < batch_count; i++) {
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+
+    void *input =
+        readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
+
+    void *var_2 = tensorConvolution(input, conv2d_1_w, 2, 2, 4, 4, 1, 1);
+    void *var_3 = tensorAdd(var_2, conv2d_1_b);
+    void *var_4 = tensorRelu(var_3);
+    void *var_5 = tensorPooling(var_4, 0, 3, 3, 0, 0, 2, 2);
+    void *var_7 = tensorConvolution(var_5, conv2d_2_w, 2, 2, 1, 1, 1, 1);
+    void *var_8 = tensorAdd(var_7, conv2d_2_b);
+    void *var_9 = tensorRelu(var_8);
+    void *var_10 = tensorPooling(var_9, 0, 3, 3, 0, 0, 2, 2);
+    void *var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1);
+    void *var_12 = tensorAdd(var_11, conv2d_3_b);
+    void *var_13 = tensorRelu(var_12);
+    void *var_14 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 1);
+    void *var_15 = tensorAdd(var_14, conv2d_4_b);
+    void *var_16 = tensorRelu(var_15);
+    void *var_17 = tensorConvolution(var_16, conv2d_5_w, 1, 1, 1, 1, 1, 1);
+    void *var_18 = tensorAdd(var_17, conv2d_5_b);
+    void *var_19 = tensorRelu(var_18);
+    void *var_20 = tensorPooling(var_19, 0, 3, 3, 0, 0, 2, 2);
+    void *var_23 = tensorGemmGPU(var_20, dense_1_w);
+    void *var_24 = tensorAdd(var_23, dense_1_b);
+    void *var_25 = tensorRelu(var_24);
+    void *var_27 = tensorGemmGPU(var_25, dense_2_w);
+    void *var_28 = tensorAdd(var_27, dense_2_b);
+    void *var_29 = tensorRelu(var_28);
+    void *var_30 = tensorGemmGPU(var_29, dense_3_w);
+    void *var_31 = tensorAdd(var_30, dense_3_b);
+    void *var_32 = tensorSoftmax(var_31);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy3(labels, var_32);
+    final_accuracy += accuracy;
+    freeBatchMemory();
   }
 
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
+  llvm_hpvm_cleanupTensorRt();
 
-  llvm_hpvm_cleanupTensorRt(); 
-
-
-  return 0; 
-
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
index 7508f3119e..c32efad92f 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc
@@ -8,117 +8,109 @@
 #include <sys/stat.h>
 #include <string.h>
 
-
 #include "tensor_runtime.h"
 #include "utils.h"
 
 int total_runs = 1;
 
-
 /* NOTE: Reference Architecture to use for profiling */
-void testLenetTanh(){
+void testLenetTanh() {
   printf("********* Lenet-2 Architecture ********** \n");
   // FIXIT: Extend this to batch of images - currently 5 images
 
   int test_batch_size = 5000;
 
-  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");   
+  std::string dir_prefix = model_params_path + std::string("/lenet_mnist/");
+
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string labels32_path = dir_prefix + std::string("labels32.bin");
 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string labels32_path =  dir_prefix + std::string("labels32.bin");
-  
   // Loading Input Batch
-  void* input = readInputBatch(input_path.c_str(),0, 0,test_batch_size,1,28,28); 
-  uint8_t* labels = readLabelsBatch(labels_path.c_str(), 0,test_batch_size); 
-    
-
-  void* conv1_filter = readTrainedWeights("../model_params/lenet_mnist/conv1.bin",
-					  float_type, 32, 1, 5, 5);    
-  void* conv1_bias = readTrainedWeights("../model_params/lenet_mnist/conv1_bias.bin",
-					float_type, 1, 32, 1, 1);  
-  void* conv2_filter = readTrainedWeights("../model_params/lenet_mnist/conv2.bin",
-					  float_type, 64, 32, 5, 5);  
-  void* conv2_bias = readTrainedWeights("../model_params/lenet_mnist/conv2_bias.bin",
-					float_type, 1, 64, 1, 1);  
-  void* fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin",
-					 float_type, 1, 1, 7*7*64, 1024);  
-  void* fc1_bias = readTrainedWeights("../model_params/lenet_mnist/fc1_bias.bin",
-				      float_type, 1, 1024, 1, 1);  
-  void* fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin",
-					 float_type, 1, 1, 1024, 10);  
-  void* fc2_bias = readTrainedWeights("../model_params/lenet_mnist/fc2_bias.bin",
-				      float_type, 1, 10, 1, 1);  
-
-
-  
+  void *input =
+      readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28);
+  uint8_t *labels = readLabelsBatch(labels_path.c_str(), 0, test_batch_size);
+
+  void *conv1_filter = readTrainedWeights(
+      "../model_params/lenet_mnist/conv1.bin", float_type, 32, 1, 5, 5);
+  void *conv1_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/conv1_bias.bin", float_type, 1, 32, 1, 1);
+  void *conv2_filter = readTrainedWeights(
+      "../model_params/lenet_mnist/conv2.bin", float_type, 64, 32, 5, 5);
+  void *conv2_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/conv2_bias.bin", float_type, 1, 64, 1, 1);
+  void *fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin",
+                                         float_type, 1, 1, 7 * 7 * 64, 1024);
+  void *fc1_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/fc1_bias.bin", float_type, 1, 1024, 1, 1);
+  void *fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin",
+                                         float_type, 1, 1, 1024, 10);
+  void *fc2_bias = readTrainedWeights(
+      "../model_params/lenet_mnist/fc2_bias.bin", float_type, 1, 10, 1, 1);
+
   clearTensorMap();
-  
-  for(int i = 0; i < total_runs; i++){
+
+  for (int i = 0; i < total_runs; i++) {
     readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters
 
-    // Start power and performnce profiling 
+    // Start power and performnce profiling
     startProfiling();
-  
+
     int conv_mode = 1; // NOTE: using CROSS_CORRELATION
-    int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+    int conv_precision =
+        0; // NOTE: using Float as compute precision. FIXIT: use enum
 
     // NOTE: 'SAME' convolution
-    void* conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1,
-				       conv_mode, conv_precision);
+    void *conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1,
+                                       conv_mode, conv_precision);
 
-    // NOTE: For tensorAdd, the only dimension that MUST match is channels  
+    // NOTE: For tensorAdd, the only dimension that MUST match is channels
     tensorAdd(conv1out, conv1_bias); // NOTE: In place operation
 
-    void* pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
+    void *pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2);
 
-    void* conv1_tanh = tensorTanh(pool1out);
+    void *conv1_tanh = tensorTanh(pool1out);
 
-    // NOTE: input channels have to match between tensor op inputs and outputs 
-    void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1,
-				       conv_mode, conv_precision);
+    // NOTE: input channels have to match between tensor op inputs and outputs
+    void *conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1,
+                                       conv_mode, conv_precision);
     tensorAdd(conv2out, conv2_bias); // NOTE: In place operation
 
-    void* pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+    void *pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
+
+    void *conv2_tanh = tensorTanh(pool2out);
 
-    void* conv2_tanh = tensorTanh(pool2out);
+    void *gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights);
 
-    void* gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights);  
+    void *gemm1biasout = tensorAdd(gemm1out, fc1_bias);
 
-    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+    void *tanh1out = tensorTanh(gemm1biasout);
 
-    void* tanh1out = tensorTanh(gemm1biasout);
-  
-    void* gemm2out = tensorGemmGPU(tanh1out, fc2_weights);  
-  
-    void* gemm2_biasout = tensorAdd(gemm2out, fc2_bias);
+    void *gemm2out = tensorGemmGPU(tanh1out, fc2_weights);
 
-    void* tanh2out = tensorTanh(gemm2_biasout);
-  
-    void* result = tensorSoftmax(tanh2out);
+    void *gemm2_biasout = tensorAdd(gemm2out, fc2_bias);
+
+    void *tanh2out = tensorTanh(gemm2_biasout);
+
+    void *result = tensorSoftmax(tanh2out);
 
     // End profiling and dump output to profile.txt
     stopProfiling();
-  
+
     float accuracy = computeAccuracy2(labels, test_batch_size, result);
-    dumpFinalAccuracy(accuracy); 
+    dumpFinalAccuracy(accuracy);
 
-    
-    //FIXME: remove the comment below to use piped autotuner
-    //dumpAccuracyNorms();
-    freeOutputTensors();  
+    // FIXME: remove the comment below to use piped autotuner
+    // dumpAccuracyNorms();
+    freeOutputTensors();
   }
 
   dumpExecutionAccuracies();
-
-  
 }
 
+int main(int argc, char *argv[]) {
 
-
-int main(int argc, char* argv[]){
-
-  if (argc > 1){
+  if (argc > 1) {
     total_runs = atoi(argv[1]);
   }
 
@@ -130,4 +122,3 @@ int main(int argc, char* argv[]){
 
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
index 7c311a5686..0820d4467a 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc
@@ -1,414 +1,732 @@
 
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
 #include "../../tensor_runtime/include/tensor_runtime.h"
-#include "../include/utils.h" 
+#include "../include/utils.h"
 
-int main(){ 
+int main() {
 
-  llvm_hpvm_initTensorRt(0); 
+  llvm_hpvm_initTensorRt(0);
 
+  std::string dir_prefix = model_params_path + std::string("/mobilenet/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3);
+  std::string batch_normalization_1_gamma_path =
+      dir_prefix + std::string("batch_normalization_1_gamma.bin");
+  void *batch_normalization_1_gamma = readTrainedWeights(
+      batch_normalization_1_gamma_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_beta_path =
+      dir_prefix + std::string("batch_normalization_1_beta.bin");
+  void *batch_normalization_1_beta = readTrainedWeights(
+      batch_normalization_1_beta_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_mean_path =
+      dir_prefix + std::string("batch_normalization_1_mean.bin");
+  void *batch_normalization_1_mean = readTrainedWeights(
+      batch_normalization_1_mean_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_1_variance_path =
+      dir_prefix + std::string("batch_normalization_1_variance.bin");
+  void *batch_normalization_1_variance = readTrainedWeights(
+      batch_normalization_1_variance_path.c_str(), 0, 1, 32, 1, 1);
+  std::string depthwise_conv2d_1_w_path =
+      dir_prefix + std::string("depthwise_conv2d_1_w.bin");
+  void *depthwise_conv2d_1_w =
+      readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0, 32, 1, 3, 3);
+  std::string batch_normalization_2_gamma_path =
+      dir_prefix + std::string("batch_normalization_2_gamma.bin");
+  void *batch_normalization_2_gamma = readTrainedWeights(
+      batch_normalization_2_gamma_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_beta_path =
+      dir_prefix + std::string("batch_normalization_2_beta.bin");
+  void *batch_normalization_2_beta = readTrainedWeights(
+      batch_normalization_2_beta_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_mean_path =
+      dir_prefix + std::string("batch_normalization_2_mean.bin");
+  void *batch_normalization_2_mean = readTrainedWeights(
+      batch_normalization_2_mean_path.c_str(), 0, 1, 32, 1, 1);
+  std::string batch_normalization_2_variance_path =
+      dir_prefix + std::string("batch_normalization_2_variance.bin");
+  void *batch_normalization_2_variance = readTrainedWeights(
+      batch_normalization_2_variance_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 32, 1, 1);
+  std::string batch_normalization_3_gamma_path =
+      dir_prefix + std::string("batch_normalization_3_gamma.bin");
+  void *batch_normalization_3_gamma = readTrainedWeights(
+      batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_beta_path =
+      dir_prefix + std::string("batch_normalization_3_beta.bin");
+  void *batch_normalization_3_beta = readTrainedWeights(
+      batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_mean_path =
+      dir_prefix + std::string("batch_normalization_3_mean.bin");
+  void *batch_normalization_3_mean = readTrainedWeights(
+      batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_variance_path =
+      dir_prefix + std::string("batch_normalization_3_variance.bin");
+  void *batch_normalization_3_variance = readTrainedWeights(
+      batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string depthwise_conv2d_2_w_path =
+      dir_prefix + std::string("depthwise_conv2d_2_w.bin");
+  void *depthwise_conv2d_2_w =
+      readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0, 64, 1, 3, 3);
+  std::string batch_normalization_4_gamma_path =
+      dir_prefix + std::string("batch_normalization_4_gamma.bin");
+  void *batch_normalization_4_gamma = readTrainedWeights(
+      batch_normalization_4_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_beta_path =
+      dir_prefix + std::string("batch_normalization_4_beta.bin");
+  void *batch_normalization_4_beta = readTrainedWeights(
+      batch_normalization_4_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_mean_path =
+      dir_prefix + std::string("batch_normalization_4_mean.bin");
+  void *batch_normalization_4_mean = readTrainedWeights(
+      batch_normalization_4_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_4_variance_path =
+      dir_prefix + std::string("batch_normalization_4_variance.bin");
+  void *batch_normalization_4_variance = readTrainedWeights(
+      batch_normalization_4_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 1, 1);
+  std::string batch_normalization_5_gamma_path =
+      dir_prefix + std::string("batch_normalization_5_gamma.bin");
+  void *batch_normalization_5_gamma = readTrainedWeights(
+      batch_normalization_5_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_beta_path =
+      dir_prefix + std::string("batch_normalization_5_beta.bin");
+  void *batch_normalization_5_beta = readTrainedWeights(
+      batch_normalization_5_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_mean_path =
+      dir_prefix + std::string("batch_normalization_5_mean.bin");
+  void *batch_normalization_5_mean = readTrainedWeights(
+      batch_normalization_5_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_5_variance_path =
+      dir_prefix + std::string("batch_normalization_5_variance.bin");
+  void *batch_normalization_5_variance = readTrainedWeights(
+      batch_normalization_5_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string depthwise_conv2d_3_w_path =
+      dir_prefix + std::string("depthwise_conv2d_3_w.bin");
+  void *depthwise_conv2d_3_w =
+      readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0, 128, 1, 3, 3);
+  std::string batch_normalization_6_gamma_path =
+      dir_prefix + std::string("batch_normalization_6_gamma.bin");
+  void *batch_normalization_6_gamma = readTrainedWeights(
+      batch_normalization_6_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_beta_path =
+      dir_prefix + std::string("batch_normalization_6_beta.bin");
+  void *batch_normalization_6_beta = readTrainedWeights(
+      batch_normalization_6_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_mean_path =
+      dir_prefix + std::string("batch_normalization_6_mean.bin");
+  void *batch_normalization_6_mean = readTrainedWeights(
+      batch_normalization_6_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_6_variance_path =
+      dir_prefix + std::string("batch_normalization_6_variance.bin");
+  void *batch_normalization_6_variance = readTrainedWeights(
+      batch_normalization_6_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 1, 1);
+  std::string batch_normalization_7_gamma_path =
+      dir_prefix + std::string("batch_normalization_7_gamma.bin");
+  void *batch_normalization_7_gamma = readTrainedWeights(
+      batch_normalization_7_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_beta_path =
+      dir_prefix + std::string("batch_normalization_7_beta.bin");
+  void *batch_normalization_7_beta = readTrainedWeights(
+      batch_normalization_7_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_mean_path =
+      dir_prefix + std::string("batch_normalization_7_mean.bin");
+  void *batch_normalization_7_mean = readTrainedWeights(
+      batch_normalization_7_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_7_variance_path =
+      dir_prefix + std::string("batch_normalization_7_variance.bin");
+  void *batch_normalization_7_variance = readTrainedWeights(
+      batch_normalization_7_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string depthwise_conv2d_4_w_path =
+      dir_prefix + std::string("depthwise_conv2d_4_w.bin");
+  void *depthwise_conv2d_4_w =
+      readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0, 128, 1, 3, 3);
+  std::string batch_normalization_8_gamma_path =
+      dir_prefix + std::string("batch_normalization_8_gamma.bin");
+  void *batch_normalization_8_gamma = readTrainedWeights(
+      batch_normalization_8_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_beta_path =
+      dir_prefix + std::string("batch_normalization_8_beta.bin");
+  void *batch_normalization_8_beta = readTrainedWeights(
+      batch_normalization_8_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_mean_path =
+      dir_prefix + std::string("batch_normalization_8_mean.bin");
+  void *batch_normalization_8_mean = readTrainedWeights(
+      batch_normalization_8_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_8_variance_path =
+      dir_prefix + std::string("batch_normalization_8_variance.bin");
+  void *batch_normalization_8_variance = readTrainedWeights(
+      batch_normalization_8_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 1, 1);
+  std::string batch_normalization_9_gamma_path =
+      dir_prefix + std::string("batch_normalization_9_gamma.bin");
+  void *batch_normalization_9_gamma = readTrainedWeights(
+      batch_normalization_9_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_beta_path =
+      dir_prefix + std::string("batch_normalization_9_beta.bin");
+  void *batch_normalization_9_beta = readTrainedWeights(
+      batch_normalization_9_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_mean_path =
+      dir_prefix + std::string("batch_normalization_9_mean.bin");
+  void *batch_normalization_9_mean = readTrainedWeights(
+      batch_normalization_9_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_9_variance_path =
+      dir_prefix + std::string("batch_normalization_9_variance.bin");
+  void *batch_normalization_9_variance = readTrainedWeights(
+      batch_normalization_9_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string depthwise_conv2d_5_w_path =
+      dir_prefix + std::string("depthwise_conv2d_5_w.bin");
+  void *depthwise_conv2d_5_w =
+      readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0, 256, 1, 3, 3);
+  std::string batch_normalization_10_gamma_path =
+      dir_prefix + std::string("batch_normalization_10_gamma.bin");
+  void *batch_normalization_10_gamma = readTrainedWeights(
+      batch_normalization_10_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_beta_path =
+      dir_prefix + std::string("batch_normalization_10_beta.bin");
+  void *batch_normalization_10_beta = readTrainedWeights(
+      batch_normalization_10_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_mean_path =
+      dir_prefix + std::string("batch_normalization_10_mean.bin");
+  void *batch_normalization_10_mean = readTrainedWeights(
+      batch_normalization_10_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_10_variance_path =
+      dir_prefix + std::string("batch_normalization_10_variance.bin");
+  void *batch_normalization_10_variance = readTrainedWeights(
+      batch_normalization_10_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 1, 1);
+  std::string batch_normalization_11_gamma_path =
+      dir_prefix + std::string("batch_normalization_11_gamma.bin");
+  void *batch_normalization_11_gamma = readTrainedWeights(
+      batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_beta_path =
+      dir_prefix + std::string("batch_normalization_11_beta.bin");
+  void *batch_normalization_11_beta = readTrainedWeights(
+      batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_mean_path =
+      dir_prefix + std::string("batch_normalization_11_mean.bin");
+  void *batch_normalization_11_mean = readTrainedWeights(
+      batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_variance_path =
+      dir_prefix + std::string("batch_normalization_11_variance.bin");
+  void *batch_normalization_11_variance = readTrainedWeights(
+      batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string depthwise_conv2d_6_w_path =
+      dir_prefix + std::string("depthwise_conv2d_6_w.bin");
+  void *depthwise_conv2d_6_w =
+      readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0, 256, 1, 3, 3);
+  std::string batch_normalization_12_gamma_path =
+      dir_prefix + std::string("batch_normalization_12_gamma.bin");
+  void *batch_normalization_12_gamma = readTrainedWeights(
+      batch_normalization_12_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_beta_path =
+      dir_prefix + std::string("batch_normalization_12_beta.bin");
+  void *batch_normalization_12_beta = readTrainedWeights(
+      batch_normalization_12_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_mean_path =
+      dir_prefix + std::string("batch_normalization_12_mean.bin");
+  void *batch_normalization_12_mean = readTrainedWeights(
+      batch_normalization_12_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_12_variance_path =
+      dir_prefix + std::string("batch_normalization_12_variance.bin");
+  void *batch_normalization_12_variance = readTrainedWeights(
+      batch_normalization_12_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 512, 256, 1, 1);
+  std::string batch_normalization_13_gamma_path =
+      dir_prefix + std::string("batch_normalization_13_gamma.bin");
+  void *batch_normalization_13_gamma = readTrainedWeights(
+      batch_normalization_13_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_beta_path =
+      dir_prefix + std::string("batch_normalization_13_beta.bin");
+  void *batch_normalization_13_beta = readTrainedWeights(
+      batch_normalization_13_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_mean_path =
+      dir_prefix + std::string("batch_normalization_13_mean.bin");
+  void *batch_normalization_13_mean = readTrainedWeights(
+      batch_normalization_13_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_13_variance_path =
+      dir_prefix + std::string("batch_normalization_13_variance.bin");
+  void *batch_normalization_13_variance = readTrainedWeights(
+      batch_normalization_13_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_7_w_path =
+      dir_prefix + std::string("depthwise_conv2d_7_w.bin");
+  void *depthwise_conv2d_7_w =
+      readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_14_gamma_path =
+      dir_prefix + std::string("batch_normalization_14_gamma.bin");
+  void *batch_normalization_14_gamma = readTrainedWeights(
+      batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_beta_path =
+      dir_prefix + std::string("batch_normalization_14_beta.bin");
+  void *batch_normalization_14_beta = readTrainedWeights(
+      batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_mean_path =
+      dir_prefix + std::string("batch_normalization_14_mean.bin");
+  void *batch_normalization_14_mean = readTrainedWeights(
+      batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_variance_path =
+      dir_prefix + std::string("batch_normalization_14_variance.bin");
+  void *batch_normalization_14_variance = readTrainedWeights(
+      batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_15_gamma_path =
+      dir_prefix + std::string("batch_normalization_15_gamma.bin");
+  void *batch_normalization_15_gamma = readTrainedWeights(
+      batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_beta_path =
+      dir_prefix + std::string("batch_normalization_15_beta.bin");
+  void *batch_normalization_15_beta = readTrainedWeights(
+      batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_mean_path =
+      dir_prefix + std::string("batch_normalization_15_mean.bin");
+  void *batch_normalization_15_mean = readTrainedWeights(
+      batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_variance_path =
+      dir_prefix + std::string("batch_normalization_15_variance.bin");
+  void *batch_normalization_15_variance = readTrainedWeights(
+      batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_8_w_path =
+      dir_prefix + std::string("depthwise_conv2d_8_w.bin");
+  void *depthwise_conv2d_8_w =
+      readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_16_gamma_path =
+      dir_prefix + std::string("batch_normalization_16_gamma.bin");
+  void *batch_normalization_16_gamma = readTrainedWeights(
+      batch_normalization_16_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_beta_path =
+      dir_prefix + std::string("batch_normalization_16_beta.bin");
+  void *batch_normalization_16_beta = readTrainedWeights(
+      batch_normalization_16_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_mean_path =
+      dir_prefix + std::string("batch_normalization_16_mean.bin");
+  void *batch_normalization_16_mean = readTrainedWeights(
+      batch_normalization_16_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_16_variance_path =
+      dir_prefix + std::string("batch_normalization_16_variance.bin");
+  void *batch_normalization_16_variance = readTrainedWeights(
+      batch_normalization_16_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_17_gamma_path =
+      dir_prefix + std::string("batch_normalization_17_gamma.bin");
+  void *batch_normalization_17_gamma = readTrainedWeights(
+      batch_normalization_17_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_beta_path =
+      dir_prefix + std::string("batch_normalization_17_beta.bin");
+  void *batch_normalization_17_beta = readTrainedWeights(
+      batch_normalization_17_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_mean_path =
+      dir_prefix + std::string("batch_normalization_17_mean.bin");
+  void *batch_normalization_17_mean = readTrainedWeights(
+      batch_normalization_17_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_17_variance_path =
+      dir_prefix + std::string("batch_normalization_17_variance.bin");
+  void *batch_normalization_17_variance = readTrainedWeights(
+      batch_normalization_17_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_9_w_path =
+      dir_prefix + std::string("depthwise_conv2d_9_w.bin");
+  void *depthwise_conv2d_9_w =
+      readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_18_gamma_path =
+      dir_prefix + std::string("batch_normalization_18_gamma.bin");
+  void *batch_normalization_18_gamma = readTrainedWeights(
+      batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_beta_path =
+      dir_prefix + std::string("batch_normalization_18_beta.bin");
+  void *batch_normalization_18_beta = readTrainedWeights(
+      batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_mean_path =
+      dir_prefix + std::string("batch_normalization_18_mean.bin");
+  void *batch_normalization_18_mean = readTrainedWeights(
+      batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_variance_path =
+      dir_prefix + std::string("batch_normalization_18_variance.bin");
+  void *batch_normalization_18_variance = readTrainedWeights(
+      batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_19_gamma_path =
+      dir_prefix + std::string("batch_normalization_19_gamma.bin");
+  void *batch_normalization_19_gamma = readTrainedWeights(
+      batch_normalization_19_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_beta_path =
+      dir_prefix + std::string("batch_normalization_19_beta.bin");
+  void *batch_normalization_19_beta = readTrainedWeights(
+      batch_normalization_19_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_mean_path =
+      dir_prefix + std::string("batch_normalization_19_mean.bin");
+  void *batch_normalization_19_mean = readTrainedWeights(
+      batch_normalization_19_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_19_variance_path =
+      dir_prefix + std::string("batch_normalization_19_variance.bin");
+  void *batch_normalization_19_variance = readTrainedWeights(
+      batch_normalization_19_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_10_w_path =
+      dir_prefix + std::string("depthwise_conv2d_10_w.bin");
+  void *depthwise_conv2d_10_w =
+      readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_20_gamma_path =
+      dir_prefix + std::string("batch_normalization_20_gamma.bin");
+  void *batch_normalization_20_gamma = readTrainedWeights(
+      batch_normalization_20_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_beta_path =
+      dir_prefix + std::string("batch_normalization_20_beta.bin");
+  void *batch_normalization_20_beta = readTrainedWeights(
+      batch_normalization_20_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_mean_path =
+      dir_prefix + std::string("batch_normalization_20_mean.bin");
+  void *batch_normalization_20_mean = readTrainedWeights(
+      batch_normalization_20_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_20_variance_path =
+      dir_prefix + std::string("batch_normalization_20_variance.bin");
+  void *batch_normalization_20_variance = readTrainedWeights(
+      batch_normalization_20_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_21_gamma_path =
+      dir_prefix + std::string("batch_normalization_21_gamma.bin");
+  void *batch_normalization_21_gamma = readTrainedWeights(
+      batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_beta_path =
+      dir_prefix + std::string("batch_normalization_21_beta.bin");
+  void *batch_normalization_21_beta = readTrainedWeights(
+      batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_mean_path =
+      dir_prefix + std::string("batch_normalization_21_mean.bin");
+  void *batch_normalization_21_mean = readTrainedWeights(
+      batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_variance_path =
+      dir_prefix + std::string("batch_normalization_21_variance.bin");
+  void *batch_normalization_21_variance = readTrainedWeights(
+      batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_11_w_path =
+      dir_prefix + std::string("depthwise_conv2d_11_w.bin");
+  void *depthwise_conv2d_11_w =
+      readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_22_gamma_path =
+      dir_prefix + std::string("batch_normalization_22_gamma.bin");
+  void *batch_normalization_22_gamma = readTrainedWeights(
+      batch_normalization_22_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_beta_path =
+      dir_prefix + std::string("batch_normalization_22_beta.bin");
+  void *batch_normalization_22_beta = readTrainedWeights(
+      batch_normalization_22_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_mean_path =
+      dir_prefix + std::string("batch_normalization_22_mean.bin");
+  void *batch_normalization_22_mean = readTrainedWeights(
+      batch_normalization_22_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_22_variance_path =
+      dir_prefix + std::string("batch_normalization_22_variance.bin");
+  void *batch_normalization_22_variance = readTrainedWeights(
+      batch_normalization_22_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 1, 1);
+  std::string batch_normalization_23_gamma_path =
+      dir_prefix + std::string("batch_normalization_23_gamma.bin");
+  void *batch_normalization_23_gamma = readTrainedWeights(
+      batch_normalization_23_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_beta_path =
+      dir_prefix + std::string("batch_normalization_23_beta.bin");
+  void *batch_normalization_23_beta = readTrainedWeights(
+      batch_normalization_23_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_mean_path =
+      dir_prefix + std::string("batch_normalization_23_mean.bin");
+  void *batch_normalization_23_mean = readTrainedWeights(
+      batch_normalization_23_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_23_variance_path =
+      dir_prefix + std::string("batch_normalization_23_variance.bin");
+  void *batch_normalization_23_variance = readTrainedWeights(
+      batch_normalization_23_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string depthwise_conv2d_12_w_path =
+      dir_prefix + std::string("depthwise_conv2d_12_w.bin");
+  void *depthwise_conv2d_12_w =
+      readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0, 512, 1, 3, 3);
+  std::string batch_normalization_24_gamma_path =
+      dir_prefix + std::string("batch_normalization_24_gamma.bin");
+  void *batch_normalization_24_gamma = readTrainedWeights(
+      batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_beta_path =
+      dir_prefix + std::string("batch_normalization_24_beta.bin");
+  void *batch_normalization_24_beta = readTrainedWeights(
+      batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_mean_path =
+      dir_prefix + std::string("batch_normalization_24_mean.bin");
+  void *batch_normalization_24_mean = readTrainedWeights(
+      batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_variance_path =
+      dir_prefix + std::string("batch_normalization_24_variance.bin");
+  void *batch_normalization_24_variance = readTrainedWeights(
+      batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 1024, 512, 1, 1);
+  std::string batch_normalization_25_gamma_path =
+      dir_prefix + std::string("batch_normalization_25_gamma.bin");
+  void *batch_normalization_25_gamma = readTrainedWeights(
+      batch_normalization_25_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_beta_path =
+      dir_prefix + std::string("batch_normalization_25_beta.bin");
+  void *batch_normalization_25_beta = readTrainedWeights(
+      batch_normalization_25_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_mean_path =
+      dir_prefix + std::string("batch_normalization_25_mean.bin");
+  void *batch_normalization_25_mean = readTrainedWeights(
+      batch_normalization_25_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_25_variance_path =
+      dir_prefix + std::string("batch_normalization_25_variance.bin");
+  void *batch_normalization_25_variance = readTrainedWeights(
+      batch_normalization_25_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string depthwise_conv2d_13_w_path =
+      dir_prefix + std::string("depthwise_conv2d_13_w.bin");
+  void *depthwise_conv2d_13_w =
+      readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0, 1024, 1, 3, 3);
+  std::string batch_normalization_26_gamma_path =
+      dir_prefix + std::string("batch_normalization_26_gamma.bin");
+  void *batch_normalization_26_gamma = readTrainedWeights(
+      batch_normalization_26_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_beta_path =
+      dir_prefix + std::string("batch_normalization_26_beta.bin");
+  void *batch_normalization_26_beta = readTrainedWeights(
+      batch_normalization_26_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_mean_path =
+      dir_prefix + std::string("batch_normalization_26_mean.bin");
+  void *batch_normalization_26_mean = readTrainedWeights(
+      batch_normalization_26_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_26_variance_path =
+      dir_prefix + std::string("batch_normalization_26_variance.bin");
+  void *batch_normalization_26_variance = readTrainedWeights(
+      batch_normalization_26_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 1024, 1024, 1, 1);
+  std::string batch_normalization_27_gamma_path =
+      dir_prefix + std::string("batch_normalization_27_gamma.bin");
+  void *batch_normalization_27_gamma = readTrainedWeights(
+      batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_beta_path =
+      dir_prefix + std::string("batch_normalization_27_beta.bin");
+  void *batch_normalization_27_beta = readTrainedWeights(
+      batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_mean_path =
+      dir_prefix + std::string("batch_normalization_27_mean.bin");
+  void *batch_normalization_27_mean = readTrainedWeights(
+      batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_variance_path =
+      dir_prefix + std::string("batch_normalization_27_variance.bin");
+  void *batch_normalization_27_variance = readTrainedWeights(
+      batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
-  std::string dir_prefix = model_params_path + std::string("/mobilenet/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); 
-  std::string batch_normalization_1_gamma_path =  dir_prefix + std::string("batch_normalization_1_gamma.bin"); 
-  void* batch_normalization_1_gamma =  readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_1_beta_path =  dir_prefix + std::string("batch_normalization_1_beta.bin"); 
-  void* batch_normalization_1_beta =  readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_1_mean_path =  dir_prefix + std::string("batch_normalization_1_mean.bin"); 
-  void* batch_normalization_1_mean =  readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_1_variance_path =  dir_prefix + std::string("batch_normalization_1_variance.bin"); 
-  void* batch_normalization_1_variance =  readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); 
-  std::string depthwise_conv2d_1_w_path =  dir_prefix + std::string("depthwise_conv2d_1_w.bin"); 
-  void* depthwise_conv2d_1_w =  readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); 
-  std::string batch_normalization_2_gamma_path =  dir_prefix + std::string("batch_normalization_2_gamma.bin"); 
-  void* batch_normalization_2_gamma =  readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_2_beta_path =  dir_prefix + std::string("batch_normalization_2_beta.bin"); 
-  void* batch_normalization_2_beta =  readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_2_mean_path =  dir_prefix + std::string("batch_normalization_2_mean.bin"); 
-  void* batch_normalization_2_mean =  readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); 
-  std::string batch_normalization_2_variance_path =  dir_prefix + std::string("batch_normalization_2_variance.bin"); 
-  void* batch_normalization_2_variance =  readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); 
-  std::string batch_normalization_3_gamma_path =  dir_prefix + std::string("batch_normalization_3_gamma.bin"); 
-  void* batch_normalization_3_gamma =  readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_beta_path =  dir_prefix + std::string("batch_normalization_3_beta.bin"); 
-  void* batch_normalization_3_beta =  readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_mean_path =  dir_prefix + std::string("batch_normalization_3_mean.bin"); 
-  void* batch_normalization_3_mean =  readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_variance_path =  dir_prefix + std::string("batch_normalization_3_variance.bin"); 
-  void* batch_normalization_3_variance =  readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); 
-  std::string depthwise_conv2d_2_w_path =  dir_prefix + std::string("depthwise_conv2d_2_w.bin"); 
-  void* depthwise_conv2d_2_w =  readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); 
-  std::string batch_normalization_4_gamma_path =  dir_prefix + std::string("batch_normalization_4_gamma.bin"); 
-  void* batch_normalization_4_gamma =  readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_4_beta_path =  dir_prefix + std::string("batch_normalization_4_beta.bin"); 
-  void* batch_normalization_4_beta =  readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_4_mean_path =  dir_prefix + std::string("batch_normalization_4_mean.bin"); 
-  void* batch_normalization_4_mean =  readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_4_variance_path =  dir_prefix + std::string("batch_normalization_4_variance.bin"); 
-  void* batch_normalization_4_variance =  readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); 
-  std::string batch_normalization_5_gamma_path =  dir_prefix + std::string("batch_normalization_5_gamma.bin"); 
-  void* batch_normalization_5_gamma =  readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_5_beta_path =  dir_prefix + std::string("batch_normalization_5_beta.bin"); 
-  void* batch_normalization_5_beta =  readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_5_mean_path =  dir_prefix + std::string("batch_normalization_5_mean.bin"); 
-  void* batch_normalization_5_mean =  readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_5_variance_path =  dir_prefix + std::string("batch_normalization_5_variance.bin"); 
-  void* batch_normalization_5_variance =  readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); 
-  std::string depthwise_conv2d_3_w_path =  dir_prefix + std::string("depthwise_conv2d_3_w.bin"); 
-  void* depthwise_conv2d_3_w =  readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); 
-  std::string batch_normalization_6_gamma_path =  dir_prefix + std::string("batch_normalization_6_gamma.bin"); 
-  void* batch_normalization_6_gamma =  readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_6_beta_path =  dir_prefix + std::string("batch_normalization_6_beta.bin"); 
-  void* batch_normalization_6_beta =  readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_6_mean_path =  dir_prefix + std::string("batch_normalization_6_mean.bin"); 
-  void* batch_normalization_6_mean =  readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_6_variance_path =  dir_prefix + std::string("batch_normalization_6_variance.bin"); 
-  void* batch_normalization_6_variance =  readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); 
-  std::string batch_normalization_7_gamma_path =  dir_prefix + std::string("batch_normalization_7_gamma.bin"); 
-  void* batch_normalization_7_gamma =  readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_7_beta_path =  dir_prefix + std::string("batch_normalization_7_beta.bin"); 
-  void* batch_normalization_7_beta =  readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_7_mean_path =  dir_prefix + std::string("batch_normalization_7_mean.bin"); 
-  void* batch_normalization_7_mean =  readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_7_variance_path =  dir_prefix + std::string("batch_normalization_7_variance.bin"); 
-  void* batch_normalization_7_variance =  readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); 
-  std::string depthwise_conv2d_4_w_path =  dir_prefix + std::string("depthwise_conv2d_4_w.bin"); 
-  void* depthwise_conv2d_4_w =  readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); 
-  std::string batch_normalization_8_gamma_path =  dir_prefix + std::string("batch_normalization_8_gamma.bin"); 
-  void* batch_normalization_8_gamma =  readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_8_beta_path =  dir_prefix + std::string("batch_normalization_8_beta.bin"); 
-  void* batch_normalization_8_beta =  readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_8_mean_path =  dir_prefix + std::string("batch_normalization_8_mean.bin"); 
-  void* batch_normalization_8_mean =  readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_8_variance_path =  dir_prefix + std::string("batch_normalization_8_variance.bin"); 
-  void* batch_normalization_8_variance =  readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); 
-  std::string batch_normalization_9_gamma_path =  dir_prefix + std::string("batch_normalization_9_gamma.bin"); 
-  void* batch_normalization_9_gamma =  readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_9_beta_path =  dir_prefix + std::string("batch_normalization_9_beta.bin"); 
-  void* batch_normalization_9_beta =  readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_9_mean_path =  dir_prefix + std::string("batch_normalization_9_mean.bin"); 
-  void* batch_normalization_9_mean =  readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_9_variance_path =  dir_prefix + std::string("batch_normalization_9_variance.bin"); 
-  void* batch_normalization_9_variance =  readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); 
-  std::string depthwise_conv2d_5_w_path =  dir_prefix + std::string("depthwise_conv2d_5_w.bin"); 
-  void* depthwise_conv2d_5_w =  readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); 
-  std::string batch_normalization_10_gamma_path =  dir_prefix + std::string("batch_normalization_10_gamma.bin"); 
-  void* batch_normalization_10_gamma =  readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_10_beta_path =  dir_prefix + std::string("batch_normalization_10_beta.bin"); 
-  void* batch_normalization_10_beta =  readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_10_mean_path =  dir_prefix + std::string("batch_normalization_10_mean.bin"); 
-  void* batch_normalization_10_mean =  readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_10_variance_path =  dir_prefix + std::string("batch_normalization_10_variance.bin"); 
-  void* batch_normalization_10_variance =  readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); 
-  std::string batch_normalization_11_gamma_path =  dir_prefix + std::string("batch_normalization_11_gamma.bin"); 
-  void* batch_normalization_11_gamma =  readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_beta_path =  dir_prefix + std::string("batch_normalization_11_beta.bin"); 
-  void* batch_normalization_11_beta =  readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_mean_path =  dir_prefix + std::string("batch_normalization_11_mean.bin"); 
-  void* batch_normalization_11_mean =  readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_variance_path =  dir_prefix + std::string("batch_normalization_11_variance.bin"); 
-  void* batch_normalization_11_variance =  readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); 
-  std::string depthwise_conv2d_6_w_path =  dir_prefix + std::string("depthwise_conv2d_6_w.bin"); 
-  void* depthwise_conv2d_6_w =  readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); 
-  std::string batch_normalization_12_gamma_path =  dir_prefix + std::string("batch_normalization_12_gamma.bin"); 
-  void* batch_normalization_12_gamma =  readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_12_beta_path =  dir_prefix + std::string("batch_normalization_12_beta.bin"); 
-  void* batch_normalization_12_beta =  readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_12_mean_path =  dir_prefix + std::string("batch_normalization_12_mean.bin"); 
-  void* batch_normalization_12_mean =  readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_12_variance_path =  dir_prefix + std::string("batch_normalization_12_variance.bin"); 
-  void* batch_normalization_12_variance =  readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); 
-  std::string batch_normalization_13_gamma_path =  dir_prefix + std::string("batch_normalization_13_gamma.bin"); 
-  void* batch_normalization_13_gamma =  readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_13_beta_path =  dir_prefix + std::string("batch_normalization_13_beta.bin"); 
-  void* batch_normalization_13_beta =  readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_13_mean_path =  dir_prefix + std::string("batch_normalization_13_mean.bin"); 
-  void* batch_normalization_13_mean =  readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_13_variance_path =  dir_prefix + std::string("batch_normalization_13_variance.bin"); 
-  void* batch_normalization_13_variance =  readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_7_w_path =  dir_prefix + std::string("depthwise_conv2d_7_w.bin"); 
-  void* depthwise_conv2d_7_w =  readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_14_gamma_path =  dir_prefix + std::string("batch_normalization_14_gamma.bin"); 
-  void* batch_normalization_14_gamma =  readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_beta_path =  dir_prefix + std::string("batch_normalization_14_beta.bin"); 
-  void* batch_normalization_14_beta =  readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_mean_path =  dir_prefix + std::string("batch_normalization_14_mean.bin"); 
-  void* batch_normalization_14_mean =  readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_variance_path =  dir_prefix + std::string("batch_normalization_14_variance.bin"); 
-  void* batch_normalization_14_variance =  readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_15_gamma_path =  dir_prefix + std::string("batch_normalization_15_gamma.bin"); 
-  void* batch_normalization_15_gamma =  readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_beta_path =  dir_prefix + std::string("batch_normalization_15_beta.bin"); 
-  void* batch_normalization_15_beta =  readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_mean_path =  dir_prefix + std::string("batch_normalization_15_mean.bin"); 
-  void* batch_normalization_15_mean =  readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_variance_path =  dir_prefix + std::string("batch_normalization_15_variance.bin"); 
-  void* batch_normalization_15_variance =  readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_8_w_path =  dir_prefix + std::string("depthwise_conv2d_8_w.bin"); 
-  void* depthwise_conv2d_8_w =  readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_16_gamma_path =  dir_prefix + std::string("batch_normalization_16_gamma.bin"); 
-  void* batch_normalization_16_gamma =  readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_16_beta_path =  dir_prefix + std::string("batch_normalization_16_beta.bin"); 
-  void* batch_normalization_16_beta =  readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_16_mean_path =  dir_prefix + std::string("batch_normalization_16_mean.bin"); 
-  void* batch_normalization_16_mean =  readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_16_variance_path =  dir_prefix + std::string("batch_normalization_16_variance.bin"); 
-  void* batch_normalization_16_variance =  readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_17_gamma_path =  dir_prefix + std::string("batch_normalization_17_gamma.bin"); 
-  void* batch_normalization_17_gamma =  readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_17_beta_path =  dir_prefix + std::string("batch_normalization_17_beta.bin"); 
-  void* batch_normalization_17_beta =  readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_17_mean_path =  dir_prefix + std::string("batch_normalization_17_mean.bin"); 
-  void* batch_normalization_17_mean =  readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_17_variance_path =  dir_prefix + std::string("batch_normalization_17_variance.bin"); 
-  void* batch_normalization_17_variance =  readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_9_w_path =  dir_prefix + std::string("depthwise_conv2d_9_w.bin"); 
-  void* depthwise_conv2d_9_w =  readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_18_gamma_path =  dir_prefix + std::string("batch_normalization_18_gamma.bin"); 
-  void* batch_normalization_18_gamma =  readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_beta_path =  dir_prefix + std::string("batch_normalization_18_beta.bin"); 
-  void* batch_normalization_18_beta =  readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_mean_path =  dir_prefix + std::string("batch_normalization_18_mean.bin"); 
-  void* batch_normalization_18_mean =  readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_variance_path =  dir_prefix + std::string("batch_normalization_18_variance.bin"); 
-  void* batch_normalization_18_variance =  readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_19_gamma_path =  dir_prefix + std::string("batch_normalization_19_gamma.bin"); 
-  void* batch_normalization_19_gamma =  readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_19_beta_path =  dir_prefix + std::string("batch_normalization_19_beta.bin"); 
-  void* batch_normalization_19_beta =  readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_19_mean_path =  dir_prefix + std::string("batch_normalization_19_mean.bin"); 
-  void* batch_normalization_19_mean =  readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_19_variance_path =  dir_prefix + std::string("batch_normalization_19_variance.bin"); 
-  void* batch_normalization_19_variance =  readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_10_w_path =  dir_prefix + std::string("depthwise_conv2d_10_w.bin"); 
-  void* depthwise_conv2d_10_w =  readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_20_gamma_path =  dir_prefix + std::string("batch_normalization_20_gamma.bin"); 
-  void* batch_normalization_20_gamma =  readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_20_beta_path =  dir_prefix + std::string("batch_normalization_20_beta.bin"); 
-  void* batch_normalization_20_beta =  readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_20_mean_path =  dir_prefix + std::string("batch_normalization_20_mean.bin"); 
-  void* batch_normalization_20_mean =  readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_20_variance_path =  dir_prefix + std::string("batch_normalization_20_variance.bin"); 
-  void* batch_normalization_20_variance =  readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_21_gamma_path =  dir_prefix + std::string("batch_normalization_21_gamma.bin"); 
-  void* batch_normalization_21_gamma =  readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_beta_path =  dir_prefix + std::string("batch_normalization_21_beta.bin"); 
-  void* batch_normalization_21_beta =  readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_mean_path =  dir_prefix + std::string("batch_normalization_21_mean.bin"); 
-  void* batch_normalization_21_mean =  readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_variance_path =  dir_prefix + std::string("batch_normalization_21_variance.bin"); 
-  void* batch_normalization_21_variance =  readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_11_w_path =  dir_prefix + std::string("depthwise_conv2d_11_w.bin"); 
-  void* depthwise_conv2d_11_w =  readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_22_gamma_path =  dir_prefix + std::string("batch_normalization_22_gamma.bin"); 
-  void* batch_normalization_22_gamma =  readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_22_beta_path =  dir_prefix + std::string("batch_normalization_22_beta.bin"); 
-  void* batch_normalization_22_beta =  readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_22_mean_path =  dir_prefix + std::string("batch_normalization_22_mean.bin"); 
-  void* batch_normalization_22_mean =  readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_22_variance_path =  dir_prefix + std::string("batch_normalization_22_variance.bin"); 
-  void* batch_normalization_22_variance =  readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); 
-  std::string batch_normalization_23_gamma_path =  dir_prefix + std::string("batch_normalization_23_gamma.bin"); 
-  void* batch_normalization_23_gamma =  readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_23_beta_path =  dir_prefix + std::string("batch_normalization_23_beta.bin"); 
-  void* batch_normalization_23_beta =  readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_23_mean_path =  dir_prefix + std::string("batch_normalization_23_mean.bin"); 
-  void* batch_normalization_23_mean =  readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_23_variance_path =  dir_prefix + std::string("batch_normalization_23_variance.bin"); 
-  void* batch_normalization_23_variance =  readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); 
-  std::string depthwise_conv2d_12_w_path =  dir_prefix + std::string("depthwise_conv2d_12_w.bin"); 
-  void* depthwise_conv2d_12_w =  readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); 
-  std::string batch_normalization_24_gamma_path =  dir_prefix + std::string("batch_normalization_24_gamma.bin"); 
-  void* batch_normalization_24_gamma =  readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_beta_path =  dir_prefix + std::string("batch_normalization_24_beta.bin"); 
-  void* batch_normalization_24_beta =  readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_mean_path =  dir_prefix + std::string("batch_normalization_24_mean.bin"); 
-  void* batch_normalization_24_mean =  readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_variance_path =  dir_prefix + std::string("batch_normalization_24_variance.bin"); 
-  void* batch_normalization_24_variance =  readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); 
-  std::string batch_normalization_25_gamma_path =  dir_prefix + std::string("batch_normalization_25_gamma.bin"); 
-  void* batch_normalization_25_gamma =  readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_25_beta_path =  dir_prefix + std::string("batch_normalization_25_beta.bin"); 
-  void* batch_normalization_25_beta =  readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_25_mean_path =  dir_prefix + std::string("batch_normalization_25_mean.bin"); 
-  void* batch_normalization_25_mean =  readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_25_variance_path =  dir_prefix + std::string("batch_normalization_25_variance.bin"); 
-  void* batch_normalization_25_variance =  readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string depthwise_conv2d_13_w_path =  dir_prefix + std::string("depthwise_conv2d_13_w.bin"); 
-  void* depthwise_conv2d_13_w =  readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); 
-  std::string batch_normalization_26_gamma_path =  dir_prefix + std::string("batch_normalization_26_gamma.bin"); 
-  void* batch_normalization_26_gamma =  readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_26_beta_path =  dir_prefix + std::string("batch_normalization_26_beta.bin"); 
-  void* batch_normalization_26_beta =  readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_26_mean_path =  dir_prefix + std::string("batch_normalization_26_mean.bin"); 
-  void* batch_normalization_26_mean =  readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_26_variance_path =  dir_prefix + std::string("batch_normalization_26_variance.bin"); 
-  void* batch_normalization_26_variance =  readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); 
-  std::string batch_normalization_27_gamma_path =  dir_prefix + std::string("batch_normalization_27_gamma.bin"); 
-  void* batch_normalization_27_gamma =  readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_beta_path =  dir_prefix + std::string("batch_normalization_27_beta.bin"); 
-  void* batch_normalization_27_beta =  readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_mean_path =  dir_prefix + std::string("batch_normalization_27_mean.bin"); 
-  void* batch_normalization_27_mean =  readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_variance_path =  dir_prefix + std::string("batch_normalization_27_variance.bin"); 
-  void* batch_normalization_27_variance =  readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+  startMemTracking();
 
+  int test_input_size = 2000;
+  int batch_size = 2000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
 
+  for (int i = 0; i < batch_count; i++) {
 
-  startMemTracking(); 
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
 
-  int test_input_size = 2000; 
-  int batch_size = 2000;  
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
 
-  for(int i = 0; i < batch_count; i++){ 
+    void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1);
+    void *var_1 = tensorBatchNorm(
+        var_0, batch_normalization_1_gamma, batch_normalization_1_beta,
+        batch_normalization_1_mean, batch_normalization_1_variance, 0.001);
+    void *var_2 = tensorRelu(var_1);
+    void *var_4 =
+        tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32);
+    void *var_5 = tensorBatchNorm(
+        var_4, batch_normalization_2_gamma, batch_normalization_2_beta,
+        batch_normalization_2_mean, batch_normalization_2_variance, 0.001);
+    void *var_6 = tensorRelu(var_5);
+    void *var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1);
+    void *var_8 = tensorBatchNorm(
+        var_7, batch_normalization_3_gamma, batch_normalization_3_beta,
+        batch_normalization_3_mean, batch_normalization_3_variance, 0.001);
+    void *var_9 = tensorRelu(var_8);
+    void *var_11 =
+        tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64);
+    void *var_12 = tensorBatchNorm(
+        var_11, batch_normalization_4_gamma, batch_normalization_4_beta,
+        batch_normalization_4_mean, batch_normalization_4_variance, 0.001);
+    void *var_13 = tensorRelu(var_12);
+    void *var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1);
+    void *var_15 = tensorBatchNorm(
+        var_14, batch_normalization_5_gamma, batch_normalization_5_beta,
+        batch_normalization_5_mean, batch_normalization_5_variance, 0.001);
+    void *var_16 = tensorRelu(var_15);
+    void *var_18 =
+        tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128);
+    void *var_19 = tensorBatchNorm(
+        var_18, batch_normalization_6_gamma, batch_normalization_6_beta,
+        batch_normalization_6_mean, batch_normalization_6_variance, 0.001);
+    void *var_20 = tensorRelu(var_19);
+    void *var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1);
+    void *var_22 = tensorBatchNorm(
+        var_21, batch_normalization_7_gamma, batch_normalization_7_beta,
+        batch_normalization_7_mean, batch_normalization_7_variance, 0.001);
+    void *var_23 = tensorRelu(var_22);
+    void *var_26 =
+        tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128);
+    void *var_27 = tensorBatchNorm(
+        var_26, batch_normalization_8_gamma, batch_normalization_8_beta,
+        batch_normalization_8_mean, batch_normalization_8_variance, 0.001);
+    void *var_28 = tensorRelu(var_27);
+    void *var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1);
+    void *var_30 = tensorBatchNorm(
+        var_29, batch_normalization_9_gamma, batch_normalization_9_beta,
+        batch_normalization_9_mean, batch_normalization_9_variance, 0.001);
+    void *var_31 = tensorRelu(var_30);
+    void *var_33 =
+        tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256);
+    void *var_34 = tensorBatchNorm(
+        var_33, batch_normalization_10_gamma, batch_normalization_10_beta,
+        batch_normalization_10_mean, batch_normalization_10_variance, 0.001);
+    void *var_35 = tensorRelu(var_34);
+    void *var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1);
+    void *var_37 = tensorBatchNorm(
+        var_36, batch_normalization_11_gamma, batch_normalization_11_beta,
+        batch_normalization_11_mean, batch_normalization_11_variance, 0.001);
+    void *var_38 = tensorRelu(var_37);
+    void *var_41 =
+        tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256);
+    void *var_42 = tensorBatchNorm(
+        var_41, batch_normalization_12_gamma, batch_normalization_12_beta,
+        batch_normalization_12_mean, batch_normalization_12_variance, 0.001);
+    void *var_43 = tensorRelu(var_42);
+    void *var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1);
+    void *var_45 = tensorBatchNorm(
+        var_44, batch_normalization_13_gamma, batch_normalization_13_beta,
+        batch_normalization_13_mean, batch_normalization_13_variance, 0.001);
+    void *var_46 = tensorRelu(var_45);
+    void *var_48 =
+        tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512);
+    void *var_49 = tensorBatchNorm(
+        var_48, batch_normalization_14_gamma, batch_normalization_14_beta,
+        batch_normalization_14_mean, batch_normalization_14_variance, 0.001);
+    void *var_50 = tensorRelu(var_49);
+    void *var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1);
+    void *var_52 = tensorBatchNorm(
+        var_51, batch_normalization_15_gamma, batch_normalization_15_beta,
+        batch_normalization_15_mean, batch_normalization_15_variance, 0.001);
+    void *var_53 = tensorRelu(var_52);
+    void *var_55 =
+        tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512);
+    void *var_56 = tensorBatchNorm(
+        var_55, batch_normalization_16_gamma, batch_normalization_16_beta,
+        batch_normalization_16_mean, batch_normalization_16_variance, 0.001);
+    void *var_57 = tensorRelu(var_56);
+    void *var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1);
+    void *var_59 = tensorBatchNorm(
+        var_58, batch_normalization_17_gamma, batch_normalization_17_beta,
+        batch_normalization_17_mean, batch_normalization_17_variance, 0.001);
+    void *var_60 = tensorRelu(var_59);
+    void *var_63 =
+        tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512);
+    void *var_64 = tensorBatchNorm(
+        var_63, batch_normalization_18_gamma, batch_normalization_18_beta,
+        batch_normalization_18_mean, batch_normalization_18_variance, 0.001);
+    void *var_65 = tensorRelu(var_64);
+    void *var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1);
+    void *var_67 = tensorBatchNorm(
+        var_66, batch_normalization_19_gamma, batch_normalization_19_beta,
+        batch_normalization_19_mean, batch_normalization_19_variance, 0.001);
+    void *var_68 = tensorRelu(var_67);
+    void *var_70 =
+        tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512);
+    void *var_71 = tensorBatchNorm(
+        var_70, batch_normalization_20_gamma, batch_normalization_20_beta,
+        batch_normalization_20_mean, batch_normalization_20_variance, 0.001);
+    void *var_72 = tensorRelu(var_71);
+    void *var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1);
+    void *var_74 = tensorBatchNorm(
+        var_73, batch_normalization_21_gamma, batch_normalization_21_beta,
+        batch_normalization_21_mean, batch_normalization_21_variance, 0.001);
+    void *var_75 = tensorRelu(var_74);
+    void *var_77 =
+        tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512);
+    void *var_78 = tensorBatchNorm(
+        var_77, batch_normalization_22_gamma, batch_normalization_22_beta,
+        batch_normalization_22_mean, batch_normalization_22_variance, 0.001);
+    void *var_79 = tensorRelu(var_78);
+    void *var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1);
+    void *var_81 = tensorBatchNorm(
+        var_80, batch_normalization_23_gamma, batch_normalization_23_beta,
+        batch_normalization_23_mean, batch_normalization_23_variance, 0.001);
+    void *var_82 = tensorRelu(var_81);
+    void *var_85 =
+        tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512);
+    void *var_86 = tensorBatchNorm(
+        var_85, batch_normalization_24_gamma, batch_normalization_24_beta,
+        batch_normalization_24_mean, batch_normalization_24_variance, 0.001);
+    void *var_87 = tensorRelu(var_86);
+    void *var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1);
+    void *var_89 = tensorBatchNorm(
+        var_88, batch_normalization_25_gamma, batch_normalization_25_beta,
+        batch_normalization_25_mean, batch_normalization_25_variance, 0.001);
+    void *var_90 = tensorRelu(var_89);
+    void *var_92 =
+        tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024);
+    void *var_93 = tensorBatchNorm(
+        var_92, batch_normalization_26_gamma, batch_normalization_26_beta,
+        batch_normalization_26_mean, batch_normalization_26_variance, 0.001);
+    void *var_94 = tensorRelu(var_93);
+    void *var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1);
+    void *var_96 = tensorBatchNorm(
+        var_95, batch_normalization_27_gamma, batch_normalization_27_beta,
+        batch_normalization_27_mean, batch_normalization_27_variance, 0.001);
+    void *var_97 = tensorRelu(var_96);
+    void *var_99 = tensorPooling(var_97, 1, 2, 2, 0, 0, 2, 2);
+    void *var_101 = tensorGemmGPU(var_99, dense_1_w);
+    void *var_102 = tensorAdd(var_101, dense_1_b);
+    void *var_103 = tensorSoftmax(var_102);
 
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
-
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
-
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
-    void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
-    void* var_2 = tensorRelu(var_1); 
-    void* var_4 = tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
-    void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
-    void* var_6 = tensorRelu(var_5); 
-    void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
-    void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
-    void* var_9 = tensorRelu(var_8); 
-    void* var_11 = tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
-    void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
-    void* var_13 = tensorRelu(var_12); 
-    void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
-    void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
-    void* var_16 = tensorRelu(var_15); 
-    void* var_18 = tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
-    void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
-    void* var_20 = tensorRelu(var_19); 
-    void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
-    void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
-    void* var_23 = tensorRelu(var_22); 
-    void* var_26 = tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
-    void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
-    void* var_28 = tensorRelu(var_27); 
-    void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
-    void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
-    void* var_31 = tensorRelu(var_30); 
-    void* var_33 = tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
-    void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
-    void* var_35 = tensorRelu(var_34); 
-    void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
-    void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
-    void* var_38 = tensorRelu(var_37); 
-    void* var_41 = tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
-    void* var_42 = tensorBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
-    void* var_43 = tensorRelu(var_42); 
-    void* var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); 
-    void* var_45 = tensorBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
-    void* var_46 = tensorRelu(var_45); 
-    void* var_48 = tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
-    void* var_49 = tensorBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
-    void* var_50 = tensorRelu(var_49); 
-    void* var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
-    void* var_52 = tensorBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
-    void* var_53 = tensorRelu(var_52); 
-    void* var_55 = tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
-    void* var_56 = tensorBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
-    void* var_57 = tensorRelu(var_56); 
-    void* var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
-    void* var_59 = tensorBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
-    void* var_60 = tensorRelu(var_59); 
-    void* var_63 = tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
-    void* var_64 = tensorBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
-    void* var_65 = tensorRelu(var_64); 
-    void* var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); 
-    void* var_67 = tensorBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
-    void* var_68 = tensorRelu(var_67); 
-    void* var_70 = tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
-    void* var_71 = tensorBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
-    void* var_72 = tensorRelu(var_71); 
-    void* var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
-    void* var_74 = tensorBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
-    void* var_75 = tensorRelu(var_74); 
-    void* var_77 = tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
-    void* var_78 = tensorBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
-    void* var_79 = tensorRelu(var_78); 
-    void* var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); 
-    void* var_81 = tensorBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
-    void* var_82 = tensorRelu(var_81); 
-    void* var_85 = tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
-    void* var_86 = tensorBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
-    void* var_87 = tensorRelu(var_86); 
-    void* var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); 
-    void* var_89 = tensorBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-    void* var_90 = tensorRelu(var_89); 
-    void* var_92 = tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
-    void* var_93 = tensorBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
-    void* var_94 = tensorRelu(var_93); 
-    void* var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
-    void* var_96 = tensorBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-    void* var_97 = tensorRelu(var_96); 
-    void* var_99 = tensorPooling(var_97,1,2,2,0,0,2,2); 
-    void* var_101 = tensorGemmGPU(var_99, dense_1_w); 
-    void* var_102 = tensorAdd(var_101, dense_1_b); 
-    void* var_103 = tensorSoftmax(var_102); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy2(labels, batch_size, var_103); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
 
+    float accuracy = computeAccuracy2(labels, batch_size, var_103);
+    final_accuracy += accuracy;
+    freeBatchMemory();
   }
 
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
-
-
-  llvm_hpvm_cleanupTensorRt(); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
index 87b8cd4156..dc462f3943 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc
@@ -1,112 +1,155 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(1); 
-  
-  std::string dir_prefix = model_params_path + std::string("/resnet18_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); 
-  std::string conv2d_14_b_path =  dir_prefix + std::string("conv2d_14_b.bin"); 
-  void* conv2d_14_b =  readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); 
-  std::string conv2d_15_w_path =  dir_prefix + std::string("conv2d_15_w.bin"); 
-  void* conv2d_15_w =  readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); 
-  std::string conv2d_15_b_path =  dir_prefix + std::string("conv2d_15_b.bin"); 
-  void* conv2d_15_b =  readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_17_w_path =  dir_prefix + std::string("conv2d_17_w.bin"); 
-  void* conv2d_17_w =  readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); 
-  std::string conv2d_17_b_path =  dir_prefix + std::string("conv2d_17_b.bin"); 
-  void* conv2d_17_b =  readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_16_w_path =  dir_prefix + std::string("conv2d_16_w.bin"); 
-  void* conv2d_16_w =  readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_16_b_path =  dir_prefix + std::string("conv2d_16_b.bin"); 
-  void* conv2d_16_b =  readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_18_w_path =  dir_prefix + std::string("conv2d_18_w.bin"); 
-  void* conv2d_18_w =  readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_18_b_path =  dir_prefix + std::string("conv2d_18_b.bin"); 
-  void* conv2d_18_b =  readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_19_w_path =  dir_prefix + std::string("conv2d_19_w.bin"); 
-  void* conv2d_19_w =  readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_19_b_path =  dir_prefix + std::string("conv2d_19_b.bin"); 
-  void* conv2d_19_b =  readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_20_w_path =  dir_prefix + std::string("conv2d_20_w.bin"); 
-  void* conv2d_20_w =  readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_20_b_path =  dir_prefix + std::string("conv2d_20_b.bin"); 
-  void* conv2d_20_b =  readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_21_w_path =  dir_prefix + std::string("conv2d_21_w.bin"); 
-  void* conv2d_21_w =  readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_21_b_path =  dir_prefix + std::string("conv2d_21_b.bin"); 
-  void* conv2d_21_b =  readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
 
+int main() {
+
+  llvm_hpvm_initTensorRt(1);
+
+  std::string dir_prefix =
+      model_params_path + std::string("/resnet18_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  // void* input = readTrainedWeights(input_path.c_str(), 0,
+  // batch_size,3,32,32);
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  // uint8_t* labels = readLabels(labels_path.c_str(), batch_size);
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 16, 16, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 16, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 32, 16, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 32, 16, 1, 1);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 32, 32, 3, 3);
+  std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin");
+  void *conv2d_14_b =
+      readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 32, 1, 1);
+  std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin");
+  void *conv2d_15_w =
+      readTrainedWeights(conv2d_15_w_path.c_str(), 0, 64, 32, 3, 3);
+  std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin");
+  void *conv2d_15_b =
+      readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin");
+  void *conv2d_17_w =
+      readTrainedWeights(conv2d_17_w_path.c_str(), 0, 64, 32, 1, 1);
+  std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin");
+  void *conv2d_17_b =
+      readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin");
+  void *conv2d_16_w =
+      readTrainedWeights(conv2d_16_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin");
+  void *conv2d_16_b =
+      readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin");
+  void *conv2d_18_w =
+      readTrainedWeights(conv2d_18_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin");
+  void *conv2d_18_b =
+      readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin");
+  void *conv2d_19_w =
+      readTrainedWeights(conv2d_19_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin");
+  void *conv2d_19_b =
+      readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin");
+  void *conv2d_20_w =
+      readTrainedWeights(conv2d_20_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin");
+  void *conv2d_20_b =
+      readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin");
+  void *conv2d_21_w =
+      readTrainedWeights(conv2d_21_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin");
+  void *conv2d_21_b =
+      readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 64, 10);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -117,94 +160,94 @@ int main(){
 
   // NOTE: Starting time profiling
   startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
+
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
-    
-    void* var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_3 = tensorAdd(var_2, conv2d_1_b); 
-    void* var_4 = tensorRelu(var_3); 
-    void* var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_7 = tensorAdd(var_6, conv2d_2_b); 
-    void* var_8 = tensorRelu(var_7); 
-    void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorAdd(var_4, var_11); 
-    void* var_13 = tensorRelu(var_12); 
-    void* var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_16 = tensorAdd(var_15, conv2d_4_b); 
-    void* var_17 = tensorRelu(var_16); 
-    void* var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_20 = tensorAdd(var_19, conv2d_5_b); 
-    void* var_21 = tensorAdd(var_13, var_20); 
-    void* var_22 = tensorRelu(var_21); 
-    void* var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorAdd(var_24, conv2d_6_b); 
-    void* var_26 = tensorRelu(var_25); 
-    void* var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorAdd(var_28, conv2d_7_b); 
-    void* var_30 = tensorAdd(var_22, var_29); 
-    void* var_31 = tensorRelu(var_30); 
-    void* var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); 
-    void* var_34 = tensorAdd(var_33, conv2d_8_b); 
-    void* var_35 = tensorRelu(var_34); 
-    void* var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_38 = tensorAdd(var_37, conv2d_9_b); 
-    void* var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); 
-    void* var_41 = tensorAdd(var_40, conv2d_10_b); 
-    void* var_42 = tensorAdd(var_41, var_38); 
-    void* var_43 = tensorRelu(var_42); 
-    void* var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_46 = tensorAdd(var_45, conv2d_11_b); 
-    void* var_47 = tensorRelu(var_46); 
-    void* var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_50 = tensorAdd(var_49, conv2d_12_b); 
-    void* var_51 = tensorAdd(var_43, var_50); 
-    void* var_52 = tensorRelu(var_51); 
-    void* var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_55 = tensorAdd(var_54, conv2d_13_b); 
-    void* var_56 = tensorRelu(var_55); 
-    void* var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); 
-    void* var_59 = tensorAdd(var_58, conv2d_14_b); 
-    void* var_60 = tensorAdd(var_52, var_59); 
-    void* var_61 = tensorRelu(var_60); 
-    void* var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); 
-    void* var_64 = tensorAdd(var_63, conv2d_15_b); 
-    void* var_65 = tensorRelu(var_64); 
-    void* var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); 
-    void* var_68 = tensorAdd(var_67, conv2d_16_b); 
-    void* var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); 
-    void* var_71 = tensorAdd(var_70, conv2d_17_b); 
-    void* var_72 = tensorAdd(var_71, var_68); 
-    void* var_73 = tensorRelu(var_72); 
-    void* var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); 
-    void* var_76 = tensorAdd(var_75, conv2d_18_b); 
-    void* var_77 = tensorRelu(var_76); 
-    void* var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); 
-    void* var_80 = tensorAdd(var_79, conv2d_19_b); 
-    void* var_81 = tensorAdd(var_73, var_80); 
-    void* var_82 = tensorRelu(var_81); 
-    void* var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); 
-    void* var_85 = tensorAdd(var_84, conv2d_20_b); 
-    void* var_86 = tensorRelu(var_85); 
-    void* var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); 
-    void* var_89 = tensorAdd(var_88, conv2d_21_b); 
-    void* var_90 = tensorAdd(var_82, var_89); 
-    void* var_91 = tensorRelu(var_90); 
-    void* var_92 = tensorPooling(var_91,1,8,8,0,0,8,8); 
-    void* var_94 = tensorGemmGPU(var_92, dense_1_w); 
-    void* var_95 = tensorAdd(var_94, dense_1_b); 
-    void* var_96 = tensorSoftmax(var_95); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_96); 
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_3 = tensorAdd(var_2, conv2d_1_b);
+    void *var_4 = tensorRelu(var_3);
+    void *var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_7 = tensorAdd(var_6, conv2d_2_b);
+    void *var_8 = tensorRelu(var_7);
+    void *var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_11 = tensorAdd(var_10, conv2d_3_b);
+    void *var_12 = tensorAdd(var_4, var_11);
+    void *var_13 = tensorRelu(var_12);
+    void *var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_16 = tensorAdd(var_15, conv2d_4_b);
+    void *var_17 = tensorRelu(var_16);
+    void *var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_20 = tensorAdd(var_19, conv2d_5_b);
+    void *var_21 = tensorAdd(var_13, var_20);
+    void *var_22 = tensorRelu(var_21);
+    void *var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorAdd(var_24, conv2d_6_b);
+    void *var_26 = tensorRelu(var_25);
+    void *var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorAdd(var_28, conv2d_7_b);
+    void *var_30 = tensorAdd(var_22, var_29);
+    void *var_31 = tensorRelu(var_30);
+    void *var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0);
+    void *var_34 = tensorAdd(var_33, conv2d_8_b);
+    void *var_35 = tensorRelu(var_34);
+    void *var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_38 = tensorAdd(var_37, conv2d_9_b);
+    void *var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0);
+    void *var_41 = tensorAdd(var_40, conv2d_10_b);
+    void *var_42 = tensorAdd(var_41, var_38);
+    void *var_43 = tensorRelu(var_42);
+    void *var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_46 = tensorAdd(var_45, conv2d_11_b);
+    void *var_47 = tensorRelu(var_46);
+    void *var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_50 = tensorAdd(var_49, conv2d_12_b);
+    void *var_51 = tensorAdd(var_43, var_50);
+    void *var_52 = tensorRelu(var_51);
+    void *var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_55 = tensorAdd(var_54, conv2d_13_b);
+    void *var_56 = tensorRelu(var_55);
+    void *var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0);
+    void *var_59 = tensorAdd(var_58, conv2d_14_b);
+    void *var_60 = tensorAdd(var_52, var_59);
+    void *var_61 = tensorRelu(var_60);
+    void *var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0);
+    void *var_64 = tensorAdd(var_63, conv2d_15_b);
+    void *var_65 = tensorRelu(var_64);
+    void *var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0);
+    void *var_68 = tensorAdd(var_67, conv2d_16_b);
+    void *var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0);
+    void *var_71 = tensorAdd(var_70, conv2d_17_b);
+    void *var_72 = tensorAdd(var_71, var_68);
+    void *var_73 = tensorRelu(var_72);
+    void *var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0);
+    void *var_76 = tensorAdd(var_75, conv2d_18_b);
+    void *var_77 = tensorRelu(var_76);
+    void *var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0);
+    void *var_80 = tensorAdd(var_79, conv2d_19_b);
+    void *var_81 = tensorAdd(var_73, var_80);
+    void *var_82 = tensorRelu(var_81);
+    void *var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0);
+    void *var_85 = tensorAdd(var_84, conv2d_20_b);
+    void *var_86 = tensorRelu(var_85);
+    void *var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0);
+    void *var_89 = tensorAdd(var_88, conv2d_21_b);
+    void *var_90 = tensorAdd(var_82, var_89);
+    void *var_91 = tensorRelu(var_90);
+    void *var_92 = tensorPooling(var_91, 1, 8, 8, 0, 0, 8, 8);
+    void *var_94 = tensorGemmGPU(var_92, dense_1_w);
+    void *var_95 = tensorAdd(var_94, dense_1_b);
+    void *var_96 = tensorSoftmax(var_95);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_96);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
@@ -213,9 +256,7 @@ int main(){
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
 
-  
-  llvm_hpvm_cleanupTensorRt(); 
-
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
index 0914b3f70c..1329c0b9b8 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc
@@ -1,924 +1,1557 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
 
+int main() {
 
-int main(){ 
+  llvm_hpvm_initTensorRt(0);
 
-  llvm_hpvm_initTensorRt(0); 
+  std::string dir_prefix =
+      model_params_path + std::string("/shared/hsharif3/resnet50_imagenet/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 7, 7);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_1_gamma_path =
+      dir_prefix + std::string("batch_normalization_1_gamma.bin");
+  void *batch_normalization_1_gamma = readTrainedWeights(
+      batch_normalization_1_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_1_beta_path =
+      dir_prefix + std::string("batch_normalization_1_beta.bin");
+  void *batch_normalization_1_beta = readTrainedWeights(
+      batch_normalization_1_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_1_mean_path =
+      dir_prefix + std::string("batch_normalization_1_mean.bin");
+  void *batch_normalization_1_mean = readTrainedWeights(
+      batch_normalization_1_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_1_variance_path =
+      dir_prefix + std::string("batch_normalization_1_variance.bin");
+  void *batch_normalization_1_variance = readTrainedWeights(
+      batch_normalization_1_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 1, 1);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_2_gamma_path =
+      dir_prefix + std::string("batch_normalization_2_gamma.bin");
+  void *batch_normalization_2_gamma = readTrainedWeights(
+      batch_normalization_2_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_2_beta_path =
+      dir_prefix + std::string("batch_normalization_2_beta.bin");
+  void *batch_normalization_2_beta = readTrainedWeights(
+      batch_normalization_2_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_2_mean_path =
+      dir_prefix + std::string("batch_normalization_2_mean.bin");
+  void *batch_normalization_2_mean = readTrainedWeights(
+      batch_normalization_2_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_2_variance_path =
+      dir_prefix + std::string("batch_normalization_2_variance.bin");
+  void *batch_normalization_2_variance = readTrainedWeights(
+      batch_normalization_2_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_gamma_path =
+      dir_prefix + std::string("batch_normalization_3_gamma.bin");
+  void *batch_normalization_3_gamma = readTrainedWeights(
+      batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_beta_path =
+      dir_prefix + std::string("batch_normalization_3_beta.bin");
+  void *batch_normalization_3_beta = readTrainedWeights(
+      batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_mean_path =
+      dir_prefix + std::string("batch_normalization_3_mean.bin");
+  void *batch_normalization_3_mean = readTrainedWeights(
+      batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_3_variance_path =
+      dir_prefix + std::string("batch_normalization_3_variance.bin");
+  void *batch_normalization_3_variance = readTrainedWeights(
+      batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 64, 1, 1);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 64, 1, 1);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_4_gamma_path =
+      dir_prefix + std::string("batch_normalization_4_gamma.bin");
+  void *batch_normalization_4_gamma = readTrainedWeights(
+      batch_normalization_4_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_4_beta_path =
+      dir_prefix + std::string("batch_normalization_4_beta.bin");
+  void *batch_normalization_4_beta = readTrainedWeights(
+      batch_normalization_4_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_4_mean_path =
+      dir_prefix + std::string("batch_normalization_4_mean.bin");
+  void *batch_normalization_4_mean = readTrainedWeights(
+      batch_normalization_4_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_4_variance_path =
+      dir_prefix + std::string("batch_normalization_4_variance.bin");
+  void *batch_normalization_4_variance = readTrainedWeights(
+      batch_normalization_4_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_5_gamma_path =
+      dir_prefix + std::string("batch_normalization_5_gamma.bin");
+  void *batch_normalization_5_gamma = readTrainedWeights(
+      batch_normalization_5_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_5_beta_path =
+      dir_prefix + std::string("batch_normalization_5_beta.bin");
+  void *batch_normalization_5_beta = readTrainedWeights(
+      batch_normalization_5_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_5_mean_path =
+      dir_prefix + std::string("batch_normalization_5_mean.bin");
+  void *batch_normalization_5_mean = readTrainedWeights(
+      batch_normalization_5_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_5_variance_path =
+      dir_prefix + std::string("batch_normalization_5_variance.bin");
+  void *batch_normalization_5_variance = readTrainedWeights(
+      batch_normalization_5_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 64, 256, 1, 1);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_6_gamma_path =
+      dir_prefix + std::string("batch_normalization_6_gamma.bin");
+  void *batch_normalization_6_gamma = readTrainedWeights(
+      batch_normalization_6_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_6_beta_path =
+      dir_prefix + std::string("batch_normalization_6_beta.bin");
+  void *batch_normalization_6_beta = readTrainedWeights(
+      batch_normalization_6_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_6_mean_path =
+      dir_prefix + std::string("batch_normalization_6_mean.bin");
+  void *batch_normalization_6_mean = readTrainedWeights(
+      batch_normalization_6_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_6_variance_path =
+      dir_prefix + std::string("batch_normalization_6_variance.bin");
+  void *batch_normalization_6_variance = readTrainedWeights(
+      batch_normalization_6_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_7_gamma_path =
+      dir_prefix + std::string("batch_normalization_7_gamma.bin");
+  void *batch_normalization_7_gamma = readTrainedWeights(
+      batch_normalization_7_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_7_beta_path =
+      dir_prefix + std::string("batch_normalization_7_beta.bin");
+  void *batch_normalization_7_beta = readTrainedWeights(
+      batch_normalization_7_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_7_mean_path =
+      dir_prefix + std::string("batch_normalization_7_mean.bin");
+  void *batch_normalization_7_mean = readTrainedWeights(
+      batch_normalization_7_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_7_variance_path =
+      dir_prefix + std::string("batch_normalization_7_variance.bin");
+  void *batch_normalization_7_variance = readTrainedWeights(
+      batch_normalization_7_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 256, 64, 1, 1);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_8_gamma_path =
+      dir_prefix + std::string("batch_normalization_8_gamma.bin");
+  void *batch_normalization_8_gamma = readTrainedWeights(
+      batch_normalization_8_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_8_beta_path =
+      dir_prefix + std::string("batch_normalization_8_beta.bin");
+  void *batch_normalization_8_beta = readTrainedWeights(
+      batch_normalization_8_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_8_mean_path =
+      dir_prefix + std::string("batch_normalization_8_mean.bin");
+  void *batch_normalization_8_mean = readTrainedWeights(
+      batch_normalization_8_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_8_variance_path =
+      dir_prefix + std::string("batch_normalization_8_variance.bin");
+  void *batch_normalization_8_variance = readTrainedWeights(
+      batch_normalization_8_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 64, 256, 1, 1);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_9_gamma_path =
+      dir_prefix + std::string("batch_normalization_9_gamma.bin");
+  void *batch_normalization_9_gamma = readTrainedWeights(
+      batch_normalization_9_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_9_beta_path =
+      dir_prefix + std::string("batch_normalization_9_beta.bin");
+  void *batch_normalization_9_beta = readTrainedWeights(
+      batch_normalization_9_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_9_mean_path =
+      dir_prefix + std::string("batch_normalization_9_mean.bin");
+  void *batch_normalization_9_mean = readTrainedWeights(
+      batch_normalization_9_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_9_variance_path =
+      dir_prefix + std::string("batch_normalization_9_variance.bin");
+  void *batch_normalization_9_variance = readTrainedWeights(
+      batch_normalization_9_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_10_gamma_path =
+      dir_prefix + std::string("batch_normalization_10_gamma.bin");
+  void *batch_normalization_10_gamma = readTrainedWeights(
+      batch_normalization_10_gamma_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_10_beta_path =
+      dir_prefix + std::string("batch_normalization_10_beta.bin");
+  void *batch_normalization_10_beta = readTrainedWeights(
+      batch_normalization_10_beta_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_10_mean_path =
+      dir_prefix + std::string("batch_normalization_10_mean.bin");
+  void *batch_normalization_10_mean = readTrainedWeights(
+      batch_normalization_10_mean_path.c_str(), 0, 1, 64, 1, 1);
+  std::string batch_normalization_10_variance_path =
+      dir_prefix + std::string("batch_normalization_10_variance.bin");
+  void *batch_normalization_10_variance = readTrainedWeights(
+      batch_normalization_10_variance_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 256, 64, 1, 1);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_gamma_path =
+      dir_prefix + std::string("batch_normalization_11_gamma.bin");
+  void *batch_normalization_11_gamma = readTrainedWeights(
+      batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_beta_path =
+      dir_prefix + std::string("batch_normalization_11_beta.bin");
+  void *batch_normalization_11_beta = readTrainedWeights(
+      batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_mean_path =
+      dir_prefix + std::string("batch_normalization_11_mean.bin");
+  void *batch_normalization_11_mean = readTrainedWeights(
+      batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_11_variance_path =
+      dir_prefix + std::string("batch_normalization_11_variance.bin");
+  void *batch_normalization_11_variance = readTrainedWeights(
+      batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 128, 256, 1, 1);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_12_gamma_path =
+      dir_prefix + std::string("batch_normalization_12_gamma.bin");
+  void *batch_normalization_12_gamma = readTrainedWeights(
+      batch_normalization_12_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_12_beta_path =
+      dir_prefix + std::string("batch_normalization_12_beta.bin");
+  void *batch_normalization_12_beta = readTrainedWeights(
+      batch_normalization_12_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_12_mean_path =
+      dir_prefix + std::string("batch_normalization_12_mean.bin");
+  void *batch_normalization_12_mean = readTrainedWeights(
+      batch_normalization_12_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_12_variance_path =
+      dir_prefix + std::string("batch_normalization_12_variance.bin");
+  void *batch_normalization_12_variance = readTrainedWeights(
+      batch_normalization_12_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_13_gamma_path =
+      dir_prefix + std::string("batch_normalization_13_gamma.bin");
+  void *batch_normalization_13_gamma = readTrainedWeights(
+      batch_normalization_13_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_13_beta_path =
+      dir_prefix + std::string("batch_normalization_13_beta.bin");
+  void *batch_normalization_13_beta = readTrainedWeights(
+      batch_normalization_13_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_13_mean_path =
+      dir_prefix + std::string("batch_normalization_13_mean.bin");
+  void *batch_normalization_13_mean = readTrainedWeights(
+      batch_normalization_13_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_13_variance_path =
+      dir_prefix + std::string("batch_normalization_13_variance.bin");
+  void *batch_normalization_13_variance = readTrainedWeights(
+      batch_normalization_13_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin");
+  void *conv2d_14_w =
+      readTrainedWeights(conv2d_14_w_path.c_str(), 0, 512, 128, 1, 1);
+  std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin");
+  void *conv2d_14_b =
+      readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin");
+  void *conv2d_15_w =
+      readTrainedWeights(conv2d_15_w_path.c_str(), 0, 512, 256, 1, 1);
+  std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin");
+  void *conv2d_15_b =
+      readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_gamma_path =
+      dir_prefix + std::string("batch_normalization_14_gamma.bin");
+  void *batch_normalization_14_gamma = readTrainedWeights(
+      batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_beta_path =
+      dir_prefix + std::string("batch_normalization_14_beta.bin");
+  void *batch_normalization_14_beta = readTrainedWeights(
+      batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_mean_path =
+      dir_prefix + std::string("batch_normalization_14_mean.bin");
+  void *batch_normalization_14_mean = readTrainedWeights(
+      batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_14_variance_path =
+      dir_prefix + std::string("batch_normalization_14_variance.bin");
+  void *batch_normalization_14_variance = readTrainedWeights(
+      batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_gamma_path =
+      dir_prefix + std::string("batch_normalization_15_gamma.bin");
+  void *batch_normalization_15_gamma = readTrainedWeights(
+      batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_beta_path =
+      dir_prefix + std::string("batch_normalization_15_beta.bin");
+  void *batch_normalization_15_beta = readTrainedWeights(
+      batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_mean_path =
+      dir_prefix + std::string("batch_normalization_15_mean.bin");
+  void *batch_normalization_15_mean = readTrainedWeights(
+      batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_15_variance_path =
+      dir_prefix + std::string("batch_normalization_15_variance.bin");
+  void *batch_normalization_15_variance = readTrainedWeights(
+      batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin");
+  void *conv2d_16_w =
+      readTrainedWeights(conv2d_16_w_path.c_str(), 0, 128, 512, 1, 1);
+  std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin");
+  void *conv2d_16_b =
+      readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_16_gamma_path =
+      dir_prefix + std::string("batch_normalization_16_gamma.bin");
+  void *batch_normalization_16_gamma = readTrainedWeights(
+      batch_normalization_16_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_16_beta_path =
+      dir_prefix + std::string("batch_normalization_16_beta.bin");
+  void *batch_normalization_16_beta = readTrainedWeights(
+      batch_normalization_16_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_16_mean_path =
+      dir_prefix + std::string("batch_normalization_16_mean.bin");
+  void *batch_normalization_16_mean = readTrainedWeights(
+      batch_normalization_16_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_16_variance_path =
+      dir_prefix + std::string("batch_normalization_16_variance.bin");
+  void *batch_normalization_16_variance = readTrainedWeights(
+      batch_normalization_16_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin");
+  void *conv2d_17_w =
+      readTrainedWeights(conv2d_17_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin");
+  void *conv2d_17_b =
+      readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_17_gamma_path =
+      dir_prefix + std::string("batch_normalization_17_gamma.bin");
+  void *batch_normalization_17_gamma = readTrainedWeights(
+      batch_normalization_17_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_17_beta_path =
+      dir_prefix + std::string("batch_normalization_17_beta.bin");
+  void *batch_normalization_17_beta = readTrainedWeights(
+      batch_normalization_17_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_17_mean_path =
+      dir_prefix + std::string("batch_normalization_17_mean.bin");
+  void *batch_normalization_17_mean = readTrainedWeights(
+      batch_normalization_17_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_17_variance_path =
+      dir_prefix + std::string("batch_normalization_17_variance.bin");
+  void *batch_normalization_17_variance = readTrainedWeights(
+      batch_normalization_17_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin");
+  void *conv2d_18_w =
+      readTrainedWeights(conv2d_18_w_path.c_str(), 0, 512, 128, 1, 1);
+  std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin");
+  void *conv2d_18_b =
+      readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_gamma_path =
+      dir_prefix + std::string("batch_normalization_18_gamma.bin");
+  void *batch_normalization_18_gamma = readTrainedWeights(
+      batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_beta_path =
+      dir_prefix + std::string("batch_normalization_18_beta.bin");
+  void *batch_normalization_18_beta = readTrainedWeights(
+      batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_mean_path =
+      dir_prefix + std::string("batch_normalization_18_mean.bin");
+  void *batch_normalization_18_mean = readTrainedWeights(
+      batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_18_variance_path =
+      dir_prefix + std::string("batch_normalization_18_variance.bin");
+  void *batch_normalization_18_variance = readTrainedWeights(
+      batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin");
+  void *conv2d_19_w =
+      readTrainedWeights(conv2d_19_w_path.c_str(), 0, 128, 512, 1, 1);
+  std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin");
+  void *conv2d_19_b =
+      readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_19_gamma_path =
+      dir_prefix + std::string("batch_normalization_19_gamma.bin");
+  void *batch_normalization_19_gamma = readTrainedWeights(
+      batch_normalization_19_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_19_beta_path =
+      dir_prefix + std::string("batch_normalization_19_beta.bin");
+  void *batch_normalization_19_beta = readTrainedWeights(
+      batch_normalization_19_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_19_mean_path =
+      dir_prefix + std::string("batch_normalization_19_mean.bin");
+  void *batch_normalization_19_mean = readTrainedWeights(
+      batch_normalization_19_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_19_variance_path =
+      dir_prefix + std::string("batch_normalization_19_variance.bin");
+  void *batch_normalization_19_variance = readTrainedWeights(
+      batch_normalization_19_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin");
+  void *conv2d_20_w =
+      readTrainedWeights(conv2d_20_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin");
+  void *conv2d_20_b =
+      readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_20_gamma_path =
+      dir_prefix + std::string("batch_normalization_20_gamma.bin");
+  void *batch_normalization_20_gamma = readTrainedWeights(
+      batch_normalization_20_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_20_beta_path =
+      dir_prefix + std::string("batch_normalization_20_beta.bin");
+  void *batch_normalization_20_beta = readTrainedWeights(
+      batch_normalization_20_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_20_mean_path =
+      dir_prefix + std::string("batch_normalization_20_mean.bin");
+  void *batch_normalization_20_mean = readTrainedWeights(
+      batch_normalization_20_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_20_variance_path =
+      dir_prefix + std::string("batch_normalization_20_variance.bin");
+  void *batch_normalization_20_variance = readTrainedWeights(
+      batch_normalization_20_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin");
+  void *conv2d_21_w =
+      readTrainedWeights(conv2d_21_w_path.c_str(), 0, 512, 128, 1, 1);
+  std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin");
+  void *conv2d_21_b =
+      readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_gamma_path =
+      dir_prefix + std::string("batch_normalization_21_gamma.bin");
+  void *batch_normalization_21_gamma = readTrainedWeights(
+      batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_beta_path =
+      dir_prefix + std::string("batch_normalization_21_beta.bin");
+  void *batch_normalization_21_beta = readTrainedWeights(
+      batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_mean_path =
+      dir_prefix + std::string("batch_normalization_21_mean.bin");
+  void *batch_normalization_21_mean = readTrainedWeights(
+      batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_21_variance_path =
+      dir_prefix + std::string("batch_normalization_21_variance.bin");
+  void *batch_normalization_21_variance = readTrainedWeights(
+      batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_22_w_path = dir_prefix + std::string("conv2d_22_w.bin");
+  void *conv2d_22_w =
+      readTrainedWeights(conv2d_22_w_path.c_str(), 0, 128, 512, 1, 1);
+  std::string conv2d_22_b_path = dir_prefix + std::string("conv2d_22_b.bin");
+  void *conv2d_22_b =
+      readTrainedWeights(conv2d_22_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_22_gamma_path =
+      dir_prefix + std::string("batch_normalization_22_gamma.bin");
+  void *batch_normalization_22_gamma = readTrainedWeights(
+      batch_normalization_22_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_22_beta_path =
+      dir_prefix + std::string("batch_normalization_22_beta.bin");
+  void *batch_normalization_22_beta = readTrainedWeights(
+      batch_normalization_22_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_22_mean_path =
+      dir_prefix + std::string("batch_normalization_22_mean.bin");
+  void *batch_normalization_22_mean = readTrainedWeights(
+      batch_normalization_22_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_22_variance_path =
+      dir_prefix + std::string("batch_normalization_22_variance.bin");
+  void *batch_normalization_22_variance = readTrainedWeights(
+      batch_normalization_22_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_23_w_path = dir_prefix + std::string("conv2d_23_w.bin");
+  void *conv2d_23_w =
+      readTrainedWeights(conv2d_23_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_23_b_path = dir_prefix + std::string("conv2d_23_b.bin");
+  void *conv2d_23_b =
+      readTrainedWeights(conv2d_23_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_23_gamma_path =
+      dir_prefix + std::string("batch_normalization_23_gamma.bin");
+  void *batch_normalization_23_gamma = readTrainedWeights(
+      batch_normalization_23_gamma_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_23_beta_path =
+      dir_prefix + std::string("batch_normalization_23_beta.bin");
+  void *batch_normalization_23_beta = readTrainedWeights(
+      batch_normalization_23_beta_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_23_mean_path =
+      dir_prefix + std::string("batch_normalization_23_mean.bin");
+  void *batch_normalization_23_mean = readTrainedWeights(
+      batch_normalization_23_mean_path.c_str(), 0, 1, 128, 1, 1);
+  std::string batch_normalization_23_variance_path =
+      dir_prefix + std::string("batch_normalization_23_variance.bin");
+  void *batch_normalization_23_variance = readTrainedWeights(
+      batch_normalization_23_variance_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_24_w_path = dir_prefix + std::string("conv2d_24_w.bin");
+  void *conv2d_24_w =
+      readTrainedWeights(conv2d_24_w_path.c_str(), 0, 512, 128, 1, 1);
+  std::string conv2d_24_b_path = dir_prefix + std::string("conv2d_24_b.bin");
+  void *conv2d_24_b =
+      readTrainedWeights(conv2d_24_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_gamma_path =
+      dir_prefix + std::string("batch_normalization_24_gamma.bin");
+  void *batch_normalization_24_gamma = readTrainedWeights(
+      batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_beta_path =
+      dir_prefix + std::string("batch_normalization_24_beta.bin");
+  void *batch_normalization_24_beta = readTrainedWeights(
+      batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_mean_path =
+      dir_prefix + std::string("batch_normalization_24_mean.bin");
+  void *batch_normalization_24_mean = readTrainedWeights(
+      batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_24_variance_path =
+      dir_prefix + std::string("batch_normalization_24_variance.bin");
+  void *batch_normalization_24_variance = readTrainedWeights(
+      batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_25_w_path = dir_prefix + std::string("conv2d_25_w.bin");
+  void *conv2d_25_w =
+      readTrainedWeights(conv2d_25_w_path.c_str(), 0, 256, 512, 1, 1);
+  std::string conv2d_25_b_path = dir_prefix + std::string("conv2d_25_b.bin");
+  void *conv2d_25_b =
+      readTrainedWeights(conv2d_25_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_25_gamma_path =
+      dir_prefix + std::string("batch_normalization_25_gamma.bin");
+  void *batch_normalization_25_gamma = readTrainedWeights(
+      batch_normalization_25_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_25_beta_path =
+      dir_prefix + std::string("batch_normalization_25_beta.bin");
+  void *batch_normalization_25_beta = readTrainedWeights(
+      batch_normalization_25_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_25_mean_path =
+      dir_prefix + std::string("batch_normalization_25_mean.bin");
+  void *batch_normalization_25_mean = readTrainedWeights(
+      batch_normalization_25_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_25_variance_path =
+      dir_prefix + std::string("batch_normalization_25_variance.bin");
+  void *batch_normalization_25_variance = readTrainedWeights(
+      batch_normalization_25_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_26_w_path = dir_prefix + std::string("conv2d_26_w.bin");
+  void *conv2d_26_w =
+      readTrainedWeights(conv2d_26_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_26_b_path = dir_prefix + std::string("conv2d_26_b.bin");
+  void *conv2d_26_b =
+      readTrainedWeights(conv2d_26_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_26_gamma_path =
+      dir_prefix + std::string("batch_normalization_26_gamma.bin");
+  void *batch_normalization_26_gamma = readTrainedWeights(
+      batch_normalization_26_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_26_beta_path =
+      dir_prefix + std::string("batch_normalization_26_beta.bin");
+  void *batch_normalization_26_beta = readTrainedWeights(
+      batch_normalization_26_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_26_mean_path =
+      dir_prefix + std::string("batch_normalization_26_mean.bin");
+  void *batch_normalization_26_mean = readTrainedWeights(
+      batch_normalization_26_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_26_variance_path =
+      dir_prefix + std::string("batch_normalization_26_variance.bin");
+  void *batch_normalization_26_variance = readTrainedWeights(
+      batch_normalization_26_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_27_w_path = dir_prefix + std::string("conv2d_27_w.bin");
+  void *conv2d_27_w =
+      readTrainedWeights(conv2d_27_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_27_b_path = dir_prefix + std::string("conv2d_27_b.bin");
+  void *conv2d_27_b =
+      readTrainedWeights(conv2d_27_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_28_w_path = dir_prefix + std::string("conv2d_28_w.bin");
+  void *conv2d_28_w =
+      readTrainedWeights(conv2d_28_w_path.c_str(), 0, 1024, 512, 1, 1);
+  std::string conv2d_28_b_path = dir_prefix + std::string("conv2d_28_b.bin");
+  void *conv2d_28_b =
+      readTrainedWeights(conv2d_28_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_gamma_path =
+      dir_prefix + std::string("batch_normalization_27_gamma.bin");
+  void *batch_normalization_27_gamma = readTrainedWeights(
+      batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_beta_path =
+      dir_prefix + std::string("batch_normalization_27_beta.bin");
+  void *batch_normalization_27_beta = readTrainedWeights(
+      batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_mean_path =
+      dir_prefix + std::string("batch_normalization_27_mean.bin");
+  void *batch_normalization_27_mean = readTrainedWeights(
+      batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_27_variance_path =
+      dir_prefix + std::string("batch_normalization_27_variance.bin");
+  void *batch_normalization_27_variance = readTrainedWeights(
+      batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_28_gamma_path =
+      dir_prefix + std::string("batch_normalization_28_gamma.bin");
+  void *batch_normalization_28_gamma = readTrainedWeights(
+      batch_normalization_28_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_28_beta_path =
+      dir_prefix + std::string("batch_normalization_28_beta.bin");
+  void *batch_normalization_28_beta = readTrainedWeights(
+      batch_normalization_28_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_28_mean_path =
+      dir_prefix + std::string("batch_normalization_28_mean.bin");
+  void *batch_normalization_28_mean = readTrainedWeights(
+      batch_normalization_28_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_28_variance_path =
+      dir_prefix + std::string("batch_normalization_28_variance.bin");
+  void *batch_normalization_28_variance = readTrainedWeights(
+      batch_normalization_28_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_29_w_path = dir_prefix + std::string("conv2d_29_w.bin");
+  void *conv2d_29_w =
+      readTrainedWeights(conv2d_29_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_29_b_path = dir_prefix + std::string("conv2d_29_b.bin");
+  void *conv2d_29_b =
+      readTrainedWeights(conv2d_29_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_29_gamma_path =
+      dir_prefix + std::string("batch_normalization_29_gamma.bin");
+  void *batch_normalization_29_gamma = readTrainedWeights(
+      batch_normalization_29_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_29_beta_path =
+      dir_prefix + std::string("batch_normalization_29_beta.bin");
+  void *batch_normalization_29_beta = readTrainedWeights(
+      batch_normalization_29_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_29_mean_path =
+      dir_prefix + std::string("batch_normalization_29_mean.bin");
+  void *batch_normalization_29_mean = readTrainedWeights(
+      batch_normalization_29_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_29_variance_path =
+      dir_prefix + std::string("batch_normalization_29_variance.bin");
+  void *batch_normalization_29_variance = readTrainedWeights(
+      batch_normalization_29_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_30_w_path = dir_prefix + std::string("conv2d_30_w.bin");
+  void *conv2d_30_w =
+      readTrainedWeights(conv2d_30_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_30_b_path = dir_prefix + std::string("conv2d_30_b.bin");
+  void *conv2d_30_b =
+      readTrainedWeights(conv2d_30_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_30_gamma_path =
+      dir_prefix + std::string("batch_normalization_30_gamma.bin");
+  void *batch_normalization_30_gamma = readTrainedWeights(
+      batch_normalization_30_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_30_beta_path =
+      dir_prefix + std::string("batch_normalization_30_beta.bin");
+  void *batch_normalization_30_beta = readTrainedWeights(
+      batch_normalization_30_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_30_mean_path =
+      dir_prefix + std::string("batch_normalization_30_mean.bin");
+  void *batch_normalization_30_mean = readTrainedWeights(
+      batch_normalization_30_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_30_variance_path =
+      dir_prefix + std::string("batch_normalization_30_variance.bin");
+  void *batch_normalization_30_variance = readTrainedWeights(
+      batch_normalization_30_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_31_w_path = dir_prefix + std::string("conv2d_31_w.bin");
+  void *conv2d_31_w =
+      readTrainedWeights(conv2d_31_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_31_b_path = dir_prefix + std::string("conv2d_31_b.bin");
+  void *conv2d_31_b =
+      readTrainedWeights(conv2d_31_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_31_gamma_path =
+      dir_prefix + std::string("batch_normalization_31_gamma.bin");
+  void *batch_normalization_31_gamma = readTrainedWeights(
+      batch_normalization_31_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_31_beta_path =
+      dir_prefix + std::string("batch_normalization_31_beta.bin");
+  void *batch_normalization_31_beta = readTrainedWeights(
+      batch_normalization_31_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_31_mean_path =
+      dir_prefix + std::string("batch_normalization_31_mean.bin");
+  void *batch_normalization_31_mean = readTrainedWeights(
+      batch_normalization_31_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_31_variance_path =
+      dir_prefix + std::string("batch_normalization_31_variance.bin");
+  void *batch_normalization_31_variance = readTrainedWeights(
+      batch_normalization_31_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_32_w_path = dir_prefix + std::string("conv2d_32_w.bin");
+  void *conv2d_32_w =
+      readTrainedWeights(conv2d_32_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_32_b_path = dir_prefix + std::string("conv2d_32_b.bin");
+  void *conv2d_32_b =
+      readTrainedWeights(conv2d_32_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_32_gamma_path =
+      dir_prefix + std::string("batch_normalization_32_gamma.bin");
+  void *batch_normalization_32_gamma = readTrainedWeights(
+      batch_normalization_32_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_32_beta_path =
+      dir_prefix + std::string("batch_normalization_32_beta.bin");
+  void *batch_normalization_32_beta = readTrainedWeights(
+      batch_normalization_32_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_32_mean_path =
+      dir_prefix + std::string("batch_normalization_32_mean.bin");
+  void *batch_normalization_32_mean = readTrainedWeights(
+      batch_normalization_32_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_32_variance_path =
+      dir_prefix + std::string("batch_normalization_32_variance.bin");
+  void *batch_normalization_32_variance = readTrainedWeights(
+      batch_normalization_32_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_33_w_path = dir_prefix + std::string("conv2d_33_w.bin");
+  void *conv2d_33_w =
+      readTrainedWeights(conv2d_33_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_33_b_path = dir_prefix + std::string("conv2d_33_b.bin");
+  void *conv2d_33_b =
+      readTrainedWeights(conv2d_33_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_33_gamma_path =
+      dir_prefix + std::string("batch_normalization_33_gamma.bin");
+  void *batch_normalization_33_gamma = readTrainedWeights(
+      batch_normalization_33_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_33_beta_path =
+      dir_prefix + std::string("batch_normalization_33_beta.bin");
+  void *batch_normalization_33_beta = readTrainedWeights(
+      batch_normalization_33_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_33_mean_path =
+      dir_prefix + std::string("batch_normalization_33_mean.bin");
+  void *batch_normalization_33_mean = readTrainedWeights(
+      batch_normalization_33_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_33_variance_path =
+      dir_prefix + std::string("batch_normalization_33_variance.bin");
+  void *batch_normalization_33_variance = readTrainedWeights(
+      batch_normalization_33_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_34_w_path = dir_prefix + std::string("conv2d_34_w.bin");
+  void *conv2d_34_w =
+      readTrainedWeights(conv2d_34_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_34_b_path = dir_prefix + std::string("conv2d_34_b.bin");
+  void *conv2d_34_b =
+      readTrainedWeights(conv2d_34_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_34_gamma_path =
+      dir_prefix + std::string("batch_normalization_34_gamma.bin");
+  void *batch_normalization_34_gamma = readTrainedWeights(
+      batch_normalization_34_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_34_beta_path =
+      dir_prefix + std::string("batch_normalization_34_beta.bin");
+  void *batch_normalization_34_beta = readTrainedWeights(
+      batch_normalization_34_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_34_mean_path =
+      dir_prefix + std::string("batch_normalization_34_mean.bin");
+  void *batch_normalization_34_mean = readTrainedWeights(
+      batch_normalization_34_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_34_variance_path =
+      dir_prefix + std::string("batch_normalization_34_variance.bin");
+  void *batch_normalization_34_variance = readTrainedWeights(
+      batch_normalization_34_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_35_w_path = dir_prefix + std::string("conv2d_35_w.bin");
+  void *conv2d_35_w =
+      readTrainedWeights(conv2d_35_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_35_b_path = dir_prefix + std::string("conv2d_35_b.bin");
+  void *conv2d_35_b =
+      readTrainedWeights(conv2d_35_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_35_gamma_path =
+      dir_prefix + std::string("batch_normalization_35_gamma.bin");
+  void *batch_normalization_35_gamma = readTrainedWeights(
+      batch_normalization_35_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_35_beta_path =
+      dir_prefix + std::string("batch_normalization_35_beta.bin");
+  void *batch_normalization_35_beta = readTrainedWeights(
+      batch_normalization_35_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_35_mean_path =
+      dir_prefix + std::string("batch_normalization_35_mean.bin");
+  void *batch_normalization_35_mean = readTrainedWeights(
+      batch_normalization_35_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_35_variance_path =
+      dir_prefix + std::string("batch_normalization_35_variance.bin");
+  void *batch_normalization_35_variance = readTrainedWeights(
+      batch_normalization_35_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_36_w_path = dir_prefix + std::string("conv2d_36_w.bin");
+  void *conv2d_36_w =
+      readTrainedWeights(conv2d_36_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_36_b_path = dir_prefix + std::string("conv2d_36_b.bin");
+  void *conv2d_36_b =
+      readTrainedWeights(conv2d_36_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_36_gamma_path =
+      dir_prefix + std::string("batch_normalization_36_gamma.bin");
+  void *batch_normalization_36_gamma = readTrainedWeights(
+      batch_normalization_36_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_36_beta_path =
+      dir_prefix + std::string("batch_normalization_36_beta.bin");
+  void *batch_normalization_36_beta = readTrainedWeights(
+      batch_normalization_36_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_36_mean_path =
+      dir_prefix + std::string("batch_normalization_36_mean.bin");
+  void *batch_normalization_36_mean = readTrainedWeights(
+      batch_normalization_36_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_36_variance_path =
+      dir_prefix + std::string("batch_normalization_36_variance.bin");
+  void *batch_normalization_36_variance = readTrainedWeights(
+      batch_normalization_36_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_37_w_path = dir_prefix + std::string("conv2d_37_w.bin");
+  void *conv2d_37_w =
+      readTrainedWeights(conv2d_37_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_37_b_path = dir_prefix + std::string("conv2d_37_b.bin");
+  void *conv2d_37_b =
+      readTrainedWeights(conv2d_37_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_37_gamma_path =
+      dir_prefix + std::string("batch_normalization_37_gamma.bin");
+  void *batch_normalization_37_gamma = readTrainedWeights(
+      batch_normalization_37_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_37_beta_path =
+      dir_prefix + std::string("batch_normalization_37_beta.bin");
+  void *batch_normalization_37_beta = readTrainedWeights(
+      batch_normalization_37_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_37_mean_path =
+      dir_prefix + std::string("batch_normalization_37_mean.bin");
+  void *batch_normalization_37_mean = readTrainedWeights(
+      batch_normalization_37_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_37_variance_path =
+      dir_prefix + std::string("batch_normalization_37_variance.bin");
+  void *batch_normalization_37_variance = readTrainedWeights(
+      batch_normalization_37_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_38_w_path = dir_prefix + std::string("conv2d_38_w.bin");
+  void *conv2d_38_w =
+      readTrainedWeights(conv2d_38_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_38_b_path = dir_prefix + std::string("conv2d_38_b.bin");
+  void *conv2d_38_b =
+      readTrainedWeights(conv2d_38_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_38_gamma_path =
+      dir_prefix + std::string("batch_normalization_38_gamma.bin");
+  void *batch_normalization_38_gamma = readTrainedWeights(
+      batch_normalization_38_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_38_beta_path =
+      dir_prefix + std::string("batch_normalization_38_beta.bin");
+  void *batch_normalization_38_beta = readTrainedWeights(
+      batch_normalization_38_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_38_mean_path =
+      dir_prefix + std::string("batch_normalization_38_mean.bin");
+  void *batch_normalization_38_mean = readTrainedWeights(
+      batch_normalization_38_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_38_variance_path =
+      dir_prefix + std::string("batch_normalization_38_variance.bin");
+  void *batch_normalization_38_variance = readTrainedWeights(
+      batch_normalization_38_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_39_w_path = dir_prefix + std::string("conv2d_39_w.bin");
+  void *conv2d_39_w =
+      readTrainedWeights(conv2d_39_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_39_b_path = dir_prefix + std::string("conv2d_39_b.bin");
+  void *conv2d_39_b =
+      readTrainedWeights(conv2d_39_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_39_gamma_path =
+      dir_prefix + std::string("batch_normalization_39_gamma.bin");
+  void *batch_normalization_39_gamma = readTrainedWeights(
+      batch_normalization_39_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_39_beta_path =
+      dir_prefix + std::string("batch_normalization_39_beta.bin");
+  void *batch_normalization_39_beta = readTrainedWeights(
+      batch_normalization_39_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_39_mean_path =
+      dir_prefix + std::string("batch_normalization_39_mean.bin");
+  void *batch_normalization_39_mean = readTrainedWeights(
+      batch_normalization_39_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_39_variance_path =
+      dir_prefix + std::string("batch_normalization_39_variance.bin");
+  void *batch_normalization_39_variance = readTrainedWeights(
+      batch_normalization_39_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_40_w_path = dir_prefix + std::string("conv2d_40_w.bin");
+  void *conv2d_40_w =
+      readTrainedWeights(conv2d_40_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_40_b_path = dir_prefix + std::string("conv2d_40_b.bin");
+  void *conv2d_40_b =
+      readTrainedWeights(conv2d_40_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_40_gamma_path =
+      dir_prefix + std::string("batch_normalization_40_gamma.bin");
+  void *batch_normalization_40_gamma = readTrainedWeights(
+      batch_normalization_40_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_40_beta_path =
+      dir_prefix + std::string("batch_normalization_40_beta.bin");
+  void *batch_normalization_40_beta = readTrainedWeights(
+      batch_normalization_40_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_40_mean_path =
+      dir_prefix + std::string("batch_normalization_40_mean.bin");
+  void *batch_normalization_40_mean = readTrainedWeights(
+      batch_normalization_40_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_40_variance_path =
+      dir_prefix + std::string("batch_normalization_40_variance.bin");
+  void *batch_normalization_40_variance = readTrainedWeights(
+      batch_normalization_40_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_41_w_path = dir_prefix + std::string("conv2d_41_w.bin");
+  void *conv2d_41_w =
+      readTrainedWeights(conv2d_41_w_path.c_str(), 0, 256, 1024, 1, 1);
+  std::string conv2d_41_b_path = dir_prefix + std::string("conv2d_41_b.bin");
+  void *conv2d_41_b =
+      readTrainedWeights(conv2d_41_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_41_gamma_path =
+      dir_prefix + std::string("batch_normalization_41_gamma.bin");
+  void *batch_normalization_41_gamma = readTrainedWeights(
+      batch_normalization_41_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_41_beta_path =
+      dir_prefix + std::string("batch_normalization_41_beta.bin");
+  void *batch_normalization_41_beta = readTrainedWeights(
+      batch_normalization_41_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_41_mean_path =
+      dir_prefix + std::string("batch_normalization_41_mean.bin");
+  void *batch_normalization_41_mean = readTrainedWeights(
+      batch_normalization_41_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_41_variance_path =
+      dir_prefix + std::string("batch_normalization_41_variance.bin");
+  void *batch_normalization_41_variance = readTrainedWeights(
+      batch_normalization_41_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_42_w_path = dir_prefix + std::string("conv2d_42_w.bin");
+  void *conv2d_42_w =
+      readTrainedWeights(conv2d_42_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_42_b_path = dir_prefix + std::string("conv2d_42_b.bin");
+  void *conv2d_42_b =
+      readTrainedWeights(conv2d_42_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_42_gamma_path =
+      dir_prefix + std::string("batch_normalization_42_gamma.bin");
+  void *batch_normalization_42_gamma = readTrainedWeights(
+      batch_normalization_42_gamma_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_42_beta_path =
+      dir_prefix + std::string("batch_normalization_42_beta.bin");
+  void *batch_normalization_42_beta = readTrainedWeights(
+      batch_normalization_42_beta_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_42_mean_path =
+      dir_prefix + std::string("batch_normalization_42_mean.bin");
+  void *batch_normalization_42_mean = readTrainedWeights(
+      batch_normalization_42_mean_path.c_str(), 0, 1, 256, 1, 1);
+  std::string batch_normalization_42_variance_path =
+      dir_prefix + std::string("batch_normalization_42_variance.bin");
+  void *batch_normalization_42_variance = readTrainedWeights(
+      batch_normalization_42_variance_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_43_w_path = dir_prefix + std::string("conv2d_43_w.bin");
+  void *conv2d_43_w =
+      readTrainedWeights(conv2d_43_w_path.c_str(), 0, 1024, 256, 1, 1);
+  std::string conv2d_43_b_path = dir_prefix + std::string("conv2d_43_b.bin");
+  void *conv2d_43_b =
+      readTrainedWeights(conv2d_43_b_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_43_gamma_path =
+      dir_prefix + std::string("batch_normalization_43_gamma.bin");
+  void *batch_normalization_43_gamma = readTrainedWeights(
+      batch_normalization_43_gamma_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_43_beta_path =
+      dir_prefix + std::string("batch_normalization_43_beta.bin");
+  void *batch_normalization_43_beta = readTrainedWeights(
+      batch_normalization_43_beta_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_43_mean_path =
+      dir_prefix + std::string("batch_normalization_43_mean.bin");
+  void *batch_normalization_43_mean = readTrainedWeights(
+      batch_normalization_43_mean_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string batch_normalization_43_variance_path =
+      dir_prefix + std::string("batch_normalization_43_variance.bin");
+  void *batch_normalization_43_variance = readTrainedWeights(
+      batch_normalization_43_variance_path.c_str(), 0, 1, 1024, 1, 1);
+  std::string conv2d_44_w_path = dir_prefix + std::string("conv2d_44_w.bin");
+  void *conv2d_44_w =
+      readTrainedWeights(conv2d_44_w_path.c_str(), 0, 512, 1024, 1, 1);
+  std::string conv2d_44_b_path = dir_prefix + std::string("conv2d_44_b.bin");
+  void *conv2d_44_b =
+      readTrainedWeights(conv2d_44_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_44_gamma_path =
+      dir_prefix + std::string("batch_normalization_44_gamma.bin");
+  void *batch_normalization_44_gamma = readTrainedWeights(
+      batch_normalization_44_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_44_beta_path =
+      dir_prefix + std::string("batch_normalization_44_beta.bin");
+  void *batch_normalization_44_beta = readTrainedWeights(
+      batch_normalization_44_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_44_mean_path =
+      dir_prefix + std::string("batch_normalization_44_mean.bin");
+  void *batch_normalization_44_mean = readTrainedWeights(
+      batch_normalization_44_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_44_variance_path =
+      dir_prefix + std::string("batch_normalization_44_variance.bin");
+  void *batch_normalization_44_variance = readTrainedWeights(
+      batch_normalization_44_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_45_w_path = dir_prefix + std::string("conv2d_45_w.bin");
+  void *conv2d_45_w =
+      readTrainedWeights(conv2d_45_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_45_b_path = dir_prefix + std::string("conv2d_45_b.bin");
+  void *conv2d_45_b =
+      readTrainedWeights(conv2d_45_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_45_gamma_path =
+      dir_prefix + std::string("batch_normalization_45_gamma.bin");
+  void *batch_normalization_45_gamma = readTrainedWeights(
+      batch_normalization_45_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_45_beta_path =
+      dir_prefix + std::string("batch_normalization_45_beta.bin");
+  void *batch_normalization_45_beta = readTrainedWeights(
+      batch_normalization_45_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_45_mean_path =
+      dir_prefix + std::string("batch_normalization_45_mean.bin");
+  void *batch_normalization_45_mean = readTrainedWeights(
+      batch_normalization_45_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_45_variance_path =
+      dir_prefix + std::string("batch_normalization_45_variance.bin");
+  void *batch_normalization_45_variance = readTrainedWeights(
+      batch_normalization_45_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_46_w_path = dir_prefix + std::string("conv2d_46_w.bin");
+  void *conv2d_46_w =
+      readTrainedWeights(conv2d_46_w_path.c_str(), 0, 2048, 512, 1, 1);
+  std::string conv2d_46_b_path = dir_prefix + std::string("conv2d_46_b.bin");
+  void *conv2d_46_b =
+      readTrainedWeights(conv2d_46_b_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string conv2d_47_w_path = dir_prefix + std::string("conv2d_47_w.bin");
+  void *conv2d_47_w =
+      readTrainedWeights(conv2d_47_w_path.c_str(), 0, 2048, 1024, 1, 1);
+  std::string conv2d_47_b_path = dir_prefix + std::string("conv2d_47_b.bin");
+  void *conv2d_47_b =
+      readTrainedWeights(conv2d_47_b_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_46_gamma_path =
+      dir_prefix + std::string("batch_normalization_46_gamma.bin");
+  void *batch_normalization_46_gamma = readTrainedWeights(
+      batch_normalization_46_gamma_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_46_beta_path =
+      dir_prefix + std::string("batch_normalization_46_beta.bin");
+  void *batch_normalization_46_beta = readTrainedWeights(
+      batch_normalization_46_beta_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_46_mean_path =
+      dir_prefix + std::string("batch_normalization_46_mean.bin");
+  void *batch_normalization_46_mean = readTrainedWeights(
+      batch_normalization_46_mean_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_46_variance_path =
+      dir_prefix + std::string("batch_normalization_46_variance.bin");
+  void *batch_normalization_46_variance = readTrainedWeights(
+      batch_normalization_46_variance_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_47_gamma_path =
+      dir_prefix + std::string("batch_normalization_47_gamma.bin");
+  void *batch_normalization_47_gamma = readTrainedWeights(
+      batch_normalization_47_gamma_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_47_beta_path =
+      dir_prefix + std::string("batch_normalization_47_beta.bin");
+  void *batch_normalization_47_beta = readTrainedWeights(
+      batch_normalization_47_beta_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_47_mean_path =
+      dir_prefix + std::string("batch_normalization_47_mean.bin");
+  void *batch_normalization_47_mean = readTrainedWeights(
+      batch_normalization_47_mean_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_47_variance_path =
+      dir_prefix + std::string("batch_normalization_47_variance.bin");
+  void *batch_normalization_47_variance = readTrainedWeights(
+      batch_normalization_47_variance_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string conv2d_48_w_path = dir_prefix + std::string("conv2d_48_w.bin");
+  void *conv2d_48_w =
+      readTrainedWeights(conv2d_48_w_path.c_str(), 0, 512, 2048, 1, 1);
+  std::string conv2d_48_b_path = dir_prefix + std::string("conv2d_48_b.bin");
+  void *conv2d_48_b =
+      readTrainedWeights(conv2d_48_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_48_gamma_path =
+      dir_prefix + std::string("batch_normalization_48_gamma.bin");
+  void *batch_normalization_48_gamma = readTrainedWeights(
+      batch_normalization_48_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_48_beta_path =
+      dir_prefix + std::string("batch_normalization_48_beta.bin");
+  void *batch_normalization_48_beta = readTrainedWeights(
+      batch_normalization_48_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_48_mean_path =
+      dir_prefix + std::string("batch_normalization_48_mean.bin");
+  void *batch_normalization_48_mean = readTrainedWeights(
+      batch_normalization_48_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_48_variance_path =
+      dir_prefix + std::string("batch_normalization_48_variance.bin");
+  void *batch_normalization_48_variance = readTrainedWeights(
+      batch_normalization_48_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_49_w_path = dir_prefix + std::string("conv2d_49_w.bin");
+  void *conv2d_49_w =
+      readTrainedWeights(conv2d_49_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_49_b_path = dir_prefix + std::string("conv2d_49_b.bin");
+  void *conv2d_49_b =
+      readTrainedWeights(conv2d_49_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_49_gamma_path =
+      dir_prefix + std::string("batch_normalization_49_gamma.bin");
+  void *batch_normalization_49_gamma = readTrainedWeights(
+      batch_normalization_49_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_49_beta_path =
+      dir_prefix + std::string("batch_normalization_49_beta.bin");
+  void *batch_normalization_49_beta = readTrainedWeights(
+      batch_normalization_49_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_49_mean_path =
+      dir_prefix + std::string("batch_normalization_49_mean.bin");
+  void *batch_normalization_49_mean = readTrainedWeights(
+      batch_normalization_49_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_49_variance_path =
+      dir_prefix + std::string("batch_normalization_49_variance.bin");
+  void *batch_normalization_49_variance = readTrainedWeights(
+      batch_normalization_49_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_50_w_path = dir_prefix + std::string("conv2d_50_w.bin");
+  void *conv2d_50_w =
+      readTrainedWeights(conv2d_50_w_path.c_str(), 0, 2048, 512, 1, 1);
+  std::string conv2d_50_b_path = dir_prefix + std::string("conv2d_50_b.bin");
+  void *conv2d_50_b =
+      readTrainedWeights(conv2d_50_b_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_50_gamma_path =
+      dir_prefix + std::string("batch_normalization_50_gamma.bin");
+  void *batch_normalization_50_gamma = readTrainedWeights(
+      batch_normalization_50_gamma_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_50_beta_path =
+      dir_prefix + std::string("batch_normalization_50_beta.bin");
+  void *batch_normalization_50_beta = readTrainedWeights(
+      batch_normalization_50_beta_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_50_mean_path =
+      dir_prefix + std::string("batch_normalization_50_mean.bin");
+  void *batch_normalization_50_mean = readTrainedWeights(
+      batch_normalization_50_mean_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_50_variance_path =
+      dir_prefix + std::string("batch_normalization_50_variance.bin");
+  void *batch_normalization_50_variance = readTrainedWeights(
+      batch_normalization_50_variance_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string conv2d_51_w_path = dir_prefix + std::string("conv2d_51_w.bin");
+  void *conv2d_51_w =
+      readTrainedWeights(conv2d_51_w_path.c_str(), 0, 512, 2048, 1, 1);
+  std::string conv2d_51_b_path = dir_prefix + std::string("conv2d_51_b.bin");
+  void *conv2d_51_b =
+      readTrainedWeights(conv2d_51_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_51_gamma_path =
+      dir_prefix + std::string("batch_normalization_51_gamma.bin");
+  void *batch_normalization_51_gamma = readTrainedWeights(
+      batch_normalization_51_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_51_beta_path =
+      dir_prefix + std::string("batch_normalization_51_beta.bin");
+  void *batch_normalization_51_beta = readTrainedWeights(
+      batch_normalization_51_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_51_mean_path =
+      dir_prefix + std::string("batch_normalization_51_mean.bin");
+  void *batch_normalization_51_mean = readTrainedWeights(
+      batch_normalization_51_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_51_variance_path =
+      dir_prefix + std::string("batch_normalization_51_variance.bin");
+  void *batch_normalization_51_variance = readTrainedWeights(
+      batch_normalization_51_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_52_w_path = dir_prefix + std::string("conv2d_52_w.bin");
+  void *conv2d_52_w =
+      readTrainedWeights(conv2d_52_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_52_b_path = dir_prefix + std::string("conv2d_52_b.bin");
+  void *conv2d_52_b =
+      readTrainedWeights(conv2d_52_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_52_gamma_path =
+      dir_prefix + std::string("batch_normalization_52_gamma.bin");
+  void *batch_normalization_52_gamma = readTrainedWeights(
+      batch_normalization_52_gamma_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_52_beta_path =
+      dir_prefix + std::string("batch_normalization_52_beta.bin");
+  void *batch_normalization_52_beta = readTrainedWeights(
+      batch_normalization_52_beta_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_52_mean_path =
+      dir_prefix + std::string("batch_normalization_52_mean.bin");
+  void *batch_normalization_52_mean = readTrainedWeights(
+      batch_normalization_52_mean_path.c_str(), 0, 1, 512, 1, 1);
+  std::string batch_normalization_52_variance_path =
+      dir_prefix + std::string("batch_normalization_52_variance.bin");
+  void *batch_normalization_52_variance = readTrainedWeights(
+      batch_normalization_52_variance_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_53_w_path = dir_prefix + std::string("conv2d_53_w.bin");
+  void *conv2d_53_w =
+      readTrainedWeights(conv2d_53_w_path.c_str(), 0, 2048, 512, 1, 1);
+  std::string conv2d_53_b_path = dir_prefix + std::string("conv2d_53_b.bin");
+  void *conv2d_53_b =
+      readTrainedWeights(conv2d_53_b_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_53_gamma_path =
+      dir_prefix + std::string("batch_normalization_53_gamma.bin");
+  void *batch_normalization_53_gamma = readTrainedWeights(
+      batch_normalization_53_gamma_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_53_beta_path =
+      dir_prefix + std::string("batch_normalization_53_beta.bin");
+  void *batch_normalization_53_beta = readTrainedWeights(
+      batch_normalization_53_beta_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_53_mean_path =
+      dir_prefix + std::string("batch_normalization_53_mean.bin");
+  void *batch_normalization_53_mean = readTrainedWeights(
+      batch_normalization_53_mean_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string batch_normalization_53_variance_path =
+      dir_prefix + std::string("batch_normalization_53_variance.bin");
+  void *batch_normalization_53_variance = readTrainedWeights(
+      batch_normalization_53_variance_path.c_str(), 0, 1, 2048, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 1000);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b =
+      readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1);
 
+  startMemTracking();
 
-  std::string dir_prefix = model_params_path + std::string("/shared/hsharif3/resnet50_imagenet/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,7,7); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_1_gamma_path =  dir_prefix + std::string("batch_normalization_1_gamma.bin"); 
-  void* batch_normalization_1_gamma =  readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_1_beta_path =  dir_prefix + std::string("batch_normalization_1_beta.bin"); 
-  void* batch_normalization_1_beta =  readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_1_mean_path =  dir_prefix + std::string("batch_normalization_1_mean.bin"); 
-  void* batch_normalization_1_mean =  readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_1_variance_path =  dir_prefix + std::string("batch_normalization_1_variance.bin"); 
-  void* batch_normalization_1_variance =  readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,1,1); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_2_gamma_path =  dir_prefix + std::string("batch_normalization_2_gamma.bin"); 
-  void* batch_normalization_2_gamma =  readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_2_beta_path =  dir_prefix + std::string("batch_normalization_2_beta.bin"); 
-  void* batch_normalization_2_beta =  readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_2_mean_path =  dir_prefix + std::string("batch_normalization_2_mean.bin"); 
-  void* batch_normalization_2_mean =  readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_2_variance_path =  dir_prefix + std::string("batch_normalization_2_variance.bin"); 
-  void* batch_normalization_2_variance =  readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_gamma_path =  dir_prefix + std::string("batch_normalization_3_gamma.bin"); 
-  void* batch_normalization_3_gamma =  readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_beta_path =  dir_prefix + std::string("batch_normalization_3_beta.bin"); 
-  void* batch_normalization_3_beta =  readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_mean_path =  dir_prefix + std::string("batch_normalization_3_mean.bin"); 
-  void* batch_normalization_3_mean =  readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_3_variance_path =  dir_prefix + std::string("batch_normalization_3_variance.bin"); 
-  void* batch_normalization_3_variance =  readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,64,1,1); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,64,1,1); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_4_gamma_path =  dir_prefix + std::string("batch_normalization_4_gamma.bin"); 
-  void* batch_normalization_4_gamma =  readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_4_beta_path =  dir_prefix + std::string("batch_normalization_4_beta.bin"); 
-  void* batch_normalization_4_beta =  readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_4_mean_path =  dir_prefix + std::string("batch_normalization_4_mean.bin"); 
-  void* batch_normalization_4_mean =  readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_4_variance_path =  dir_prefix + std::string("batch_normalization_4_variance.bin"); 
-  void* batch_normalization_4_variance =  readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_5_gamma_path =  dir_prefix + std::string("batch_normalization_5_gamma.bin"); 
-  void* batch_normalization_5_gamma =  readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_5_beta_path =  dir_prefix + std::string("batch_normalization_5_beta.bin"); 
-  void* batch_normalization_5_beta =  readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_5_mean_path =  dir_prefix + std::string("batch_normalization_5_mean.bin"); 
-  void* batch_normalization_5_mean =  readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_5_variance_path =  dir_prefix + std::string("batch_normalization_5_variance.bin"); 
-  void* batch_normalization_5_variance =  readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,64,256,1,1); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_6_gamma_path =  dir_prefix + std::string("batch_normalization_6_gamma.bin"); 
-  void* batch_normalization_6_gamma =  readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_6_beta_path =  dir_prefix + std::string("batch_normalization_6_beta.bin"); 
-  void* batch_normalization_6_beta =  readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_6_mean_path =  dir_prefix + std::string("batch_normalization_6_mean.bin"); 
-  void* batch_normalization_6_mean =  readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_6_variance_path =  dir_prefix + std::string("batch_normalization_6_variance.bin"); 
-  void* batch_normalization_6_variance =  readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_7_gamma_path =  dir_prefix + std::string("batch_normalization_7_gamma.bin"); 
-  void* batch_normalization_7_gamma =  readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_7_beta_path =  dir_prefix + std::string("batch_normalization_7_beta.bin"); 
-  void* batch_normalization_7_beta =  readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_7_mean_path =  dir_prefix + std::string("batch_normalization_7_mean.bin"); 
-  void* batch_normalization_7_mean =  readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_7_variance_path =  dir_prefix + std::string("batch_normalization_7_variance.bin"); 
-  void* batch_normalization_7_variance =  readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,256,64,1,1); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_8_gamma_path =  dir_prefix + std::string("batch_normalization_8_gamma.bin"); 
-  void* batch_normalization_8_gamma =  readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_8_beta_path =  dir_prefix + std::string("batch_normalization_8_beta.bin"); 
-  void* batch_normalization_8_beta =  readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_8_mean_path =  dir_prefix + std::string("batch_normalization_8_mean.bin"); 
-  void* batch_normalization_8_mean =  readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_8_variance_path =  dir_prefix + std::string("batch_normalization_8_variance.bin"); 
-  void* batch_normalization_8_variance =  readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,64,256,1,1); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_9_gamma_path =  dir_prefix + std::string("batch_normalization_9_gamma.bin"); 
-  void* batch_normalization_9_gamma =  readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_9_beta_path =  dir_prefix + std::string("batch_normalization_9_beta.bin"); 
-  void* batch_normalization_9_beta =  readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_9_mean_path =  dir_prefix + std::string("batch_normalization_9_mean.bin"); 
-  void* batch_normalization_9_mean =  readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_9_variance_path =  dir_prefix + std::string("batch_normalization_9_variance.bin"); 
-  void* batch_normalization_9_variance =  readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_10_gamma_path =  dir_prefix + std::string("batch_normalization_10_gamma.bin"); 
-  void* batch_normalization_10_gamma =  readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_10_beta_path =  dir_prefix + std::string("batch_normalization_10_beta.bin"); 
-  void* batch_normalization_10_beta =  readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_10_mean_path =  dir_prefix + std::string("batch_normalization_10_mean.bin"); 
-  void* batch_normalization_10_mean =  readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,64,1,1); 
-  std::string batch_normalization_10_variance_path =  dir_prefix + std::string("batch_normalization_10_variance.bin"); 
-  void* batch_normalization_10_variance =  readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,256,64,1,1); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_gamma_path =  dir_prefix + std::string("batch_normalization_11_gamma.bin"); 
-  void* batch_normalization_11_gamma =  readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_beta_path =  dir_prefix + std::string("batch_normalization_11_beta.bin"); 
-  void* batch_normalization_11_beta =  readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_mean_path =  dir_prefix + std::string("batch_normalization_11_mean.bin"); 
-  void* batch_normalization_11_mean =  readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_11_variance_path =  dir_prefix + std::string("batch_normalization_11_variance.bin"); 
-  void* batch_normalization_11_variance =  readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,128,256,1,1); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_12_gamma_path =  dir_prefix + std::string("batch_normalization_12_gamma.bin"); 
-  void* batch_normalization_12_gamma =  readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_12_beta_path =  dir_prefix + std::string("batch_normalization_12_beta.bin"); 
-  void* batch_normalization_12_beta =  readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_12_mean_path =  dir_prefix + std::string("batch_normalization_12_mean.bin"); 
-  void* batch_normalization_12_mean =  readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_12_variance_path =  dir_prefix + std::string("batch_normalization_12_variance.bin"); 
-  void* batch_normalization_12_variance =  readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_13_gamma_path =  dir_prefix + std::string("batch_normalization_13_gamma.bin"); 
-  void* batch_normalization_13_gamma =  readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_13_beta_path =  dir_prefix + std::string("batch_normalization_13_beta.bin"); 
-  void* batch_normalization_13_beta =  readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_13_mean_path =  dir_prefix + std::string("batch_normalization_13_mean.bin"); 
-  void* batch_normalization_13_mean =  readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_13_variance_path =  dir_prefix + std::string("batch_normalization_13_variance.bin"); 
-  void* batch_normalization_13_variance =  readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_14_w_path =  dir_prefix + std::string("conv2d_14_w.bin"); 
-  void* conv2d_14_w =  readTrainedWeights(conv2d_14_w_path.c_str(), 0,512,128,1,1); 
-  std::string conv2d_14_b_path =  dir_prefix + std::string("conv2d_14_b.bin"); 
-  void* conv2d_14_b =  readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_15_w_path =  dir_prefix + std::string("conv2d_15_w.bin"); 
-  void* conv2d_15_w =  readTrainedWeights(conv2d_15_w_path.c_str(), 0,512,256,1,1); 
-  std::string conv2d_15_b_path =  dir_prefix + std::string("conv2d_15_b.bin"); 
-  void* conv2d_15_b =  readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_gamma_path =  dir_prefix + std::string("batch_normalization_14_gamma.bin"); 
-  void* batch_normalization_14_gamma =  readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_beta_path =  dir_prefix + std::string("batch_normalization_14_beta.bin"); 
-  void* batch_normalization_14_beta =  readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_mean_path =  dir_prefix + std::string("batch_normalization_14_mean.bin"); 
-  void* batch_normalization_14_mean =  readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_14_variance_path =  dir_prefix + std::string("batch_normalization_14_variance.bin"); 
-  void* batch_normalization_14_variance =  readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_gamma_path =  dir_prefix + std::string("batch_normalization_15_gamma.bin"); 
-  void* batch_normalization_15_gamma =  readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_beta_path =  dir_prefix + std::string("batch_normalization_15_beta.bin"); 
-  void* batch_normalization_15_beta =  readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_mean_path =  dir_prefix + std::string("batch_normalization_15_mean.bin"); 
-  void* batch_normalization_15_mean =  readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_15_variance_path =  dir_prefix + std::string("batch_normalization_15_variance.bin"); 
-  void* batch_normalization_15_variance =  readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_16_w_path =  dir_prefix + std::string("conv2d_16_w.bin"); 
-  void* conv2d_16_w =  readTrainedWeights(conv2d_16_w_path.c_str(), 0,128,512,1,1); 
-  std::string conv2d_16_b_path =  dir_prefix + std::string("conv2d_16_b.bin"); 
-  void* conv2d_16_b =  readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_16_gamma_path =  dir_prefix + std::string("batch_normalization_16_gamma.bin"); 
-  void* batch_normalization_16_gamma =  readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_16_beta_path =  dir_prefix + std::string("batch_normalization_16_beta.bin"); 
-  void* batch_normalization_16_beta =  readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_16_mean_path =  dir_prefix + std::string("batch_normalization_16_mean.bin"); 
-  void* batch_normalization_16_mean =  readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_16_variance_path =  dir_prefix + std::string("batch_normalization_16_variance.bin"); 
-  void* batch_normalization_16_variance =  readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_17_w_path =  dir_prefix + std::string("conv2d_17_w.bin"); 
-  void* conv2d_17_w =  readTrainedWeights(conv2d_17_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_17_b_path =  dir_prefix + std::string("conv2d_17_b.bin"); 
-  void* conv2d_17_b =  readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_17_gamma_path =  dir_prefix + std::string("batch_normalization_17_gamma.bin"); 
-  void* batch_normalization_17_gamma =  readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_17_beta_path =  dir_prefix + std::string("batch_normalization_17_beta.bin"); 
-  void* batch_normalization_17_beta =  readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_17_mean_path =  dir_prefix + std::string("batch_normalization_17_mean.bin"); 
-  void* batch_normalization_17_mean =  readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_17_variance_path =  dir_prefix + std::string("batch_normalization_17_variance.bin"); 
-  void* batch_normalization_17_variance =  readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_18_w_path =  dir_prefix + std::string("conv2d_18_w.bin"); 
-  void* conv2d_18_w =  readTrainedWeights(conv2d_18_w_path.c_str(), 0,512,128,1,1); 
-  std::string conv2d_18_b_path =  dir_prefix + std::string("conv2d_18_b.bin"); 
-  void* conv2d_18_b =  readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_gamma_path =  dir_prefix + std::string("batch_normalization_18_gamma.bin"); 
-  void* batch_normalization_18_gamma =  readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_beta_path =  dir_prefix + std::string("batch_normalization_18_beta.bin"); 
-  void* batch_normalization_18_beta =  readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_mean_path =  dir_prefix + std::string("batch_normalization_18_mean.bin"); 
-  void* batch_normalization_18_mean =  readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_18_variance_path =  dir_prefix + std::string("batch_normalization_18_variance.bin"); 
-  void* batch_normalization_18_variance =  readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_19_w_path =  dir_prefix + std::string("conv2d_19_w.bin"); 
-  void* conv2d_19_w =  readTrainedWeights(conv2d_19_w_path.c_str(), 0,128,512,1,1); 
-  std::string conv2d_19_b_path =  dir_prefix + std::string("conv2d_19_b.bin"); 
-  void* conv2d_19_b =  readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_19_gamma_path =  dir_prefix + std::string("batch_normalization_19_gamma.bin"); 
-  void* batch_normalization_19_gamma =  readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_19_beta_path =  dir_prefix + std::string("batch_normalization_19_beta.bin"); 
-  void* batch_normalization_19_beta =  readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_19_mean_path =  dir_prefix + std::string("batch_normalization_19_mean.bin"); 
-  void* batch_normalization_19_mean =  readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_19_variance_path =  dir_prefix + std::string("batch_normalization_19_variance.bin"); 
-  void* batch_normalization_19_variance =  readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_20_w_path =  dir_prefix + std::string("conv2d_20_w.bin"); 
-  void* conv2d_20_w =  readTrainedWeights(conv2d_20_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_20_b_path =  dir_prefix + std::string("conv2d_20_b.bin"); 
-  void* conv2d_20_b =  readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_20_gamma_path =  dir_prefix + std::string("batch_normalization_20_gamma.bin"); 
-  void* batch_normalization_20_gamma =  readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_20_beta_path =  dir_prefix + std::string("batch_normalization_20_beta.bin"); 
-  void* batch_normalization_20_beta =  readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_20_mean_path =  dir_prefix + std::string("batch_normalization_20_mean.bin"); 
-  void* batch_normalization_20_mean =  readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_20_variance_path =  dir_prefix + std::string("batch_normalization_20_variance.bin"); 
-  void* batch_normalization_20_variance =  readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_21_w_path =  dir_prefix + std::string("conv2d_21_w.bin"); 
-  void* conv2d_21_w =  readTrainedWeights(conv2d_21_w_path.c_str(), 0,512,128,1,1); 
-  std::string conv2d_21_b_path =  dir_prefix + std::string("conv2d_21_b.bin"); 
-  void* conv2d_21_b =  readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_gamma_path =  dir_prefix + std::string("batch_normalization_21_gamma.bin"); 
-  void* batch_normalization_21_gamma =  readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_beta_path =  dir_prefix + std::string("batch_normalization_21_beta.bin"); 
-  void* batch_normalization_21_beta =  readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_mean_path =  dir_prefix + std::string("batch_normalization_21_mean.bin"); 
-  void* batch_normalization_21_mean =  readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_21_variance_path =  dir_prefix + std::string("batch_normalization_21_variance.bin"); 
-  void* batch_normalization_21_variance =  readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_22_w_path =  dir_prefix + std::string("conv2d_22_w.bin"); 
-  void* conv2d_22_w =  readTrainedWeights(conv2d_22_w_path.c_str(), 0,128,512,1,1); 
-  std::string conv2d_22_b_path =  dir_prefix + std::string("conv2d_22_b.bin"); 
-  void* conv2d_22_b =  readTrainedWeights(conv2d_22_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_22_gamma_path =  dir_prefix + std::string("batch_normalization_22_gamma.bin"); 
-  void* batch_normalization_22_gamma =  readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_22_beta_path =  dir_prefix + std::string("batch_normalization_22_beta.bin"); 
-  void* batch_normalization_22_beta =  readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_22_mean_path =  dir_prefix + std::string("batch_normalization_22_mean.bin"); 
-  void* batch_normalization_22_mean =  readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_22_variance_path =  dir_prefix + std::string("batch_normalization_22_variance.bin"); 
-  void* batch_normalization_22_variance =  readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_23_w_path =  dir_prefix + std::string("conv2d_23_w.bin"); 
-  void* conv2d_23_w =  readTrainedWeights(conv2d_23_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_23_b_path =  dir_prefix + std::string("conv2d_23_b.bin"); 
-  void* conv2d_23_b =  readTrainedWeights(conv2d_23_b_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_23_gamma_path =  dir_prefix + std::string("batch_normalization_23_gamma.bin"); 
-  void* batch_normalization_23_gamma =  readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_23_beta_path =  dir_prefix + std::string("batch_normalization_23_beta.bin"); 
-  void* batch_normalization_23_beta =  readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_23_mean_path =  dir_prefix + std::string("batch_normalization_23_mean.bin"); 
-  void* batch_normalization_23_mean =  readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,128,1,1); 
-  std::string batch_normalization_23_variance_path =  dir_prefix + std::string("batch_normalization_23_variance.bin"); 
-  void* batch_normalization_23_variance =  readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_24_w_path =  dir_prefix + std::string("conv2d_24_w.bin"); 
-  void* conv2d_24_w =  readTrainedWeights(conv2d_24_w_path.c_str(), 0,512,128,1,1); 
-  std::string conv2d_24_b_path =  dir_prefix + std::string("conv2d_24_b.bin"); 
-  void* conv2d_24_b =  readTrainedWeights(conv2d_24_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_gamma_path =  dir_prefix + std::string("batch_normalization_24_gamma.bin"); 
-  void* batch_normalization_24_gamma =  readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_beta_path =  dir_prefix + std::string("batch_normalization_24_beta.bin"); 
-  void* batch_normalization_24_beta =  readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_mean_path =  dir_prefix + std::string("batch_normalization_24_mean.bin"); 
-  void* batch_normalization_24_mean =  readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_24_variance_path =  dir_prefix + std::string("batch_normalization_24_variance.bin"); 
-  void* batch_normalization_24_variance =  readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_25_w_path =  dir_prefix + std::string("conv2d_25_w.bin"); 
-  void* conv2d_25_w =  readTrainedWeights(conv2d_25_w_path.c_str(), 0,256,512,1,1); 
-  std::string conv2d_25_b_path =  dir_prefix + std::string("conv2d_25_b.bin"); 
-  void* conv2d_25_b =  readTrainedWeights(conv2d_25_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_25_gamma_path =  dir_prefix + std::string("batch_normalization_25_gamma.bin"); 
-  void* batch_normalization_25_gamma =  readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_25_beta_path =  dir_prefix + std::string("batch_normalization_25_beta.bin"); 
-  void* batch_normalization_25_beta =  readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_25_mean_path =  dir_prefix + std::string("batch_normalization_25_mean.bin"); 
-  void* batch_normalization_25_mean =  readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_25_variance_path =  dir_prefix + std::string("batch_normalization_25_variance.bin"); 
-  void* batch_normalization_25_variance =  readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_26_w_path =  dir_prefix + std::string("conv2d_26_w.bin"); 
-  void* conv2d_26_w =  readTrainedWeights(conv2d_26_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_26_b_path =  dir_prefix + std::string("conv2d_26_b.bin"); 
-  void* conv2d_26_b =  readTrainedWeights(conv2d_26_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_26_gamma_path =  dir_prefix + std::string("batch_normalization_26_gamma.bin"); 
-  void* batch_normalization_26_gamma =  readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_26_beta_path =  dir_prefix + std::string("batch_normalization_26_beta.bin"); 
-  void* batch_normalization_26_beta =  readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_26_mean_path =  dir_prefix + std::string("batch_normalization_26_mean.bin"); 
-  void* batch_normalization_26_mean =  readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_26_variance_path =  dir_prefix + std::string("batch_normalization_26_variance.bin"); 
-  void* batch_normalization_26_variance =  readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_27_w_path =  dir_prefix + std::string("conv2d_27_w.bin"); 
-  void* conv2d_27_w =  readTrainedWeights(conv2d_27_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_27_b_path =  dir_prefix + std::string("conv2d_27_b.bin"); 
-  void* conv2d_27_b =  readTrainedWeights(conv2d_27_b_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_28_w_path =  dir_prefix + std::string("conv2d_28_w.bin"); 
-  void* conv2d_28_w =  readTrainedWeights(conv2d_28_w_path.c_str(), 0,1024,512,1,1); 
-  std::string conv2d_28_b_path =  dir_prefix + std::string("conv2d_28_b.bin"); 
-  void* conv2d_28_b =  readTrainedWeights(conv2d_28_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_gamma_path =  dir_prefix + std::string("batch_normalization_27_gamma.bin"); 
-  void* batch_normalization_27_gamma =  readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_beta_path =  dir_prefix + std::string("batch_normalization_27_beta.bin"); 
-  void* batch_normalization_27_beta =  readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_mean_path =  dir_prefix + std::string("batch_normalization_27_mean.bin"); 
-  void* batch_normalization_27_mean =  readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_27_variance_path =  dir_prefix + std::string("batch_normalization_27_variance.bin"); 
-  void* batch_normalization_27_variance =  readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_28_gamma_path =  dir_prefix + std::string("batch_normalization_28_gamma.bin"); 
-  void* batch_normalization_28_gamma =  readTrainedWeights(batch_normalization_28_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_28_beta_path =  dir_prefix + std::string("batch_normalization_28_beta.bin"); 
-  void* batch_normalization_28_beta =  readTrainedWeights(batch_normalization_28_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_28_mean_path =  dir_prefix + std::string("batch_normalization_28_mean.bin"); 
-  void* batch_normalization_28_mean =  readTrainedWeights(batch_normalization_28_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_28_variance_path =  dir_prefix + std::string("batch_normalization_28_variance.bin"); 
-  void* batch_normalization_28_variance =  readTrainedWeights(batch_normalization_28_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_29_w_path =  dir_prefix + std::string("conv2d_29_w.bin"); 
-  void* conv2d_29_w =  readTrainedWeights(conv2d_29_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_29_b_path =  dir_prefix + std::string("conv2d_29_b.bin"); 
-  void* conv2d_29_b =  readTrainedWeights(conv2d_29_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_29_gamma_path =  dir_prefix + std::string("batch_normalization_29_gamma.bin"); 
-  void* batch_normalization_29_gamma =  readTrainedWeights(batch_normalization_29_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_29_beta_path =  dir_prefix + std::string("batch_normalization_29_beta.bin"); 
-  void* batch_normalization_29_beta =  readTrainedWeights(batch_normalization_29_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_29_mean_path =  dir_prefix + std::string("batch_normalization_29_mean.bin"); 
-  void* batch_normalization_29_mean =  readTrainedWeights(batch_normalization_29_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_29_variance_path =  dir_prefix + std::string("batch_normalization_29_variance.bin"); 
-  void* batch_normalization_29_variance =  readTrainedWeights(batch_normalization_29_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_30_w_path =  dir_prefix + std::string("conv2d_30_w.bin"); 
-  void* conv2d_30_w =  readTrainedWeights(conv2d_30_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_30_b_path =  dir_prefix + std::string("conv2d_30_b.bin"); 
-  void* conv2d_30_b =  readTrainedWeights(conv2d_30_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_30_gamma_path =  dir_prefix + std::string("batch_normalization_30_gamma.bin"); 
-  void* batch_normalization_30_gamma =  readTrainedWeights(batch_normalization_30_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_30_beta_path =  dir_prefix + std::string("batch_normalization_30_beta.bin"); 
-  void* batch_normalization_30_beta =  readTrainedWeights(batch_normalization_30_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_30_mean_path =  dir_prefix + std::string("batch_normalization_30_mean.bin"); 
-  void* batch_normalization_30_mean =  readTrainedWeights(batch_normalization_30_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_30_variance_path =  dir_prefix + std::string("batch_normalization_30_variance.bin"); 
-  void* batch_normalization_30_variance =  readTrainedWeights(batch_normalization_30_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_31_w_path =  dir_prefix + std::string("conv2d_31_w.bin"); 
-  void* conv2d_31_w =  readTrainedWeights(conv2d_31_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_31_b_path =  dir_prefix + std::string("conv2d_31_b.bin"); 
-  void* conv2d_31_b =  readTrainedWeights(conv2d_31_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_31_gamma_path =  dir_prefix + std::string("batch_normalization_31_gamma.bin"); 
-  void* batch_normalization_31_gamma =  readTrainedWeights(batch_normalization_31_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_31_beta_path =  dir_prefix + std::string("batch_normalization_31_beta.bin"); 
-  void* batch_normalization_31_beta =  readTrainedWeights(batch_normalization_31_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_31_mean_path =  dir_prefix + std::string("batch_normalization_31_mean.bin"); 
-  void* batch_normalization_31_mean =  readTrainedWeights(batch_normalization_31_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_31_variance_path =  dir_prefix + std::string("batch_normalization_31_variance.bin"); 
-  void* batch_normalization_31_variance =  readTrainedWeights(batch_normalization_31_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_32_w_path =  dir_prefix + std::string("conv2d_32_w.bin"); 
-  void* conv2d_32_w =  readTrainedWeights(conv2d_32_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_32_b_path =  dir_prefix + std::string("conv2d_32_b.bin"); 
-  void* conv2d_32_b =  readTrainedWeights(conv2d_32_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_32_gamma_path =  dir_prefix + std::string("batch_normalization_32_gamma.bin"); 
-  void* batch_normalization_32_gamma =  readTrainedWeights(batch_normalization_32_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_32_beta_path =  dir_prefix + std::string("batch_normalization_32_beta.bin"); 
-  void* batch_normalization_32_beta =  readTrainedWeights(batch_normalization_32_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_32_mean_path =  dir_prefix + std::string("batch_normalization_32_mean.bin"); 
-  void* batch_normalization_32_mean =  readTrainedWeights(batch_normalization_32_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_32_variance_path =  dir_prefix + std::string("batch_normalization_32_variance.bin"); 
-  void* batch_normalization_32_variance =  readTrainedWeights(batch_normalization_32_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_33_w_path =  dir_prefix + std::string("conv2d_33_w.bin"); 
-  void* conv2d_33_w =  readTrainedWeights(conv2d_33_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_33_b_path =  dir_prefix + std::string("conv2d_33_b.bin"); 
-  void* conv2d_33_b =  readTrainedWeights(conv2d_33_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_33_gamma_path =  dir_prefix + std::string("batch_normalization_33_gamma.bin"); 
-  void* batch_normalization_33_gamma =  readTrainedWeights(batch_normalization_33_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_33_beta_path =  dir_prefix + std::string("batch_normalization_33_beta.bin"); 
-  void* batch_normalization_33_beta =  readTrainedWeights(batch_normalization_33_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_33_mean_path =  dir_prefix + std::string("batch_normalization_33_mean.bin"); 
-  void* batch_normalization_33_mean =  readTrainedWeights(batch_normalization_33_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_33_variance_path =  dir_prefix + std::string("batch_normalization_33_variance.bin"); 
-  void* batch_normalization_33_variance =  readTrainedWeights(batch_normalization_33_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_34_w_path =  dir_prefix + std::string("conv2d_34_w.bin"); 
-  void* conv2d_34_w =  readTrainedWeights(conv2d_34_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_34_b_path =  dir_prefix + std::string("conv2d_34_b.bin"); 
-  void* conv2d_34_b =  readTrainedWeights(conv2d_34_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_34_gamma_path =  dir_prefix + std::string("batch_normalization_34_gamma.bin"); 
-  void* batch_normalization_34_gamma =  readTrainedWeights(batch_normalization_34_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_34_beta_path =  dir_prefix + std::string("batch_normalization_34_beta.bin"); 
-  void* batch_normalization_34_beta =  readTrainedWeights(batch_normalization_34_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_34_mean_path =  dir_prefix + std::string("batch_normalization_34_mean.bin"); 
-  void* batch_normalization_34_mean =  readTrainedWeights(batch_normalization_34_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_34_variance_path =  dir_prefix + std::string("batch_normalization_34_variance.bin"); 
-  void* batch_normalization_34_variance =  readTrainedWeights(batch_normalization_34_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_35_w_path =  dir_prefix + std::string("conv2d_35_w.bin"); 
-  void* conv2d_35_w =  readTrainedWeights(conv2d_35_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_35_b_path =  dir_prefix + std::string("conv2d_35_b.bin"); 
-  void* conv2d_35_b =  readTrainedWeights(conv2d_35_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_35_gamma_path =  dir_prefix + std::string("batch_normalization_35_gamma.bin"); 
-  void* batch_normalization_35_gamma =  readTrainedWeights(batch_normalization_35_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_35_beta_path =  dir_prefix + std::string("batch_normalization_35_beta.bin"); 
-  void* batch_normalization_35_beta =  readTrainedWeights(batch_normalization_35_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_35_mean_path =  dir_prefix + std::string("batch_normalization_35_mean.bin"); 
-  void* batch_normalization_35_mean =  readTrainedWeights(batch_normalization_35_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_35_variance_path =  dir_prefix + std::string("batch_normalization_35_variance.bin"); 
-  void* batch_normalization_35_variance =  readTrainedWeights(batch_normalization_35_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_36_w_path =  dir_prefix + std::string("conv2d_36_w.bin"); 
-  void* conv2d_36_w =  readTrainedWeights(conv2d_36_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_36_b_path =  dir_prefix + std::string("conv2d_36_b.bin"); 
-  void* conv2d_36_b =  readTrainedWeights(conv2d_36_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_36_gamma_path =  dir_prefix + std::string("batch_normalization_36_gamma.bin"); 
-  void* batch_normalization_36_gamma =  readTrainedWeights(batch_normalization_36_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_36_beta_path =  dir_prefix + std::string("batch_normalization_36_beta.bin"); 
-  void* batch_normalization_36_beta =  readTrainedWeights(batch_normalization_36_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_36_mean_path =  dir_prefix + std::string("batch_normalization_36_mean.bin"); 
-  void* batch_normalization_36_mean =  readTrainedWeights(batch_normalization_36_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_36_variance_path =  dir_prefix + std::string("batch_normalization_36_variance.bin"); 
-  void* batch_normalization_36_variance =  readTrainedWeights(batch_normalization_36_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_37_w_path =  dir_prefix + std::string("conv2d_37_w.bin"); 
-  void* conv2d_37_w =  readTrainedWeights(conv2d_37_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_37_b_path =  dir_prefix + std::string("conv2d_37_b.bin"); 
-  void* conv2d_37_b =  readTrainedWeights(conv2d_37_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_37_gamma_path =  dir_prefix + std::string("batch_normalization_37_gamma.bin"); 
-  void* batch_normalization_37_gamma =  readTrainedWeights(batch_normalization_37_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_37_beta_path =  dir_prefix + std::string("batch_normalization_37_beta.bin"); 
-  void* batch_normalization_37_beta =  readTrainedWeights(batch_normalization_37_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_37_mean_path =  dir_prefix + std::string("batch_normalization_37_mean.bin"); 
-  void* batch_normalization_37_mean =  readTrainedWeights(batch_normalization_37_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_37_variance_path =  dir_prefix + std::string("batch_normalization_37_variance.bin"); 
-  void* batch_normalization_37_variance =  readTrainedWeights(batch_normalization_37_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_38_w_path =  dir_prefix + std::string("conv2d_38_w.bin"); 
-  void* conv2d_38_w =  readTrainedWeights(conv2d_38_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_38_b_path =  dir_prefix + std::string("conv2d_38_b.bin"); 
-  void* conv2d_38_b =  readTrainedWeights(conv2d_38_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_38_gamma_path =  dir_prefix + std::string("batch_normalization_38_gamma.bin"); 
-  void* batch_normalization_38_gamma =  readTrainedWeights(batch_normalization_38_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_38_beta_path =  dir_prefix + std::string("batch_normalization_38_beta.bin"); 
-  void* batch_normalization_38_beta =  readTrainedWeights(batch_normalization_38_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_38_mean_path =  dir_prefix + std::string("batch_normalization_38_mean.bin"); 
-  void* batch_normalization_38_mean =  readTrainedWeights(batch_normalization_38_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_38_variance_path =  dir_prefix + std::string("batch_normalization_38_variance.bin"); 
-  void* batch_normalization_38_variance =  readTrainedWeights(batch_normalization_38_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_39_w_path =  dir_prefix + std::string("conv2d_39_w.bin"); 
-  void* conv2d_39_w =  readTrainedWeights(conv2d_39_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_39_b_path =  dir_prefix + std::string("conv2d_39_b.bin"); 
-  void* conv2d_39_b =  readTrainedWeights(conv2d_39_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_39_gamma_path =  dir_prefix + std::string("batch_normalization_39_gamma.bin"); 
-  void* batch_normalization_39_gamma =  readTrainedWeights(batch_normalization_39_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_39_beta_path =  dir_prefix + std::string("batch_normalization_39_beta.bin"); 
-  void* batch_normalization_39_beta =  readTrainedWeights(batch_normalization_39_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_39_mean_path =  dir_prefix + std::string("batch_normalization_39_mean.bin"); 
-  void* batch_normalization_39_mean =  readTrainedWeights(batch_normalization_39_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_39_variance_path =  dir_prefix + std::string("batch_normalization_39_variance.bin"); 
-  void* batch_normalization_39_variance =  readTrainedWeights(batch_normalization_39_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_40_w_path =  dir_prefix + std::string("conv2d_40_w.bin"); 
-  void* conv2d_40_w =  readTrainedWeights(conv2d_40_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_40_b_path =  dir_prefix + std::string("conv2d_40_b.bin"); 
-  void* conv2d_40_b =  readTrainedWeights(conv2d_40_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_40_gamma_path =  dir_prefix + std::string("batch_normalization_40_gamma.bin"); 
-  void* batch_normalization_40_gamma =  readTrainedWeights(batch_normalization_40_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_40_beta_path =  dir_prefix + std::string("batch_normalization_40_beta.bin"); 
-  void* batch_normalization_40_beta =  readTrainedWeights(batch_normalization_40_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_40_mean_path =  dir_prefix + std::string("batch_normalization_40_mean.bin"); 
-  void* batch_normalization_40_mean =  readTrainedWeights(batch_normalization_40_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_40_variance_path =  dir_prefix + std::string("batch_normalization_40_variance.bin"); 
-  void* batch_normalization_40_variance =  readTrainedWeights(batch_normalization_40_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_41_w_path =  dir_prefix + std::string("conv2d_41_w.bin"); 
-  void* conv2d_41_w =  readTrainedWeights(conv2d_41_w_path.c_str(), 0,256,1024,1,1); 
-  std::string conv2d_41_b_path =  dir_prefix + std::string("conv2d_41_b.bin"); 
-  void* conv2d_41_b =  readTrainedWeights(conv2d_41_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_41_gamma_path =  dir_prefix + std::string("batch_normalization_41_gamma.bin"); 
-  void* batch_normalization_41_gamma =  readTrainedWeights(batch_normalization_41_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_41_beta_path =  dir_prefix + std::string("batch_normalization_41_beta.bin"); 
-  void* batch_normalization_41_beta =  readTrainedWeights(batch_normalization_41_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_41_mean_path =  dir_prefix + std::string("batch_normalization_41_mean.bin"); 
-  void* batch_normalization_41_mean =  readTrainedWeights(batch_normalization_41_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_41_variance_path =  dir_prefix + std::string("batch_normalization_41_variance.bin"); 
-  void* batch_normalization_41_variance =  readTrainedWeights(batch_normalization_41_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_42_w_path =  dir_prefix + std::string("conv2d_42_w.bin"); 
-  void* conv2d_42_w =  readTrainedWeights(conv2d_42_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_42_b_path =  dir_prefix + std::string("conv2d_42_b.bin"); 
-  void* conv2d_42_b =  readTrainedWeights(conv2d_42_b_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_42_gamma_path =  dir_prefix + std::string("batch_normalization_42_gamma.bin"); 
-  void* batch_normalization_42_gamma =  readTrainedWeights(batch_normalization_42_gamma_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_42_beta_path =  dir_prefix + std::string("batch_normalization_42_beta.bin"); 
-  void* batch_normalization_42_beta =  readTrainedWeights(batch_normalization_42_beta_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_42_mean_path =  dir_prefix + std::string("batch_normalization_42_mean.bin"); 
-  void* batch_normalization_42_mean =  readTrainedWeights(batch_normalization_42_mean_path.c_str(), 0,1,256,1,1); 
-  std::string batch_normalization_42_variance_path =  dir_prefix + std::string("batch_normalization_42_variance.bin"); 
-  void* batch_normalization_42_variance =  readTrainedWeights(batch_normalization_42_variance_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_43_w_path =  dir_prefix + std::string("conv2d_43_w.bin"); 
-  void* conv2d_43_w =  readTrainedWeights(conv2d_43_w_path.c_str(), 0,1024,256,1,1); 
-  std::string conv2d_43_b_path =  dir_prefix + std::string("conv2d_43_b.bin"); 
-  void* conv2d_43_b =  readTrainedWeights(conv2d_43_b_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_43_gamma_path =  dir_prefix + std::string("batch_normalization_43_gamma.bin"); 
-  void* batch_normalization_43_gamma =  readTrainedWeights(batch_normalization_43_gamma_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_43_beta_path =  dir_prefix + std::string("batch_normalization_43_beta.bin"); 
-  void* batch_normalization_43_beta =  readTrainedWeights(batch_normalization_43_beta_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_43_mean_path =  dir_prefix + std::string("batch_normalization_43_mean.bin"); 
-  void* batch_normalization_43_mean =  readTrainedWeights(batch_normalization_43_mean_path.c_str(), 0,1,1024,1,1); 
-  std::string batch_normalization_43_variance_path =  dir_prefix + std::string("batch_normalization_43_variance.bin"); 
-  void* batch_normalization_43_variance =  readTrainedWeights(batch_normalization_43_variance_path.c_str(), 0,1,1024,1,1); 
-  std::string conv2d_44_w_path =  dir_prefix + std::string("conv2d_44_w.bin"); 
-  void* conv2d_44_w =  readTrainedWeights(conv2d_44_w_path.c_str(), 0,512,1024,1,1); 
-  std::string conv2d_44_b_path =  dir_prefix + std::string("conv2d_44_b.bin"); 
-  void* conv2d_44_b =  readTrainedWeights(conv2d_44_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_44_gamma_path =  dir_prefix + std::string("batch_normalization_44_gamma.bin"); 
-  void* batch_normalization_44_gamma =  readTrainedWeights(batch_normalization_44_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_44_beta_path =  dir_prefix + std::string("batch_normalization_44_beta.bin"); 
-  void* batch_normalization_44_beta =  readTrainedWeights(batch_normalization_44_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_44_mean_path =  dir_prefix + std::string("batch_normalization_44_mean.bin"); 
-  void* batch_normalization_44_mean =  readTrainedWeights(batch_normalization_44_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_44_variance_path =  dir_prefix + std::string("batch_normalization_44_variance.bin"); 
-  void* batch_normalization_44_variance =  readTrainedWeights(batch_normalization_44_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_45_w_path =  dir_prefix + std::string("conv2d_45_w.bin"); 
-  void* conv2d_45_w =  readTrainedWeights(conv2d_45_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_45_b_path =  dir_prefix + std::string("conv2d_45_b.bin"); 
-  void* conv2d_45_b =  readTrainedWeights(conv2d_45_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_45_gamma_path =  dir_prefix + std::string("batch_normalization_45_gamma.bin"); 
-  void* batch_normalization_45_gamma =  readTrainedWeights(batch_normalization_45_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_45_beta_path =  dir_prefix + std::string("batch_normalization_45_beta.bin"); 
-  void* batch_normalization_45_beta =  readTrainedWeights(batch_normalization_45_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_45_mean_path =  dir_prefix + std::string("batch_normalization_45_mean.bin"); 
-  void* batch_normalization_45_mean =  readTrainedWeights(batch_normalization_45_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_45_variance_path =  dir_prefix + std::string("batch_normalization_45_variance.bin"); 
-  void* batch_normalization_45_variance =  readTrainedWeights(batch_normalization_45_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_46_w_path =  dir_prefix + std::string("conv2d_46_w.bin"); 
-  void* conv2d_46_w =  readTrainedWeights(conv2d_46_w_path.c_str(), 0,2048,512,1,1); 
-  std::string conv2d_46_b_path =  dir_prefix + std::string("conv2d_46_b.bin"); 
-  void* conv2d_46_b =  readTrainedWeights(conv2d_46_b_path.c_str(), 0,1,2048,1,1); 
-  std::string conv2d_47_w_path =  dir_prefix + std::string("conv2d_47_w.bin"); 
-  void* conv2d_47_w =  readTrainedWeights(conv2d_47_w_path.c_str(), 0,2048,1024,1,1); 
-  std::string conv2d_47_b_path =  dir_prefix + std::string("conv2d_47_b.bin"); 
-  void* conv2d_47_b =  readTrainedWeights(conv2d_47_b_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_46_gamma_path =  dir_prefix + std::string("batch_normalization_46_gamma.bin"); 
-  void* batch_normalization_46_gamma =  readTrainedWeights(batch_normalization_46_gamma_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_46_beta_path =  dir_prefix + std::string("batch_normalization_46_beta.bin"); 
-  void* batch_normalization_46_beta =  readTrainedWeights(batch_normalization_46_beta_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_46_mean_path =  dir_prefix + std::string("batch_normalization_46_mean.bin"); 
-  void* batch_normalization_46_mean =  readTrainedWeights(batch_normalization_46_mean_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_46_variance_path =  dir_prefix + std::string("batch_normalization_46_variance.bin"); 
-  void* batch_normalization_46_variance =  readTrainedWeights(batch_normalization_46_variance_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_47_gamma_path =  dir_prefix + std::string("batch_normalization_47_gamma.bin"); 
-  void* batch_normalization_47_gamma =  readTrainedWeights(batch_normalization_47_gamma_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_47_beta_path =  dir_prefix + std::string("batch_normalization_47_beta.bin"); 
-  void* batch_normalization_47_beta =  readTrainedWeights(batch_normalization_47_beta_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_47_mean_path =  dir_prefix + std::string("batch_normalization_47_mean.bin"); 
-  void* batch_normalization_47_mean =  readTrainedWeights(batch_normalization_47_mean_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_47_variance_path =  dir_prefix + std::string("batch_normalization_47_variance.bin"); 
-  void* batch_normalization_47_variance =  readTrainedWeights(batch_normalization_47_variance_path.c_str(), 0,1,2048,1,1); 
-  std::string conv2d_48_w_path =  dir_prefix + std::string("conv2d_48_w.bin"); 
-  void* conv2d_48_w =  readTrainedWeights(conv2d_48_w_path.c_str(), 0,512,2048,1,1); 
-  std::string conv2d_48_b_path =  dir_prefix + std::string("conv2d_48_b.bin"); 
-  void* conv2d_48_b =  readTrainedWeights(conv2d_48_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_48_gamma_path =  dir_prefix + std::string("batch_normalization_48_gamma.bin"); 
-  void* batch_normalization_48_gamma =  readTrainedWeights(batch_normalization_48_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_48_beta_path =  dir_prefix + std::string("batch_normalization_48_beta.bin"); 
-  void* batch_normalization_48_beta =  readTrainedWeights(batch_normalization_48_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_48_mean_path =  dir_prefix + std::string("batch_normalization_48_mean.bin"); 
-  void* batch_normalization_48_mean =  readTrainedWeights(batch_normalization_48_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_48_variance_path =  dir_prefix + std::string("batch_normalization_48_variance.bin"); 
-  void* batch_normalization_48_variance =  readTrainedWeights(batch_normalization_48_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_49_w_path =  dir_prefix + std::string("conv2d_49_w.bin"); 
-  void* conv2d_49_w =  readTrainedWeights(conv2d_49_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_49_b_path =  dir_prefix + std::string("conv2d_49_b.bin"); 
-  void* conv2d_49_b =  readTrainedWeights(conv2d_49_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_49_gamma_path =  dir_prefix + std::string("batch_normalization_49_gamma.bin"); 
-  void* batch_normalization_49_gamma =  readTrainedWeights(batch_normalization_49_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_49_beta_path =  dir_prefix + std::string("batch_normalization_49_beta.bin"); 
-  void* batch_normalization_49_beta =  readTrainedWeights(batch_normalization_49_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_49_mean_path =  dir_prefix + std::string("batch_normalization_49_mean.bin"); 
-  void* batch_normalization_49_mean =  readTrainedWeights(batch_normalization_49_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_49_variance_path =  dir_prefix + std::string("batch_normalization_49_variance.bin"); 
-  void* batch_normalization_49_variance =  readTrainedWeights(batch_normalization_49_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_50_w_path =  dir_prefix + std::string("conv2d_50_w.bin"); 
-  void* conv2d_50_w =  readTrainedWeights(conv2d_50_w_path.c_str(), 0,2048,512,1,1); 
-  std::string conv2d_50_b_path =  dir_prefix + std::string("conv2d_50_b.bin"); 
-  void* conv2d_50_b =  readTrainedWeights(conv2d_50_b_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_50_gamma_path =  dir_prefix + std::string("batch_normalization_50_gamma.bin"); 
-  void* batch_normalization_50_gamma =  readTrainedWeights(batch_normalization_50_gamma_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_50_beta_path =  dir_prefix + std::string("batch_normalization_50_beta.bin"); 
-  void* batch_normalization_50_beta =  readTrainedWeights(batch_normalization_50_beta_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_50_mean_path =  dir_prefix + std::string("batch_normalization_50_mean.bin"); 
-  void* batch_normalization_50_mean =  readTrainedWeights(batch_normalization_50_mean_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_50_variance_path =  dir_prefix + std::string("batch_normalization_50_variance.bin"); 
-  void* batch_normalization_50_variance =  readTrainedWeights(batch_normalization_50_variance_path.c_str(), 0,1,2048,1,1); 
-  std::string conv2d_51_w_path =  dir_prefix + std::string("conv2d_51_w.bin"); 
-  void* conv2d_51_w =  readTrainedWeights(conv2d_51_w_path.c_str(), 0,512,2048,1,1); 
-  std::string conv2d_51_b_path =  dir_prefix + std::string("conv2d_51_b.bin"); 
-  void* conv2d_51_b =  readTrainedWeights(conv2d_51_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_51_gamma_path =  dir_prefix + std::string("batch_normalization_51_gamma.bin"); 
-  void* batch_normalization_51_gamma =  readTrainedWeights(batch_normalization_51_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_51_beta_path =  dir_prefix + std::string("batch_normalization_51_beta.bin"); 
-  void* batch_normalization_51_beta =  readTrainedWeights(batch_normalization_51_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_51_mean_path =  dir_prefix + std::string("batch_normalization_51_mean.bin"); 
-  void* batch_normalization_51_mean =  readTrainedWeights(batch_normalization_51_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_51_variance_path =  dir_prefix + std::string("batch_normalization_51_variance.bin"); 
-  void* batch_normalization_51_variance =  readTrainedWeights(batch_normalization_51_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_52_w_path =  dir_prefix + std::string("conv2d_52_w.bin"); 
-  void* conv2d_52_w =  readTrainedWeights(conv2d_52_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_52_b_path =  dir_prefix + std::string("conv2d_52_b.bin"); 
-  void* conv2d_52_b =  readTrainedWeights(conv2d_52_b_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_52_gamma_path =  dir_prefix + std::string("batch_normalization_52_gamma.bin"); 
-  void* batch_normalization_52_gamma =  readTrainedWeights(batch_normalization_52_gamma_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_52_beta_path =  dir_prefix + std::string("batch_normalization_52_beta.bin"); 
-  void* batch_normalization_52_beta =  readTrainedWeights(batch_normalization_52_beta_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_52_mean_path =  dir_prefix + std::string("batch_normalization_52_mean.bin"); 
-  void* batch_normalization_52_mean =  readTrainedWeights(batch_normalization_52_mean_path.c_str(), 0,1,512,1,1); 
-  std::string batch_normalization_52_variance_path =  dir_prefix + std::string("batch_normalization_52_variance.bin"); 
-  void* batch_normalization_52_variance =  readTrainedWeights(batch_normalization_52_variance_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_53_w_path =  dir_prefix + std::string("conv2d_53_w.bin"); 
-  void* conv2d_53_w =  readTrainedWeights(conv2d_53_w_path.c_str(), 0,2048,512,1,1); 
-  std::string conv2d_53_b_path =  dir_prefix + std::string("conv2d_53_b.bin"); 
-  void* conv2d_53_b =  readTrainedWeights(conv2d_53_b_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_53_gamma_path =  dir_prefix + std::string("batch_normalization_53_gamma.bin"); 
-  void* batch_normalization_53_gamma =  readTrainedWeights(batch_normalization_53_gamma_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_53_beta_path =  dir_prefix + std::string("batch_normalization_53_beta.bin"); 
-  void* batch_normalization_53_beta =  readTrainedWeights(batch_normalization_53_beta_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_53_mean_path =  dir_prefix + std::string("batch_normalization_53_mean.bin"); 
-  void* batch_normalization_53_mean =  readTrainedWeights(batch_normalization_53_mean_path.c_str(), 0,1,2048,1,1); 
-  std::string batch_normalization_53_variance_path =  dir_prefix + std::string("batch_normalization_53_variance.bin"); 
-  void* batch_normalization_53_variance =  readTrainedWeights(batch_normalization_53_variance_path.c_str(), 0,1,2048,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,1000); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,1000,1,1); 
+  int test_input_size = 500;
+  int batch_size = 100;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
 
+  for (int i = 0; i < batch_count; i++) {
 
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
 
-  startMemTracking(); 
+    void *input =
+        readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
 
-  int test_input_size = 500; 
-  int batch_size = 100; 
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
+    void *var_2 = tensorConvolution(input, conv2d_1_w, 3, 3, 2, 2, 1, 1);
+    void *var_3 = tensorAdd(var_2, conv2d_1_b);
+    void *var_4 = tensorRelu(var_3);
+    void *var_5 = tensorPooling(var_4, 0, 3, 3, 0, 0, 2, 2);
+    void *var_6 = tensorBatchNorm(
+        var_5, batch_normalization_1_gamma, batch_normalization_1_beta,
+        batch_normalization_1_mean, batch_normalization_1_variance, 0.001);
+    void *var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1);
+    void *var_8 = tensorAdd(var_7, conv2d_2_b);
+    void *var_9 = tensorBatchNorm(
+        var_8, batch_normalization_2_gamma, batch_normalization_2_beta,
+        batch_normalization_2_mean, batch_normalization_2_variance, 0.001);
+    void *var_10 = tensorRelu(var_9);
+    void *var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1);
+    void *var_12 = tensorAdd(var_11, conv2d_3_b);
+    void *var_13 = tensorBatchNorm(
+        var_12, batch_normalization_3_gamma, batch_normalization_3_beta,
+        batch_normalization_3_mean, batch_normalization_3_variance, 0.001);
+    void *var_14 = tensorRelu(var_13);
+    void *var_15 = tensorConvolution(var_14, conv2d_4_w, 0, 0, 1, 1, 1, 1);
+    void *var_16 = tensorAdd(var_15, conv2d_4_b);
+    void *var_17 = tensorBatchNorm(
+        var_16, batch_normalization_4_gamma, batch_normalization_4_beta,
+        batch_normalization_4_mean, batch_normalization_4_variance, 0.001);
+    void *var_18 = tensorConvolution(var_6, conv2d_5_w, 0, 0, 1, 1, 1, 1);
+    void *var_19 = tensorAdd(var_18, conv2d_5_b);
+    void *var_20 = tensorBatchNorm(
+        var_19, batch_normalization_5_gamma, batch_normalization_5_beta,
+        batch_normalization_5_mean, batch_normalization_5_variance, 0.001);
+    void *var_21 = tensorAdd(var_17, var_20);
+    void *var_22 = tensorRelu(var_21);
+    void *var_23 = tensorConvolution(var_22, conv2d_6_w, 0, 0, 1, 1, 1, 1);
+    void *var_24 = tensorAdd(var_23, conv2d_6_b);
+    void *var_25 = tensorBatchNorm(
+        var_24, batch_normalization_6_gamma, batch_normalization_6_beta,
+        batch_normalization_6_mean, batch_normalization_6_variance, 0.001);
+    void *var_26 = tensorRelu(var_25);
+    void *var_27 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 1);
+    void *var_28 = tensorAdd(var_27, conv2d_7_b);
+    void *var_29 = tensorBatchNorm(
+        var_28, batch_normalization_7_gamma, batch_normalization_7_beta,
+        batch_normalization_7_mean, batch_normalization_7_variance, 0.001);
+    void *var_30 = tensorRelu(var_29);
+    void *var_31 = tensorConvolution(var_30, conv2d_8_w, 0, 0, 1, 1, 1, 1);
+    void *var_32 = tensorAdd(var_31, conv2d_8_b);
+    void *var_33 = tensorBatchNorm(
+        var_32, batch_normalization_8_gamma, batch_normalization_8_beta,
+        batch_normalization_8_mean, batch_normalization_8_variance, 0.001);
+    void *var_34 = tensorAdd(var_33, var_22);
+    void *var_35 = tensorRelu(var_34);
+    void *var_36 = tensorConvolution(var_35, conv2d_9_w, 0, 0, 1, 1, 1, 1);
+    void *var_37 = tensorAdd(var_36, conv2d_9_b);
+    void *var_38 = tensorBatchNorm(
+        var_37, batch_normalization_9_gamma, batch_normalization_9_beta,
+        batch_normalization_9_mean, batch_normalization_9_variance, 0.001);
+    void *var_39 = tensorRelu(var_38);
+    void *var_40 = tensorConvolution(var_39, conv2d_10_w, 1, 1, 1, 1, 1, 1);
+    void *var_41 = tensorAdd(var_40, conv2d_10_b);
+    void *var_42 = tensorBatchNorm(
+        var_41, batch_normalization_10_gamma, batch_normalization_10_beta,
+        batch_normalization_10_mean, batch_normalization_10_variance, 0.001);
+    void *var_43 = tensorRelu(var_42);
+    void *var_44 = tensorConvolution(var_43, conv2d_11_w, 0, 0, 1, 1, 1, 1);
+    void *var_45 = tensorAdd(var_44, conv2d_11_b);
+    void *var_46 = tensorBatchNorm(
+        var_45, batch_normalization_11_gamma, batch_normalization_11_beta,
+        batch_normalization_11_mean, batch_normalization_11_variance, 0.001);
+    void *var_47 = tensorAdd(var_46, var_35);
+    void *var_48 = tensorRelu(var_47);
+    void *var_49 = tensorConvolution(var_48, conv2d_12_w, 0, 0, 2, 2, 1, 1);
+    void *var_50 = tensorAdd(var_49, conv2d_12_b);
+    void *var_51 = tensorBatchNorm(
+        var_50, batch_normalization_12_gamma, batch_normalization_12_beta,
+        batch_normalization_12_mean, batch_normalization_12_variance, 0.001);
+    void *var_52 = tensorRelu(var_51);
+    void *var_53 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 1);
+    void *var_54 = tensorAdd(var_53, conv2d_13_b);
+    void *var_55 = tensorBatchNorm(
+        var_54, batch_normalization_13_gamma, batch_normalization_13_beta,
+        batch_normalization_13_mean, batch_normalization_13_variance, 0.001);
+    void *var_56 = tensorRelu(var_55);
+    void *var_57 = tensorConvolution(var_56, conv2d_14_w, 0, 0, 1, 1, 1, 1);
+    void *var_58 = tensorAdd(var_57, conv2d_14_b);
+    void *var_59 = tensorBatchNorm(
+        var_58, batch_normalization_14_gamma, batch_normalization_14_beta,
+        batch_normalization_14_mean, batch_normalization_14_variance, 0.001);
+    void *var_60 = tensorConvolution(var_48, conv2d_15_w, 0, 0, 2, 2, 1, 1);
+    void *var_61 = tensorAdd(var_60, conv2d_15_b);
+    void *var_62 = tensorBatchNorm(
+        var_61, batch_normalization_15_gamma, batch_normalization_15_beta,
+        batch_normalization_15_mean, batch_normalization_15_variance, 0.001);
+    void *var_63 = tensorAdd(var_59, var_62);
+    void *var_64 = tensorRelu(var_63);
+    void *var_65 = tensorConvolution(var_64, conv2d_16_w, 0, 0, 1, 1, 1, 1);
+    void *var_66 = tensorAdd(var_65, conv2d_16_b);
+    void *var_67 = tensorBatchNorm(
+        var_66, batch_normalization_16_gamma, batch_normalization_16_beta,
+        batch_normalization_16_mean, batch_normalization_16_variance, 0.001);
+    void *var_68 = tensorRelu(var_67);
+    void *var_69 = tensorConvolution(var_68, conv2d_17_w, 1, 1, 1, 1, 1, 1);
+    void *var_70 = tensorAdd(var_69, conv2d_17_b);
+    void *var_71 = tensorBatchNorm(
+        var_70, batch_normalization_17_gamma, batch_normalization_17_beta,
+        batch_normalization_17_mean, batch_normalization_17_variance, 0.001);
+    void *var_72 = tensorRelu(var_71);
+    void *var_73 = tensorConvolution(var_72, conv2d_18_w, 0, 0, 1, 1, 1, 1);
+    void *var_74 = tensorAdd(var_73, conv2d_18_b);
+    void *var_75 = tensorBatchNorm(
+        var_74, batch_normalization_18_gamma, batch_normalization_18_beta,
+        batch_normalization_18_mean, batch_normalization_18_variance, 0.001);
+    void *var_76 = tensorAdd(var_75, var_64);
+    void *var_77 = tensorRelu(var_76);
+    void *var_78 = tensorConvolution(var_77, conv2d_19_w, 0, 0, 1, 1, 1, 1);
+    void *var_79 = tensorAdd(var_78, conv2d_19_b);
+    void *var_80 = tensorBatchNorm(
+        var_79, batch_normalization_19_gamma, batch_normalization_19_beta,
+        batch_normalization_19_mean, batch_normalization_19_variance, 0.001);
+    void *var_81 = tensorRelu(var_80);
+    void *var_82 = tensorConvolution(var_81, conv2d_20_w, 1, 1, 1, 1, 1, 1);
+    void *var_83 = tensorAdd(var_82, conv2d_20_b);
+    void *var_84 = tensorBatchNorm(
+        var_83, batch_normalization_20_gamma, batch_normalization_20_beta,
+        batch_normalization_20_mean, batch_normalization_20_variance, 0.001);
+    void *var_85 = tensorRelu(var_84);
+    void *var_86 = tensorConvolution(var_85, conv2d_21_w, 0, 0, 1, 1, 1, 1);
+    void *var_87 = tensorAdd(var_86, conv2d_21_b);
+    void *var_88 = tensorBatchNorm(
+        var_87, batch_normalization_21_gamma, batch_normalization_21_beta,
+        batch_normalization_21_mean, batch_normalization_21_variance, 0.001);
+    void *var_89 = tensorAdd(var_88, var_77);
+    void *var_90 = tensorRelu(var_89);
+    void *var_91 = tensorConvolution(var_90, conv2d_22_w, 0, 0, 1, 1, 1, 1);
+    void *var_92 = tensorAdd(var_91, conv2d_22_b);
+    void *var_93 = tensorBatchNorm(
+        var_92, batch_normalization_22_gamma, batch_normalization_22_beta,
+        batch_normalization_22_mean, batch_normalization_22_variance, 0.001);
+    void *var_94 = tensorRelu(var_93);
+    void *var_95 = tensorConvolution(var_94, conv2d_23_w, 1, 1, 1, 1, 1, 1);
+    void *var_96 = tensorAdd(var_95, conv2d_23_b);
+    void *var_97 = tensorBatchNorm(
+        var_96, batch_normalization_23_gamma, batch_normalization_23_beta,
+        batch_normalization_23_mean, batch_normalization_23_variance, 0.001);
+    void *var_98 = tensorRelu(var_97);
+    void *var_99 = tensorConvolution(var_98, conv2d_24_w, 0, 0, 1, 1, 1, 1);
+    void *var_100 = tensorAdd(var_99, conv2d_24_b);
+    void *var_101 = tensorBatchNorm(
+        var_100, batch_normalization_24_gamma, batch_normalization_24_beta,
+        batch_normalization_24_mean, batch_normalization_24_variance, 0.001);
+    void *var_102 = tensorAdd(var_101, var_90);
+    void *var_103 = tensorRelu(var_102);
+    void *var_104 = tensorConvolution(var_103, conv2d_25_w, 0, 0, 2, 2, 1, 1);
+    void *var_105 = tensorAdd(var_104, conv2d_25_b);
+    void *var_106 = tensorBatchNorm(
+        var_105, batch_normalization_25_gamma, batch_normalization_25_beta,
+        batch_normalization_25_mean, batch_normalization_25_variance, 0.001);
+    void *var_107 = tensorRelu(var_106);
+    void *var_108 = tensorConvolution(var_107, conv2d_26_w, 1, 1, 1, 1, 1, 1);
+    void *var_109 = tensorAdd(var_108, conv2d_26_b);
+    void *var_110 = tensorBatchNorm(
+        var_109, batch_normalization_26_gamma, batch_normalization_26_beta,
+        batch_normalization_26_mean, batch_normalization_26_variance, 0.001);
+    void *var_111 = tensorRelu(var_110);
+    void *var_112 = tensorConvolution(var_111, conv2d_27_w, 0, 0, 1, 1, 1, 1);
+    void *var_113 = tensorAdd(var_112, conv2d_27_b);
+    void *var_114 = tensorBatchNorm(
+        var_113, batch_normalization_27_gamma, batch_normalization_27_beta,
+        batch_normalization_27_mean, batch_normalization_27_variance, 0.001);
+    void *var_115 = tensorConvolution(var_103, conv2d_28_w, 0, 0, 2, 2, 1, 1);
+    void *var_116 = tensorAdd(var_115, conv2d_28_b);
+    void *var_117 = tensorBatchNorm(
+        var_116, batch_normalization_28_gamma, batch_normalization_28_beta,
+        batch_normalization_28_mean, batch_normalization_28_variance, 0.001);
+    void *var_118 = tensorAdd(var_114, var_117);
+    void *var_119 = tensorRelu(var_118);
+    void *var_120 = tensorConvolution(var_119, conv2d_29_w, 0, 0, 1, 1, 1, 1);
+    void *var_121 = tensorAdd(var_120, conv2d_29_b);
+    void *var_122 = tensorBatchNorm(
+        var_121, batch_normalization_29_gamma, batch_normalization_29_beta,
+        batch_normalization_29_mean, batch_normalization_29_variance, 0.001);
+    void *var_123 = tensorRelu(var_122);
+    void *var_124 = tensorConvolution(var_123, conv2d_30_w, 1, 1, 1, 1, 1, 1);
+    void *var_125 = tensorAdd(var_124, conv2d_30_b);
+    void *var_126 = tensorBatchNorm(
+        var_125, batch_normalization_30_gamma, batch_normalization_30_beta,
+        batch_normalization_30_mean, batch_normalization_30_variance, 0.001);
+    void *var_127 = tensorRelu(var_126);
+    void *var_128 = tensorConvolution(var_127, conv2d_31_w, 0, 0, 1, 1, 1, 1);
+    void *var_129 = tensorAdd(var_128, conv2d_31_b);
+    void *var_130 = tensorBatchNorm(
+        var_129, batch_normalization_31_gamma, batch_normalization_31_beta,
+        batch_normalization_31_mean, batch_normalization_31_variance, 0.001);
+    void *var_131 = tensorAdd(var_130, var_119);
+    void *var_132 = tensorRelu(var_131);
+    void *var_133 = tensorConvolution(var_132, conv2d_32_w, 0, 0, 1, 1, 1, 1);
+    void *var_134 = tensorAdd(var_133, conv2d_32_b);
+    void *var_135 = tensorBatchNorm(
+        var_134, batch_normalization_32_gamma, batch_normalization_32_beta,
+        batch_normalization_32_mean, batch_normalization_32_variance, 0.001);
+    void *var_136 = tensorRelu(var_135);
+    void *var_137 = tensorConvolution(var_136, conv2d_33_w, 1, 1, 1, 1, 1, 1);
+    void *var_138 = tensorAdd(var_137, conv2d_33_b);
+    void *var_139 = tensorBatchNorm(
+        var_138, batch_normalization_33_gamma, batch_normalization_33_beta,
+        batch_normalization_33_mean, batch_normalization_33_variance, 0.001);
+    void *var_140 = tensorRelu(var_139);
+    void *var_141 = tensorConvolution(var_140, conv2d_34_w, 0, 0, 1, 1, 1, 1);
+    void *var_142 = tensorAdd(var_141, conv2d_34_b);
+    void *var_143 = tensorBatchNorm(
+        var_142, batch_normalization_34_gamma, batch_normalization_34_beta,
+        batch_normalization_34_mean, batch_normalization_34_variance, 0.001);
+    void *var_144 = tensorAdd(var_143, var_132);
+    void *var_145 = tensorRelu(var_144);
+    void *var_146 = tensorConvolution(var_145, conv2d_35_w, 0, 0, 1, 1, 1, 1);
+    void *var_147 = tensorAdd(var_146, conv2d_35_b);
+    void *var_148 = tensorBatchNorm(
+        var_147, batch_normalization_35_gamma, batch_normalization_35_beta,
+        batch_normalization_35_mean, batch_normalization_35_variance, 0.001);
+    void *var_149 = tensorRelu(var_148);
+    void *var_150 = tensorConvolution(var_149, conv2d_36_w, 1, 1, 1, 1, 1, 1);
+    void *var_151 = tensorAdd(var_150, conv2d_36_b);
+    void *var_152 = tensorBatchNorm(
+        var_151, batch_normalization_36_gamma, batch_normalization_36_beta,
+        batch_normalization_36_mean, batch_normalization_36_variance, 0.001);
+    void *var_153 = tensorRelu(var_152);
+    void *var_154 = tensorConvolution(var_153, conv2d_37_w, 0, 0, 1, 1, 1, 1);
+    void *var_155 = tensorAdd(var_154, conv2d_37_b);
+    void *var_156 = tensorBatchNorm(
+        var_155, batch_normalization_37_gamma, batch_normalization_37_beta,
+        batch_normalization_37_mean, batch_normalization_37_variance, 0.001);
+    void *var_157 = tensorAdd(var_156, var_145);
+    void *var_158 = tensorRelu(var_157);
+    void *var_159 = tensorConvolution(var_158, conv2d_38_w, 0, 0, 1, 1, 1, 1);
+    void *var_160 = tensorAdd(var_159, conv2d_38_b);
+    void *var_161 = tensorBatchNorm(
+        var_160, batch_normalization_38_gamma, batch_normalization_38_beta,
+        batch_normalization_38_mean, batch_normalization_38_variance, 0.001);
+    void *var_162 = tensorRelu(var_161);
+    void *var_163 = tensorConvolution(var_162, conv2d_39_w, 1, 1, 1, 1, 1, 1);
+    void *var_164 = tensorAdd(var_163, conv2d_39_b);
+    void *var_165 = tensorBatchNorm(
+        var_164, batch_normalization_39_gamma, batch_normalization_39_beta,
+        batch_normalization_39_mean, batch_normalization_39_variance, 0.001);
+    void *var_166 = tensorRelu(var_165);
+    void *var_167 = tensorConvolution(var_166, conv2d_40_w, 0, 0, 1, 1, 1, 1);
+    void *var_168 = tensorAdd(var_167, conv2d_40_b);
+    void *var_169 = tensorBatchNorm(
+        var_168, batch_normalization_40_gamma, batch_normalization_40_beta,
+        batch_normalization_40_mean, batch_normalization_40_variance, 0.001);
+    void *var_170 = tensorAdd(var_169, var_158);
+    void *var_171 = tensorRelu(var_170);
+    void *var_172 = tensorConvolution(var_171, conv2d_41_w, 0, 0, 1, 1, 1, 1);
+    void *var_173 = tensorAdd(var_172, conv2d_41_b);
+    void *var_174 = tensorBatchNorm(
+        var_173, batch_normalization_41_gamma, batch_normalization_41_beta,
+        batch_normalization_41_mean, batch_normalization_41_variance, 0.001);
+    void *var_175 = tensorRelu(var_174);
+    void *var_176 = tensorConvolution(var_175, conv2d_42_w, 1, 1, 1, 1, 1, 1);
+    void *var_177 = tensorAdd(var_176, conv2d_42_b);
+    void *var_178 = tensorBatchNorm(
+        var_177, batch_normalization_42_gamma, batch_normalization_42_beta,
+        batch_normalization_42_mean, batch_normalization_42_variance, 0.001);
+    void *var_179 = tensorRelu(var_178);
+    void *var_180 = tensorConvolution(var_179, conv2d_43_w, 0, 0, 1, 1, 1, 1);
+    void *var_181 = tensorAdd(var_180, conv2d_43_b);
+    void *var_182 = tensorBatchNorm(
+        var_181, batch_normalization_43_gamma, batch_normalization_43_beta,
+        batch_normalization_43_mean, batch_normalization_43_variance, 0.001);
+    void *var_183 = tensorAdd(var_182, var_171);
+    void *var_184 = tensorRelu(var_183);
+    void *var_185 = tensorConvolution(var_184, conv2d_44_w, 0, 0, 2, 2, 1, 1);
+    void *var_186 = tensorAdd(var_185, conv2d_44_b);
+    void *var_187 = tensorBatchNorm(
+        var_186, batch_normalization_44_gamma, batch_normalization_44_beta,
+        batch_normalization_44_mean, batch_normalization_44_variance, 0.001);
+    void *var_188 = tensorRelu(var_187);
+    void *var_189 = tensorConvolution(var_188, conv2d_45_w, 1, 1, 1, 1, 1, 1);
+    void *var_190 = tensorAdd(var_189, conv2d_45_b);
+    void *var_191 = tensorBatchNorm(
+        var_190, batch_normalization_45_gamma, batch_normalization_45_beta,
+        batch_normalization_45_mean, batch_normalization_45_variance, 0.001);
+    void *var_192 = tensorRelu(var_191);
+    void *var_193 = tensorConvolution(var_192, conv2d_46_w, 0, 0, 1, 1, 1, 1);
+    void *var_194 = tensorAdd(var_193, conv2d_46_b);
+    void *var_195 = tensorBatchNorm(
+        var_194, batch_normalization_46_gamma, batch_normalization_46_beta,
+        batch_normalization_46_mean, batch_normalization_46_variance, 0.001);
+    void *var_196 = tensorConvolution(var_184, conv2d_47_w, 0, 0, 2, 2, 1, 1);
+    void *var_197 = tensorAdd(var_196, conv2d_47_b);
+    void *var_198 = tensorBatchNorm(
+        var_197, batch_normalization_47_gamma, batch_normalization_47_beta,
+        batch_normalization_47_mean, batch_normalization_47_variance, 0.001);
+    void *var_199 = tensorAdd(var_195, var_198);
+    void *var_200 = tensorRelu(var_199);
+    void *var_201 = tensorConvolution(var_200, conv2d_48_w, 0, 0, 1, 1, 1, 1);
+    void *var_202 = tensorAdd(var_201, conv2d_48_b);
+    void *var_203 = tensorBatchNorm(
+        var_202, batch_normalization_48_gamma, batch_normalization_48_beta,
+        batch_normalization_48_mean, batch_normalization_48_variance, 0.001);
+    void *var_204 = tensorRelu(var_203);
+    void *var_205 = tensorConvolution(var_204, conv2d_49_w, 1, 1, 1, 1, 1, 1);
+    void *var_206 = tensorAdd(var_205, conv2d_49_b);
+    void *var_207 = tensorBatchNorm(
+        var_206, batch_normalization_49_gamma, batch_normalization_49_beta,
+        batch_normalization_49_mean, batch_normalization_49_variance, 0.001);
+    void *var_208 = tensorRelu(var_207);
+    void *var_209 = tensorConvolution(var_208, conv2d_50_w, 0, 0, 1, 1, 1, 1);
+    void *var_210 = tensorAdd(var_209, conv2d_50_b);
+    void *var_211 = tensorBatchNorm(
+        var_210, batch_normalization_50_gamma, batch_normalization_50_beta,
+        batch_normalization_50_mean, batch_normalization_50_variance, 0.001);
+    void *var_212 = tensorAdd(var_211, var_200);
+    void *var_213 = tensorRelu(var_212);
+    void *var_214 = tensorConvolution(var_213, conv2d_51_w, 0, 0, 1, 1, 1, 1);
+    void *var_215 = tensorAdd(var_214, conv2d_51_b);
+    void *var_216 = tensorBatchNorm(
+        var_215, batch_normalization_51_gamma, batch_normalization_51_beta,
+        batch_normalization_51_mean, batch_normalization_51_variance, 0.001);
+    void *var_217 = tensorRelu(var_216);
+    void *var_218 = tensorConvolution(var_217, conv2d_52_w, 1, 1, 1, 1, 1, 1);
+    void *var_219 = tensorAdd(var_218, conv2d_52_b);
+    void *var_220 = tensorBatchNorm(
+        var_219, batch_normalization_52_gamma, batch_normalization_52_beta,
+        batch_normalization_52_mean, batch_normalization_52_variance, 0.001);
+    void *var_221 = tensorRelu(var_220);
+    void *var_222 = tensorConvolution(var_221, conv2d_53_w, 0, 0, 1, 1, 1, 1);
+    void *var_223 = tensorAdd(var_222, conv2d_53_b);
+    void *var_224 = tensorBatchNorm(
+        var_223, batch_normalization_53_gamma, batch_normalization_53_beta,
+        batch_normalization_53_mean, batch_normalization_53_variance, 0.001);
+    void *var_225 = tensorAdd(var_224, var_213);
+    void *var_226 = tensorRelu(var_225);
+    void *var_227 = tensorPooling(var_226, 1, 7, 7, 0, 0, 7, 7);
+    void *var_229 = tensorGemmGPU(var_227, dense_1_w);
+    void *var_230 = tensorAdd(var_229, dense_1_b);
+    void *var_231 = tensorSoftmax(var_230);
 
-  for(int i = 0; i < batch_count; i++){ 
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
 
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
-
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); 
-
-    void* var_2 = tensorConvolution(input, conv2d_1_w, 3, 3, 2, 2, 1, 1); 
-    void* var_3 = tensorAdd(var_2, conv2d_1_b); 
-    void* var_4 = tensorRelu(var_3); 
-    void* var_5 = tensorPooling(var_4,0,3,3,0,0,2,2); 
-    void* var_6 = tensorBatchNorm(var_5, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
-    void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
-    void* var_8 = tensorAdd(var_7, conv2d_2_b); 
-    void* var_9 = tensorBatchNorm(var_8, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
-    void* var_10 = tensorRelu(var_9); 
-    void* var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); 
-    void* var_12 = tensorAdd(var_11, conv2d_3_b); 
-    void* var_13 = tensorBatchNorm(var_12, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
-    void* var_14 = tensorRelu(var_13); 
-    void* var_15 = tensorConvolution(var_14, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
-    void* var_16 = tensorAdd(var_15, conv2d_4_b); 
-    void* var_17 = tensorBatchNorm(var_16, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
-    void* var_18 = tensorConvolution(var_6, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
-    void* var_19 = tensorAdd(var_18, conv2d_5_b); 
-    void* var_20 = tensorBatchNorm(var_19, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
-    void* var_21 = tensorAdd(var_17, var_20); 
-    void* var_22 = tensorRelu(var_21); 
-    void* var_23 = tensorConvolution(var_22, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
-    void* var_24 = tensorAdd(var_23, conv2d_6_b); 
-    void* var_25 = tensorBatchNorm(var_24, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
-    void* var_26 = tensorRelu(var_25); 
-    void* var_27 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 1); 
-    void* var_28 = tensorAdd(var_27, conv2d_7_b); 
-    void* var_29 = tensorBatchNorm(var_28, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
-    void* var_30 = tensorRelu(var_29); 
-    void* var_31 = tensorConvolution(var_30, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
-    void* var_32 = tensorAdd(var_31, conv2d_8_b); 
-    void* var_33 = tensorBatchNorm(var_32, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
-    void* var_34 = tensorAdd(var_33, var_22); 
-    void* var_35 = tensorRelu(var_34); 
-    void* var_36 = tensorConvolution(var_35, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
-    void* var_37 = tensorAdd(var_36, conv2d_9_b); 
-    void* var_38 = tensorBatchNorm(var_37, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
-    void* var_39 = tensorRelu(var_38); 
-    void* var_40 = tensorConvolution(var_39, conv2d_10_w, 1, 1, 1, 1, 1, 1); 
-    void* var_41 = tensorAdd(var_40, conv2d_10_b); 
-    void* var_42 = tensorBatchNorm(var_41, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
-    void* var_43 = tensorRelu(var_42); 
-    void* var_44 = tensorConvolution(var_43, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
-    void* var_45 = tensorAdd(var_44, conv2d_11_b); 
-    void* var_46 = tensorBatchNorm(var_45, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
-    void* var_47 = tensorAdd(var_46, var_35); 
-    void* var_48 = tensorRelu(var_47); 
-    void* var_49 = tensorConvolution(var_48, conv2d_12_w, 0, 0, 2, 2, 1, 1); 
-    void* var_50 = tensorAdd(var_49, conv2d_12_b); 
-    void* var_51 = tensorBatchNorm(var_50, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
-    void* var_52 = tensorRelu(var_51); 
-    void* var_53 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 1); 
-    void* var_54 = tensorAdd(var_53, conv2d_13_b); 
-    void* var_55 = tensorBatchNorm(var_54, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
-    void* var_56 = tensorRelu(var_55); 
-    void* var_57 = tensorConvolution(var_56, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
-    void* var_58 = tensorAdd(var_57, conv2d_14_b); 
-    void* var_59 = tensorBatchNorm(var_58, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
-    void* var_60 = tensorConvolution(var_48, conv2d_15_w, 0, 0, 2, 2, 1, 1); 
-    void* var_61 = tensorAdd(var_60, conv2d_15_b); 
-    void* var_62 = tensorBatchNorm(var_61, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
-    void* var_63 = tensorAdd(var_59, var_62); 
-    void* var_64 = tensorRelu(var_63); 
-    void* var_65 = tensorConvolution(var_64, conv2d_16_w, 0, 0, 1, 1, 1, 1); 
-    void* var_66 = tensorAdd(var_65, conv2d_16_b); 
-    void* var_67 = tensorBatchNorm(var_66, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
-    void* var_68 = tensorRelu(var_67); 
-    void* var_69 = tensorConvolution(var_68, conv2d_17_w, 1, 1, 1, 1, 1, 1); 
-    void* var_70 = tensorAdd(var_69, conv2d_17_b); 
-    void* var_71 = tensorBatchNorm(var_70, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
-    void* var_72 = tensorRelu(var_71); 
-    void* var_73 = tensorConvolution(var_72, conv2d_18_w, 0, 0, 1, 1, 1, 1); 
-    void* var_74 = tensorAdd(var_73, conv2d_18_b); 
-    void* var_75 = tensorBatchNorm(var_74, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
-    void* var_76 = tensorAdd(var_75, var_64); 
-    void* var_77 = tensorRelu(var_76); 
-    void* var_78 = tensorConvolution(var_77, conv2d_19_w, 0, 0, 1, 1, 1, 1); 
-    void* var_79 = tensorAdd(var_78, conv2d_19_b); 
-    void* var_80 = tensorBatchNorm(var_79, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
-    void* var_81 = tensorRelu(var_80); 
-    void* var_82 = tensorConvolution(var_81, conv2d_20_w, 1, 1, 1, 1, 1, 1); 
-    void* var_83 = tensorAdd(var_82, conv2d_20_b); 
-    void* var_84 = tensorBatchNorm(var_83, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
-    void* var_85 = tensorRelu(var_84); 
-    void* var_86 = tensorConvolution(var_85, conv2d_21_w, 0, 0, 1, 1, 1, 1); 
-    void* var_87 = tensorAdd(var_86, conv2d_21_b); 
-    void* var_88 = tensorBatchNorm(var_87, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
-    void* var_89 = tensorAdd(var_88, var_77); 
-    void* var_90 = tensorRelu(var_89); 
-    void* var_91 = tensorConvolution(var_90, conv2d_22_w, 0, 0, 1, 1, 1, 1); 
-    void* var_92 = tensorAdd(var_91, conv2d_22_b); 
-    void* var_93 = tensorBatchNorm(var_92, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
-    void* var_94 = tensorRelu(var_93); 
-    void* var_95 = tensorConvolution(var_94, conv2d_23_w, 1, 1, 1, 1, 1, 1); 
-    void* var_96 = tensorAdd(var_95, conv2d_23_b); 
-    void* var_97 = tensorBatchNorm(var_96, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
-    void* var_98 = tensorRelu(var_97); 
-    void* var_99 = tensorConvolution(var_98, conv2d_24_w, 0, 0, 1, 1, 1, 1); 
-    void* var_100 = tensorAdd(var_99, conv2d_24_b); 
-    void* var_101 = tensorBatchNorm(var_100, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
-    void* var_102 = tensorAdd(var_101, var_90); 
-    void* var_103 = tensorRelu(var_102); 
-    void* var_104 = tensorConvolution(var_103, conv2d_25_w, 0, 0, 2, 2, 1, 1); 
-    void* var_105 = tensorAdd(var_104, conv2d_25_b); 
-    void* var_106 = tensorBatchNorm(var_105, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-    void* var_107 = tensorRelu(var_106); 
-    void* var_108 = tensorConvolution(var_107, conv2d_26_w, 1, 1, 1, 1, 1, 1); 
-    void* var_109 = tensorAdd(var_108, conv2d_26_b); 
-    void* var_110 = tensorBatchNorm(var_109, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
-    void* var_111 = tensorRelu(var_110); 
-    void* var_112 = tensorConvolution(var_111, conv2d_27_w, 0, 0, 1, 1, 1, 1); 
-    void* var_113 = tensorAdd(var_112, conv2d_27_b); 
-    void* var_114 = tensorBatchNorm(var_113, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-    void* var_115 = tensorConvolution(var_103, conv2d_28_w, 0, 0, 2, 2, 1, 1); 
-    void* var_116 = tensorAdd(var_115, conv2d_28_b); 
-    void* var_117 = tensorBatchNorm(var_116, batch_normalization_28_gamma, batch_normalization_28_beta, batch_normalization_28_mean, batch_normalization_28_variance, 0.001); 
-    void* var_118 = tensorAdd(var_114, var_117); 
-    void* var_119 = tensorRelu(var_118); 
-    void* var_120 = tensorConvolution(var_119, conv2d_29_w, 0, 0, 1, 1, 1, 1); 
-    void* var_121 = tensorAdd(var_120, conv2d_29_b); 
-    void* var_122 = tensorBatchNorm(var_121, batch_normalization_29_gamma, batch_normalization_29_beta, batch_normalization_29_mean, batch_normalization_29_variance, 0.001); 
-    void* var_123 = tensorRelu(var_122); 
-    void* var_124 = tensorConvolution(var_123, conv2d_30_w, 1, 1, 1, 1, 1, 1); 
-    void* var_125 = tensorAdd(var_124, conv2d_30_b); 
-    void* var_126 = tensorBatchNorm(var_125, batch_normalization_30_gamma, batch_normalization_30_beta, batch_normalization_30_mean, batch_normalization_30_variance, 0.001); 
-    void* var_127 = tensorRelu(var_126); 
-    void* var_128 = tensorConvolution(var_127, conv2d_31_w, 0, 0, 1, 1, 1, 1); 
-    void* var_129 = tensorAdd(var_128, conv2d_31_b); 
-    void* var_130 = tensorBatchNorm(var_129, batch_normalization_31_gamma, batch_normalization_31_beta, batch_normalization_31_mean, batch_normalization_31_variance, 0.001); 
-    void* var_131 = tensorAdd(var_130, var_119); 
-    void* var_132 = tensorRelu(var_131); 
-    void* var_133 = tensorConvolution(var_132, conv2d_32_w, 0, 0, 1, 1, 1, 1); 
-    void* var_134 = tensorAdd(var_133, conv2d_32_b); 
-    void* var_135 = tensorBatchNorm(var_134, batch_normalization_32_gamma, batch_normalization_32_beta, batch_normalization_32_mean, batch_normalization_32_variance, 0.001); 
-    void* var_136 = tensorRelu(var_135); 
-    void* var_137 = tensorConvolution(var_136, conv2d_33_w, 1, 1, 1, 1, 1, 1); 
-    void* var_138 = tensorAdd(var_137, conv2d_33_b); 
-    void* var_139 = tensorBatchNorm(var_138, batch_normalization_33_gamma, batch_normalization_33_beta, batch_normalization_33_mean, batch_normalization_33_variance, 0.001); 
-    void* var_140 = tensorRelu(var_139); 
-    void* var_141 = tensorConvolution(var_140, conv2d_34_w, 0, 0, 1, 1, 1, 1); 
-    void* var_142 = tensorAdd(var_141, conv2d_34_b); 
-    void* var_143 = tensorBatchNorm(var_142, batch_normalization_34_gamma, batch_normalization_34_beta, batch_normalization_34_mean, batch_normalization_34_variance, 0.001); 
-    void* var_144 = tensorAdd(var_143, var_132); 
-    void* var_145 = tensorRelu(var_144); 
-    void* var_146 = tensorConvolution(var_145, conv2d_35_w, 0, 0, 1, 1, 1, 1); 
-    void* var_147 = tensorAdd(var_146, conv2d_35_b); 
-    void* var_148 = tensorBatchNorm(var_147, batch_normalization_35_gamma, batch_normalization_35_beta, batch_normalization_35_mean, batch_normalization_35_variance, 0.001); 
-    void* var_149 = tensorRelu(var_148); 
-    void* var_150 = tensorConvolution(var_149, conv2d_36_w, 1, 1, 1, 1, 1, 1); 
-    void* var_151 = tensorAdd(var_150, conv2d_36_b); 
-    void* var_152 = tensorBatchNorm(var_151, batch_normalization_36_gamma, batch_normalization_36_beta, batch_normalization_36_mean, batch_normalization_36_variance, 0.001); 
-    void* var_153 = tensorRelu(var_152); 
-    void* var_154 = tensorConvolution(var_153, conv2d_37_w, 0, 0, 1, 1, 1, 1); 
-    void* var_155 = tensorAdd(var_154, conv2d_37_b); 
-    void* var_156 = tensorBatchNorm(var_155, batch_normalization_37_gamma, batch_normalization_37_beta, batch_normalization_37_mean, batch_normalization_37_variance, 0.001); 
-    void* var_157 = tensorAdd(var_156, var_145); 
-    void* var_158 = tensorRelu(var_157); 
-    void* var_159 = tensorConvolution(var_158, conv2d_38_w, 0, 0, 1, 1, 1, 1); 
-    void* var_160 = tensorAdd(var_159, conv2d_38_b); 
-    void* var_161 = tensorBatchNorm(var_160, batch_normalization_38_gamma, batch_normalization_38_beta, batch_normalization_38_mean, batch_normalization_38_variance, 0.001); 
-    void* var_162 = tensorRelu(var_161); 
-    void* var_163 = tensorConvolution(var_162, conv2d_39_w, 1, 1, 1, 1, 1, 1); 
-    void* var_164 = tensorAdd(var_163, conv2d_39_b); 
-    void* var_165 = tensorBatchNorm(var_164, batch_normalization_39_gamma, batch_normalization_39_beta, batch_normalization_39_mean, batch_normalization_39_variance, 0.001); 
-    void* var_166 = tensorRelu(var_165); 
-    void* var_167 = tensorConvolution(var_166, conv2d_40_w, 0, 0, 1, 1, 1, 1); 
-    void* var_168 = tensorAdd(var_167, conv2d_40_b); 
-    void* var_169 = tensorBatchNorm(var_168, batch_normalization_40_gamma, batch_normalization_40_beta, batch_normalization_40_mean, batch_normalization_40_variance, 0.001); 
-    void* var_170 = tensorAdd(var_169, var_158); 
-    void* var_171 = tensorRelu(var_170); 
-    void* var_172 = tensorConvolution(var_171, conv2d_41_w, 0, 0, 1, 1, 1, 1); 
-    void* var_173 = tensorAdd(var_172, conv2d_41_b); 
-    void* var_174 = tensorBatchNorm(var_173, batch_normalization_41_gamma, batch_normalization_41_beta, batch_normalization_41_mean, batch_normalization_41_variance, 0.001); 
-    void* var_175 = tensorRelu(var_174); 
-    void* var_176 = tensorConvolution(var_175, conv2d_42_w, 1, 1, 1, 1, 1, 1); 
-    void* var_177 = tensorAdd(var_176, conv2d_42_b); 
-    void* var_178 = tensorBatchNorm(var_177, batch_normalization_42_gamma, batch_normalization_42_beta, batch_normalization_42_mean, batch_normalization_42_variance, 0.001); 
-    void* var_179 = tensorRelu(var_178); 
-    void* var_180 = tensorConvolution(var_179, conv2d_43_w, 0, 0, 1, 1, 1, 1); 
-    void* var_181 = tensorAdd(var_180, conv2d_43_b); 
-    void* var_182 = tensorBatchNorm(var_181, batch_normalization_43_gamma, batch_normalization_43_beta, batch_normalization_43_mean, batch_normalization_43_variance, 0.001); 
-    void* var_183 = tensorAdd(var_182, var_171); 
-    void* var_184 = tensorRelu(var_183); 
-    void* var_185 = tensorConvolution(var_184, conv2d_44_w, 0, 0, 2, 2, 1, 1); 
-    void* var_186 = tensorAdd(var_185, conv2d_44_b); 
-    void* var_187 = tensorBatchNorm(var_186, batch_normalization_44_gamma, batch_normalization_44_beta, batch_normalization_44_mean, batch_normalization_44_variance, 0.001); 
-    void* var_188 = tensorRelu(var_187); 
-    void* var_189 = tensorConvolution(var_188, conv2d_45_w, 1, 1, 1, 1, 1, 1); 
-    void* var_190 = tensorAdd(var_189, conv2d_45_b); 
-    void* var_191 = tensorBatchNorm(var_190, batch_normalization_45_gamma, batch_normalization_45_beta, batch_normalization_45_mean, batch_normalization_45_variance, 0.001); 
-    void* var_192 = tensorRelu(var_191); 
-    void* var_193 = tensorConvolution(var_192, conv2d_46_w, 0, 0, 1, 1, 1, 1); 
-    void* var_194 = tensorAdd(var_193, conv2d_46_b); 
-    void* var_195 = tensorBatchNorm(var_194, batch_normalization_46_gamma, batch_normalization_46_beta, batch_normalization_46_mean, batch_normalization_46_variance, 0.001); 
-    void* var_196 = tensorConvolution(var_184, conv2d_47_w, 0, 0, 2, 2, 1, 1); 
-    void* var_197 = tensorAdd(var_196, conv2d_47_b); 
-    void* var_198 = tensorBatchNorm(var_197, batch_normalization_47_gamma, batch_normalization_47_beta, batch_normalization_47_mean, batch_normalization_47_variance, 0.001); 
-    void* var_199 = tensorAdd(var_195, var_198); 
-    void* var_200 = tensorRelu(var_199); 
-    void* var_201 = tensorConvolution(var_200, conv2d_48_w, 0, 0, 1, 1, 1, 1); 
-    void* var_202 = tensorAdd(var_201, conv2d_48_b); 
-    void* var_203 = tensorBatchNorm(var_202, batch_normalization_48_gamma, batch_normalization_48_beta, batch_normalization_48_mean, batch_normalization_48_variance, 0.001); 
-    void* var_204 = tensorRelu(var_203); 
-    void* var_205 = tensorConvolution(var_204, conv2d_49_w, 1, 1, 1, 1, 1, 1); 
-    void* var_206 = tensorAdd(var_205, conv2d_49_b); 
-    void* var_207 = tensorBatchNorm(var_206, batch_normalization_49_gamma, batch_normalization_49_beta, batch_normalization_49_mean, batch_normalization_49_variance, 0.001); 
-    void* var_208 = tensorRelu(var_207); 
-    void* var_209 = tensorConvolution(var_208, conv2d_50_w, 0, 0, 1, 1, 1, 1); 
-    void* var_210 = tensorAdd(var_209, conv2d_50_b); 
-    void* var_211 = tensorBatchNorm(var_210, batch_normalization_50_gamma, batch_normalization_50_beta, batch_normalization_50_mean, batch_normalization_50_variance, 0.001); 
-    void* var_212 = tensorAdd(var_211, var_200); 
-    void* var_213 = tensorRelu(var_212); 
-    void* var_214 = tensorConvolution(var_213, conv2d_51_w, 0, 0, 1, 1, 1, 1); 
-    void* var_215 = tensorAdd(var_214, conv2d_51_b); 
-    void* var_216 = tensorBatchNorm(var_215, batch_normalization_51_gamma, batch_normalization_51_beta, batch_normalization_51_mean, batch_normalization_51_variance, 0.001); 
-    void* var_217 = tensorRelu(var_216); 
-    void* var_218 = tensorConvolution(var_217, conv2d_52_w, 1, 1, 1, 1, 1, 1); 
-    void* var_219 = tensorAdd(var_218, conv2d_52_b); 
-    void* var_220 = tensorBatchNorm(var_219, batch_normalization_52_gamma, batch_normalization_52_beta, batch_normalization_52_mean, batch_normalization_52_variance, 0.001); 
-    void* var_221 = tensorRelu(var_220); 
-    void* var_222 = tensorConvolution(var_221, conv2d_53_w, 0, 0, 1, 1, 1, 1); 
-    void* var_223 = tensorAdd(var_222, conv2d_53_b); 
-    void* var_224 = tensorBatchNorm(var_223, batch_normalization_53_gamma, batch_normalization_53_beta, batch_normalization_53_mean, batch_normalization_53_variance, 0.001); 
-    void* var_225 = tensorAdd(var_224, var_213); 
-    void* var_226 = tensorRelu(var_225); 
-    void* var_227 = tensorPooling(var_226,1,7,7,0,0,7,7); 
-    void* var_229 = tensorGemmGPU(var_227, dense_1_w); 
-    void* var_230 = tensorAdd(var_229, dense_1_b); 
-    void* var_231 = tensorSoftmax(var_230); 
-
-    uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy3(labels, var_231); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
- 
+    float accuracy = computeAccuracy3(labels, var_231);
+    final_accuracy += accuracy;
+    freeBatchMemory();
   }
 
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
-
-
-  llvm_hpvm_cleanupTensorRt(); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc
index a6dc7cbc11..7807cdced2 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc
@@ -1,82 +1,109 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
 
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1);
 
   startMemTracking();
 
@@ -85,77 +112,76 @@ int main(){
   int batch_count = test_input_size / batch_size;
   float final_accuracy = 0.0;
 
-  for(int i = 0; i < batch_count; i++){
+  for (int i = 0; i < batch_count; i++) {
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); 
- 
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorRelu(var_1); 
-    void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_5 = tensorAdd(var_4, conv2d_2_b); 
-    void* var_6 = tensorRelu(var_5); 
-    void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); 
-    void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_9 = tensorAdd(var_8, conv2d_3_b); 
-    void* var_10 = tensorRelu(var_9); 
-    void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_13 = tensorAdd(var_12, conv2d_4_b); 
-    void* var_14 = tensorRelu(var_13); 
-    void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); 
-    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorRelu(var_17); 
-    void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_21 = tensorAdd(var_20, conv2d_6_b); 
-    void* var_22 = tensorRelu(var_21); 
-    void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorAdd(var_24, conv2d_7_b); 
-    void* var_26 = tensorRelu(var_25); 
-    void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); 
-    void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorAdd(var_28, conv2d_8_b); 
-    void* var_30 = tensorRelu(var_29); 
-    void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_33 = tensorAdd(var_32, conv2d_9_b); 
-    void* var_34 = tensorRelu(var_33); 
-    void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
-    void* var_37 = tensorAdd(var_36, conv2d_10_b); 
-    void* var_38 = tensorRelu(var_37); 
-    void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); 
-    void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_41 = tensorAdd(var_40, conv2d_11_b); 
-    void* var_42 = tensorRelu(var_41); 
-    void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_45 = tensorAdd(var_44, conv2d_12_b); 
-    void* var_46 = tensorRelu(var_45); 
-    void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_49 = tensorAdd(var_48, conv2d_13_b); 
-    void* var_50 = tensorRelu(var_49); 
-    void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); 
-    void* var_54 = tensorGemmGPU(var_51, dense_1_w); 
-    void* var_55 = tensorAdd(var_54, dense_1_b); 
-    void* var_56 = tensorRelu(var_55); 
-    void* var_58 = tensorGemmGPU(var_56, dense_2_w); 
-    void* var_59 = tensorAdd(var_58, dense_2_b); 
-    void* var_60 = tensorSoftmax(var_59); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_60); 
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_1 = tensorAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorRelu(var_1);
+    void *var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_5 = tensorAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorRelu(var_5);
+    void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_9 = tensorAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorRelu(var_9);
+    void *var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_13 = tensorAdd(var_12, conv2d_4_b);
+    void *var_14 = tensorRelu(var_13);
+    void *var_15 = tensorPooling(var_14, 0, 2, 2, 0, 0, 2, 2);
+    void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorRelu(var_17);
+    void *var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_21 = tensorAdd(var_20, conv2d_6_b);
+    void *var_22 = tensorRelu(var_21);
+    void *var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorAdd(var_24, conv2d_7_b);
+    void *var_26 = tensorRelu(var_25);
+    void *var_27 = tensorPooling(var_26, 0, 2, 2, 0, 0, 2, 2);
+    void *var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorAdd(var_28, conv2d_8_b);
+    void *var_30 = tensorRelu(var_29);
+    void *var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_33 = tensorAdd(var_32, conv2d_9_b);
+    void *var_34 = tensorRelu(var_33);
+    void *var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0);
+    void *var_37 = tensorAdd(var_36, conv2d_10_b);
+    void *var_38 = tensorRelu(var_37);
+    void *var_39 = tensorPooling(var_38, 0, 2, 2, 0, 0, 2, 2);
+    void *var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_41 = tensorAdd(var_40, conv2d_11_b);
+    void *var_42 = tensorRelu(var_41);
+    void *var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_45 = tensorAdd(var_44, conv2d_12_b);
+    void *var_46 = tensorRelu(var_45);
+    void *var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_49 = tensorAdd(var_48, conv2d_13_b);
+    void *var_50 = tensorRelu(var_49);
+    void *var_51 = tensorPooling(var_50, 0, 2, 2, 0, 0, 2, 2);
+    void *var_54 = tensorGemmGPU(var_51, dense_1_w);
+    void *var_55 = tensorAdd(var_54, dense_1_b);
+    void *var_56 = tensorRelu(var_55);
+    void *var_58 = tensorGemmGPU(var_56, dense_2_w);
+    void *var_59 = tensorAdd(var_58, dense_2_b);
+    void *var_60 = tensorSoftmax(var_59);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_60);
     final_accuracy += accuracy;
-    
+
     freeBatchMemory();
   }
 
   final_accuracy = final_accuracy / batch_count;
   dumpFinalAccuracy(final_accuracy);
-  
-  llvm_hpvm_cleanupTensorRt(); 
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc
index 2539f8d872..22afc20687 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc
@@ -1,161 +1,187 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin");
-  
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); 
-
-
-  startMemTracking(); 
-
-  int test_input_size = 5000; 
-  int batch_size = 5000;  
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
-
-  for(int i = 0; i < batch_count; i++){ 
-
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
-
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); 
-
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
-    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorRelu(var_1); 
-    void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); 
-    void* var_5 = tensorAdd(var_4, conv2d_2_b); 
-    void* var_6 = tensorRelu(var_5); 
-    void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); 
-    void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_9 = tensorAdd(var_8, conv2d_3_b); 
-    void* var_10 = tensorRelu(var_9); 
-    void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_13 = tensorAdd(var_12, conv2d_4_b); 
-    void* var_14 = tensorRelu(var_13); 
-    void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); 
-    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorRelu(var_17); 
-    void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); 
-    void* var_21 = tensorAdd(var_20, conv2d_6_b); 
-    void* var_22 = tensorRelu(var_21); 
-    void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); 
-    void* var_25 = tensorAdd(var_24, conv2d_7_b); 
-    void* var_26 = tensorRelu(var_25); 
-    void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); 
-    void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); 
-    void* var_29 = tensorAdd(var_28, conv2d_8_b); 
-    void* var_30 = tensorRelu(var_29); 
-    void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); 
-    void* var_33 = tensorAdd(var_32, conv2d_9_b); 
-    void* var_34 = tensorRelu(var_33); 
-    void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); 
-    void* var_37 = tensorAdd(var_36, conv2d_10_b); 
-    void* var_38 = tensorRelu(var_37); 
-    void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); 
-    void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); 
-    void* var_41 = tensorAdd(var_40, conv2d_11_b); 
-    void* var_42 = tensorRelu(var_41); 
-    void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); 
-    void* var_45 = tensorAdd(var_44, conv2d_12_b); 
-    void* var_46 = tensorRelu(var_45); 
-    void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); 
-    void* var_49 = tensorAdd(var_48, conv2d_13_b); 
-    void* var_50 = tensorRelu(var_49); 
-    void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); 
-    void* var_54 = tensorGemmGPU(var_51, dense_1_w); 
-    void* var_55 = tensorAdd(var_54, dense_1_b); 
-    void* var_56 = tensorRelu(var_55); 
-    void* var_58 = tensorGemmGPU(var_56, dense_2_w); 
-    void* var_59 = tensorAdd(var_58, dense_2_b); 
-    void* var_60 = tensorSoftmax(var_59); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
- 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1);
+
+  startMemTracking();
+
+  int test_input_size = 5000;
+  int batch_size = 5000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  for (int i = 0; i < batch_count; i++) {
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+
+    void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32);
+
+    void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
+    void *var_1 = tensorAdd(var_0, conv2d_1_b);
+    void *var_2 = tensorRelu(var_1);
+    void *var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0);
+    void *var_5 = tensorAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorRelu(var_5);
+    void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0);
+    void *var_9 = tensorAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorRelu(var_9);
+    void *var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0);
+    void *var_13 = tensorAdd(var_12, conv2d_4_b);
+    void *var_14 = tensorRelu(var_13);
+    void *var_15 = tensorPooling(var_14, 0, 2, 2, 0, 0, 2, 2);
+    void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0);
+    void *var_17 = tensorAdd(var_16, conv2d_5_b);
+    void *var_18 = tensorRelu(var_17);
+    void *var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0);
+    void *var_21 = tensorAdd(var_20, conv2d_6_b);
+    void *var_22 = tensorRelu(var_21);
+    void *var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0);
+    void *var_25 = tensorAdd(var_24, conv2d_7_b);
+    void *var_26 = tensorRelu(var_25);
+    void *var_27 = tensorPooling(var_26, 0, 2, 2, 0, 0, 2, 2);
+    void *var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0);
+    void *var_29 = tensorAdd(var_28, conv2d_8_b);
+    void *var_30 = tensorRelu(var_29);
+    void *var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0);
+    void *var_33 = tensorAdd(var_32, conv2d_9_b);
+    void *var_34 = tensorRelu(var_33);
+    void *var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0);
+    void *var_37 = tensorAdd(var_36, conv2d_10_b);
+    void *var_38 = tensorRelu(var_37);
+    void *var_39 = tensorPooling(var_38, 0, 2, 2, 0, 0, 2, 2);
+    void *var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0);
+    void *var_41 = tensorAdd(var_40, conv2d_11_b);
+    void *var_42 = tensorRelu(var_41);
+    void *var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0);
+    void *var_45 = tensorAdd(var_44, conv2d_12_b);
+    void *var_46 = tensorRelu(var_45);
+    void *var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0);
+    void *var_49 = tensorAdd(var_48, conv2d_13_b);
+    void *var_50 = tensorRelu(var_49);
+    void *var_51 = tensorPooling(var_50, 0, 2, 2, 0, 0, 2, 2);
+    void *var_54 = tensorGemmGPU(var_51, dense_1_w);
+    void *var_55 = tensorAdd(var_54, dense_1_b);
+    void *var_56 = tensorRelu(var_55);
+    void *var_58 = tensorGemmGPU(var_56, dense_2_w);
+    void *var_59 = tensorAdd(var_58, dense_2_b);
+    void *var_60 = tensorSoftmax(var_59);
+
+    uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy2(labels, batch_size, var_60, 100);
+    final_accuracy += accuracy;
+    freeBatchMemory();
   }
 
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-  llvm_hpvm_cleanupTensorRt(); 
+  llvm_hpvm_cleanupTensorRt();
 
-  return 0; 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc
index 1d78065c57..0e0a1dfbbc 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc
@@ -1,173 +1,199 @@
 
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "tensor_runtime.h" 
-#include "utils.h" 
-
-
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-
-  std::string dir_prefix = std::string("/home/nvidia/sd_card/vgg16_imagenet_new/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_6_w_path =  dir_prefix + std::string("conv2d_6_w.bin"); 
-  void* conv2d_6_w =  readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_6_b_path =  dir_prefix + std::string("conv2d_6_b.bin"); 
-  void* conv2d_6_b =  readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_7_w_path =  dir_prefix + std::string("conv2d_7_w.bin"); 
-  void* conv2d_7_w =  readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_7_b_path =  dir_prefix + std::string("conv2d_7_b.bin"); 
-  void* conv2d_7_b =  readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_8_w_path =  dir_prefix + std::string("conv2d_8_w.bin"); 
-  void* conv2d_8_w =  readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); 
-  std::string conv2d_8_b_path =  dir_prefix + std::string("conv2d_8_b.bin"); 
-  void* conv2d_8_b =  readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_9_w_path =  dir_prefix + std::string("conv2d_9_w.bin"); 
-  void* conv2d_9_w =  readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_9_b_path =  dir_prefix + std::string("conv2d_9_b.bin"); 
-  void* conv2d_9_b =  readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_10_w_path =  dir_prefix + std::string("conv2d_10_w.bin"); 
-  void* conv2d_10_w =  readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_10_b_path =  dir_prefix + std::string("conv2d_10_b.bin"); 
-  void* conv2d_10_b =  readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_11_w_path =  dir_prefix + std::string("conv2d_11_w.bin"); 
-  void* conv2d_11_w =  readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_11_b_path =  dir_prefix + std::string("conv2d_11_b.bin"); 
-  void* conv2d_11_b =  readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_12_w_path =  dir_prefix + std::string("conv2d_12_w.bin"); 
-  void* conv2d_12_w =  readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_12_b_path =  dir_prefix + std::string("conv2d_12_b.bin"); 
-  void* conv2d_12_b =  readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); 
-  std::string conv2d_13_w_path =  dir_prefix + std::string("conv2d_13_w.bin"); 
-  void* conv2d_13_w =  readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); 
-  std::string conv2d_13_b_path =  dir_prefix + std::string("conv2d_13_b.bin"); 
-  void* conv2d_13_b =  readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,25088,4096); 
-  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); 
-  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
-  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); 
-  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
-  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); 
-  std::string dense_3_w_path =  dir_prefix + std::string("dense_3_w.bin"); 
-  void* dense_3_w =  readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); 
-  std::string dense_3_b_path =  dir_prefix + std::string("dense_3_b.bin"); 
-  void* dense_3_b =  readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); 
-
-
-
-  startMemTracking(); 
-
-  int test_input_size = 500; 
-  int batch_size = 100; 
-  int batch_count = test_input_size / batch_size; 
-  float final_accuracy = 0.0; 
-
-  for(int i = 0; i < batch_count; i++){ 
-
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
-
-    void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); 
-
-    void* var_1 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
-    void* var_2 = tensorAdd(var_1, conv2d_1_b); 
-    void* var_3 = tensorRelu(var_2); 
-    void* var_4 = tensorConvolution(var_3, conv2d_2_w, 1, 1, 1, 1, 1, 1); 
-    void* var_5 = tensorAdd(var_4, conv2d_2_b); 
-    void* var_6 = tensorRelu(var_5); 
-    void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); 
-    void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 1); 
-    void* var_9 = tensorAdd(var_8, conv2d_3_b); 
-    void* var_10 = tensorRelu(var_9); 
-    void* var_11 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 1); 
-    void* var_12 = tensorAdd(var_11, conv2d_4_b); 
-    void* var_13 = tensorRelu(var_12); 
-    void* var_14 = tensorPooling(var_13,0,2,2,0,0,2,2); 
-    void* var_15 = tensorConvolution(var_14, conv2d_5_w, 1, 1, 1, 1, 1, 1); 
-    void* var_16 = tensorAdd(var_15, conv2d_5_b); 
-    void* var_17 = tensorRelu(var_16); 
-    void* var_18 = tensorConvolution(var_17, conv2d_6_w, 1, 1, 1, 1, 1, 1); 
-    void* var_19 = tensorAdd(var_18, conv2d_6_b); 
-    void* var_20 = tensorRelu(var_19); 
-    void* var_21 = tensorConvolution(var_20, conv2d_7_w, 1, 1, 1, 1, 1, 1); 
-    void* var_22 = tensorAdd(var_21, conv2d_7_b); 
-    void* var_23 = tensorRelu(var_22); 
-    void* var_24 = tensorPooling(var_23,0,2,2,0,0,2,2); 
-    void* var_25 = tensorConvolution(var_24, conv2d_8_w, 1, 1, 1, 1, 1, 1); 
-    void* var_26 = tensorAdd(var_25, conv2d_8_b); 
-    void* var_27 = tensorRelu(var_26); 
-    void* var_28 = tensorConvolution(var_27, conv2d_9_w, 1, 1, 1, 1, 1, 1); 
-    void* var_29 = tensorAdd(var_28, conv2d_9_b); 
-    void* var_30 = tensorRelu(var_29); 
-    void* var_31 = tensorConvolution(var_30, conv2d_10_w, 1, 1, 1, 1, 1, 1); 
-    void* var_32 = tensorAdd(var_31, conv2d_10_b); 
-    void* var_33 = tensorRelu(var_32); 
-    void* var_34 = tensorPooling(var_33,0,2,2,0,0,2,2); 
-    void* var_35 = tensorConvolution(var_34, conv2d_11_w, 1, 1, 1, 1, 1, 1); 
-    void* var_36 = tensorAdd(var_35, conv2d_11_b); 
-    void* var_37 = tensorRelu(var_36); 
-    void* var_38 = tensorConvolution(var_37, conv2d_12_w, 1, 1, 1, 1, 1, 1); 
-    void* var_39 = tensorAdd(var_38, conv2d_12_b); 
-    void* var_40 = tensorRelu(var_39); 
-    void* var_41 = tensorConvolution(var_40, conv2d_13_w, 1, 1, 1, 1, 1, 1); 
-    void* var_42 = tensorAdd(var_41, conv2d_13_b); 
-    void* var_43 = tensorRelu(var_42); 
-    void* var_44 = tensorPooling(var_43,0,2,2,0,0,2,2); 
-    void* var_46 = tensorGemmGPU(var_44, dense_1_w); 
-    void* var_47 = tensorAdd(var_46, dense_1_b); 
-    void* var_48 = tensorRelu(var_47); 
-    void* var_49 = tensorGemmGPU(var_48, dense_2_w); 
-    void* var_50 = tensorAdd(var_49, dense_2_b); 
-    void* var_51 = tensorRelu(var_50); 
-    void* var_52 = tensorGemmGPU(var_51, dense_3_w); 
-    void* var_53 = tensorAdd(var_52, dense_3_b); 
-    void* var_54 = tensorSoftmax(var_53); 
-
-    uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); 
-
-    float accuracy = computeAccuracy3(labels, var_54); 
-    final_accuracy += accuracy; 
-    freeBatchMemory(); 
- 
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+#include "tensor_runtime.h"
+#include "utils.h"
+
+int main() {
+
+  llvm_hpvm_initTensorRt(0);
+
+  std::string dir_prefix =
+      std::string("/home/nvidia/sd_card/vgg16_imagenet_new/");
+  std::string input_path = dir_prefix + std::string("input.bin");
+  std::string labels_path = dir_prefix + std::string("labels.bin");
+  std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
+  void *conv2d_1_w =
+      readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3);
+  std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
+  void *conv2d_1_b =
+      readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin");
+  void *conv2d_2_w =
+      readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3);
+  std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin");
+  void *conv2d_2_b =
+      readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1);
+  std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin");
+  void *conv2d_3_w =
+      readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3);
+  std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin");
+  void *conv2d_3_b =
+      readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin");
+  void *conv2d_4_w =
+      readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3);
+  std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin");
+  void *conv2d_4_b =
+      readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1);
+  std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin");
+  void *conv2d_5_w =
+      readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3);
+  std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin");
+  void *conv2d_5_b =
+      readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin");
+  void *conv2d_6_w =
+      readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin");
+  void *conv2d_6_b =
+      readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin");
+  void *conv2d_7_w =
+      readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3);
+  std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin");
+  void *conv2d_7_b =
+      readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1);
+  std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin");
+  void *conv2d_8_w =
+      readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3);
+  std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin");
+  void *conv2d_8_b =
+      readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin");
+  void *conv2d_9_w =
+      readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin");
+  void *conv2d_9_b =
+      readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin");
+  void *conv2d_10_w =
+      readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin");
+  void *conv2d_10_b =
+      readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin");
+  void *conv2d_11_w =
+      readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin");
+  void *conv2d_11_b =
+      readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin");
+  void *conv2d_12_w =
+      readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin");
+  void *conv2d_12_b =
+      readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin");
+  void *conv2d_13_w =
+      readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3);
+  std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin");
+  void *conv2d_13_b =
+      readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1);
+  std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin");
+  void *dense_1_w =
+      readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 25088, 4096);
+  std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin");
+  void *dense_1_b =
+      readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 4096, 1, 1);
+  std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin");
+  void *dense_2_w =
+      readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 4096, 4096);
+  std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin");
+  void *dense_2_b =
+      readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 4096, 1, 1);
+  std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin");
+  void *dense_3_w =
+      readTrainedWeights(dense_3_w_path.c_str(), 0, 1, 1, 4096, 1000);
+  std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin");
+  void *dense_3_b =
+      readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1);
+
+  startMemTracking();
+
+  int test_input_size = 500;
+  int batch_size = 100;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  for (int i = 0; i < batch_count; i++) {
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+
+    void *input =
+        readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224);
+
+    void *var_1 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1);
+    void *var_2 = tensorAdd(var_1, conv2d_1_b);
+    void *var_3 = tensorRelu(var_2);
+    void *var_4 = tensorConvolution(var_3, conv2d_2_w, 1, 1, 1, 1, 1, 1);
+    void *var_5 = tensorAdd(var_4, conv2d_2_b);
+    void *var_6 = tensorRelu(var_5);
+    void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2);
+    void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 1);
+    void *var_9 = tensorAdd(var_8, conv2d_3_b);
+    void *var_10 = tensorRelu(var_9);
+    void *var_11 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 1);
+    void *var_12 = tensorAdd(var_11, conv2d_4_b);
+    void *var_13 = tensorRelu(var_12);
+    void *var_14 = tensorPooling(var_13, 0, 2, 2, 0, 0, 2, 2);
+    void *var_15 = tensorConvolution(var_14, conv2d_5_w, 1, 1, 1, 1, 1, 1);
+    void *var_16 = tensorAdd(var_15, conv2d_5_b);
+    void *var_17 = tensorRelu(var_16);
+    void *var_18 = tensorConvolution(var_17, conv2d_6_w, 1, 1, 1, 1, 1, 1);
+    void *var_19 = tensorAdd(var_18, conv2d_6_b);
+    void *var_20 = tensorRelu(var_19);
+    void *var_21 = tensorConvolution(var_20, conv2d_7_w, 1, 1, 1, 1, 1, 1);
+    void *var_22 = tensorAdd(var_21, conv2d_7_b);
+    void *var_23 = tensorRelu(var_22);
+    void *var_24 = tensorPooling(var_23, 0, 2, 2, 0, 0, 2, 2);
+    void *var_25 = tensorConvolution(var_24, conv2d_8_w, 1, 1, 1, 1, 1, 1);
+    void *var_26 = tensorAdd(var_25, conv2d_8_b);
+    void *var_27 = tensorRelu(var_26);
+    void *var_28 = tensorConvolution(var_27, conv2d_9_w, 1, 1, 1, 1, 1, 1);
+    void *var_29 = tensorAdd(var_28, conv2d_9_b);
+    void *var_30 = tensorRelu(var_29);
+    void *var_31 = tensorConvolution(var_30, conv2d_10_w, 1, 1, 1, 1, 1, 1);
+    void *var_32 = tensorAdd(var_31, conv2d_10_b);
+    void *var_33 = tensorRelu(var_32);
+    void *var_34 = tensorPooling(var_33, 0, 2, 2, 0, 0, 2, 2);
+    void *var_35 = tensorConvolution(var_34, conv2d_11_w, 1, 1, 1, 1, 1, 1);
+    void *var_36 = tensorAdd(var_35, conv2d_11_b);
+    void *var_37 = tensorRelu(var_36);
+    void *var_38 = tensorConvolution(var_37, conv2d_12_w, 1, 1, 1, 1, 1, 1);
+    void *var_39 = tensorAdd(var_38, conv2d_12_b);
+    void *var_40 = tensorRelu(var_39);
+    void *var_41 = tensorConvolution(var_40, conv2d_13_w, 1, 1, 1, 1, 1, 1);
+    void *var_42 = tensorAdd(var_41, conv2d_13_b);
+    void *var_43 = tensorRelu(var_42);
+    void *var_44 = tensorPooling(var_43, 0, 2, 2, 0, 0, 2, 2);
+    void *var_46 = tensorGemmGPU(var_44, dense_1_w);
+    void *var_47 = tensorAdd(var_46, dense_1_b);
+    void *var_48 = tensorRelu(var_47);
+    void *var_49 = tensorGemmGPU(var_48, dense_2_w);
+    void *var_50 = tensorAdd(var_49, dense_2_b);
+    void *var_51 = tensorRelu(var_50);
+    void *var_52 = tensorGemmGPU(var_51, dense_3_w);
+    void *var_53 = tensorAdd(var_52, dense_3_b);
+    void *var_54 = tensorSoftmax(var_53);
+
+    uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end);
+
+    float accuracy = computeAccuracy3(labels, var_54);
+    final_accuracy += accuracy;
+    freeBatchMemory();
   }
 
-  final_accuracy = final_accuracy / batch_count; 
-  dumpFinalAccuracy(final_accuracy); 
-
-
-  llvm_hpvm_cleanupTensorRt(); 
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-  return 0; 
+  llvm_hpvm_cleanupTensorRt();
 
+  return 0;
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
index 6793cd79f1..ea959342a4 100644
--- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
+++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc
@@ -10,10 +10,7 @@
 
 using namespace std;
 
-
-
-
-class UnitTestResults{
+class UnitTestResults {
 
 private:
   unsigned int total_tests;
@@ -22,48 +19,46 @@ private:
   std::vector<string> failed_test_ids;
 
 public:
-
-  UnitTestResults(){
+  UnitTestResults() {
     total_tests = 0;
     failed_tests = 0;
     passed_tests = 0;
   }
 
-  void evalTestResult(Tensor* res, const float* expected_result, size_t num_elems,
-		      float epsilon, string test_name){
+  void evalTestResult(Tensor *res, const float *expected_result,
+                      size_t num_elems, float epsilon, string test_name) {
 
-    total_tests += 1;      
-    if(res->num_elems != num_elems){
+    total_tests += 1;
+    if (res->num_elems != num_elems) {
       failed_tests += 1;
       failed_test_ids.push_back(test_name);
       return;
     }
 
-    float* data_ptr = (float*) res->host_data;
-    for (unsigned int i = 0; i < res->num_elems; i++){
-      //printf("**diff value = %f ", std::abs(data_ptr[i] - expected_result[i]));
-      if (std::abs(data_ptr[i] - expected_result[i]) > epsilon){
-	failed_tests += 1;
-	failed_test_ids.push_back(test_name);
+    float *data_ptr = (float *)res->host_data;
+    for (unsigned int i = 0; i < res->num_elems; i++) {
+      // printf("**diff value = %f ", std::abs(data_ptr[i] -
+      // expected_result[i]));
+      if (std::abs(data_ptr[i] - expected_result[i]) > epsilon) {
+        failed_tests += 1;
+        failed_test_ids.push_back(test_name);
         return;
       }
     }
-    
-    passed_tests += 1;    
+
+    passed_tests += 1;
   }
 
-  void compareTensors(Tensor* res, Tensor* gold_res,
-		      float epsilon, string test_name){
+  void compareTensors(Tensor *res, Tensor *gold_res, float epsilon,
+                      string test_name) {
 
-    const float* expected_result = (float*) gold_res->host_data;
+    const float *expected_result = (float *)gold_res->host_data;
     unsigned int num_elems = res->num_elems;
 
     evalTestResult(res, expected_result, num_elems, epsilon, test_name);
-    
   }
 
-
-  void printSummary(){
+  void printSummary() {
 
     printf("\n\n\n ************* Printing Results Summary ********** \n\n");
     printf("-- Total tests :=  %d \n", total_tests);
@@ -71,147 +66,136 @@ public:
     printf("-- Tests Failed := %d \n", failed_tests);
 
     printf("\n\n Tests that failed : \n\n");
-    for (int i = 0; i < failed_test_ids.size(); i++){
+    for (int i = 0; i < failed_test_ids.size(); i++) {
       printf("*** Test = %s \n", failed_test_ids[i].c_str());
     }
   }
-  
 };
 
-
-
-
-void testTensorHgemm(UnitTestResults& unitTestResults){
+void testTensorHgemm(UnitTestResults &unitTestResults) {
 
   printf("***** TensorHgemm ***** \n\n");
-  void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
-  struct Tensor* lhs = (struct Tensor*) lhs_ptr;
+  void *lhs_ptr =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
+  struct Tensor *lhs = (struct Tensor *)lhs_ptr;
   fillTensorWithOnes(lhs);
-  
-  float* data_arr = (float*) lhs->host_data;
-  for(int i = 0; i < lhs->num_elems; i++){
+
+  float *data_arr = (float *)lhs->host_data;
+  for (int i = 0; i < lhs->num_elems; i++) {
     data_arr[i] = (i / 4) + 1;
   }
-  
-  void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);
+
+  void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);
   fillTensorWithOnes(rhs);
-  
-  void* output = tensorHalfGemm(lhs, rhs);
-  convertToFP32((struct Tensor*) output);
+
+  void *output = tensorHalfGemm(lhs, rhs);
+  convertToFP32((struct Tensor *)output);
 
   printTensorValues(output);
 
-  const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 20, 20, 20};
+  const float expected_result[15] = {4,  4,  4,  8,  8,  8,  12, 12,
+                                     12, 16, 16, 16, 20, 20, 20};
 
-  unitTestResults.evalTestResult((Tensor*) output, expected_result, 15, 0.01, "Hgemm");
+  unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01,
+                                 "Hgemm");
 }
 
-
-
-void testTensorSgemm(UnitTestResults& unitTestResults){
+void testTensorSgemm(UnitTestResults &unitTestResults) {
 
   printf("***** TensorSgemm ***** \n\n");
-  void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
-  struct Tensor* lhs = (struct Tensor*) lhs_ptr;
+  void *lhs_ptr =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
+  struct Tensor *lhs = (struct Tensor *)lhs_ptr;
   fillTensorWithOnes(lhs);
- 
-  float* data_arr = (float*) lhs->host_data;
-  for(int i = 0; i < lhs->num_elems; i++){
+
+  float *data_arr = (float *)lhs->host_data;
+  for (int i = 0; i < lhs->num_elems; i++) {
     data_arr[i] = (i / 4) + 1;
   }
 
-  void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);  
+  void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);
   fillTensorWithOnes(rhs);
-  
-  void* output = tensorGemmGPU(lhs, rhs);
-  printTensorValues(output);
 
-  const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 20, 20, 20};
+  void *output = tensorGemmGPU(lhs, rhs);
+  printTensorValues(output);
 
-  unitTestResults.evalTestResult((Tensor*) output, expected_result, 15, 0.01, "Sgemm");
+  const float expected_result[15] = {4,  4,  4,  8,  8,  8,  12, 12,
+                                     12, 16, 16, 16, 20, 20, 20};
 
+  unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01,
+                                 "Sgemm");
 }
 
+void testTensorConcatAndSplit() {
 
+  int conv_mode = 1;         // CROSS_CORRELATION mode
+  int compute_precision = 0; // floating point precision
 
-
-
-void testTensorConcatAndSplit(){
-
-  int conv_mode = 1;  // CROSS_CORRELATION mode
-  int compute_precision = 0; // floating point precision 
-  
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
   fillWithOnesAndTwos(input);
-  void** splits = tensorSplit(input, 2, 1);
+  void **splits = tensorSplit(input, 2, 1);
 
-  void* conv2W = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
+  void *conv2W =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
   fillTensorWithOnes(conv2W);
-		     
-  void** conv2fils = tensorSplit(conv2W, 2, 0);
 
-  void* conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0,
-				       1, 1, conv_mode, compute_precision);
+  void **conv2fils = tensorSplit(conv2W, 2, 0);
+
+  void *conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0, 1, 1,
+                                       conv_mode, compute_precision);
   printTensorDims(conv2a_out);
 
-  void* conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0,
-				       1, 1, conv_mode, compute_precision);
+  void *conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0, 1, 1,
+                                       conv_mode, compute_precision);
   printTensorDims(conv2b_out);
- 
-  void* conv2_outs[2];
+
+  void *conv2_outs[2];
   conv2_outs[0] = conv2a_out;
   conv2_outs[1] = conv2b_out;
 
-  void* conv2_concat_out = tensorConcat(conv2_outs, 2, 1);
+  void *conv2_concat_out = tensorConcat(conv2_outs, 2, 1);
   printTensorDims(conv2_concat_out);
   printTensorValues(conv2_concat_out);
-  
 }
 
+void testLRN() {
 
-
-
-
-
-void testLRN(){
-
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20);
+  void *input =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20);
   fillTensorWithOnes(input);
 
   unsigned LRN_window = 5;
   double LRN_alpha = 2e-05;
   printf("LRN_alpha = %f \n", LRN_alpha);
-  
+
   double LRN_beta = 0.75;
   double LRN_k = 1.0;
 
   // TEST-point - Compare TF vs CUDNN
-  void* lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k);
+  void *lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k);
   printTensorDims(lrn1out);
   dumpWeightsToFile("tensors_out/lrn1_test.out", lrn1out);
 
-  void* input2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7);
+  void *input2 =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7);
   fillTensorWithOnes(input2);
 
   LRN_window = 5;
   LRN_alpha = 0.5 * LRN_window;
-  
+
   LRN_beta = 0.75;
   LRN_k = 1.0;
 
-  void* lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k);
+  void *lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k);
   printTensorDims(lrn2out);
-  dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out); 
+  dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out);
 }
 
-
-
-
-void testTensorAdd(){
+void testTensorAdd() {
 
   // Tensor add with equal dimensions
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
-  void* bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
+  void *bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
   fillTensorWithOnes(x);
   fillTensorWithOnes(bias);
 
@@ -222,8 +206,8 @@ void testTensorAdd(){
   printTensorValues(x);
 
   // Tensor addd with matching channel dimension
-  void* x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2);
-  void* bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1);
+  void *x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2);
+  void *bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1);
   fillTensorWithOnes(x2);
   fillTensorWithOnes(bias2);
 
@@ -231,191 +215,181 @@ void testTensorAdd(){
   printTensorValues(x2);
 }
 
-void testTensorConv(){
+void testTensorConv() {
 
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
+  void *filter =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
 
   fillTensorWithOnes(input);
   fillTensorWithOnes(filter);
 
-  int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
+  int conv_mode = 1;         // NOTE: uses CROSS_CORRELATION
   int compute_precision = 0; // floating point precision for conv
-  
-  void* conv_out = tensorConvolution(input, filter, 0, 0,
-				  1, 1, conv_mode, compute_precision);
-  printTensorValues(conv_out);
 
+  void *conv_out = tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode,
+                                     compute_precision);
+  printTensorValues(conv_out);
 }
 
+void testTensorHalfConv() {
 
-void testTensorHalfConv(){
-
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
+  void *filter =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
 
   fillTensorWithOnes(input);
   fillTensorWithOnes(filter);
 
-  int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
+  int conv_mode = 1;         // NOTE: uses CROSS_CORRELATION
   int compute_precision = 0; // floating point precision for conv
-  
-  void* conv_out = tensorHalfConvolution(input, filter, 0, 0,
-					 1, 1, conv_mode, compute_precision);
-  printTensorValues(conv_out);
 
+  void *conv_out = tensorHalfConvolution(input, filter, 0, 0, 1, 1, conv_mode,
+                                         compute_precision);
+  printTensorValues(conv_out);
 }
 
+void testTensorGroupConv() {
 
+  // NOTE: The input channel count value (param2 to Tensor and Filter) must be
+  // the same
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
+  void *filter =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
 
-
-void testTensorGroupConv(){
-
-  // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
-
-  // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor, val)
+  // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor,
+  // val)
   fillTensorWithOnes(input);
   fillTensorWithOnes(filter);
 
   int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
   int conv_groups = 2;
-  
-  void* conv_out = tensorConvolution(input, filter,
-	                             0, 0,
-				     1, 1,
-				     conv_mode, conv_groups);
+
+  void *conv_out =
+      tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups);
   printTensorValues(conv_out);
-  
 }
 
+void testTensorHalfGroupConv() {
 
-void testTensorHalfGroupConv(){
-
-  // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
+  // NOTE: The input channel count value (param2 to Tensor and Filter) must be
+  // the same
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
+  void *filter =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3);
 
   fillTensorWithOnes(input);
   fillTensorWithOnes(filter);
 
   int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
   int conv_groups = 2;
-  
-  void* conv_out = tensorConvolution(input, filter,
-	                             0, 0,
-				     1, 1,
-				     conv_mode, conv_groups);
-  
-  convertToFP32((struct Tensor*) conv_out);
+
+  void *conv_out =
+      tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups);
+
+  convertToFP32((struct Tensor *)conv_out);
 
   printTensorValues(conv_out);
 }
 
+void testTensorPooling() {
 
-void testTensorPooling(){
-
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
   fillTensorWithOnes(x);
 
-  float* data_arr = (float*) ((Tensor*) x)->host_data;
-  for(int i = 0; i < ((Tensor*) x)->num_elems; i += 4){
+  float *data_arr = (float *)((Tensor *)x)->host_data;
+  for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) {
     data_arr[i] = i;
   }
 
-  void* output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
+  void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
   printTensorValues(output);
 }
 
+void testTensorHalfPooling() {
 
-void testTensorHalfPooling(){
-
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4);
   fillTensorWithOnes(x);
 
-  float* data_arr = (float*) ((Tensor*) x)->host_data;
-  for(int i = 0; i < ((Tensor*) x)->num_elems; i += 4){
+  float *data_arr = (float *)((Tensor *)x)->host_data;
+  for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) {
     data_arr[i] = i;
   }
 
-  void* output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
-  convertToFP32((struct Tensor*) output);
+  void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2);
+  convertToFP32((struct Tensor *)output);
 
   printTensorValues(output);
 }
 
+void testTensorBatchNorm() {
 
-void testTensorBatchNorm(){
-
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
   fillTensorWithVal(x, 3);
 
-  void* gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(gamma, 1);
 
-  void* beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(beta, 0);
 
-  void* mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(mean, 1);
 
-  void* variance = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *variance =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(variance, 1);
 
   double epsilon = 1;
   // NOTE: result = X - mean / sqrt(epsilon + variance)
-  void* output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);
+  void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);
 
-  printTensorValues(output);  
+  printTensorValues(output);
 }
 
+void testTensorHalfBatchNorm() {
 
-void testTensorHalfBatchNorm(){
-
-  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
+  void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
   fillTensorWithVal(x, 3);
 
-  void* gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(gamma, 1);
 
-  void* beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(beta, 0);
 
-  void* mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(mean, 1);
 
-  void* variance = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  void *variance =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
   fillTensorWithVal(variance, 1);
 
-
   double epsilon = 1;
   // NOTE: result = X - mean / sqrt(epsilon + variance)
-  void* output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);  
-  convertToFP32((struct Tensor*) output);
+  void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1);
+  convertToFP32((struct Tensor *)output);
 
-  printTensorValues(output);  
+  printTensorValues(output);
 }
 
+void testTensorRelu() {
 
-void testTensorRelu(){
-
-  // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match 
+  // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match
   printf("***** TensorRelu ***** \n\n");
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
   fillTensorWithNegOnes(input);
 
-  void* output = tensorRelu(input);
+  void *output = tensorRelu(input);
   printTensorValues(output);
 }
 
-
-void testTensorSoftmax(){
+void testTensorSoftmax() {
 
   printf("***** TensorSoftmax ***** \n\n");
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1);
 
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
   host_ptr[0] = 0.1;
   host_ptr[1] = 0.2;
   host_ptr[2] = 0.3;
@@ -425,39 +399,36 @@ void testTensorSoftmax(){
   host_ptr[6] = 0.7;
   host_ptr[7] = 2.5;
 
-  void* output = tensorSoftmax(input);
+  void *output = tensorSoftmax(input);
   printTensorValues(output);
 }
 
+void testSoftmaxOutput(void *output_ptr) {
 
-void testSoftmaxOutput(void* output_ptr){
+  struct Tensor *output = (struct Tensor *)output_ptr;
 
-  struct Tensor* output = (struct Tensor*) output_ptr;
-  
   size_t batch_dim = output->dims.dim_sizes[0];
   size_t channels = output->dims.dim_sizes[1];
 
-  float* data = (float*) output->host_data;
-  for(int i = 0; i < batch_dim; i++){
+  float *data = (float *)output->host_data;
+  for (int i = 0; i < batch_dim; i++) {
     float sum = 0.0;
-    for(int j = 0; j < channels; j++){
+    for (int j = 0; j < channels; j++) {
       sum += data[i * channels + j];
     }
     printf("output_sum = %f \n", sum);
   }
-  
 }
 
-
-
-void testPromiseError(){
+void testPromiseError() {
 
   printf("***** TensorQuantize ***** \n\n");
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
 
-  void* gold_tensor = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
-  float* gold_ptr = (float*) ((struct Tensor*) gold_tensor)->host_data;
+  void *gold_tensor =
+      create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
+  float *gold_ptr = (float *)((struct Tensor *)gold_tensor)->host_data;
 
   gold_ptr[0] = -1;
   gold_ptr[1] = -2;
@@ -472,21 +443,20 @@ void testPromiseError(){
   gold_ptr[10] = 1;
   gold_ptr[11] = 1;
 
-
   int num_elems = 12;
   int num_runs = 1000;
 
-  float* result_ptr = (float*) malloc(sizeof(float) * num_elems);
+  float *result_ptr = (float *)malloc(sizeof(float) * num_elems);
 
-  for (int swing = 1; swing <= 7; swing++){
+  for (int swing = 1; swing <= 7; swing++) {
 
-    for (int j = 0; j < num_elems; j++){
-      result_ptr[j] = 0; 
+    for (int j = 0; j < num_elems; j++) {
+      result_ptr[j] = 0;
     }
 
     float error_sum = 0.0;
-    
-    for (int i = 0; i < 1000; i++){
+
+    for (int i = 0; i < 1000; i++) {
       host_ptr[0] = -1;
       host_ptr[1] = -2;
       host_ptr[2] = -3;
@@ -499,43 +469,39 @@ void testPromiseError(){
       host_ptr[9] = 2;
       host_ptr[10] = 1;
       host_ptr[11] = 1;
- 
-      void* error_out = addPromiseError(input, swing);
-      //printTensorValues(error_out);
+
+      void *error_out = addPromiseError(input, swing);
+      // printTensorValues(error_out);
 
       // Move result data back to the host
       hpvm_request_tensor(input, 0);
-      float* error_out_ptr = (float*) ((struct Tensor*) input)->host_data;
+      float *error_out_ptr = (float *)((struct Tensor *)input)->host_data;
 
-      for (int j = 0; j < num_elems; j++){
-	result_ptr[j] += error_out_ptr[j];
-	error_sum += (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]); 
+      for (int j = 0; j < num_elems; j++) {
+        result_ptr[j] += error_out_ptr[j];
+        error_sum +=
+            (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]);
       }
     }
 
-    printf ("\n\n - Swing %d results : \n", swing);
-    for (int j = 0; j < num_elems; j++){
+    printf("\n\n - Swing %d results : \n", swing);
+    for (int j = 0; j < num_elems; j++) {
       result_ptr[j] = result_ptr[j] / num_runs;
       printf(" %f ", result_ptr[j]);
     }
 
     printf("mean_error = %f \n", error_sum / num_runs);
-    
+
     printf(" \n");
   }
-  
-  
 }
 
-
-
-
-void testQuantization(){
+void testQuantization() {
 
   printf("***** TensorQuantize ***** \n\n");
-  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
+  void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1);
 
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
   host_ptr[0] = -0.1;
   host_ptr[1] = -25;
   host_ptr[2] = 0.2;
@@ -548,13 +514,12 @@ void testQuantization(){
   host_ptr[9] = 7.2;
   host_ptr[10] = 2.5;
   host_ptr[11] = 3;
- 
 
-  void* quantize_result1 = quantizeTensorPromise(input, -4, 6);
+  void *quantize_result1 = quantizeTensorPromise(input, -4, 6);
 
-  printf ("\n ** quantizing with range min = %d max = %d \n", -4, 6);
+  printf("\n ** quantizing with range min = %d max = %d \n", -4, 6);
   printTensorValues(quantize_result1);
-  
+
   host_ptr[0] = -0.1;
   host_ptr[1] = -25;
   host_ptr[2] = 0.2;
@@ -568,9 +533,9 @@ void testQuantization(){
   host_ptr[10] = 2.5;
   host_ptr[11] = 3;
 
-  void* quantize_result2 = quantizeTensorPromise(input, -2, 2);
+  void *quantize_result2 = quantizeTensorPromise(input, -2, 2);
 
-  printf ("\n ** quantizing with range min = %d max = %d \n", -2, 2);
+  printf("\n ** quantizing with range min = %d max = %d \n", -2, 2);
   printTensorValues(quantize_result2);
 
   host_ptr[0] = -0.1;
@@ -586,13 +551,12 @@ void testQuantization(){
   host_ptr[10] = 2.5;
   host_ptr[11] = 3;
 
+  void *quantize_result3 = quantizeTensorPromise(input, -25, 8);
 
-  void* quantize_result3 = quantizeTensorPromise(input, -25, 8);
-
-  printf ("\n ** quantizing with range min = %d max = %d \n", -25, 8);
+  printf("\n ** quantizing with range min = %d max = %d \n", -25, 8);
   printTensorValues(quantize_result3);
 
-  printf ("\n ** quantizing with range min = %d max = %d \n", -10, 10);
+  printf("\n ** quantizing with range min = %d max = %d \n", -10, 10);
 
   host_ptr[0] = -0.1;
   host_ptr[1] = -25;
@@ -607,30 +571,26 @@ void testQuantization(){
   host_ptr[10] = 2.5;
   host_ptr[11] = 3;
 
-
-  void* quantize_result4 = quantizeTensorPromise(input, -10, 10);  
+  void *quantize_result4 = quantizeTensorPromise(input, -10, 10);
   printTensorValues(quantize_result4);
 
-
-  void* quantize_result5 = quantizeTensorPromise(input, -10, 10);
+  void *quantize_result5 = quantizeTensorPromise(input, -10, 10);
   printTensorValues(quantize_result5);
-  
-  //void* error_out = addPromiseError(quantize_result, 1);
-  //printTensorValues(error_out);
 
+  // void* error_out = addPromiseError(quantize_result, 1);
+  // printTensorValues(error_out);
 }
 
-
-
-
-void testSampleFilter(){
+void testSampleFilter() {
 
   printf("***** Tensor Sample Filter ***** \n\n");
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
-  //fillTensorWithVal(input, 3);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3);
+  // fillTensorWithVal(input, 3);
   fillWithOnesAndTwos(input);
-  
-  Tensor* input2 = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, 2, 32, 32);
+
+  Tensor *input2 = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
+                                            3, 2, 32, 32);
   fillTensorWithVal(input2, 1);
 
   /*  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
@@ -649,7 +609,7 @@ void testSampleFilter(){
   /*  printf("\n\n");
 
   hpvm_request_tensor(input, DEVICE);
-    
+
   sampleFilter(input, 2, 1);
 
   hpvm_request_tensor(input, HOST);
@@ -657,116 +617,81 @@ void testSampleFilter(){
   printTensorValues(input);
   */
 
-  void* exact_res = tensorConvolution(input2, input, 0, 0,
-				      1, 1, 1, 1);
+  void *exact_res = tensorConvolution(input2, input, 0, 0, 1, 1, 1, 1);
   printTensorValues(exact_res);
-  
-  void* res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0);
-
-  //void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3);
- 
-  printTensorValues(res);
-  
-}
-
-
 
+  void *res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0);
 
+  // void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3);
 
-void testPerforationCalls(void* input, void* filter,
-			  int pad_h, int pad_w,
-			  int stride_h, int stride_w,
-			  int row, int col){
+  printTensorValues(res);
+}
 
+void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w,
+                          int stride_h, int stride_w, int row, int col) {
 
   float interpolation_rate = 1.0;
-  for (int offset = 0; offset < 2; offset++){
-  
-      printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d row = %d col = %d  offset= %d \n\n",
-	     pad_h, pad_w, stride_h, stride_w, row, col, offset);
-
-    
-      void* res_exact = tensorConvolution(input, filter, pad_h, pad_w,
-					  stride_h, stride_w,
-					  1, 1);
-
-      printf ("tensorConvolution Result :");
-      printTensorValues(res_exact);
-
+  for (int offset = 0; offset < 2; offset++) {
 
-      void* res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w,
-					  stride_h, stride_w,
-					  1, 1, 1, 1, 1, 1);
+    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
+           "row = %d col = %d  offset= %d \n\n",
+           pad_h, pad_w, stride_h, stride_w, row, col, offset);
 
-      printf ("\nBaseline Result :");
-      printTensorValues(res_exact2);
+    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1);
 
+    printf("tensorConvolution Result :");
+    printTensorValues(res_exact);
 
-      void* res_exact3 = tensorConvApproxHalf2(input, filter, pad_h, pad_w,
-					       stride_h, stride_w,
-					       1, 1, 1, 1, 1, 1);
-      convertToFP32((struct Tensor*) res_exact3);
+    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1, 1, 1, 1, 1);
 
-      printf ("\nFP16_Baseline Result :");
-      printTensorValues(res_exact3);
+    printf("\nBaseline Result :");
+    printTensorValues(res_exact2);
 
-    
-      void* res_sim = tensorConvPerfCuda(input, filter,
-					 pad_h, pad_w,
-					 stride_h, stride_w,
-					 1, 1,
-					 row, col,
-					 offset);
+    void *res_exact3 = tensorConvApproxHalf2(
+        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
+    convertToFP32((struct Tensor *)res_exact3);
 
-      printf ("\nConvPerfCuda Result :");
-      printTensorValues(res_sim);
+    printf("\nFP16_Baseline Result :");
+    printTensorValues(res_exact3);
 
-  
-      void* res = tensorConvApprox(input, filter,
-				   pad_h, pad_w,
-				   stride_h, stride_w,
-				   1, 1,
-				   row, col,
-				   1, offset);
+    void *res_sim = tensorConvPerfCuda(input, filter, pad_h, pad_w, stride_h,
+                                       stride_w, 1, 1, row, col, offset);
 
+    printf("\nConvPerfCuda Result :");
+    printTensorValues(res_sim);
 
-      printf ("\nConvApprox Result :");
-      printTensorValues(res);
+    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                 stride_w, 1, 1, row, col, 1, offset);
 
+    printf("\nConvApprox Result :");
+    printTensorValues(res);
 
-      void* res_half = tensorConvApproxHalf2(input, filter,
-					     pad_h, pad_w,
-					     stride_h, stride_w,
-					     1, 1,
-					     row, col,
-					     1, offset);
+    void *res_half =
+        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
+                              1, 1, row, col, 1, offset);
 
-      convertToFP32((struct Tensor*) res_half);
+    convertToFP32((struct Tensor *)res_half);
 
-      printf ("\nConvApproxHalf2 Result :");
-      printTensorValues(res_half);
+    printf("\nConvApproxHalf2 Result :");
+    printTensorValues(res_half);
+  }
 
-    }
-  
- 
-  printf ("\n\n\n--- End of Test \n\n\n");
+  printf("\n\n\n--- End of Test \n\n\n");
 }
 
-
-
-
-
 /**** Tests Perforation for a set of different inputs */
-void testPerforation(UnitTestResults& unitTestResults){
+void testPerforation(UnitTestResults &unitTestResults) {
 
-  
   printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
   fillTensorWithVal(input, 1);
-  
-  Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
-  fillTensorWithVal(filter, 1);
 
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
+  fillTensorWithVal(filter, 1);
 
   /*
   float* host_ptr = (float*) ((struct Tensor*) filter)->host_data;
@@ -785,43 +710,33 @@ void testPerforation(UnitTestResults& unitTestResults){
   host_ptr[24] = 2;
   host_ptr[26] = 2;
   */
-  
 
   testPerforationCalls(input, filter, 0, 0, 1, 1, 1, 2);
 
   testPerforationCalls(input, filter, 0, 0, 1, 1, 2, 1);
 
-
   testPerforationCalls(input, filter, 1, 1, 1, 1, 1, 3);
 
   testPerforationCalls(input, filter, 1, 1, 1, 1, 3, 1);
 
-
   testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4);
 
   testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1);
-    
 }
 
-
-
-
-
-
-
-
-
-void testSampling(){
+void testSampling() {
 
   printf("***** Testing Sampling ***** \n\n");
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
   fillTensorWithVal(input, 1);
-  //fillWithOnesAndTwos(input);
-  
-  Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
+  // fillWithOnesAndTwos(input);
+
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
   fillTensorWithVal(filter, 1);
 
-  float* host_ptr = (float*) ((struct Tensor*) filter)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
   host_ptr[0] = 2;
   host_ptr[2] = 2;
   host_ptr[4] = 2;
@@ -836,144 +751,124 @@ void testSampling(){
   host_ptr[22] = 2;
   host_ptr[24] = 2;
   host_ptr[26] = 2;
-  //printTensorValues(input);
+  // printTensorValues(input);
+
+  void *res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
 
-  void* res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1);
-  
   printTensorValues(res);
 
+  void *res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1);
 
-  void* res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1);
-  
   printTensorValues(res2);
 
+  void *res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0);
 
-  void* res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0);
-  
   printTensorValues(res2_sim);
 
-  
-  void* res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0);
-  
+  void *res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0);
+
   printTensorValues(res3);
 
+  void *res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
 
-  void* res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
-  
   printTensorValues(res4);
 
+  void *res4_half =
+      tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
 
-  void* res4_half = tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0);
-
-  convertToFP32((struct Tensor*) res4_half);
+  convertToFP32((struct Tensor *)res4_half);
 
   printTensorValues(res4_half);
-
 }
 
-
-
-
-void testSamplingCalls(void* input, void* filter,
-		       int pad_h, int pad_w,
-		       int stride_h, int stride_w,
-		       int skip_every, UnitTestResults& unitTestResults){
-
+void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w,
+                       int stride_h, int stride_w, int skip_every,
+                       UnitTestResults &unitTestResults) {
 
   float interpolation_rate = 1.0;
-  for (int offset = 0; offset < 2; offset++){
-
-  
-      printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d skip_every = %d offset= %d interpolation_rate = %f \n\n",
-	     pad_h, pad_w, stride_h, stride_w, skip_every, offset, interpolation_rate);
-
-    
-      void* res_exact = tensorConvolution(input, filter, pad_h, pad_w,
-					  stride_h, stride_w,
-					  1, 1);
+  for (int offset = 0; offset < 2; offset++) {
 
-      printf ("tensorConvolution Result :");
-      printTensorValues(res_exact);
+    printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d "
+           "skip_every = %d offset= %d interpolation_rate = %f \n\n",
+           pad_h, pad_w, stride_h, stride_w, skip_every, offset,
+           interpolation_rate);
 
+    void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1);
 
-      void* res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w,
-					  stride_h, stride_w,
-					  1, 1, 1, 1, 1, 1);
+    printf("tensorConvolution Result :");
+    printTensorValues(res_exact);
 
-      printf ("\nBaseline Result :");
-      printTensorValues(res_exact2);
+    void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                        stride_w, 1, 1, 1, 1, 1, 1);
 
+    printf("\nBaseline Result :");
+    printTensorValues(res_exact2);
 
-      void* res_exact3 = tensorConvApproxHalf2(input, filter, pad_h, pad_w,
-					       stride_h, stride_w,
-					       1, 1, 1, 1, 1, 1);
-      convertToFP32((struct Tensor*) res_exact3);
+    void *res_exact3 = tensorConvApproxHalf2(
+        input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1);
+    convertToFP32((struct Tensor *)res_exact3);
 
-      printf ("\nFP16_Baseline Result :");
-      printTensorValues(res_exact3);
+    printf("\nFP16_Baseline Result :");
+    printTensorValues(res_exact3);
 
-    
-      void* res_sim = tensorConvSampSim2(input, filter, pad_h, pad_w,
-					 stride_h, stride_w,
-					 1, 1, skip_every, offset, interpolation_rate);
+    void *res_sim =
+        tensorConvSampSim2(input, filter, pad_h, pad_w, stride_h, stride_w, 1,
+                           1, skip_every, offset, interpolation_rate);
 
-      printf ("\nConvSampSim Result :");
-      printTensorValues(res_sim);
+    printf("\nConvSampSim Result :");
+    printTensorValues(res_sim);
 
-  
-      void* res = tensorConvApprox(input, filter, pad_h, pad_w,
-				   stride_h, stride_w,
-				   1, 1, 1, 1, skip_every, offset);
+    void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h,
+                                 stride_w, 1, 1, 1, 1, skip_every, offset);
 
+    printf("\nConvApprox Result :");
+    printTensorValues(res);
 
-      printf ("\nConvApprox Result :");
-      printTensorValues(res);
+    void *res_half =
+        tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w,
+                              1, 1, 1, 1, skip_every, offset);
 
+    convertToFP32((struct Tensor *)res_half);
 
-      void* res_half = tensorConvApproxHalf2(input, filter, pad_h, pad_w,
-					     stride_h, stride_w,
-					     1, 1, 1, 1, skip_every, offset);
+    printf("\nConvApproxHalf2 Result :");
+    printTensorValues(res_half);
 
-      convertToFP32((struct Tensor*) res_half);
+    std::string suffix =
+        std::string(" pad_h = ") + std::to_string(pad_h) +
+        std::string(" pad_w = ") + std::to_string(pad_w) +
+        std::string(" stride_h = ") + std::to_string(stride_h) +
+        std::string(" stride_w = ") + std::to_string(stride_w) +
+        std::string(" skip_every = ") + std::to_string(skip_every) +
+        std::string(" offset = ") + std::to_string(offset);
 
-      printf ("\nConvApproxHalf2 Result :");
-      printTensorValues(res_half);
+    std::string test_name = std::string("SAMP_FP32 ") + suffix;
 
-      std::string suffix = std::string(" pad_h = ") + std::to_string(pad_h)
-	+ std::string(" pad_w = ") + std::to_string(pad_w)
-        + std::string(" stride_h = ") + std::to_string(stride_h)
-	+ std::string(" stride_w = ") + std::to_string(stride_w)
-        + std::string(" skip_every = ") + std::to_string(skip_every)
-	+ std::string(" offset = ") + std::to_string(offset);
+    unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.01,
+                                   test_name);
 
-      std::string test_name = std::string("SAMP_FP32 ") + suffix; 
-					  
-      unitTestResults.compareTensors((Tensor*) res, (Tensor*) res_sim, 0.01, test_name);
+    std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix;
+    unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.04,
+                                   fp16_test_name);
+  }
 
-      std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix; 
-      unitTestResults.compareTensors((Tensor*) res_half, (Tensor*) res_sim, 0.04, fp16_test_name);
-    }
-  
- 
-  printf ("\n\n\n --- End of Test \n\n\n");
+  printf("\n\n\n --- End of Test \n\n\n");
 }
 
-
-
 /**** Tests Sample for a sample 3 * 3 Filter */
-void testSampling_3_3(UnitTestResults& unitTestResults){
+void testSampling_3_3(UnitTestResults &unitTestResults) {
 
-  
   printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n");
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4);
   fillTensorWithVal(input, 1);
-  //fillWithOnesAndTwos(input);
-  
-  Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
-  fillTensorWithVal(filter, 1);
+  // fillWithOnesAndTwos(input);
 
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3);
+  fillTensorWithVal(filter, 1);
 
-  float* host_ptr = (float*) ((struct Tensor*) filter)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)filter)->host_data;
   host_ptr[0] = 2;
   host_ptr[2] = 2;
   host_ptr[4] = 2;
@@ -989,7 +884,6 @@ void testSampling_3_3(UnitTestResults& unitTestResults){
   host_ptr[24] = 2;
   host_ptr[26] = 2;
 
-  
   // Tests with padding = 0 stride = 1
   testSamplingCalls(input, filter, 0, 0, 1, 1, 2, unitTestResults);
 
@@ -1010,27 +904,19 @@ void testSampling_3_3(UnitTestResults& unitTestResults){
   testSamplingCalls(input, filter, 1, 1, 2, 2, 3, unitTestResults);
 
   testSamplingCalls(input, filter, 1, 1, 2, 2, 4, unitTestResults);
-
-    
 }
 
-
-
-
-
-
-
 /**** Tests Sample for a sample 1 * 1 Filter */
-void testSampling_1_1(UnitTestResults& unitTestResults){
+void testSampling_1_1(UnitTestResults &unitTestResults) {
 
-  
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2);
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2);
   fillTensorWithVal(input, 2);
-  //fillWithOnesAndTwos(input);
-  
-  Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1);
+  // fillWithOnesAndTwos(input);
+
+  Tensor *filter =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1);
   fillTensorWithVal(filter, 2);
-  
 
   // Tests with padding = 0 stride = 1
   testSamplingCalls(input, filter, 0, 0, 1, 1, 2, unitTestResults);
@@ -1039,25 +925,20 @@ void testSampling_1_1(UnitTestResults& unitTestResults){
 
   testSamplingCalls(input, filter, 0, 0, 1, 1, 4, unitTestResults);
 
-
   // Tests with padding = 1 stride = 1
   testSamplingCalls(input, filter, 1, 1, 1, 1, 2, unitTestResults);
 
   testSamplingCalls(input, filter, 1, 1, 1, 1, 3, unitTestResults);
 
   testSamplingCalls(input, filter, 1, 1, 1, 1, 4, unitTestResults);
-
-    
 }
 
+void *testTensorArgMax() {
 
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1);
 
-
-void* testTensorArgMax(){
-
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1);
- 
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
 
   // Input 0
   host_ptr[0] = 1;
@@ -1079,37 +960,34 @@ void* testTensorArgMax(){
   host_ptr[10] = 2;
   host_ptr[11] = 8;
 
-  void* argmax_out = tensorArgMax(input);
-  
-  // Expect Output of call below to be:  
+  void *argmax_out = tensorArgMax(input);
+
+  // Expect Output of call below to be:
   //   1    2    2    0
   printTensorValues(argmax_out);
 
-  return argmax_out; 
+  return argmax_out;
 }
 
+void *testTensorSelect(void *argmax_out) {
 
-
-void* testTensorSelect(void* argmax_out){
-
-  void* select_out = tensorSelect(argmax_out, 2);
-  printf ("***** tensorSelect output \n");
+  void *select_out = tensorSelect(argmax_out, 2);
+  printf("***** tensorSelect output \n");
 
   printTensorValues(select_out);
 
-  return select_out; 
-  
+  return select_out;
 }
 
+void testTensorContract(void *select_out) {
 
-void testTensorContract(void* select_out){
-
-  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1);
-  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+  Tensor *input =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1);
+  float *host_ptr = (float *)((struct Tensor *)input)->host_data;
 
   // Input 0
   host_ptr[0] = 1;
-  host_ptr[1] = 1; 
+  host_ptr[1] = 1;
   host_ptr[2] = 1;
   host_ptr[3] = 1;
 
@@ -1118,51 +996,38 @@ void testTensorContract(void* select_out){
   host_ptr[5] = 2;
   host_ptr[6] = 2;
   host_ptr[7] = 2;
-  
+
   // Input 2
   host_ptr[8] = 3;
   host_ptr[9] = 3;
-  host_ptr[10] = 3; 
-  host_ptr[11] = 3; 
+  host_ptr[10] = 3;
+  host_ptr[11] = 3;
 
   // Input 3
-  host_ptr[12] = 4; 
+  host_ptr[12] = 4;
   host_ptr[13] = 4;
   host_ptr[14] = 4;
   host_ptr[15] = 4;
 
-
-  void* contract_out = tensorContract(input, select_out);
-  printf ("***** tensorContract output \n");
+  void *contract_out = tensorContract(input, select_out);
+  printf("***** tensorContract output \n");
 
   printTensorValues(contract_out);
-
 }
 
+void testNewTensorOps() {
 
-
-void testNewTensorOps(){
-
-  void* argmax_out = testTensorArgMax();
-  void* select_out = testTensorSelect(argmax_out);
+  void *argmax_out = testTensorArgMax();
+  void *select_out = testTensorSelect(argmax_out);
   testTensorContract(select_out);
-  
 }
 
-
-
-
-
-
-
-
-int main(){
+int main() {
 
   llvm_hpvm_initTensorRt(0);
 
-
   UnitTestResults unitTestResults;
-  
+
   // Function call per unit test
   testTensorHgemm(unitTestResults);
   testTensorSgemm(unitTestResults);
@@ -1181,31 +1046,26 @@ int main(){
   testTensorHalfPooling();
 
   */
-  
+
   testSampling_3_3(unitTestResults);
   testSampling_1_1(unitTestResults);
 
   testPerforation(unitTestResults);
 
-  
-
   unitTestResults.printSummary();
-  
 
   // testTensorError();
-  // testQuantization(); 
+  // testQuantization();
   // testTensorGemm();
   // testTensorGemmGPU();
-  // testTensorGemmBias();  
+  // testTensorGemmBias();
   // testTensorConv2();
   // testTensorConv3();
   // testLRN();
   // testSampleFilter();
-  // testNewTensorOps(); 
+  // testNewTensorOps();
   // testQuantization();
   // testPromiseError();
-  
-    
+
   return 0;
 }
-
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
index 98d6d63ead..1ca90cf6f7 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -14,10 +14,10 @@ __global__ void convToGemmApproxHalf(
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     for (int i = 0; i < KH; i++) {
       for (int j = 0; j < KW; j++) {
         const int filter_elem_num =
@@ -58,7 +58,7 @@ convToGemmPerfRow(float *const __restrict__ output,
                                                             // number
   const int h = tx % (H_eff * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out; // output width index (col number)
+  const int w = tx % W_out;                   // output width index (col number)
   int past_start = (h % (x - 1) >= (x - 1 - start));
   const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride -
                   V_pad;                // input height index (row number)
@@ -135,7 +135,7 @@ convToGemmPerfCol(float *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_eff) / W_eff; // output height index (row
                                               // number)
-  const int w = tx % W_eff; // output width index (col number)
+  const int w = tx % W_eff;                   // output width index (col number)
   int past_start = (w % (x - 1)) >= (x - 1 - start);
   const int inH = h * V_stride - V_pad; // input height index (row number)
   const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride -
@@ -394,7 +394,7 @@ __global__ void convToGemmPerfRowHalf(
                                                             // number
   const int h = tx % (H_eff * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out; // output width index (col number)
+  const int w = tx % W_out;                   // output width index (col number)
   int past_start = (h % (x - 1) >= (x - 1 - start));
   const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride -
                   V_pad;                // input height index (row number)
@@ -469,7 +469,7 @@ __global__ void convToGemmPerfColHalf(
                                                             // number
   const int h = tx % (H_out * W_eff) / W_eff; // output height index (row
                                               // number)
-  const int w = tx % W_eff; // output width index (col number)
+  const int w = tx % W_eff;                   // output width index (col number)
   int past_start = (w % (x - 1)) >= (x - 1 - start);
   const int inH = h * V_stride - V_pad; // input height index (row number)
   const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride -
@@ -557,10 +557,10 @@ __global__ void convToGemmApproxHalfN(
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     for (int i = 0; i < KH; i++) {
       for (int j = 0; j < KW; j++) {
         const int filter_elem_num =
@@ -832,10 +832,10 @@ convToGemmHalfInput(__half *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     for (int i = 0; i < KH; i++) {
       for (int j = 0; j < KW; j++) {
         const int filter_elem_num =
@@ -873,10 +873,10 @@ convToGemmHalfInput2(__half *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     const int filter_elem_num = c * KH * KW;
     for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) {
       int i = l / KW;
@@ -1044,10 +1044,10 @@ convToGemmFullInput(float *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     for (int i = 0; i < KH; i++) {
       for (int j = 0; j < KW; j++) {
         const int filter_elem_num =
@@ -1085,10 +1085,10 @@ convToGemmFullInput2(float *const __restrict__ output,
                                                             // number
   const int h = tx % (H_out * W_out) / W_out; // output height index (row
                                               // number)
-  const int w = tx % W_out;             // output width index (col number)
-  const int inH = h * V_stride - V_pad; // input height index (row number)
-  const int inW = w * H_stride - H_pad; // input width index (col number)
-  if (n < N) {                          // is thread id within bounds?
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
     const int filter_elem_num = c * KH * KW;
     for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) {
       int i = l / KW;
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
index 330d97600e..c318a8fb6a 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
@@ -3,7 +3,6 @@
 #ifndef APPROXHPVM_RUNTIME_UTILS
 #define APPROXHPVM_RUNTIME_UTILS
 
-
 #include "tensor_runtime.h"
 #include "tensor_cpu_runtime.h"
 #include "configuration.h"
@@ -17,30 +16,29 @@
 //---                      CPU Approximation handling                      ---//
 //----------------------------------------------------------------------------//
 
-void* handleTensorAddApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, void* bias) {
+void *handleTensorAddApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *bias) {
 
-if (approxTuples.size() == 1) {
+  if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorAddCPU(input, bias);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorAddCPU(input, bias);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -53,32 +51,31 @@ if (approxTuples.size() == 1) {
   return NULL;
 }
 
-void* handleTensorMulApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* lhs, void* rhs) {
+void *handleTensorMulApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *lhs, void *rhs) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorGemmCPU(lhs, rhs);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorGemmCPU(lhs, rhs);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
-      }
+    }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
     abort();
@@ -89,79 +86,72 @@ void* handleTensorMulApproximationTuples_CPU(
   return NULL;
 }
 
-void* handleTensorConvApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter, 
-  int conv_pad_h, int conv_pad_w,
-  int conv_stride_h, int conv_stride_w) {
+void *handleTensorConvApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int conv_pad_h, int conv_pad_w,
+    int conv_stride_h, int conv_stride_w) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvApproxCPU(input, filter,
-                               conv_pad_h, conv_pad_w,
-                               conv_stride_h, conv_stride_w,
-                               1, 1,
-                               1, 1, 1, 1);
-
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
-        return t_out;
-        }
-      case CPUNodeConfiguration::APPROX::PERFORATION :
-        {
-          PerfParams params = perfParamSet->getPerfParams(param);
-          INFO("perforation param = %i\n", param);
-          INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
-                params.row, params.col, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxCPU(input, filter,
-                                 conv_pad_h, conv_pad_w,
-                                 conv_stride_h, conv_stride_w,
-                                 1, 1,
-                                 params.row, params.col, 1, params.skip_offset);
-
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)", pinfo.second);
-          return t_out;
-        }
-      case CPUNodeConfiguration::APPROX::INPUT_SAMPLING :
-        {
-          SampParams params = sampParamSet->getSampParams(param);
-          INFO("sampling param = %i\n", param);
-          INFO("params.skip_rate = %i, params.skip_offset = %i\n",
-                params.skip_rate, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxCPU(input, filter,
-                                 conv_pad_h, conv_pad_w,
-                                 conv_stride_h, conv_stride_w,
-                                 1, 1,
-                                 1, 1,
-                                params.skip_rate, params.skip_offset);
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)", pinfo.second);
-          return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out =
+          tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w,
+                              conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
+      return t_out;
+    }
+    case CPUNodeConfiguration::APPROX::PERFORATION: {
+      PerfParams params = perfParamSet->getPerfParams(param);
+      INFO("perforation param = %i\n", param);
+      INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
+           params.row, params.col, params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxCPU(
+          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
+          1, 1, params.row, params.col, 1, params.skip_offset);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)",
+                                             pinfo.second);
+      return t_out;
+    }
+    case CPUNodeConfiguration::APPROX::INPUT_SAMPLING: {
+      SampParams params = sampParamSet->getSampParams(param);
+      INFO("sampling param = %i\n", param);
+      INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate,
+           params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w,
+                                  conv_stride_h, conv_stride_w, 1, 1, 1, 1,
+                                  params.skip_rate, params.skip_offset);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -174,75 +164,73 @@ void* handleTensorConvApproximationTuples_CPU(
   return NULL;
 }
 
-void* handleTensorGroupConvApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride,
-  int conv_mode, int conv_groups) {
+void *handleTensorGroupConvApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int conv_groups) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvCutlassCPU(input, filter,
-                                     vertical_pad, horizontal_pad,
-                                     vertical_stride, horizontal_stride,
-                                     conv_mode, conv_groups);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvCutlassCPU(input, filter, vertical_pad, horizontal_pad,
+                                   vertical_stride, horizontal_stride,
+                                   conv_mode, conv_groups);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorBatchNormApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, void* gamma_ptr, void* beta_ptr,
-  void* mean_ptr, void* variance_ptr, double epsilon) {
+void *handleTensorBatchNormApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr,
+    void *variance_ptr, double epsilon) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr,
-                                  mean_ptr, variance_ptr, epsilon);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-    // TODO additional approx methods implemented here
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+                                 variance_ptr, epsilon);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
+      // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
@@ -254,161 +242,154 @@ void* handleTensorBatchNormApproximationTuples_CPU(
   return NULL;
 }
 
-void* handleTensorReluApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorReluApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorReluCPU(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorReluCPU(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorClippedReluApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, float min, float max) {
+void *handleTensorClippedReluApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, float min, float max) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorRelu2CPU(input, min, max);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorRelu2CPU(input, min, max);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorTanhApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorTanhApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorTanhCPU(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorTanhCPU(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorPoolingApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, int poolFunction,
-  int window_height, int window_width,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride) {
+void *handleTensorPoolingApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, int poolFunction, int window_height, int window_width,
+    int vertical_pad, int horizontal_pad, int vertical_stride,
+    int horizontal_stride) {
 
   if (approxTuples.size() == 1) {
     enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case CPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorPoolingCPU(input_ptr,
-                                 poolFunction,
-                                 window_height, window_width,
-                                 vertical_pad, horizontal_pad,
-                                 vertical_stride, horizontal_stride);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case CPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorPoolingCPU(input_ptr, poolFunction, window_height,
+                               window_width, vertical_pad, horizontal_pad,
+                               vertical_stride, horizontal_stride);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorSoftmaxApproximationTuples_CPU(
-  std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input_ptr) {
-  void* t_out;
+void *handleTensorSoftmaxApproximationTuples_CPU(
+    std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr) {
+  void *t_out;
   RC->resume_profiler();
   t_out = tensorSoftmaxCPU(input_ptr);
   RC->pause_profiler();
@@ -423,42 +404,40 @@ void* handleTensorSoftmaxApproximationTuples_CPU(
 //---                      GPU Approximation handling                      ---//
 //----------------------------------------------------------------------------//
 
-void* handleTensorAddApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, void* bias) {
+void *handleTensorAddApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *bias) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorAdd(input, bias);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfAdd(input, bias);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorAdd(input, bias);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfAdd(input, bias);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -471,44 +450,42 @@ void* handleTensorAddApproximationTuples(
   return NULL;
 }
 
-void* handleTensorMulApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* lhs, void* rhs) {
+void *handleTensorMulApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *lhs, void *rhs) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorGemmGPU(lhs, rhs);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfGemmGPU(lhs, rhs);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorGemmGPU(lhs, rhs);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfGemmGPU(lhs, rhs);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
-      }
+    }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
     abort();
@@ -519,100 +496,88 @@ void* handleTensorMulApproximationTuples(
   return NULL;
 }
 
-void* handleTensorConvApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter, 
-  int conv_pad_h, int conv_pad_w,
-  int conv_stride_h, int conv_stride_w) {
+void *handleTensorConvApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int conv_pad_h, int conv_pad_w,
+    int conv_stride_h, int conv_stride_w) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvApprox(input, filter,
-                                 conv_pad_h, conv_pad_w,
-                                 conv_stride_h, conv_stride_w,
-                                 1, 1,
-                                 1, 1, 1, 1);
-	
-
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvApproxHalf2(input, filter,
-                                     conv_pad_h, conv_pad_w,
-                                     conv_stride_h, conv_stride_w,
-                                     1, 1,
-                                     1, 1, 1, 1);
-	
-
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::PERFORATION :
-      case GPUNodeConfiguration::APPROX::PERFORATION_HP :
-        {
-          PerfParams params = perfParamSet->getPerfParams(param);
-          INFO("perforation param = %i\n", param);
-          INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
-                params.row, params.col, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxHalf2(input, filter,
-                                       conv_pad_h, conv_pad_w,
-                                       conv_stride_h, conv_stride_w,
-                                       1, 1,
-                                       params.row, params.col, 1, params.skip_offset);
-
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", pinfo.second);
-          return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::INPUT_SAMPLING :
-      case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP :
-        {
-          SampParams params = sampParamSet->getSampParams(param);
-          INFO("sampling param = %i\n", param);
-          INFO("params.skip_rate = %i, params.skip_offset = %i\n",
-                params.skip_rate, params.skip_offset);
-          void* t_out;
-          RC->resume_profiler();
-          t_out = tensorConvApproxHalf2(input, filter,
-                                       conv_pad_h, conv_pad_w,
-                                       conv_stride_h, conv_stride_w,
-                                       1, 1,
-                                       1, 1,
-                                       params.skip_rate, params.skip_offset);
-          RC->pause_profiler();
-          std::pair<double, double> pinfo = RC->get_time_energy();
-          RC->reset_profiler();
-          RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", pinfo.first);
-          RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", pinfo.second);
-          return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w,
+                               conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out =
+          tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
+                                conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf",
+                                             pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::PERFORATION:
+    case GPUNodeConfiguration::APPROX::PERFORATION_HP: {
+      PerfParams params = perfParamSet->getPerfParams(param);
+      INFO("perforation param = %i\n", param);
+      INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
+           params.row, params.col, params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxHalf2(
+          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
+          1, 1, params.row, params.col, 1, params.skip_offset);
+
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)",
+                                             pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::INPUT_SAMPLING:
+    case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP: {
+      SampParams params = sampParamSet->getSampParams(param);
+      INFO("sampling param = %i\n", param);
+      INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate,
+           params.skip_offset);
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
+                                    conv_stride_h, conv_stride_w, 1, 1, 1, 1,
+                                    params.skip_rate, params.skip_offset);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
       // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
@@ -625,103 +590,99 @@ void* handleTensorConvApproximationTuples(
   return NULL;
 }
 
-void* handleTensorGroupConvApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input, void* filter,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride,
-  int conv_mode, int conv_groups) {
+void *handleTensorGroupConvApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, void *filter, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int conv_groups) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorConvCutlass(input, filter,
-                                  vertical_pad, horizontal_pad,
-                                  vertical_stride, horizontal_stride,
-                                  conv_mode, conv_groups);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfConvCutlass(input, filter,
-                                      vertical_pad, horizontal_pad,
-                                      vertical_stride, horizontal_stride,
-                                      conv_mode, conv_groups);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorConvCutlass(input, filter, vertical_pad, horizontal_pad,
+                                vertical_stride, horizontal_stride, conv_mode,
+                                conv_groups);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfConvCutlass(input, filter, vertical_pad, horizontal_pad,
+                                    vertical_stride, horizontal_stride,
+                                    conv_mode, conv_groups);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass",
+                                           pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorBatchNormApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, void* gamma_ptr, void* beta_ptr,
-  void* mean_ptr, void* variance_ptr, double epsilon) {
+void *handleTensorBatchNormApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr,
+    void *variance_ptr, double epsilon) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr,
-                               mean_ptr, variance_ptr, epsilon);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr,
-                                   mean_ptr, variance_ptr, epsilon);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-    // TODO additional approx methods implemented here
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+                              variance_ptr, epsilon);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+                                  variance_ptr, epsilon);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm",
+                                             pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
+      abort();
+      // TODO additional approx methods implemented here
     }
   } else if (approxTuples.size() == 2) {
     ERROR("Currently unsupported case");
@@ -733,215 +694,202 @@ void* handleTensorBatchNormApproximationTuples(
   return NULL;
 }
 
-void* handleTensorReluApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorReluApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorRelu(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfRelu(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorRelu(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfRelu(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorClippedReluApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input, float min, float max) {
+void *handleTensorClippedReluApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input, float min, float max) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorRelu2(input, min, max);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfRelu2(input, min, max);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorRelu2(input, min, max);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfRelu2(input, min, max);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorTanhApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input) {
+void *handleTensorTanhApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorTanh(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfTanh(input);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorTanh(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfTanh(input);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorPoolingApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-  void* input_ptr, int poolFunction,
-  int window_height, int window_width,
-  int vertical_pad, int horizontal_pad,
-  int vertical_stride, int horizontal_stride) {
+void *handleTensorPoolingApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr, int poolFunction, int window_height, int window_width,
+    int vertical_pad, int horizontal_pad, int vertical_stride,
+    int horizontal_stride) {
 
   if (approxTuples.size() == 1) {
     enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first;
     int param = approxTuples[0].second;
     switch (approx) {
-      case GPUNodeConfiguration::APPROX::FP32 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorPooling(input_ptr,
-                             poolFunction,
-                             window_height, window_width,
-                             vertical_pad, horizontal_pad,
-                             vertical_stride, horizontal_stride);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
-        return t_out;
-        }
-      case GPUNodeConfiguration::APPROX::FP16 :
-        {
-        void* t_out;
-        RC->resume_profiler();
-        t_out = tensorHalfPooling(input_ptr,
-                                 poolFunction,
-                                 window_height, window_width,
-                                 vertical_pad, horizontal_pad,
-                                 vertical_stride, horizontal_stride);
-        RC->pause_profiler();
-        std::pair<double, double> pinfo = RC->get_time_energy();
-        RC->reset_profiler();
-        RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
-        RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
-        return t_out;
-        }
-      default :
-        CUSTOM_ASSERT(false && "Unknown approximation type");
-        ERROR("Unknown approximation type");
-        abort();
-      // TODO additional approx methods implemented here
-      }
-    } else if (approxTuples.size() == 2) {
-      ERROR("Currently unsupported case");
-      abort();
-    } else {
-      ERROR("Unsupported case");
+    case GPUNodeConfiguration::APPROX::FP32: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorPooling(input_ptr, poolFunction, window_height,
+                            window_width, vertical_pad, horizontal_pad,
+                            vertical_stride, horizontal_stride);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorHalfPooling(input_ptr, poolFunction, window_height,
+                                window_width, vertical_pad, horizontal_pad,
+                                vertical_stride, horizontal_stride);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second);
+      return t_out;
+    }
+    default:
+      CUSTOM_ASSERT(false && "Unknown approximation type");
+      ERROR("Unknown approximation type");
       abort();
+      // TODO additional approx methods implemented here
     }
+  } else if (approxTuples.size() == 2) {
+    ERROR("Currently unsupported case");
+    abort();
+  } else {
+    ERROR("Unsupported case");
+    abort();
+  }
   return NULL;
 }
 
-void* handleTensorSoftmaxApproximationTuples(
-  std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples,
-   void* input_ptr) {
-  //TODO: if approximation choices are added for softmax operation,
+void *handleTensorSoftmaxApproximationTuples(
+    std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples,
+    void *input_ptr) {
+  // TODO: if approximation choices are added for softmax operation,
   // implement this like the other handle* functions
-  void* t_out;
+  void *t_out;
   RC->resume_profiler();
   t_out = tensorSoftmax(input_ptr);
   RC->pause_profiler();
@@ -952,5 +900,4 @@ void* handleTensorSoftmaxApproximationTuples(
   return t_out;
 }
 
-
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
index b4f3d39fae..3b52cce9f6 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
@@ -144,7 +144,8 @@ public:
 // - energy
 // - accuracy (compared to golden output)
 // - accuracy loss (compared to baseline)
-// - a hardware choice and set or operations-approximation choices, described in setup
+// - a hardware choice and set or operations-approximation choices, described in
+// setup
 struct Configuration {
   std::string name;
   float speedup;
@@ -152,7 +153,7 @@ struct Configuration {
   float accuracy;
   float accuracyLoss;
   std::map<std::string, NodeConfiguration *> setup;
-  // map for mapping visc.node.id IDs to HPVM (fused) node approx-configurations 
+  // map for mapping visc.node.id IDs to HPVM (fused) node approx-configurations
   std::map<int, NodeConfiguration *> idConfigMap;
 
   Configuration(std::string &n, float f, float e, float a, float al);
@@ -171,8 +172,8 @@ struct Configuration {
 // Comparison operator definition, in increasing accuracy loss
 // (for std sort, used in pareto optimal computation)
 struct ConfigurationLessThan {
-  bool operator()(
-      const struct Configuration &a, const struct Configuration &b) const;
+  bool operator()(const struct Configuration &a,
+                  const struct Configuration &b) const;
 };
 
 // Comparison operator definition, in increasing accuracy loss
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h
index 50a0527def..a766c02d6c 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h
@@ -2,5 +2,6 @@ extern "C" {
 // Functions to be inserted with initializeTensorRT and clearTensorRT
 void llvm_hpvm_initializeRuntimeController(const char *);
 void llvm_hpvm_clearRuntimeController();
-void llvm_hpvm_invokeRtControl(void *result, const char *str, int start, int end);
+void llvm_hpvm_invokeRtControl(void *result, const char *str, int start,
+                               int end);
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
index f8b722ca38..d070d7755c 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h
@@ -1,84 +1,79 @@
-//===--------------------------- tensor_cpu_runtime.h -----------------------===//
+//===--------------------------- tensor_cpu_runtime.h
+//-----------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 // This header file comprises of the API to the tensor routines for CPU.
 // This also contains the interfaces to the approximated versions of tensor
 // operations that are supported on CPU.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include <stdio.h>
 #include <cstdlib>
 #include <cmath>
 #include <memory>
 #include <string>
 
-
 #ifndef TENSOR_CPU_HEADER
 #define TENSOR_CPU_HEADER
 
+extern "C" {
+/****  Initialization Routine - Must be inserted at program start (in the
+ * backend)  ****/
+void llvm_hpvm_initTensorRtCPU();
+void llvm_hpvm_cleanupTensorRtCPU();
 
-extern "C"{
-  /****  Initialization Routine - Must be inserted at program start (in the backend)  ****/
-  void llvm_hpvm_initTensorRtCPU();
-  void llvm_hpvm_cleanupTensorRtCPU();
+// Routine to moving tensor data (from and to GPU,CPU)
+void hpvm_request_tensorCPU(void *tensor, int destination);
 
-  // Routine to moving tensor data (from and to GPU,CPU)
-  void hpvm_request_tensorCPU(void* tensor, int destination);
+// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for
+// cuDNN operations NOTE: The only data format supported as of now is: NCHW
+// (batch_dimension, channels, Height, Width)
+// void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t
+// dim2_size,
+///	       size_t dim3_size, size_t dim4_size, bool freeMemory = true);
 
+void initTensorData(void *tensor, void *data_ptr, size_t size_in_bytes);
 
-  // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations
-  // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width)
-  //void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
-	///	       size_t dim3_size, size_t dim4_size, bool freeMemory = true);
-  
-  void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes);
+/********** Tensor Operation API ******/
 
-  /********** Tensor Operation API ******/
+// NOTE: For conv_mode, only value '1' is supported
+void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int compute_precision, int row, int col,
+                           int skip_every, int start);
 
-  // NOTE: For conv_mode, only value '1' is supported
-void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr,
-                          int vertical_pad, int horizontal_pad,
-                          int vertical_stride, int horizontal_stride,
-                          int conv_mode, int compute_precision,
-                          int row, int col, int skip_every, int start);
+void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                          int horizontal_pad, int vertical_stride,
+                          int horizontal_stride, int conv_mode,
+                          int compute_precision, int row, int col,
+                          int skip_every, int start);
 
-void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr,
-                          int vertical_pad, int horizontal_pad,
-                          int vertical_stride, int horizontal_stride,
-                          int conv_mode, int compute_precision,
-                          int row, int col, int skip_every, int start);
+void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int conv_groups);
 
-void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups);
-			
- void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-                         void* mean_ptr, void* variance_ptr, double epsilon);
+void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                         void *mean_ptr, void *variance_ptr, double epsilon);
 
+void *tensorPoolingCPU(void *input, int poolFunction, int window_height,
+                       int window_width, int vertical_pad, int horizontal_pad,
+                       int vertical_stride, int horizontal_stride);
 
-  void* tensorPoolingCPU(void* input,
-			 int poolFunction,
-			 int window_height, int window_width,
-			 int vertical_pad, int horizontal_pad,
-			 int vertical_stride, int horizontal_stride);
+void *tensorGemmCPU(void *lhs, void *rhs);
 
-  void* tensorGemmCPU(void* lhs, void* rhs);
+void *tensorAddCPU(void *x, void *bias);
 
-  void* tensorAddCPU(void* x, void* bias);
+void *tensorReluCPU(void *input);
 
-  void* tensorReluCPU(void* input);
+void *tensorRelu2CPU(void *input, float min, float max);
 
-  void* tensorRelu2CPU(void* input, float min, float max);
-  
-  void* tensorTanhCPU(void* input);
-  
-  void* tensorSoftmaxCPU(void* input);
-    
-}
+void *tensorTanhCPU(void *input);
 
+void *tensorSoftmaxCPU(void *input);
+}
 
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
index f05dab738b..1b6e986a47 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
@@ -159,22 +159,14 @@ void *wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
                         int activation_id, // Relu, Tanh, ClipRelu
                         float out_min, float out_max);
 
+void *wrapper_ConvLayer2(
+    const char *hpvm_node_id, void *input, void *filter, void *bias,
+    int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w,
+    int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v,
+    int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id,
+    // NOTE: out_min, out_max are only relevant for ClippedRelu
+    float out_min, float out_max);
 
-void* wrapper_ConvLayer2(const char* hpvm_node_id,
-			  void* input, 
-			  void* filter, 
-			  void* bias, 
-			  int conv_pad_h, int conv_pad_w,
-			  int conv_stride_h, int conv_stride_w,
-			  int pool_id,
-			  int pool_size_v, int pool_size_h,			 
-			  int pool_pad_v, int pool_pad_h,
-			  int pool_stride_v, int pool_stride_h,
-			  int activation_id,
-			  // NOTE: out_min, out_max are only relevant for ClippedRelu
-			  float out_min, float out_max);
-  
-  
 void *wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights,
                       void *bias, int activation_id, float out_min,
                       float out_max);
@@ -204,11 +196,8 @@ void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr,
 
 void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr);
 
-
 void *tensor_set_node_id(unsigned int node_id);
-  
-  
-  
+
 // Utilities
 // TODO: separate utils in separate header
 void dumpAccuracyNorms();
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
index b3abdc0ce4..a3853fda53 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
@@ -1,14 +1,14 @@
-//===--------------------------- tensor_signatures.cc -----------------------===//
+//===--------------------------- tensor_signatures.cc
+//-----------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 // This file contains the declarations of the API to the HPVM tensor runtime.
 // This is compiled to LLVM bitcode file that is loaded by HPVM passes when
 // tensor-based application are compiled through HPVM.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "tensor_runtime.h"
 
 void dummyFunction() {
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc
index b272bbcab4..a0ca6f5bb0 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc
@@ -27,17 +27,17 @@ PerfParamSet::PerfParamSet() {
   printf("- knobs_file_path = %s \n", GLOBAL_KNOBS_FILE);
   std::ifstream file(GLOBAL_KNOBS_FILE);
 
-  if (!file){
+  if (!file) {
     ERROR(" Could NOT find global_knobs.txt \n");
   }
-  
+
   std::string line;
   std::string partial;
   std::vector<std::string> tokens;
 
   while (std::getline(file, line)) { // Read each line
 
-    //printf ("***** line === %s ", line);
+    // printf ("***** line === %s ", line);
     std::istringstream iss(line);
     std::string token;
     while (std::getline(iss, token, '\t')) { // Read each token in the line
@@ -64,7 +64,7 @@ PerfParamSet::PerfParamSet() {
         std::getline(token_stream, tok, ',');
         int offset = atoi(tok.c_str());
 
-        //printf("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob,
+        // printf("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob,
         //       row, col, offset);
         PerfParams params(row, col, offset);
         perf_knob_map[knob] = params;
@@ -101,10 +101,10 @@ SampParamSet::SampParamSet() {
   printf("- knobs_file_path = %s \n", GLOBAL_KNOBS_FILE);
   std::ifstream file(GLOBAL_KNOBS_FILE);
 
-  if (!file){
+  if (!file) {
     ERROR("Could NOT find global_knobs.txt \n");
   }
-  
+
   std::string line;
   std::string partial;
   std::vector<std::string> tokens;
@@ -124,7 +124,7 @@ SampParamSet::SampParamSet() {
         int index2 = token.find(",");
         std::string knob_str = token.substr(index2 + 1);
         int knob = atoi(knob_str.c_str());
-        //printf("knob = %d \n", knob);
+        // printf("knob = %d \n", knob);
 
         std::getline(iss, token, '\t');
         std::istringstream token_stream(token);
@@ -140,7 +140,7 @@ SampParamSet::SampParamSet() {
         std::getline(token_stream, tok, ',');
         float interpolation_id = atof(tok.c_str());
 
-        //printf("skip_every = %d, offset = %d \n", skip_every, offset);
+        // printf("skip_every = %d, offset = %d \n", skip_every, offset);
         SampParams params(skip_every, offset, interpolation_id);
         samp_knob_map[knob] = params;
       }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
index 41aa185284..8a8ff8435d 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu
@@ -1,14 +1,13 @@
 //===--------------------------- approxs_simulator.cu ---------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  consists of the emulations of implementation of software 
-// approximations for tensor convolutions. The approximations implemented are 
-// feature sampling and perforation for FP32 and FP16 compute precisions.  
+//
+//  This file  consists of the emulations of implementation of software
+// approximations for tensor convolutions. The approximations implemented are
+// feature sampling and perforation for FP32 and FP16 compute precisions.
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef SIM_HEADER
 #define SIM_HEADER
 
@@ -27,7 +26,6 @@
 #include "global_data.h"
 #include "approx_knob_utils.h"
 
-
 #include <unordered_map>
 #include <sstream>
 #include <fstream>
@@ -36,77 +34,67 @@
 #include <map>
 #include <cassert>
 
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void postInterpolateRow(int N, int n, int c, int h, int w,
-			float* data, int int_row){
+// N is new_data's size
+// n, c, h, w are the dimensions of new_data
+__global__ void postInterpolateRow(int N, int n, int c, int h, int w,
+                                   float *data, int int_row) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n = i / (c * h * w);
 
-    if((row % int_row == 1) && (row != 0) && (row != h-1))
+    if ((row % int_row == 1) && (row != 0) && (row != h - 1))
       data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(data[n * (c * h * w) + ch * (h * w) + (row - 1) * (w) + col] +
-	 data[n * (c * h * w) + ch * (h * w) + (row + 1)  * (w) + col]) / 2;
-
+          (data[n * (c * h * w) + ch * (h * w) + (row - 1) * (w) + col] +
+           data[n * (c * h * w) + ch * (h * w) + (row + 1) * (w) + col]) /
+          2;
   }
 }
 
-
-
-__global__
-void postInterpolateCol(int N, int n, int c, int h, int w,
-			float* data, int int_col){
+__global__ void postInterpolateCol(int N, int n, int c, int h, int w,
+                                   float *data, int int_col) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
     int n = i / (c * h * w);
 
-    if((col % int_col == 1) && (col != 0) && (col != w-1))
+    if ((col % int_col == 1) && (col != 0) && (col != w - 1))
       data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-	(data[n * (c * h * w) + ch * (h * w) + row * (w) + (col-1) ] +
-	 data[n * (c * h * w) + ch * (h * w) + row * (w) + (col+1) ])/2;
-
+          (data[n * (c * h * w) + ch * (h * w) + row * (w) + (col - 1)] +
+           data[n * (c * h * w) + ch * (h * w) + row * (w) + (col + 1)]) /
+          2;
   }
 }
 
-
-
-
 // A 'Simulation' of perforated tensor convolution
-void* tensorConvPerfSim(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups,
-			int row, int col){
-  
+void *tensorConvPerfSim(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups,
+                        int row, int col) {
 
   INFO("*** TensorConvolution \n");
   profileEvent("tensorConv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
   cudnnConvolutionDescriptor_t convDesc;
   cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  
-  if(conv_mode == 0)
+
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   float alpha = 1.0f, beta = 0.0f;
@@ -114,13 +102,13 @@ void* tensorConvPerfSim(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-  INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
-       vertical_stride, horizontal_stride);
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+       horizontal_stride);
 
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
   }
 
@@ -130,134 +118,111 @@ void* tensorConvPerfSim(void* input_ptr, void* filter_ptr,
   int new_v = vertical_stride + 0;
   int new_h = horizontal_stride + 0;
   cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-  
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     new_v, new_h, // conv strides
-					     1, 1, // upscaling values
-					     mode , // mode is configurable
-					     computeType)); // defines compute precision
+
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      new_v, new_h,                           // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   int n, c, h, w; // output dimensions
   // Find dimension of convolution output
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
-
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
   DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output;
-  if(input->data_format == CUDNN_TENSOR_NCHW)
-    output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
-  else if(input->data_format == CUDNN_TENSOR_NHWC){
+  Tensor *output;
+  if (input->data_format == CUDNN_TENSOR_NCHW)
+    output = (Tensor *)create4DTensor((cudnnDataType_t)input->data_type,
+                                      CUDNN_TENSOR_NCHW, n, c, h, w);
+  else if (input->data_format == CUDNN_TENSOR_NHWC) {
     DEBUG("* NHWC Format \n");
-    output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
-				      CUDNN_TENSOR_NHWC, n, h, w, c);
-  }
-  else
+    output = (Tensor *)create4DTensor((cudnnDataType_t)input->data_type,
+                                      CUDNN_TENSOR_NHWC, n, h, w, c);
+  } else
     ERROR("Unsupported Tensor Type");
 
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
   // NOTE: Necessary to insert the above call for every output tensor
 
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0],
-	output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = "
+        "%d, W = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
 
-  if(convDesc == NULL || input->tensor_desc == NULL ||
-     filter->filter_desc == NULL || output->tensor_desc == NULL)
+  if (convDesc == NULL || input->tensor_desc == NULL ||
+      filter->filter_desc == NULL || output->tensor_desc == NULL)
     ERROR("NULL descriptor! \n");
 
-
-
-  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						 input->tensor_desc,
-						 filter->filter_desc,
-						 convDesc,
-						 output->tensor_desc,
-						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						 0,
-						 &convAlgo));
-
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support
+  // is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+      // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+      0, &convAlgo));
 
   DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
   // FIXIT: Algo shouldn't be hardcoded
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
 
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     convDesc,
-						     output->tensor_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
-  void* workspace;
+  void *workspace;
   checkCudaErrors(cudaMalloc(&workspace, workspace_size));
   DEBUG("workspace size = %d \n", workspace_size);
 
-
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				     input->gpu_data, filter->filter_desc, filter->gpu_data,
-				     convDesc, convAlgo, workspace, workspace_size,
-				     &beta, output->tensor_desc, output->gpu_data));
-
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+      filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace,
+      workspace_size, &beta, output->tensor_desc, output->gpu_data));
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] -
-       filter->dims.dim_sizes[2]) / vertical_stride + 1;
-  
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] -
-       filter->dims.dim_sizes[3]) / horizontal_stride + 1;
+       filter->dims.dim_sizes[2]) /
+          vertical_stride +
+      1;
 
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] -
+       filter->dims.dim_sizes[3]) /
+          horizontal_stride +
+      1;
 
-  int numBlocks = (n * c * h * w  + 127) / 128;
+  int numBlocks = (n * c * h * w + 127) / 128;
 
   if (row > 0)
-    postInterpolateRow<<<numBlocks,128>>>(n * c * h * w, n, c, h, w,
-				         (float *) output->gpu_data, row);
+    postInterpolateRow<<<numBlocks, 128>>>(n * c * h * w, n, c, h, w,
+                                           (float *)output->gpu_data, row);
 
   if (col > 0)
-    postInterpolateCol<<<numBlocks,128>>>(n * c * h * w, n, c, h, w,
-				         (float *) output->gpu_data, col);
-
+    postInterpolateCol<<<numBlocks, 128>>>(n * c * h * w, n, c, h, w,
+                                           (float *)output->gpu_data, col);
 
   profileEvent("tensorConv_end", true);
 
   return output;
 }
 
-
-
-
-
-//N is new_data's size
-//n, c, h, w are the dimensions of new_data
-__global__
-void sampleFilterElems(int N,
-		       int n, int c, int h, int w,
-		       float* data,
-		       int skip_elem, int skip_offset,
-		       float mul_factor,
-		       float* newData){
+// N is new_data's size
+// n, c, h, w are the dimensions of new_data
+__global__ void sampleFilterElems(int N, int n, int c, int h, int w,
+                                  float *data, int skip_elem, int skip_offset,
+                                  float mul_factor, float *newData) {
 
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
 
-  
-  for(int i = index; i < N; i += stride){
+  for (int i = index; i < N; i += stride) {
     int col = ((i % (c * h * w)) % (h * w)) % w;
     int row = ((i % (c * h * w)) % (h * w)) / w;
     int ch = (i % (c * h * w)) / (h * w);
@@ -265,75 +230,60 @@ void sampleFilterElems(int N,
 
     int local_index = (ch * (h * w)) + (row * w) + col;
 
-    if(skip_elem == 3 && h == 3 && w == 3){
+    if (skip_elem == 3 && h == 3 && w == 3) {
       skip_offset = (skip_offset + ch) % w; // wrap around skip offsets
     }
 
-    if(local_index % skip_elem  == skip_offset)
-       newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0;
+    if (local_index % skip_elem == skip_offset)
+      newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0;
     else
       newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-      data[n * (c * h * w) + ch * (h * w) + row * (w) + col] * mul_factor;
-      
+          data[n * (c * h * w) + ch * (h * w) + row * (w) + col] * mul_factor;
   }
 }
 
-
-
-
-
-void sampleFilter(Tensor* newFilter, Tensor* filter,
-		  int skip_rate, int skip_offset){
+void sampleFilter(Tensor *newFilter, Tensor *filter, int skip_rate,
+                  int skip_offset) {
 
   int n = filter->dims.dim_sizes[0];
   int c = filter->dims.dim_sizes[1];
   int h = filter->dims.dim_sizes[2];
   int w = filter->dims.dim_sizes[3];
-    
-  int numBlocks = (n * c * h * w  + 127) / 128;
-  int N = n * c * h * w;
 
-  float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); 
+  int numBlocks = (n * c * h * w + 127) / 128;
+  int N = n * c * h * w;
 
-  //float mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
-  //mul_factor = (mul_factor + 1.0) / 2;
+  float mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
 
-  
-  DEBUG ("mul_factor = %f \n", mul_factor);
+  // float mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
+  // mul_factor = (mul_factor + 1.0) / 2;
 
-  
-  sampleFilterElems<<<numBlocks,128>>>(N,
-				       n, c, h, w,
-				       (float *) filter->gpu_data,
-				       skip_rate, skip_offset, mul_factor,
-				       (float *) newFilter->gpu_data);
+  DEBUG("mul_factor = %f \n", mul_factor);
 
+  sampleFilterElems<<<numBlocks, 128>>>(
+      N, n, c, h, w, (float *)filter->gpu_data, skip_rate, skip_offset,
+      mul_factor, (float *)newFilter->gpu_data);
 }
 
-
-
 // A 'Simulation' of perforated tensor convolution
-void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups,
-			int skip_rate, int skip_offset){
-  
+void *tensorConvSampSim(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups,
+                        int skip_rate, int skip_offset) {
 
   INFO("*** TensorConvolution \n");
   profileEvent("tensorConv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
-  
   cudnnConvolutionDescriptor_t convDesc;
-  cudnnConvolutionFwdAlgo_t convAlgo;  
+  cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  
-  if(conv_mode == 0)
+
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   float alpha = 1.0f, beta = 0.0f;
@@ -344,24 +294,22 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
   convertToFP32(input);
   convertToFP32(filter);
 
-  Tensor* newFilter;
-  newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type,
-					CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
-					filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
-					filter->dims.dim_sizes[3]);
-
+  Tensor *newFilter;
+  newFilter = (Tensor *)create4DTensor(
+      (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
+      filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
+      filter->dims.dim_sizes[3]);
 
   // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
   sampleFilter(newFilter, filter, skip_rate, skip_offset);
-  
 
-  INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
-       vertical_stride, horizontal_stride);
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+       horizontal_stride);
 
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
   }
 
@@ -371,147 +319,116 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
   int new_v = vertical_stride + 0;
   int new_h = horizontal_stride + 0;
   cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-  
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     new_v, new_h, // conv strides
-					     1, 1, // upscaling values
-					     mode , // mode is configurable
-					     computeType)); // defines compute precision
+
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      new_v, new_h,                           // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   int n, c, h, w; // output dimensions
   // Find dimension of convolution output
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
-
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
   DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output;
-  output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
-  
+  Tensor *output;
+  output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                    CUDNN_TENSOR_NCHW, n, c, h, w);
 
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
   // NOTE: Necessary to insert the above call for every output tensor
 
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0],
-	output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = "
+        "%d, W = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
 
-  if(convDesc == NULL || input->tensor_desc == NULL ||
-     filter->filter_desc == NULL || output->tensor_desc == NULL)
+  if (convDesc == NULL || input->tensor_desc == NULL ||
+      filter->filter_desc == NULL || output->tensor_desc == NULL)
     ERROR("NULL descriptor! \n");
 
-
-  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						 input->tensor_desc,
-						 filter->filter_desc,
-						 convDesc,
-						 output->tensor_desc,
-						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						 0,
-						 &convAlgo));
-
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support
+  // is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+      // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+      0, &convAlgo));
 
   DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
   // NOTE: Using GEMM-based Algo
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
 
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     convDesc,
-						     output->tensor_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
-  void* workspace;
+  void *workspace;
   checkCudaErrors(cudaMalloc(&workspace, workspace_size));
   DEBUG("workspace size = %d \n", workspace_size);
 
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+      filter->filter_desc, newFilter->gpu_data, convDesc, convAlgo, workspace,
+      workspace_size, &beta, output->tensor_desc, output->gpu_data));
 
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				     input->gpu_data, filter->filter_desc, newFilter->gpu_data,
-				     convDesc, convAlgo, workspace, workspace_size,
-				     &beta, output->tensor_desc, output->gpu_data));
-
-
- 
   freeTensor(newFilter);
   profileEvent("tensorConv_end", true);
 
   return output;
 }
 
-
-
-
-
-
-
-
-
-
-void sampleFilter2(Tensor* newFilter, Tensor* filter,
-		   int skip_rate, int skip_offset, float interpolation_rate){
+void sampleFilter2(Tensor *newFilter, Tensor *filter, int skip_rate,
+                   int skip_offset, float interpolation_rate) {
 
   int n = filter->dims.dim_sizes[0];
   int c = filter->dims.dim_sizes[1];
   int h = filter->dims.dim_sizes[2];
   int w = filter->dims.dim_sizes[3];
-    
-  int numBlocks = (n * c * h * w  + 127) / 128;
+
+  int numBlocks = (n * c * h * w + 127) / 128;
   int N = n * c * h * w;
 
   float mul_factor;
   mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
   mul_factor = 1 + (interpolation_rate * (mul_factor - 1.0));
-  DEBUG ("mul_factor = %f \n", mul_factor);
-  
-  sampleFilterElems<<<numBlocks,128>>>(N,
-				       n, c, h, w,
-				       (float *) filter->gpu_data,
-				       skip_rate, skip_offset, mul_factor,
-				       (float *) newFilter->gpu_data);
-}
-
+  DEBUG("mul_factor = %f \n", mul_factor);
 
+  sampleFilterElems<<<numBlocks, 128>>>(
+      N, n, c, h, w, (float *)filter->gpu_data, skip_rate, skip_offset,
+      mul_factor, (float *)newFilter->gpu_data);
+}
 
 // A 'Simulation' of perforated tensor convolution
-void* tensorConvSampSim2(void* input_ptr, void* filter_ptr,
-			 int vertical_pad, int horizontal_pad,
-			 int vertical_stride, int horizontal_stride,
-			 int conv_mode, int conv_groups,
-			 int skip_rate, int skip_offset, float interpolation_rate){
-  
+void *tensorConvSampSim2(void *input_ptr, void *filter_ptr, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode, int conv_groups,
+                         int skip_rate, int skip_offset,
+                         float interpolation_rate) {
 
   INFO("*** TensorConvolution \n");
   profileEvent("tensorConv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
-  
   cudnnConvolutionDescriptor_t convDesc;
-  cudnnConvolutionFwdAlgo_t convAlgo;  
+  cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  
-  if(conv_mode == 0)
+
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   float alpha = 1.0f, beta = 0.0f;
@@ -522,24 +439,22 @@ void* tensorConvSampSim2(void* input_ptr, void* filter_ptr,
   convertToFP32(input);
   convertToFP32(filter);
 
-  Tensor* newFilter;
-  newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type,
-					CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
-					filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
-					filter->dims.dim_sizes[3]);
-
+  Tensor *newFilter;
+  newFilter = (Tensor *)create4DTensor(
+      (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
+      filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
+      filter->dims.dim_sizes[3]);
 
   // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
   sampleFilter2(newFilter, filter, skip_rate, skip_offset, interpolation_rate);
-  
 
-  INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
-       vertical_stride, horizontal_stride);
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+       horizontal_stride);
 
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
   }
 
@@ -549,166 +464,135 @@ void* tensorConvSampSim2(void* input_ptr, void* filter_ptr,
   int new_v = vertical_stride + 0;
   int new_h = horizontal_stride + 0;
   cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-  
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     new_v, new_h, // conv strides
-					     1, 1, // upscaling values
-					     mode , // mode is configurable
-					     computeType)); // defines compute precision
+
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      new_v, new_h,                           // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   int n, c, h, w; // output dimensions
   // Find dimension of convolution output
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
-
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
   DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output;
-  output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
-  
+  Tensor *output;
+  output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                    CUDNN_TENSOR_NCHW, n, c, h, w);
 
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
   // NOTE: Necessary to insert the above call for every output tensor
 
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0],
-	output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = "
+        "%d, W = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
 
-  if(convDesc == NULL || input->tensor_desc == NULL ||
-     filter->filter_desc == NULL || output->tensor_desc == NULL)
+  if (convDesc == NULL || input->tensor_desc == NULL ||
+      filter->filter_desc == NULL || output->tensor_desc == NULL)
     ERROR("NULL descriptor! \n");
 
-
-  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						 input->tensor_desc,
-						 filter->filter_desc,
-						 convDesc,
-						 output->tensor_desc,
-						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						 0,
-						 &convAlgo));
-
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support
+  // is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+      // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+      0, &convAlgo));
 
   DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
   // NOTE: Using GEMM-based Algo
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
 
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     convDesc,
-						     output->tensor_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
-  void* workspace;
+  void *workspace;
   checkCudaErrors(cudaMalloc(&workspace, workspace_size));
   DEBUG("workspace size = %d \n", workspace_size);
 
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+      filter->filter_desc, newFilter->gpu_data, convDesc, convAlgo, workspace,
+      workspace_size, &beta, output->tensor_desc, output->gpu_data));
 
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				     input->gpu_data, filter->filter_desc, newFilter->gpu_data,
-				     convDesc, convAlgo, workspace, workspace_size,
-				     &beta, output->tensor_desc, output->gpu_data));
-
-
- 
   freeTensor(newFilter);
   profileEvent("tensorConv_end", true);
 
   return output;
 }
 
+/************ NOTE: API for ApproxHPVM Wrapper runtime *******/
 
+void *PROMISE_Conv(void *input, float i_min, float i_max, void *filter,
+                   float w_min, float w_max, void *bias, float b_min,
+                   float b_max, int conv_pad_h, int conv_pad_w,
+                   int conv_stride_h, int conv_stride_w, int pool_id,
+                   int pool_size, int pool_stride,
+                   int activation_id, // Relu, Tanh, ClipRelu
+                   float out_min, float out_max, int swing) {
 
+  Tensor *input_t = (Tensor *)input;
+  Tensor *filter_t = (Tensor *)filter;
+  Tensor *bias_t = (Tensor *)bias;
 
-
-
-
-
-
-/************ NOTE: API for ApproxHPVM Wrapper runtime *******/ 
-
-
-void* PROMISE_Conv(void* input, float i_min, float i_max,
-		   void* filter, float w_min, float w_max,
-		   void* bias, float b_min, float b_max,
-		   int conv_pad_h, int conv_pad_w,
-		   int conv_stride_h, int conv_stride_w,
-		   int pool_id, int pool_size, int pool_stride,
-		   int activation_id, // Relu, Tanh, ClipRelu
-		   float out_min, float out_max, int swing){ 
-
-
-  Tensor* input_t = (Tensor*) input;
-  Tensor* filter_t = (Tensor*) filter;
-  Tensor* bias_t = (Tensor*) bias;
-  
   int orig_type = input_t->cur_type;
 
   DEBUG("FP32 conversions \n");
-  
+
   convertToFP32(input_t);
 
   convertToFP32(filter_t);
   convertToFP32(bias_t);
 
   DEBUG("DONE FP32 conversions \n");
-  
 
-  if(swing < 8){
+  if (swing < 8) {
     input = quantizeTensorPromise(input, i_min, i_max);
     filter = quantizeTensorPromise(filter, w_min, w_max);
-    if(bias != NULL)
+    if (bias != NULL)
       bias = quantizeTensorPromise(bias, b_min, b_max);
     // aRead error
-    
+
     input = addPromiseError(input, swing);
   }
 
-  
-  void* conv_out;
-  conv_out = tensorConvolution(input, filter,
-			       conv_pad_h, conv_pad_w,
-			       conv_stride_h, conv_stride_w,
-			       1, 0);
-  
-  void* conv_add;
-  if(bias != NULL){
+  void *conv_out;
+  conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w,
+                               conv_stride_h, conv_stride_w, 1, 0);
+
+  void *conv_add;
+  if (bias != NULL) {
     conv_add = tensorAdd(conv_out, bias);
-  }
-  else{
+  } else {
     conv_add = conv_out;
   }
 
-  void* pool_out;
+  void *pool_out;
   // NOTE: Skip pooling on negative pool sizes
-  if(pool_size > 0){
-    //FIXME: Currently only using MaxPooling
-    //-- pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_size, pool_size);
-    pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_stride, pool_stride);
-  }
-  else{
+  if (pool_size > 0) {
+    // FIXME: Currently only using MaxPooling
+    //-- pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0,
+    // pool_size, pool_size);
+    pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0,
+                             pool_stride, pool_stride);
+  } else {
     pool_out = conv_add;
   }
-  
-  void* activation_out;  
-  switch(activation_id){
+
+  void *activation_out;
+  switch (activation_id) {
   case -1:
     activation_out = pool_out;
     INFO("NO Activation Function \n");
@@ -727,68 +611,54 @@ void* PROMISE_Conv(void* input, float i_min, float i_max,
     break;
   }
 
-
-  if(swing < 8 && activation_id != -1){
+  if (swing < 8 && activation_id != -1) {
     activation_out = quantizeTensorPromise(activation_out, out_min, out_max);
   }
 
-
-
-  //NOTE: Convert back to FP16 if original type
-  if (orig_type == half_type){
-    convertToFP16((Tensor*) activation_out);
+  // NOTE: Convert back to FP16 if original type
+  if (orig_type == half_type) {
+    convertToFP16((Tensor *)activation_out);
   }
 
-  
   return activation_out;
 }
 
+void *PROMISE_FC(void *input, float i_min, float i_max, void *weights,
+                 float w_min, float w_max, void *bias, float b_min, float b_max,
+                 int activation_id, float out_min, float out_max, int swing) {
 
+  Tensor *input_t = (Tensor *)input;
+  Tensor *weights_t = (Tensor *)weights;
+  Tensor *bias_t = (Tensor *)bias;
 
-void* PROMISE_FC(void* input, float i_min, float i_max,
-		 void* weights, float w_min, float w_max,
-		 void* bias, float b_min, float b_max,
-		 int activation_id,
-		 float out_min, float out_max, int swing){
-
-
-  Tensor* input_t = (Tensor*) input;
-  Tensor* weights_t = (Tensor*) weights;
-  Tensor* bias_t = (Tensor*) bias;
-  
   int orig_type = input_t->cur_type;
-  
+
   convertToFP32(input_t);
   convertToFP32(weights_t);
   convertToFP32(bias_t);
-  
-  
-  if(swing < 8){
+
+  if (swing < 8) {
     input = quantizeTensorPromise(input, i_min, i_max);
     weights = quantizeTensorPromise(weights, w_min, w_max);
-    if(bias != NULL)
+    if (bias != NULL)
       bias = quantizeTensorPromise(bias, b_min, b_max);
 
     // NOTE: Modelling aRead error in PROMISE
     input = addPromiseError(input, swing);
   }
 
-
-  
-  void* gemm_out;
+  void *gemm_out;
   gemm_out = tensorGemmGPU(input, weights);
 
-  
-  void* gemmbias_out;
-  if(bias != NULL){
+  void *gemmbias_out;
+  if (bias != NULL) {
     gemmbias_out = tensorAdd(gemm_out, bias);
-  }
-  else{
+  } else {
     gemmbias_out = gemm_out;
   }
- 
-  void* activation_out;
-  switch(activation_id){
+
+  void *activation_out;
+  switch (activation_id) {
 
   case -1:
     activation_out = gemmbias_out;
@@ -807,86 +677,71 @@ void* PROMISE_FC(void* input, float i_min, float i_max,
     ERROR("Activation id %d NOT supported \n", activation_out);
     break;
   }
-  
-  
-  if(swing < 8 && activation_id != -1){
+
+  if (swing < 8 && activation_id != -1) {
     activation_out = quantizeTensorPromise(activation_out, out_min, out_max);
   }
 
-
-  //NOTE: Convert back to FP16 if original type
-  if (orig_type == half_type){
-    convertToFP16((Tensor*) activation_out);
+  // NOTE: Convert back to FP16 if original type
+  if (orig_type == half_type) {
+    convertToFP16((Tensor *)activation_out);
   }
 
-
-  
   return activation_out;
 }
 
-
-
-
-
-// NOTE: Enabling the macro below is used for testing against the old PROMISE wrapper
+// NOTE: Enabling the macro below is used for testing against the old PROMISE
+// wrapper
 //#define OLD_MODEL
 
 #ifndef OLD_MODEL
 
+bool isPromiseLayer(int swing) {
 
-
-bool isPromiseLayer(int swing){
-
-  if(swing < 8)
+  if (swing < 8)
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isGPULayer(int swing) {
 
-bool isGPULayer(int swing){
-
-  if(swing > 10 ) // PROMISE layers are 1-7
+  if (swing > 10) // PROMISE layers are 1-7
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isFullPrecision(int swing) {
 
-bool isFullPrecision(int swing){
-
-  if(swing == 11)
+  if (swing == 11)
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isHalfPrecision(int swing) {
 
-
-bool isHalfPrecision(int swing){
-
-  if(swing == 12)
+  if (swing == 12)
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isPerforation(int swing) {
 
-bool isPerforation(int swing){
-
-  if(swing >= 100 && swing <= 200)
+  if (swing >= 100 && swing <= 200)
     return true;
   else
-    return false;      
+    return false;
 }
 
+bool isSampling(int swing) {
 
-bool isSampling(int swing){
-
-  if(swing >= 200 && swing <= 300)
+  if (swing >= 200 && swing <= 300)
     return true;
   else
-    return false;      
+    return false;
 }
 
 bool isReductionSampling(int swing) {
@@ -894,300 +749,227 @@ bool isReductionSampling(int swing) {
   if (swing >= 41 && swing <= 49)
     return true;
   else
-    return false;      
+    return false;
 }
 
-int getSwing(int swing){
+int getSwing(int swing) {
 
-  #ifdef PROMISE_TUNER_ENABLED
+#ifdef PROMISE_TUNER_ENABLED
 
   // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime
-  if(!approxhpvm_runtime_mode){
-  
-    if(op_counter >= total_ops){
+  if (!approxhpvm_runtime_mode) {
+
+    if (op_counter >= total_ops) {
       ERROR("No accuracy flag found \n");
     }
-  
+
     swing = op_accuracies[op_counter];
     op_counter++;
   }
 
-  #endif  
+#endif
 
-   DEBUG("---- swing_value = %d \n", swing);  
+  DEBUG("---- swing_value = %d \n", swing);
 
-   return swing;
+  return swing;
 }
 
-
-
-
-//bool FP16_tuning = false;
-
+// bool FP16_tuning = false;
 
 /***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */
 
-
-
-void initializeAutotuner(){
+void initializeAutotuner() {
 
   DEBUG("initializing tuner .... \n");
-  
+
   sampParamSet = new SampParamSet;
-  perfParamSet = new PerfParamSet;  
+  perfParamSet = new PerfParamSet;
 }
 
+void *Autotuner_SampConv(void *input, float i_min, float i_max, void *filter,
+                         float w_min, float w_max, void *bias, float b_min,
+                         float b_max, int conv_pad_h, int conv_pad_w,
+                         int conv_stride_h, int conv_stride_w, int pool_id,
+                         int pool_size,
+                         int activation_id, // Relu, Tanh, ClipRelu
+                         float out_min, float out_max, int swing) {
+
+  SampParams params = sampParamSet->getSampParams(swing);
 
-void* Autotuner_SampConv(void* input, float i_min, float i_max,
-			 void* filter, float w_min, float w_max,
-			 void* bias, float b_min, float b_max,
-			 int conv_pad_h, int conv_pad_w,
-			 int conv_stride_h, int conv_stride_w,
-			 int pool_id, int pool_size,
-			 int activation_id, // Relu, Tanh, ClipRelu
-			 float out_min, float out_max, int swing){
+  DEBUG("params.skip_rate = %d, params.skip_offset = %d \n", params.skip_rate,
+        params.skip_offset);
 
+  void *conv_out;
+
+  if (!FP16_tuning) {
 
-  SampParams params = sampParamSet->getSampParams(swing);
-  
-  DEBUG("params.skip_rate = %d, params.skip_offset = %d \n",
-	params.skip_rate, params.skip_offset);
-  
-  void* conv_out;
-  
-  if (!FP16_tuning){
- 
     /* conv_out = tensorConvSampSim(input, filter,
-				 conv_pad_h, conv_pad_w,
-				 conv_stride_h, conv_stride_w, 1, 1,
-				 params.skip_rate, params.skip_offset);
+                                 conv_pad_h, conv_pad_w,
+                                 conv_stride_h, conv_stride_w, 1, 1,
+                                 params.skip_rate, params.skip_offset);
     */
 
-
-    if (SIMULATION_MODE){
-      conv_out = tensorConvSampSim2(input, filter,
-				    conv_pad_h, conv_pad_w,
-				    conv_stride_h, conv_stride_w, 1, 1,
-				    params.skip_rate, params.skip_offset, params.interpolation_id);
+    if (SIMULATION_MODE) {
+      conv_out = tensorConvSampSim2(
+          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
+          1, 1, params.skip_rate, params.skip_offset, params.interpolation_id);
     }
-        
 
     else {
-      conv_out = tensorConvApprox(input, filter,
-				  conv_pad_h, conv_pad_w,
-				  conv_stride_h, conv_stride_w, 1, 1,
-				  1, 1, params.skip_rate, params.skip_offset);
+      conv_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w,
+                                  conv_stride_h, conv_stride_w, 1, 1, 1, 1,
+                                  params.skip_rate, params.skip_offset);
     }
-    
-    
-  }
-  else{
-        
-    conv_out = tensorConvApproxHalf2(input, filter,
-				     conv_pad_h, conv_pad_w,
-				     conv_stride_h, conv_stride_w,
-				     1, 1,
-				     1, 1,
-				     params.skip_rate, params.skip_offset);    
-   
+
+  } else {
+
+    conv_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
+                                     conv_stride_h, conv_stride_w, 1, 1, 1, 1,
+                                     params.skip_rate, params.skip_offset);
   }
 
   return conv_out;
 }
 
-
-
-
-void* Autotuner_PerforatedConv(void* input, float i_min, float i_max,
-			       void* filter, float w_min, float w_max,
-			       void* bias, float b_min, float b_max,
-			       int conv_pad_h, int conv_pad_w,
-			       int conv_stride_h, int conv_stride_w,
-			       int pool_id, int pool_size,
-			       int activation_id, // Relu, Tanh, ClipRelu
-			       float out_min, float out_max, int swing){ 
-
+void *Autotuner_PerforatedConv(void *input, float i_min, float i_max,
+                               void *filter, float w_min, float w_max,
+                               void *bias, float b_min, float b_max,
+                               int conv_pad_h, int conv_pad_w,
+                               int conv_stride_h, int conv_stride_w,
+                               int pool_id, int pool_size,
+                               int activation_id, // Relu, Tanh, ClipRelu
+                               float out_min, float out_max, int swing) {
 
   PerfParams params = perfParamSet->getPerfParams(swing);
-  
+
   DEBUG("params.row = %d, params.col = %d, params.skip_offset = %d \n",
-	params.row, params.col, params.skip_offset);
-    
+        params.row, params.col, params.skip_offset);
 
-  void* conv_out;
-  
-  if (!FP16_tuning){
+  void *conv_out;
 
+  if (!FP16_tuning) {
 
-    if (SIMULATION_MODE){
+    if (SIMULATION_MODE) {
 
-      conv_out = tensorConvPerfCuda(input, filter,
-				    conv_pad_h, conv_pad_w,
-				    conv_stride_h, conv_stride_w, 1, 1,
-				    params.row, params.col, params.skip_offset);
+      conv_out = tensorConvPerfCuda(input, filter, conv_pad_h, conv_pad_w,
+                                    conv_stride_h, conv_stride_w, 1, 1,
+                                    params.row, params.col, params.skip_offset);
 
+    } else {
+
+      conv_out = tensorConvApprox(
+          input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w,
+          1, 1, params.row, params.col, 1, params.skip_offset);
     }
-    else{
-
-      conv_out = tensorConvApprox(input, filter,
-				  conv_pad_h, conv_pad_w,
-				  conv_stride_h, conv_stride_w,
-				  1, 1,
-				  params.row, params.col,
-				  1, params.skip_offset);   
-    }
-    
-    
-  }
-  else{
 
-    conv_out = tensorConvApproxHalf2(input, filter,
-				     conv_pad_h, conv_pad_w,
-				     conv_stride_h, conv_stride_w,
-				     1, 1,
-				     params.row, params.col,
-				     1, params.skip_offset);   
+  } else {
 
+    conv_out = tensorConvApproxHalf2(
+        input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1,
+        1, params.row, params.col, 1, params.skip_offset);
   }
-    
-  return conv_out;  
-}
-
-
-
 
+  return conv_out;
+}
 
+void *Autotuner_ConvOp(void *input, float i_min, float i_max, void *filter,
+                       float w_min, float w_max, void *bias, float b_min,
+                       float b_max, int conv_pad_h, int conv_pad_w,
+                       int conv_stride_h, int conv_stride_w, int pool_id,
+                       int pool_size,
+                       int activation_id, // Relu, Tanh, ClipRelu
+                       float out_min, float out_max, int swing) {
 
-void* Autotuner_ConvOp(void* input, float i_min, float i_max,
-		       void* filter, float w_min, float w_max,
-		       void* bias, float b_min, float b_max,
-		       int conv_pad_h, int conv_pad_w,
-		       int conv_stride_h, int conv_stride_w,
-		       int pool_id, int pool_size,
-		       int activation_id, // Relu, Tanh, ClipRelu
-		       float out_min, float out_max, int swing){ 
+  void *conv_out;
+  if (isPerforation(swing)) {
 
-  
-  void* conv_out;
-  if(isPerforation(swing)){
+    conv_out = Autotuner_PerforatedConv(
+        input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max,
+        conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, pool_id,
+        pool_size, activation_id, out_min, out_max, swing);
 
-    conv_out = Autotuner_PerforatedConv(input, i_min, i_max,
-					filter, w_min, w_max,
-					bias, b_min, b_max,
-					conv_pad_h, conv_pad_w,
-					conv_stride_h, conv_stride_w,
-					pool_id, pool_size,
-					activation_id, 
-					out_min, out_max, swing);
-    
   }
 
-  else if(isSampling(swing)){
+  else if (isSampling(swing)) {
 
-    conv_out = Autotuner_SampConv(input, i_min, i_max,
-				  filter, w_min, w_max,
-				  bias, b_min, b_max,
-				  conv_pad_h, conv_pad_w,
-				  conv_stride_h, conv_stride_w,
-				  pool_id, pool_size,
-				  activation_id, 
-				  out_min, out_max, swing);
+    conv_out = Autotuner_SampConv(
+        input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max,
+        conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, pool_id,
+        pool_size, activation_id, out_min, out_max, swing);
   }
-  
 
-  else if (isHalfPrecision(swing)){
+  else if (isHalfPrecision(swing)) {
 
-    if (FP16_tuning){
- 
-      conv_out = tensorHalfConvolution(input, filter,
-				     conv_pad_h, conv_pad_w,
-				     conv_stride_h, conv_stride_w,
-				     1, 0);
-    }
-    else{
-      conv_out = tensorConvolution(input, filter,
-				 conv_pad_h, conv_pad_w,
-				 conv_stride_h, conv_stride_w,
-				 1, 0);
+    if (FP16_tuning) {
+
+      conv_out = tensorHalfConvolution(input, filter, conv_pad_h, conv_pad_w,
+                                       conv_stride_h, conv_stride_w, 1, 0);
+    } else {
+      conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w,
+                                   conv_stride_h, conv_stride_w, 1, 0);
     }
-    
-  }
 
-  else if (isFullPrecision(swing)){
-    conv_out = tensorConvolution(input, filter,
-				 conv_pad_h, conv_pad_w,
-				 conv_stride_h, conv_stride_w,
-				 1, 0);
   }
 
+  else if (isFullPrecision(swing)) {
+    conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w,
+                                 conv_stride_h, conv_stride_w, 1, 0);
+  }
 
-  return conv_out;  
+  return conv_out;
 }
 
+void *Autotuner_Add(void *input, void *bias, int swing) {
 
+  void *conv_add;
+  if (bias != NULL) {
 
-void* Autotuner_Add(void* input, void* bias, int swing){
-
-  void* conv_add;
-  if(bias != NULL){
-    
-    if( isFullPrecision(swing) || !(FP16_tuning) ){  
+    if (isFullPrecision(swing) || !(FP16_tuning)) {
       conv_add = tensorAdd(input, bias);
-    }
-    else {
+    } else {
       conv_add = tensorHalfAdd(input, bias);
     }
-  }
-  else{
+  } else {
     conv_add = input;
   }
 
   return conv_add;
 }
 
+void *Autotuner_Pooling(void *input, int pool_size, int pool_stride,
+                        int swing) {
 
+  void *pool_out;
 
-void* Autotuner_Pooling(void* input,
-			int pool_size, int pool_stride,
-			int swing){
+  if (pool_size > 0) {
 
-  void* pool_out;
-  
-  if(pool_size > 0){
-    
-    //FIXME: Currently only using MaxPooling
-    if( isFullPrecision(swing) || !(FP16_tuning) ){  
-      pool_out = tensorPooling(input, 0, pool_size, pool_size,
-			       0, 0, pool_stride, pool_stride);
+    // FIXME: Currently only using MaxPooling
+    if (isFullPrecision(swing) || !(FP16_tuning)) {
+      pool_out = tensorPooling(input, 0, pool_size, pool_size, 0, 0,
+                               pool_stride, pool_stride);
 
     }
-	
+
     else {
-      pool_out = tensorHalfPooling(input, 0, pool_size, pool_size,
-				   0, 0, pool_stride, pool_stride);  
+      pool_out = tensorHalfPooling(input, 0, pool_size, pool_size, 0, 0,
+                                   pool_stride, pool_stride);
     }
-    
-   
-  }
-  else{
+
+  } else {
     pool_out = input;
   }
-  
-  
+
   return pool_out;
 }
 
+void *Autotuner_Activation(void *input, int activation_id, int out_min,
+                           int out_max, int swing) {
 
+  void *activation_out;
 
+  if (isFullPrecision(swing) || (!FP16_tuning)) {
 
-void* Autotuner_Activation(void* input, int activation_id,
-			   int out_min, int out_max, int swing){
-
-  void* activation_out;
-
-  if ( isFullPrecision(swing) || (!FP16_tuning) ){
-    
-    switch(activation_id){
+    switch (activation_id) {
     case -1:
       activation_out = input;
       INFO("NO Activation Function \n");
@@ -1206,10 +988,10 @@ void* Autotuner_Activation(void* input, int activation_id,
       break;
     }
   }
-   
-  else{
 
-    switch(activation_id){
+  else {
+
+    switch (activation_id) {
     case -1:
       activation_out = input;
       INFO("NO Activation Function \n");
@@ -1227,167 +1009,116 @@ void* Autotuner_Activation(void* input, int activation_id,
       ERROR("Activation id %d NOT supported \n", activation_out);
       break;
     }
-
   }
 
-
   return activation_out;
 }
 
-void* Autotuner_GPU_ConvLayer(void* input, float i_min, float i_max,
-			      void* filter, float w_min, float w_max,
-			      void* bias, float b_min, float b_max,
-			      int conv_pad_h, int conv_pad_w,
-			      int conv_stride_h, int conv_stride_w,
-			      int pool_id, int pool_size, int pool_stride, 
-			      int activation_id, // Relu, Tanh, ClipRelu
-			      float out_min, float out_max, int swing){ 
-  
-
-  void* conv_out = Autotuner_ConvOp(input, i_min, i_max,
-				    filter, w_min, w_max,
-				    bias, b_min, b_max,
-				    conv_pad_h, conv_pad_w,
-				    conv_stride_h, conv_stride_w,
-				    pool_id, pool_size,
-				    activation_id, 
-				    out_min, out_max, swing);
- 
-  
-  void* conv_add = Autotuner_Add(conv_out, bias, swing);
-
-  void* pool_out = Autotuner_Pooling(conv_add, pool_size, pool_stride, swing);
-
-  void* activation_out = Autotuner_Activation(pool_out, activation_id, out_min, out_max, swing);
-  
-
-  return activation_out;  
-}
+void *Autotuner_GPU_ConvLayer(void *input, float i_min, float i_max,
+                              void *filter, float w_min, float w_max,
+                              void *bias, float b_min, float b_max,
+                              int conv_pad_h, int conv_pad_w, int conv_stride_h,
+                              int conv_stride_w, int pool_id, int pool_size,
+                              int pool_stride,
+                              int activation_id, // Relu, Tanh, ClipRelu
+                              float out_min, float out_max, int swing) {
+
+  void *conv_out = Autotuner_ConvOp(
+      input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h,
+      conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size,
+      activation_id, out_min, out_max, swing);
 
+  void *conv_add = Autotuner_Add(conv_out, bias, swing);
+
+  void *pool_out = Autotuner_Pooling(conv_add, pool_size, pool_stride, swing);
+
+  void *activation_out =
+      Autotuner_Activation(pool_out, activation_id, out_min, out_max, swing);
+
+  return activation_out;
+}
 
 /**** Top-level API for Handling Convolution Layers
 
       The granularity of handling is at a layer-level - not tensor-op level
-        
+
 ***/
 
-void* Autotuner_ConvLayer(void* input, float i_min, float i_max,
-			  void* filter, float w_min, float w_max,
-			  void* bias, float b_min, float b_max,
-			  int conv_pad_h, int conv_pad_w,
-			  int conv_stride_h, int conv_stride_w,
-			  int pool_id, int pool_size, int pool_stride, 
-			  int activation_id, // Relu, Tanh, ClipRelu
-			  float out_min, float out_max, int swing){ 
-
-  if(FP16_tuning){
-    if(ONLINE_PROFILING){
+void *Autotuner_ConvLayer(void *input, float i_min, float i_max, void *filter,
+                          float w_min, float w_max, void *bias, float b_min,
+                          float b_max, int conv_pad_h, int conv_pad_w,
+                          int conv_stride_h, int conv_stride_w, int pool_id,
+                          int pool_size, int pool_stride,
+                          int activation_id, // Relu, Tanh, ClipRelu
+                          float out_min, float out_max, int swing) {
+
+  if (FP16_tuning) {
+    if (ONLINE_PROFILING) {
       ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n");
     }
   }
 
-  swing = getSwing(swing);  
- 
-  if(isPromiseLayer(swing)){
-    
-    return PROMISE_Conv(input, i_min, i_max,
-			filter, w_min, w_max,
-			bias, b_min, b_max,
-			conv_pad_h, conv_pad_w,
-			conv_stride_h, conv_stride_w,
-			pool_id, pool_size, pool_stride,
-			activation_id, 
-			out_min, out_max, swing);
+  swing = getSwing(swing);
+
+  if (isPromiseLayer(swing)) {
+
+    return PROMISE_Conv(input, i_min, i_max, filter, w_min, w_max, bias, b_min,
+                        b_max, conv_pad_h, conv_pad_w, conv_stride_h,
+                        conv_stride_w, pool_id, pool_size, pool_stride,
+                        activation_id, out_min, out_max, swing);
   }
 
   assert(isGPULayer(swing));
 
-  return Autotuner_GPU_ConvLayer(input, i_min, i_max,
-			   filter, w_min, w_max,
-			   bias, b_min, b_max,
-			   conv_pad_h, conv_pad_w,
-			   conv_stride_h, conv_stride_w,
-			   pool_id, pool_size, pool_stride, 
-			   activation_id,
-			   out_min, out_max, swing);
-
+  return Autotuner_GPU_ConvLayer(
+      input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h,
+      conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, pool_stride,
+      activation_id, out_min, out_max, swing);
 }
 
-
-
-
-
 /**** Top-level API Unchanged for backwards compatibility  ***/
 
-void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
-			void* filter, float w_min, float w_max,
-			void* bias, float b_min, float b_max,
-			int conv_pad_h, int conv_pad_w,
-			int conv_stride_h, int conv_stride_w,
-			int pool_id, int pool_size,
-			int activation_id, // Relu, Tanh, ClipRelu
-			float out_min, float out_max, int swing){ 
-
-
-  return Autotuner_ConvLayer(input, i_min, i_max,
-			     filter, w_min, w_max,
-			     bias, b_min, b_max,
-			     conv_pad_h, conv_pad_w,
-			     conv_stride_h, conv_stride_w,
-			     pool_id, pool_size, pool_size, // FIXIT: Assumption pool_size == pool_strides
-			     activation_id,
-			     out_min, out_max, swing);
-  
-
+void *ConvLayer_PROMISE(void *input, float i_min, float i_max, void *filter,
+                        float w_min, float w_max, void *bias, float b_min,
+                        float b_max, int conv_pad_h, int conv_pad_w,
+                        int conv_stride_h, int conv_stride_w, int pool_id,
+                        int pool_size,
+                        int activation_id, // Relu, Tanh, ClipRelu
+                        float out_min, float out_max, int swing) {
+
+  return Autotuner_ConvLayer(
+      input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h,
+      conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size,
+      pool_size, // FIXIT: Assumption pool_size == pool_strides
+      activation_id, out_min, out_max, swing);
 }
 
-
-
-
-void* ConvLayer_PROMISE2(void* input, float i_min, float i_max,
-			 void* filter, float w_min, float w_max,
-			 void* bias, float b_min, float b_max,
-			 int conv_pad_h, int conv_pad_w,
-			 int conv_stride_h, int conv_stride_w,
-			 int pool_id, int pool_size, int pool_stride, 
-			 int activation_id, // Relu, Tanh, ClipRelu
-			 float out_min, float out_max, int swing){ 
-
-
-  return Autotuner_ConvLayer(input, i_min, i_max,
-			     filter, w_min, w_max,
-			     bias, b_min, b_max,
-			     conv_pad_h, conv_pad_w,
-			     conv_stride_h, conv_stride_w,
-			     pool_id, pool_size, pool_stride, 
-			     activation_id,
-			     out_min, out_max, swing);
-  
-
+void *ConvLayer_PROMISE2(void *input, float i_min, float i_max, void *filter,
+                         float w_min, float w_max, void *bias, float b_min,
+                         float b_max, int conv_pad_h, int conv_pad_w,
+                         int conv_stride_h, int conv_stride_w, int pool_id,
+                         int pool_size, int pool_stride,
+                         int activation_id, // Relu, Tanh, ClipRelu
+                         float out_min, float out_max, int swing) {
+
+  return Autotuner_ConvLayer(
+      input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h,
+      conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, pool_stride,
+      activation_id, out_min, out_max, swing);
 }
 
+void *
+FCLayer_PROMISE(void *input, float i_min, float i_max, void *weights,
+                float w_min, float w_max, void *bias, float b_min, float b_max,
+                int activation_id, float out_min, float out_max,
+                int swing) { // NOTE: min_val, max_val apply to 'ClippedRelu'
 
+  swing = getSwing(swing);
 
+  if (isPromiseLayer(swing)) {
 
-
-
-
-void* FCLayer_PROMISE(void* input, float i_min, float i_max,
-		      void* weights, float w_min, float w_max,
-		      void* bias, float b_min, float b_max,
-		      int activation_id,
-		      float out_min, float out_max, int swing){ //NOTE: min_val, max_val apply to 'ClippedRelu'
-
-
-  swing = getSwing(swing);
-  
-  if(isPromiseLayer(swing)){
-
-    return PROMISE_FC(input, i_min, i_max,
-		      weights, w_min, w_max,
-		      bias, b_min, b_max,
-		      activation_id,
-		      out_min, out_max, swing);
+    return PROMISE_FC(input, i_min, i_max, weights, w_min, w_max, bias, b_min,
+                      b_max, activation_id, out_min, out_max, swing);
   }
 
   assert(isGPULayer(swing));
@@ -1433,18 +1164,12 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max,
   }
 
   return activation_out;
-
 }
 
 #endif
 
-
-
 #ifdef OLD_MODEL
 
 #endif
 
-#endif 
-
-
-
+#endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
index c1848f1267..b97e5beadb 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
@@ -1,13 +1,12 @@
 //===--------------------------- approxtechniques.cu ---------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 //  This file  consists of the custom implementation of software approximations
 // for tensor convolutions. The approximations implemented are feature sampling
-// and perforation for FP32 and FP16 compute precisions.  
+// and perforation for FP32 and FP16 compute precisions.
 //
 //===----------------------------------------------------------------------===//
- 
 
 #include "tensor_utils.h"
 #include "approx_utils.h"
@@ -17,406 +16,465 @@
 #include "fp16_conversion.h"
 #include "profiling.h"
 
-extern "C"{
-
-__global__ void convToGemm(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+extern "C" {
+
+__global__ void convToGemm(float *const __restrict__ output,
+                           const float *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     const int inH = h * V_stride - V_pad;
     const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-            output[out_index] = 0;
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmFullInput(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride,
-                    const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen
-                  if(filter_elem_num % skip_every != skip_every-1-skip_offset) {
-                      int output_col = filter_elem_num -
-                               ((filter_elem_num + skip_every)/skip_every);
-                        if(skip_every == 1) output_col = filter_elem_num;
-                        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                            output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                        input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                        else         
-                            output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                   }                
-                }              
-            }                
+__global__ void convToGemmFullInput(
+    float *const __restrict__ output, const float *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h =
+      tx % (H_out * W_out) / W_out;     // output height index (row number)_
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter elemen
+        if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) {
+          int output_col =
+              filter_elem_num - ((filter_elem_num + skip_every) / skip_every);
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
         }
+      }
+    }
+  }
 }
 
-__global__ void convToGemmHalfInputNew(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-      
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if(filter_elem_num % skip_every != skip_offset) {
-                      int output_col = filter_elem_num -
-                                        (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                     if(skip_every == 1) output_col = filter_elem_num;
-                      if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                          output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                      else
-                          output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                  }
-              }
-           }
+__global__ void
+convToGemmHalfInputNew(__half *const __restrict__ output,
+                       const __half *const __restrict input, const int N,
+                       const int C, const int H, const int W, const int KH,
+                       const int KW, const int V_pad, const int H_pad,
+                       const int H_out, const int W_out, const int V_stride,
+                       const int H_stride, const int reduced_filter_elem,
+                       const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (filter_elem_num % skip_every != skip_offset) {
+          int output_col =
+              filter_elem_num - (filter_elem_num / skip_every +
+                                 (filter_elem_num % skip_every > skip_offset));
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
+        }
       }
+    }
+  }
 }
 
-
-__global__
-void convToGemmHalf(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW,
-                    const int V_pad, const int H_pad,
-                    const int H_out, const int W_out,
-                    const int V_stride, const int H_stride){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i
-    const int n = tx / (C * H_out * W_out); //output image numbe
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number
-    const int w = tx % W_out; //output width index (col number
-    const int inH = h * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
-                                            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                } else {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
-                }
-            }
+__global__ void convToGemmHalf(__half *const __restrict__ output,
+                               const __half *const __restrict input,
+                               const int N, const int C, const int H,
+                               const int W, const int KH, const int KW,
+                               const int V_pad, const int H_pad,
+                               const int H_out, const int W_out,
+                               const int V_stride, const int H_stride) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread i
+  const int n = tx / (C * H_out * W_out);               // output image numbe
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan numbe
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row number
+  const int w = tx % W_out;                   // output width index (col number
+  const int inH = h * V_stride - V_pad;
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+          output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        } else {
+          output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
         }
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output,
-                                        const __half * const __restrict input,
-                                        const int N, const int C,
-                                        const int H, const int W,
-                                        const int KH, const int KW, const int V_pad,
-                                        const int H_pad, const int H_out,
-                                        const int W_out, const int V_stride,
-                                        const int H_stride, const int reduced_filter_elem,
-                                        const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                     const int output_col = condition * filter_elem_num 
-                                    + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) 
-                                                         - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));                   		     
-                    const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
-                    //((output_col*N + n) * H_out + h) * W_out + w;
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                       output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                     else
-                       output[out_index] = 0;
-              }
-            }
+__global__ void convToGemmHalfInputNewIrregular(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if ((filter_elem_num - skip_offset) % skip_every) {
+          const int condition = (filter_elem_num < skip_offset);
+          const int output_col =
+              condition * filter_elem_num +
+              (!condition) *
+                  (filter_elem_num -
+                   ((filter_elem_num + 1 - skip_offset) / skip_every) -
+                   ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
+          const int out_index =
+              ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
+          //((output_col*N + n) * H_out + h) * W_out + w;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[out_index] = 0;
         }
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output,
-                                                const __half * const __restrict input,
-                                                const int N, const int C,
-                                                const int H, const int W,
-                                                const int KH, const int KW, const int V_pad,
-                                                const int H_pad, const int H_out,
-                                                const int W_out, const int V_stride,
-                                                const int H_stride, const int reduced_filter_elem,
-                                                const int skip_every, const int skip_offset) {
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_out * W_out); //output image number
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-
-	        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                    const int output_col = condition * filter_elem_num
-                                        + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every)
-                                        - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
-
-                    const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
-                    
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                    else
-                        output[out_index] = 0;
-                }
-            }
+__global__ void convToGemmHalfInputNewIrregular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if ((filter_elem_num - skip_offset) % skip_every) {
+          const int condition = (filter_elem_num < skip_offset);
+          const int output_col =
+              condition * filter_elem_num +
+              (!condition) *
+                  (filter_elem_num -
+                   ((filter_elem_num + 1 - skip_offset) / skip_every) -
+                   ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
+
+          const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
+
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[out_index] = 0;
         }
+      }
     }
+  }
 }
 
-
-
-__global__ void convToGemmHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void convToGemmHalf2(__half *const __restrict__ output,
+                                const __half *const __restrict input,
+                                const int N, const int C, const int H,
+                                const int W, const int KH, const int KW,
+                                const int V_pad, const int H_pad,
+                                const int H_out, const int W_out,
+                                const int V_stride, const int H_stride,
+                                const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     const int inH = h * V_stride - V_pad;
     const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) { 
-      for(int j = 0; j < KW; j++) { 
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element 
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-            output[out_index] = 0;
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmPerfRow(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void
+convToGemmPerfRow(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     int h_index;
-    if(h < start) {
-        h_index = h;
+    if (h < start) {
+      h_index = h;
     } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-
-    for(int i = 0; i < KH; i++) {
-        for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element
-	const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[out_index] = 0;
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w,
-			  float *old_data, float *new_data, int x, int start){
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-    
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) { 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) { 
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                (old_data[output_index] + old_data[output_index - w]) / 2;
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); 
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // filter number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      int row_index = row - ((row + 1 - start) / x);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          (old_data[output_index] + old_data[output_index - w]) / 2;
+    } else {
+      int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void convToGemmPerfCol(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
+__global__ void
+convToGemmPerfCol(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
     int w_index;
-    if(w < start) {
+    if (w < start) {
       w_index = w;
     } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
-    const int inW = w_index * H_stride - H_pad; 
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW  + i * KW + j; //index of this filter element
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
+    const int inW = w_index * H_stride - H_pad;
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = 0;
       }
     }
   }
 }
 
-__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w,
-			                        float *old_data, float *new_data, int x, int start) { 
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    	const int ch = tx % (c * h * w) / (h * w); //output chan number
-   	 const int row = tx % (h * w) / w; //output height index (row number)
-    	const int col = tx % w; //output width index (col number)
-
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] 
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-       		int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                	    (old_data[output_index] + old_data[output_index - 1]) / 2;
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);  
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-       	 	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
+__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // output chan number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      int col_index = col - ((col + 1 - start) / x);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          (old_data[output_index] + old_data[output_index - 1]) / 2;
+    } else {
+      int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void convToGemmPerfRowHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     int h_index;
-    if(h < start) {
-        h_index = h;
+    if (h < start) {
+      h_index = h;
     } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-
-    
-   for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-    	const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-    	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
           output[out_index] = 0;
       }
@@ -424,844 +482,903 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
   }
 }
 
-__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_eff * W_out); //output image numbe
-    if(n < N) { 
-        const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-        const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        int h_index;                   
-        if(h < start) {                
-            h_index = h;               
-        } else {                       
-            h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;                                                            
-        }                              
-        const int inH = h_index * V_stride - V_pad;
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-
-
-	for(int i = 0; i < KH; i++) {
-	  for(int j = 0; j < KW; j++) {
-	    const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-	    const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
-
-	    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	      output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	    else
-	      output[out_index] = 0;
-
-	  }
-	}
-	
+__global__ void convToGemmPerfRowHalf2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image numbe
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
+    int h_index;
+    if (h < start) {
+      h_index = h;
+    } else {
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+    }
+    const int inH = h_index * V_stride - V_pad;
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-				__hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+
+    const int ch = tx % (c * h * w) / (h * w); // filter number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      int row_index = row - ((row + 1 - start) / x);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+    } else {
+      int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col];
-       } else if(row == h-1) {
-           new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            const int row_index = row - ((row + 1 - start) / x);
-            const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h,
+                                          int w, __half *old_data,
+                                          __half *new_data, int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+
+    const int ch = tx % (c * h * w) / (h * w); // filter number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      const int row_index = row - ((row + 1 - start) / x);
+      const int output_index =
+          ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+    } else {
+      const int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      const int output_index =
+          ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-
-__global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
+__global__ void convToGemmPerfColHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
     int w_index;
-    if(w < start) {
+    if (w < start) {
       w_index = w;
     } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inW = w_index * H_stride - H_pad;
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
           output[out_index] = 0;
-
       }
     }
   }
 }
 
-__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                        const int H, const int W, const int KH, const int KW, const int V_pad,
-                        const int H_pad, const int H_out, const int W_out, const int V_stride,
-                        const int H_stride, const int x, const int start, const int W_eff){
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_eff); //output image number
-      if(n < N) {
-          const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-          const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-          const int w = tx % W_eff; //output width index (col number)
-          int w_index;
-          if(w < start) {
-              w_index = w;
-          } else {
-              w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
-          }
-          const int inW = w_index * H_stride - H_pad;
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-
-
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen
-                  const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
-                  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                  else
-                      output[out_index] = 0;
-              }
-        }
+__global__ void convToGemmPerfColHalf2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
+    int w_index;
+    if (w < start) {
+      w_index = w;
+    } else {
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+    }
+    const int inW = w_index * H_stride - H_pad;
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter elemen
+        const int out_index =
+            ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+      }
     }
+  }
 }
 
-
-__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    	const int ch = tx % (c * h * w) / (h * w); //output chan number
-    	const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-        	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-			__hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
-   }
+__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // output chan number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      int col_index = col - ((col + 1 - start) / x);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    } else {
+      int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
+    }
+  }
 }
 
-__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        const int ch = tx % (c * h * w) / (h * w); //output chan number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-        if(col < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                        = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col];
-   
-        } else if(col == w - 1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
-   
-        } else if (col == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                        old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
-   
-        } else if((col - start) % x == 0) {
-            const int col_index = col - ((col + 1 - start) / x);
-            const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-        } else {
-            const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-            const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h,
+                                          int w, __half *old_data,
+                                          __half *new_data, int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // output chan number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col];
+
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
+
+    } else if ((col - start) % x == 0) {
+      const int col_index = col - ((col + 1 - start) / x);
+      const int output_index =
+          ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    } else {
+      const int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      const int output_index =
+          ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
+__global__ void
+convToGemmFullInputRegular(float *const __restrict__ output,
+                           const float *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int reduced_filter_elem,
+                           const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int ch = (fi * C) / reduced_filter_elem;
+      const int offset = (skip_offset + ch) % skip_every;
+      int in_index;
+      if (fi < offset) {
+        in_index = fi;
+      } else {
+        in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                   (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                   offset - 1;
+      }
 
-__global__ void convToGemmFullInputRegular(float * const __restrict__ output,
-				    const float * const __restrict input,
-				    const int N, const int C,
-				    const int H, const int W,
-				    const int KH, const int KW, const int V_pad,
-				    const int H_pad, const int H_out,
-				    const int W_out, const int V_stride,
-				    const int H_stride, const int reduced_filter_elem,
-				    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (H_out * W_out); //output image number
-  if(n < N) {
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-    #pragma unroll
-    for(int fi = 0; fi < reduced_filter_elem; fi++) {
-         const int ch = (fi * C) / reduced_filter_elem;
-         const int offset = (skip_offset + ch) % skip_every;
-         int in_index;
-         if(fi < offset) {
-             in_index = fi;
-         } else {
-             in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
-                        + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-        }
-	 
-        const int i = (in_index % (KW * KH)) / KW;
-        const int j = in_index % KW;
-        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; 
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-        } else {
-            output[out_index] = 0;
-        }
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
       }
     }
+  }
 }
 
-__global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            int in_index;
-            if(fi < skip_offset) {
-                in_index = fi;
-            } else {
-                in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                            + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-            }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmFullInputIrregular(
+    float *const __restrict__ output, const float *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      int in_index;
+      if (fi < skip_offset) {
+        in_index = fi;
+      } else {
+        in_index =
+            ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+            (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+            skip_offset - 1;
+      }
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
-
-    
+  }
 }
 
-__global__ void createReducedFiltersFullRegular(float * output,
-						const float * const __restrict input, const int NF,
-						const int num_filter_elem, const int reduced_filter_elem, 
-						const int channels,
-						const int skip_every, const int skip_offset, const float fac) {
-  
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / reduced_filter_elem; //filter index
-  if(fIdx < NF) { 
-    const int offset = tx % reduced_filter_elem; //offset within filter
+__global__ void createReducedFiltersFullRegular(
+    float *output, const float *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int channels, const int skip_every, const int skip_offset,
+    const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
     const int ch = (offset * channels) / reduced_filter_elem;
     const int channel_offset = (skip_offset + ch) % skip_every;
-      int in_index;
-      if(offset < channel_offset) {
-        in_index = offset;
-      }
-      else {
-         in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                  + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
-      }
-      
-      output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+    int in_index;
+    if (offset < channel_offset) {
+      in_index = offset;
+    } else {
+      in_index =
+          ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) +
+          (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) >
+           0) +
+          channel_offset - 1;
+    }
+
+    output[fIdx * reduced_filter_elem + offset] =
+        fac * input[num_filter_elem * fIdx + in_index];
   }
 }
 
-__global__ void createReducedFiltersFullIrregular(float * output,
-                     const float * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int fIdx = tx / reduced_filter_elem; //filter index
-      if(fIdx < NF) {
-        const int offset = tx % reduced_filter_elem; //offset within filter
-        int in_index;
-        if(offset < skip_offset) {
-            in_index = offset;
-        } else {
-            in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; 
-        }
-        output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+__global__ void createReducedFiltersFullIrregular(
+    float *output, const float *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int skip_every, const int skip_offset, const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
+    int in_index;
+    if (offset < skip_offset) {
+      in_index = offset;
+    } else {
+      in_index =
+          ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) +
+          (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+          skip_offset - 1;
     }
+    output[fIdx * reduced_filter_elem + offset] =
+        fac * input[num_filter_elem * fIdx + in_index];
+  }
 }
 
-__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-      #pragma unroll
-      for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-         const int fi = ch * (reduced_filter_elem / C) + ki;
-         const int offset = (skip_offset + ch) % skip_every;
-   
-         const bool condition = (fi < offset);
-         const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-  
-         const int i = (in_index % (KW * KH)) / KW;
-         const int j = in_index % KW;
-         const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-         if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { 
-             output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-         } else {
-            output[out_index] = 0;
-         }
+__global__ void
+convToGemmHalfInputRegular(__half *const __restrict__ output,
+                           const __half *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int reduced_filter_elem,
+                           const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int ch =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int ki = 0; ki < reduced_filter_elem / C; ki++) {
+      const int fi = ch * (reduced_filter_elem / C) + ki;
+      const int offset = (skip_offset + ch) % skip_every;
+
+      const bool condition = (fi < offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               offset - 1);
+
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
       }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C, 
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      if(n < N) {
-           const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-          const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-          const int w = tx % W_out; //output width index (col number)
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-          const int inW = w * H_stride - H_pad; //input width index (col number)
-          
-          #pragma unroll
-           for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-
-	      const int fi = ch * (reduced_filter_elem / C) + ki;	          
-              const int offset = (skip_offset + ch) % skip_every;
-              const int condition = (fi < offset);
-              const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                          + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-         
-              const int i = (in_index % (KW * KH)) / KW;
-              const int j = in_index % KW;
-              const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-              if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                  output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-              }
-	      else {
-                  output[out_index] = 0;
-             }
-        }
+__global__ void convToGemmHalfInputRegular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int ch =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int ki = 0; ki < reduced_filter_elem / C; ki++) {
+
+      const int fi = ch * (reduced_filter_elem / C) + ki;
+      const int offset = (skip_offset + ch) % skip_every;
+      const int condition = (fi < offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               offset - 1);
+
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-
-	    const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW; 
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            }
-	    else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmHalfInputIrregular(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int condition = (fi < skip_offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               skip_offset - 1);
+
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-       #pragma unroll 
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                 + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-      
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmHalfInputIrregular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int condition = (fi < skip_offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               skip_offset - 1);
+
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
+__global__ void createReducedFiltersHalfRegular(
+    __half *output, const __half *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int channels, const int skip_every, const int skip_offset,
+    const float fac) {
 
-__global__ void createReducedFiltersHalfRegular(__half * output,
-                                         const __half * const __restrict input, const int NF,
-                                         const int num_filter_elem, const int reduced_filter_elem,
-                     			 const int channels,
-                                         const int skip_every, const int skip_offset, const float fac) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
 
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-
-  const int fIdx = tx / reduced_filter_elem; //filter index
-  if(fIdx < NF) {
-    const int offset = tx % reduced_filter_elem; //offset within filter
+  const int fIdx = tx / reduced_filter_elem; // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
     const int ch = (offset * channels) / reduced_filter_elem;
     const int channel_offset = (skip_offset + ch) % skip_every;
     const int condition = (offset < channel_offset);
-    const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                          + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1);
-      
-    output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
- }
-  
+    const int in_index =
+        condition * offset +
+        (!condition) *
+            (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) +
+             (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) >
+              0) +
+             channel_offset - 1);
+
+    output[fIdx * reduced_filter_elem + offset] =
+        __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
+  }
 }
 
-__global__ void createReducedFiltersHalfIrregular(__half * output,
-                     const __half * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
+__global__ void createReducedFiltersHalfIrregular(
+    __half *output, const __half *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int skip_every, const int skip_offset, const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
 
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / reduced_filter_elem; //filter index
-  
-  if(fIdx < NF) {
+  if (fIdx < NF) {
 
-    const int offset = tx % reduced_filter_elem; //offset within filter
+    const int offset = tx % reduced_filter_elem; // offset within filter
     const int condition = (offset < skip_offset);
-    
-    int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-        
-    output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
-  }
-      
-}
 
+    int in_index =
+        condition * offset +
+        (!condition) *
+            (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) +
+             (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) >
+              0) +
+             skip_offset - 1);
 
+    output[fIdx * reduced_filter_elem + offset] =
+        __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
+  }
+}
 
-//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
-__global__ void convToGemmApprox(float * const __restrict__ output,
-				 const float * const __restrict input, const int N, const int C,
-				 const int H, const int W,
-				 const int KH, const int KW, const int V_pad,
-				 const int H_pad, const int H_out,
-				 const int W_out, const int V_stride,
-				 const int H_stride, const int reduced_filter_elem,
-				 const int skip_every) {
-  
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-  const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-  const int w = tx % W_out; //output width index (col number)
-  const int inH = h * V_stride - V_pad; //input height index (row number)
-  const int inW = w * H_stride - H_pad; //input width index (col number)
-  if(n < N) { //is thread id within bounds?
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-	if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
-	  const int output_col = filter_elem_num - (filter_elem_num/skip_every); //cal output column, taking skipping into account
-	  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	  else
-	    output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
-	}
+// produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem
+// cols
+__global__ void
+convToGemmApprox(float *const __restrict__ output,
+                 const float *const __restrict input, const int N, const int C,
+                 const int H, const int W, const int KH, const int KW,
+                 const int V_pad, const int H_pad, const int H_out,
+                 const int W_out, const int V_stride, const int H_stride,
+                 const int reduced_filter_elem, const int skip_every) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (filter_elem_num % skip_every !=
+            skip_every - 1) { // are we including this filter element?
+          const int output_col =
+              filter_elem_num -
+              (filter_elem_num /
+               skip_every); // cal output column, taking skipping into account
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((n * reduced_filter_elem + output_col) * H_out + h) *
+                       W_out +
+                   w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((n * reduced_filter_elem + output_col) * H_out + h) *
+                       W_out +
+                   w] = 0;
+        }
       }
     }
   }
 }
 
-
 /// This function serves as an API with the custom implementation of convolution
-/// with the perforation and filter sampling support. The compute precison is FP32.
-/// This routine is invoked by the tuner for tuning approximations for convolutions.
+/// with the perforation and filter sampling support. The compute precison is
+/// FP32. This routine is invoked by the tuner for tuning approximations for
+/// convolutions.
 ///
-void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
-			 int vertical_pad, int horizontal_pad, int vertical_stride,
-			 int horizontal_stride, int conv_mode, int conv_groups,
-			 int row, int col, int start){
-
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode, int conv_groups,
+                         int row, int col, int start) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
-  
-  Tensor* output;
+
+  Tensor *output;
   // TODO: Support other cases;
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
   convertToFP32(input);
   convertToFP32(filter);
-  
+
   long int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
   int rem_row = (h - start) % row > 0;
   int h_eff = h - ((h - start) / row) - rem_row;
-  
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   int rem_col = (w - start) % col > 0;
   int w_eff = w - ((w - start) / col) - rem_col;
 
-  Tensor* new_output;
-  if(row > 1){
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+  Tensor *new_output;
+  if (row > 1) {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float* convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, start, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, start, h_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h_eff * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h_eff * w,
-					      num_filter_elem * h_eff * w,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h_eff * w, c * h_eff * w,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateRow<<<numBlocks, 128>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  }
-  else if(col > 1){
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+  } else if (col > 1) {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, start, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, start, w_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData,
-					      h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h * w_eff, c * h * w_eff,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateCol<<<numBlocks, 128>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else { 
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
+  } else {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    convToGemmApprox<<<gridSize, blockSize>>>(convData,
-					      (float *)input->gpu_data, n,
-					      input->dims.dim_sizes[1],
-					      input->dims.dim_sizes[2],
-					      input->dims.dim_sizes[3],
-					      KH, KW,
-					      vertical_pad, horizontal_pad, h, w,
-					      vertical_stride, horizontal_stride,
-					      num_filter_elem, c * h * w);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApprox<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, c * h * w);
     checkCudaErrors(cudaDeviceSynchronize());
-    //Do the matrix multiplication
-    //Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
-    
+    // Do the matrix multiplication
+    // Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w, num_filter_elem * h * w,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w, c * h * w,
-					      n));
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem,
+        &alpha, convData, h * w, num_filter_elem * h * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w, c * h * w, n));
 
     new_output = output;
     cudaFree(convData);
   }
 
-  //Event("Conv_end"); //, true);
+  // Event("Conv_end"); //, true);
   return new_output;
 }
 
-__global__
-void switchMatrixFull(int N, int n, int c, int h, int w,
-              float *old_data, float *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-          int col = ((i % (c * h * w)) % (h * w)) % w;
-          int row = ((i % (c * h * w)) % (h * w)) / w;
-          int ch = (i % (c * h * w)) / (h * w);
-          int n_new = i / (c * h * w);
-          
-          new_data[((n_new * c + ch) * h + row ) * w + col] =
-                        old_data[((ch * n + n_new) * h + row ) * w + col];
-        }
-}
+__global__ void switchMatrixFull(int N, int n, int c, int h, int w,
+                                 float *old_data, float *new_data) {
 
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
+
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
+  }
+}
 
 /// This function serves as an API with the custom implementation of convolution
-/// with the perforation and filter sampling support. The compute precison is FP32.
+/// with the perforation and filter sampling support. The compute precison is
+/// FP32.
 ///
-void* tensorConvApprox(void* input_ptr, void* filter_ptr,
-		       int vertical_pad, int horizontal_pad, int vertical_stride,
-		       int horizontal_stride, int conv_mode, int conv_groups,
-		       int row, int col, int skip_every, int offset){
+void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
+                       int horizontal_pad, int vertical_stride,
+                       int horizontal_stride, int conv_mode, int conv_groups,
+                       int row, int col, int skip_every, int offset) {
 
   //////INFO("*** TensorConvolution approximation \n");
-  //Event("Conv");
+  // Event("Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
@@ -1275,15 +1392,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   ////Event("H2F_end");
 
   const int n = input->dims.dim_sizes[0];
-  const int c = filter->dims.dim_sizes[0]; //number of filters
+  const int c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-  const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const int h =
+      (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const int w =
+      (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
   ////INFO("batch: %d\n", n);
@@ -1296,619 +1416,572 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   ////INFO("horizontal_stride: %d\n", horizontal_stride);
   ////INFO("output height: %d\n", h);
   ////INFO("output width: %d\n", w);
-  if(row > 1) {
+  if (row > 1) {
     const int rem_row = (h - offset) % row > 0;
     const int h_eff = h - ((h - offset) / row) - rem_row;
 
-    Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, offset, h_eff);
+    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h_eff * w));
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, offset, h_eff);
     checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                &alpha,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (float *)filter->gpu_data, num_filter_elem, 0,
-                                                &beta,
-                                                (float *)output->gpu_data, h_eff * w, c * h_eff * w,
-                                                n));
-    //interpolate
+
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+    // interpolate
     int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    approxInterpolateRow<<<numBlocks, blocksize>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(col > 1) {
+  } else if (col > 1) {
     const int rem_col = (w - offset) % col > 0;
     const int w_eff = w - ((w - offset) / col) - rem_col;
 
-    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3], KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, offset, w_eff);
+    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w_eff));
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, offset, w_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
-					      n));
-
-    //interpolate
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    // interpolate
     int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    approxInterpolateCol<<<numBlocks, blocksize>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(skip_every > 1) {
-    //reduced number after skipping
+  } else if (skip_every > 1) {
+    // reduced number after skipping
     const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+    const int reduced_filter_elem =
+        num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
 
-    float* convData;
+    float *convData;
     size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    float* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
-    
+    float *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
+
     const int filtBlockSize = 128;
     ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac =  ((float) skip_every) / ((float) skip_every - 1);
+    const int filtGridSize =
+        (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac = ((float)skip_every) / ((float)skip_every - 1);
     //////INFO("fac: %f\n", fac);
     const int blockSize = 128;
-    //////INFO("n * h * w : %d\n", (n * h * w ));    
-    const int gridSize = (n * h * w + blockSize - 1) / blockSize;  
-    if(!(KH * KW % skip_every)) {
-       // ////INFO("REGULAR FILTERING\n");
-        createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                (float *)filter->gpu_data,
-								c, num_filter_elem,
-								reduced_filter_elem,
-								input->dims.dim_sizes[1], skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
+    //////INFO("n * h * w : %d\n", (n * h * w ));
+    const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+    if (!(KH * KW % skip_every)) {
+      // ////INFO("REGULAR FILTERING\n");
+      createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
+          reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+          fac);
+      checkCudaErrors(cudaDeviceSynchronize());
+      convToGemmFullInputRegular<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     } else {
-       // ////INFO("IRREGULAR FILTERING\n");
-        createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (float *)filter->gpu_data,
-                                    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,     
-                                                                input->dims.dim_sizes[1],                                                     
-                                                                input->dims.dim_sizes[2],                                                 
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
+      // ////INFO("IRREGULAR FILTERING\n");
+      createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
+          reduced_filter_elem, skip_every, offset, fac);
+      checkCudaErrors(cudaDeviceSynchronize());
+      convToGemmFullInputIrregular<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     }
     checkCudaErrors(cudaDeviceSynchronize());
-    
+
     const float alpha = 1.0;
     const float beta = 0.0;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            &alpha,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem,
+        &alpha, convData, h * w, reduced_filter_elem * h * w, reducedFilter,
+        reduced_filter_elem, 0, &beta, (float *)new_output->gpu_data, h * w,
+        c * h * w, n));
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
 
-      //INFO("FP32 BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-                               CUDNN_TENSOR_NCHW, n, c, h, w);
+    // INFO("FP32 BASELINE\n");
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w));
-    convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w, vertical_stride, horizontal_stride, 
-                           skip_every, offset);//num_filter_elem);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w));
+    convToGemmFullInput<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        skip_every, offset); // num_filter_elem);
     checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     /*
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                          CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, num_filter_elem,
-                                            &alpha,
-                                            convData, h * w, num_filter_elem * h * w,
-                                            (float *)filter->gpu_data, num_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
-    */
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                       n * h * w, c, num_filter_elem,
-                        &alpha,
-                        convData,
-                        CUDA_R_32F, n * h * w,
-                        (float *) filter->gpu_data, CUDA_R_32F,
-                        num_filter_elem,
-                        &beta,
-                        (float *) output->gpu_data,
-                        CUDA_R_32F, n * h * w,
-                        CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-    
-    const int numBlocks = (n * c * h * w  + 255) / 256;
-    switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (float *)output->gpu_data,
-                                    (float *)new_output->gpu_data);
-    
+
+    float alpha = 1.0f, beta = 0.0f;
+    /*
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+                                         CUBLAS_OP_N, CUBLAS_OP_N,
+                                           h * w, c, num_filter_elem,
+                                           &alpha,
+                                           convData, h * w, num_filter_elem * h
+    * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float
+    *)new_output->gpu_data, h * w, c * h * w, n));
+   */
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
+        &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data,
+        CUDA_R_32F, num_filter_elem, &beta, (float *)output->gpu_data,
+        CUDA_R_32F, n * h * w, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    const int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                         (float *)output->gpu_data,
+                                         (float *)new_output->gpu_data);
+
     checkCudaErrors(cudaDeviceSynchronize());
     cudaFree(convData);
   }
 
-  //Event("Conv_end");
+  // Event("Conv_end");
   return new_output;
 }
 
-__global__
-void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-            int col = ((i % (c * h * w)) % (h * w)) % w;
-            int row = ((i % (c * h * w)) % (h * w)) / w;
-            int ch = (i % (c * h * w)) / (h * w);
-            int n_new = i / (c * h * w);
-            
-            new_data[((n_new * c + ch) * h + row ) * w + col] =
-                            old_data[((ch * n + n_new) * h + row ) * w + col];
-      }
-}
+__global__ void switchMatrixHalf(int N, int n, int c, int h, int w,
+                                 __half *old_data, __half *new_data) {
 
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
+
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
+  }
+}
 
-/// This function serves as an API to custom implementation of the 
+/// This function serves as an API to custom implementation of the
 /// half-precision convolution with the perforation and filter sampling
-/// support. 
+/// support.
 ///
-void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
-			   int vertical_pad, int horizontal_pad,
-			   int vertical_stride, int horizontal_stride,
-			   int conv_mode, int conv_groups,
-			   int row, int col, int skip_every, int offset) {
-
- //INFO("*** TensorConvolution half approximation \n");
- // profileEvent("#Conv");
-
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups, int row, int col, int skip_every,
+                            int offset) {
+
+  // INFO("*** TensorConvolution half approximation \n");
+  // profileEvent("#Conv");
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
 
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
-  
+
   profileEvent("F2H_start");
-   convertToFP16(input);
-   convertToFP16(filter);
+  convertToFP16(input);
+  convertToFP16(filter);
   profileEvent("F2H_end");
-  
+
   const long int n = input->dims.dim_sizes[0];
-  const long int c = filter->dims.dim_sizes[0]; //number of filters
+  const long int c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-  const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const long int h =
+      (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const long int w =
+      (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   changeTensorPlacement(new_output, DEVICE);
-  //INFO("batch: %d\n", n);
+  // INFO("batch: %d\n", n);
   // INFO("channels: %d\n", input->dims.dim_sizes[1]);
   // INFO("num_filters: %d\n", c);
   // INFO("kernel height: %d\n", KH);
-  // INFO("kernel width: %d\n", KW);   
+  // INFO("kernel width: %d\n", KW);
   // INFO("num_filter_elem: %d\n", num_filter_elem);
-   //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
-   //INFO("vertical_stride: %d\n", vertical_stride);
-   //INFO("horizontal_stride: %d\n", horizontal_stride);
+  // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
+  // INFO("vertical_stride: %d\n", vertical_stride);
+  // INFO("horizontal_stride: %d\n", horizontal_stride);
   // INFO("output height: %d\n", h);
   // INFO("output width: %d\n", w);
-   //INFO("skip_every: %d\n", skip_every);
+  // INFO("skip_every: %d\n", skip_every);
   const __half alf = approx_float_to_half(1.0);
   const __half bet = approx_float_to_half(0.0);
   const __half *alpha_half = &alf;
   const __half *beta_half = &bet;
 
-  if(row > 1){
+  if (row > 1) {
     const int rem_row = (h - offset) % row > 0;
     const int h_eff = h - ((h - offset) / row) - rem_row;
-    
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW,
-						  n, c, h_eff, w);
+
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w);
     changeTensorPlacement(output_half, DEVICE);
 
-    __half * convData;
+    __half *convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    
+
     const int patchBlockSize = 256;
-    const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) / patchBlockSize;
+    const int numPatchBlocks =
+        (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) /
+        patchBlockSize;
     const int interpolationBlocksize = 256;
-    const int numInterpolationBlocks = (n * c * h * w  + interpolationBlocksize - 1) / interpolationBlocksize;
-    if(h * w <= 64) {
-        //INFO("H *W <= 64\n");
-        convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>(convData,
-                                   (__half *)input->gpu_half_data, n,
-                                   input->dims.dim_sizes[1],
-                                   input->dims.dim_sizes[2],
-                                   input->dims.dim_sizes[3],
-                                   KH, KW, vertical_pad,
-                                   horizontal_pad, h, w, vertical_stride,
-                                   horizontal_stride, row, offset, h_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                         n * h_eff * w, c, num_filter_elem,
-                         alpha_half,
-                         convData, CUDA_R_16F, n * h_eff * w,
-                         (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                         beta_half,
-                         (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
-                         CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-        approxInterpolateRowHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-                                        (__half *)output_half->gpu_half_data,
-                                        (__half *)new_output->gpu_half_data,
-                                        row, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
-    
-    } else {
-        //INFO("H *W > 64\n");
-        convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>(convData,
-						   (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1],
-						   input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3],
-						   KH, KW, vertical_pad,
-						   horizontal_pad, h, w, vertical_stride,
-						   horizontal_stride, row, offset, h_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                alpha_half,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                                beta_half,
-                                                (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w,
-                                                n));    
-        
-        approxInterpolateRowHalf<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						row, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
+    const int numInterpolationBlocks =
+        (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize;
+    if (h * w <= 64) {
+      // INFO("H *W <= 64\n");
+      convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, row, offset, h_eff);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c,
+          num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w,
+          (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+          beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F,
+          n * h_eff * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+      approxInterpolateRowHalf2<<<numInterpolationBlocks,
+                                  interpolationBlocksize>>>(
+          n * c * h * w, h_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, row, offset);
+      checkCudaErrors(cudaDeviceSynchronize());
 
+    } else {
+      // INFO("H *W > 64\n");
+      convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, row, offset, h_eff);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+          alpha_half, convData, h_eff * w, num_filter_elem * h_eff * w,
+          (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half,
+          (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, n));
+
+      approxInterpolateRowHalf<<<numInterpolationBlocks,
+                                 interpolationBlocksize>>>(
+          n * c * h * w, h_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, row, offset);
+      checkCudaErrors(cudaDeviceSynchronize());
     }
     freeTensor(output_half);
     cudaFree(convData);
-} else if(col > 1) {
+  } else if (col > 1) {
     const int rem_col = (w - offset) % col > 0;
     const int w_eff = w - ((w - offset) / col) - rem_col;
 
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff);
     changeTensorPlacement(output_half, DEVICE);
-   
-    __half * convData;
+
+    __half *convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    
+
     const int patchBlockSize = 256;
-    const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) / patchBlockSize;
+    const int numPatchBlocks =
+        (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) /
+        patchBlockSize;
     const int interpolationBlocksize = 256;
-    const int numInterpolationBlocks = (n * c * h * w  + interpolationBlocksize - 1) / interpolationBlocksize;
-   if(h * w <= 64) {
-         //INFO("H *W <= 64\n");
-        convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3], KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, col, offset, w_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                        n * h * w_eff, c, num_filter_elem,
-                                        alpha_half,
-                                        convData, CUDA_R_16F, n * h * w_eff,
-                                        (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                                        beta_half,
-                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
-                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
-         approxInterpolateColHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-                                                        (__half *)output_half->gpu_half_data,
-                                                        (__half *)new_output->gpu_half_data,
-                                                        col, offset);
-          checkCudaErrors(cudaDeviceSynchronize());
+    const int numInterpolationBlocks =
+        (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize;
+    if (h * w <= 64) {
+      // INFO("H *W <= 64\n");
+      convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, col, offset, w_eff);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c,
+          num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff,
+          (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+          beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F,
+          n * h * w_eff, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+      approxInterpolateColHalf2<<<numInterpolationBlocks,
+                                  interpolationBlocksize>>>(
+          n * c * h * w, w_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, col, offset);
+      checkCudaErrors(cudaDeviceSynchronize());
     } else {
-        //INFO("H *W > 64\n");
-        convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n,
-		                            				   input->dims.dim_sizes[1],
-                                                       input->dims.dim_sizes[2],
-                                                       input->dims.dim_sizes[3], KH, KW, vertical_pad,
-                                                       horizontal_pad, h, w, vertical_stride,
-                                                       horizontal_stride, col, offset, w_eff);
-        checkCudaErrors(cudaDeviceSynchronize());
-    
-        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                              CUBLAS_OP_N, CUBLAS_OP_N,
-                                              h * w_eff, c, num_filter_elem,
-                                              alpha_half,
-                                              convData, h * w_eff, num_filter_elem * h * w_eff,
-                                              (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                              beta_half,
-                                              (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff,
-                                              n));
-
-         approxInterpolateColHalf<<<numInterpolationBlocks,interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-                                 (__half *)output_half->gpu_half_data,
-                                 (__half *)new_output->gpu_half_data,
-                                 col, offset);
-         checkCudaErrors(cudaDeviceSynchronize());
+      // INFO("H *W > 64\n");
+      convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, col, offset, w_eff);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+          alpha_half, convData, h * w_eff, num_filter_elem * h * w_eff,
+          (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half,
+          (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, n));
+
+      approxInterpolateColHalf<<<numInterpolationBlocks,
+                                 interpolationBlocksize>>>(
+          n * c * h * w, w_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, col, offset);
+      checkCudaErrors(cudaDeviceSynchronize());
     }
 
     freeTensor(output_half);
     cudaFree(convData);
-  } else if(skip_every > 1) {
+  } else if (skip_every > 1) {
     const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+    const int reduced_filter_elem =
+        num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
 
-    __half* convData;
+    __half *convData;
     size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    __half* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
+    __half *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
 
     const int filtBlockSize = 256;
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac =  ((float) skip_every) / ((float) skip_every - 1);
+    const int filtGridSize =
+        (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac = ((float)skip_every) / ((float)skip_every - 1);
     const int blockSize = 256;
-    //const int gridSize = (n * h * w + blockSize - 1) / blockSize;
-   // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
-   // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
+    // const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+    // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
+    // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
     const __half alf = approx_float_to_half(1.0);
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-    if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { 
-      if(!(KH * KW % skip_every)) {
-        //INFO("---REGULAR FILTERING\n");
-        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                (__half *)filter->gpu_half_data,
-								c, num_filter_elem,
-                                                                reduced_filter_elem,
-                                                                input->dims.dim_sizes[1], skip_every, offset, fac);
+    if (c * num_filter_elem <
+        500000) { // 250) {//c * reduced_filter_elem < 150000) {
+      if (!(KH * KW % skip_every)) {
+        // INFO("---REGULAR FILTERING\n");
+        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+            fac);
         checkCudaErrors(cudaDeviceSynchronize());
-	
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
       } else {
-        //INFO("---IRREGULAR FILTERING\n");
-        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (__half *)filter->gpu_half_data,
-				    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
+        // INFO("---IRREGULAR FILTERING\n");
+        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, skip_every, offset, fac);
         checkCudaErrors(cudaDeviceSynchronize());
-        
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1]  + blockSize - 1) / blockSize;
-	    //convToGemmHalfInputIrregular
-        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,  
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-     }   
-     checkCudaErrors(cudaDeviceSynchronize());
-
-     checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            alpha_half,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            beta_half,
-                                            (__half *)new_output->gpu_half_data, h * w, c * h * w,
-                                            n));
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        // convToGemmHalfInputIrregular
+        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      }
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem,
+          alpha_half, convData, h * w, reduced_filter_elem * h * w,
+          reducedFilter, reduced_filter_elem, 0, beta_half,
+          (__half *)new_output->gpu_half_data, h * w, c * h * w, n));
     } else {
-        Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                 CUDNN_TENSOR_NCHW, n, c, h, w);
-        changeTensorPlacement(output_half, DEVICE);
-
-        if(!(KH * KW % skip_every)) {
-           //INFO("REGULAR FILTERING\n");
-            createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                        (__half *)filter->gpu_half_data,
-                                                        c, num_filter_elem,
-                                                        reduced_filter_elem,
-                                                        input->dims.dim_sizes[1], skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-        } else {
-           // INFO("IRREGULAR FILTERING\n");
-            createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                            (__half *)filter->gpu_half_data,
-                                                                            c, num_filter_elem,
-                                                                            reduced_filter_elem,
-                                                                            skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-            }
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                        n * h * w, c, reduced_filter_elem,
-                                        alpha_half,
-                                        convData, CUDA_R_16F, n * h * w,
-                                         reducedFilter, CUDA_R_16F, reduced_filter_elem,
-                                        beta_half,
-                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
-                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-            
-            int numBlocks = (n * c * h * w  + 255) / 256;
-            switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (__half *)output_half->gpu_half_data,
-                                    (__half *)new_output->gpu_half_data);
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            freeTensor(output_half);
+      Tensor *output_half = (Tensor *)create4DTensor(
+          (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w);
+      changeTensorPlacement(output_half, DEVICE);
+
+      if (!(KH * KW % skip_every)) {
+        // INFO("REGULAR FILTERING\n");
+        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+            fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      } else {
+        // INFO("IRREGULAR FILTERING\n");
+        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, skip_every, offset, fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      }
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+          reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w,
+          reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half,
+          (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w,
+          CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+      int numBlocks = (n * c * h * w + 255) / 256;
+      switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                           (__half *)output_half->gpu_half_data,
+                                           (__half *)new_output->gpu_half_data);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      freeTensor(output_half);
     }
-    
+
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
-       //INFO("FP16 BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                   CUDNN_TENSOR_NCHW, n, c, h, w);
-      
-      changeTensorPlacement(output, DEVICE);
-      __half * convData;
-      long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
-      checkCudaErrors(cudaMalloc(&convData, convDataSize));
-      
-      const int blockSize = 256;
-      const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-      //convToGemmHalf
-      convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData,
-                                                (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3],
-                                                KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, num_filter_elem,
-                                                skip_every, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        const __half alf = approx_float_to_half(1.0);
-        const __half bet = approx_float_to_half(0.0);
-        const __half *alpha_half = &alf;
-        const __half *beta_half = &bet;
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                    n * h * w, c, num_filter_elem,
-                                    alpha_half,
-                                    convData, CUDA_R_16F, n * h * w,
-                                    (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                                    beta_half,
-                                    (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w,
-                                    CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-        
-        const int numBlocks = (n * c * h * w  + 255) / 256;
-        switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data,
-                                            (__half *)new_output->gpu_half_data);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        freeTensor(output);
-        cudaFree(convData);
+    // INFO("FP16 BASELINE\n");
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    changeTensorPlacement(output, DEVICE);
+    __half *convData;
+    long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    // convToGemmHalf
+    convToGemmHalfInputNew<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, skip_every, offset);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
+        alpha_half, convData, CUDA_R_16F, n * h * w,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half,
+        (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F,
+        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    const int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                         (__half *)output->gpu_half_data,
+                                         (__half *)new_output->gpu_half_data);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    freeTensor(output);
+    cudaFree(convData);
   }
 
   profileEvent("H2F_start");
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu
index 6e9f88bb54..bdcfb2c568 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu
@@ -7,429 +7,489 @@
 #include "fp16_conversion.h"
 #include "profiling.h"
 
-extern "C"{
-
-__global__ void convToGemm(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+extern "C" {
+
+__global__ void convToGemm(float *const __restrict__ output,
+                           const float *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     const int inH = h * V_stride - V_pad;
     const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-            output[out_index] = 0;
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmFullInput(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride,
-                    const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen
-                  if(filter_elem_num % skip_every != skip_every-1-skip_offset) {
-                      int output_col = filter_elem_num -
-                                ((filter_elem_num + skip_every)/skip_every);
-                       if(skip_every == 1)
-                           output_col = filter_elem_num;
-                        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                            output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                        input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                        else         
-                            output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                    }                
-                }              
-            }                
+__global__ void convToGemmFullInput(
+    float *const __restrict__ output, const float *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int skip_every, const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h =
+      tx % (H_out * W_out) / W_out;     // output height index (row number)_
+  const int w = tx % W_out;             // output width index (col number)
+  const int inH = h * V_stride - V_pad; // input height index (row number)
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter elemen
+        if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) {
+          int output_col =
+              filter_elem_num - ((filter_elem_num + skip_every) / skip_every);
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
         }
+      }
+    }
+  }
 }
 
-__global__ void convToGemmHalfInputNew(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-      
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if(filter_elem_num % skip_every != skip_offset) {
-                      int output_col = filter_elem_num -
-                                        (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                      if(skip_every == 1)
-                          output_col = filter_elem_num;
-                      if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                          output[((output_col*N + n) * H_out + h) * W_out + w] =
-                                    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                      else
-                          output[((output_col*N + n) * H_out + h) * W_out + w] = 0;
-                  }
-                }
-           }
+__global__ void
+convToGemmHalfInputNew(__half *const __restrict__ output,
+                       const __half *const __restrict input, const int N,
+                       const int C, const int H, const int W, const int KH,
+                       const int KW, const int V_pad, const int H_pad,
+                       const int H_out, const int W_out, const int V_stride,
+                       const int H_stride, const int reduced_filter_elem,
+                       const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (filter_elem_num % skip_every != skip_offset) {
+          int output_col =
+              filter_elem_num - (filter_elem_num / skip_every +
+                                 (filter_elem_num % skip_every > skip_offset));
+          if (skip_every == 1)
+            output_col = filter_elem_num;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[((output_col * N + n) * H_out + h) * W_out + w] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[((output_col * N + n) * H_out + h) * W_out + w] = 0;
+        }
       }
+    }
+  }
 }
 
-
-__global__
-void convToGemmHalf(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW,
-                    const int V_pad, const int H_pad,
-                    const int H_out, const int W_out,
-                    const int V_stride, const int H_stride){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i
-    const int n = tx / (C * H_out * W_out); //output image numbe
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number
-    const int w = tx % W_out; //output width index (col number
-    const int inH = h * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
-                                            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                } else {
-                    output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
-                }
-            }
+__global__ void convToGemmHalf(__half *const __restrict__ output,
+                               const __half *const __restrict input,
+                               const int N, const int C, const int H,
+                               const int W, const int KH, const int KW,
+                               const int V_pad, const int H_pad,
+                               const int H_out, const int W_out,
+                               const int V_stride, const int H_stride) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread i
+  const int n = tx / (C * H_out * W_out);               // output image numbe
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan numbe
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row number
+  const int w = tx % W_out;                   // output width index (col number
+  const int inH = h * V_stride - V_pad;
+  const int inW = w * H_stride - H_pad; // input width index (col number)
+  if (n < N) {                          // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+          output[((filter_elem_num * N + n) * H_out + h) * W_out + w] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        } else {
+          output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0;
         }
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output,
-                                        const __half * const __restrict input,
-                                        const int N, const int C,
-                                        const int H, const int W,
-                                        const int KH, const int KW, const int V_pad,
-                                        const int H_pad, const int H_out,
-                                        const int W_out, const int V_stride,
-                                        const int H_stride, const int reduced_filter_elem,
-                                        const int skip_every, const int skip_offset) {
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-      const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-      const int w = tx % W_out; //output width index (col number)
-      const int inH = h * V_stride - V_pad; //input height index (row number)
-      const int inW = w * H_stride - H_pad; //input width index (col number)
-      if(n < N) { //is thread id within bounds?
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  //const int ki = c * KH * KW + i;
-                  //const int kj = c * KH * KW + j;
-                  const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                  if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                     const int output_col = condition * filter_elem_num 
-                                    + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) 
-                                                         - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));                   
-                  //if(filter_elem_num % skip_every != skip_offset) {
-                  // int output_col = filter_elem_num -
-                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                   //if(skip_every == 1)
-                   //    output_col = filter_elem_num;
-                    const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
-                    //((output_col*N + n) * H_out + h) * W_out + w;
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                       output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                     else
-                       output[out_index] = 0;
-              }
-            }
+__global__ void convToGemmHalfInputNewIrregular(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        // const int ki = c * KH * KW + i;
+        // const int kj = c * KH * KW + j;
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if ((filter_elem_num - skip_offset) % skip_every) {
+          const int condition = (filter_elem_num < skip_offset);
+          const int output_col =
+              condition * filter_elem_num +
+              (!condition) *
+                  (filter_elem_num -
+                   ((filter_elem_num + 1 - skip_offset) / skip_every) -
+                   ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
+          // if(filter_elem_num % skip_every != skip_offset) {
+          // int output_col = filter_elem_num -
+          //  (filter_elem_num/skip_every + (filter_elem_num % skip_every >
+          //  skip_offset));
+          // if(skip_every == 1)
+          //    output_col = filter_elem_num;
+          const int out_index =
+              ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
+          //((output_col*N + n) * H_out + h) * W_out + w;
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[out_index] = 0;
         }
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output,
-                                                const __half * const __restrict input,
-                                                const int N, const int C,
-                                                const int H, const int W,
-                                                const int KH, const int KW, const int V_pad,
-                                                const int H_pad, const int H_out,
-                                                const int W_out, const int V_stride,
-                                                const int H_stride, const int reduced_filter_elem,
-                                                const int skip_every, const int skip_offset) {
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_out * W_out); //output image number
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    if(n < N) { //is thread id within bounds?
-        for(int i = 0; i < KH; i++) {
-            for(int j = 0; j < KW; j++) {
-                //const int ki = c * KH * KW + i;
-                //const int kj = c * KH * KW + j;
-                const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
-                if((filter_elem_num - skip_offset) % skip_every) {
-                    const int condition = (filter_elem_num < skip_offset);
-                    const int output_col = condition * filter_elem_num
-                                        + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every)
-                                        - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
-                    //if(filter_elem_num % skip_every != skip_offset) {
-                    // int output_col = filter_elem_num -
-                    //  (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
-                    //if(skip_every == 1)
-                    //    output_col = filter_elem_num;
-                    const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
-                    //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
-                    //((output_col*N + n) * H_out + h) * W_out + w
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                    else
-                        output[out_index] = 0;
-                }
-            }
+__global__ void convToGemmHalfInputNewIrregular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
+                                                            // number
+  const int h = tx % (H_out * W_out) / W_out; // output height index (row
+                                              // number)
+  const int w = tx % W_out;                   // output width index (col number)
+  const int inH = h * V_stride - V_pad;       // input height index (row number)
+  const int inW = w * H_stride - H_pad;       // input width index (col number)
+  if (n < N) {                                // is thread id within bounds?
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        // const int ki = c * KH * KW + i;
+        // const int kj = c * KH * KW + j;
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        if ((filter_elem_num - skip_offset) % skip_every) {
+          const int condition = (filter_elem_num < skip_offset);
+          const int output_col =
+              condition * filter_elem_num +
+              (!condition) *
+                  (filter_elem_num -
+                   ((filter_elem_num + 1 - skip_offset) / skip_every) -
+                   ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
+          // if(filter_elem_num % skip_every != skip_offset) {
+          // int output_col = filter_elem_num -
+          //  (filter_elem_num/skip_every + (filter_elem_num % skip_every >
+          //  skip_offset));
+          // if(skip_every == 1)
+          //    output_col = filter_elem_num;
+          const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
+          //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
+          //((output_col*N + n) * H_out + h) * W_out + w
+          if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+            output[out_index] =
+                input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+          else
+            output[out_index] = 0;
         }
+      }
     }
+  }
 }
 
-
-
-__global__ void convToGemmHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int num_filter_elem) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void convToGemmHalf2(__half *const __restrict__ output,
+                                const __half *const __restrict input,
+                                const int N, const int C, const int H,
+                                const int W, const int KH, const int KW,
+                                const int V_pad, const int H_pad,
+                                const int H_out, const int W_out,
+                                const int V_stride, const int H_stride,
+                                const int num_filter_elem) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     const int inH = h * V_stride - V_pad;
     const int inW = w * H_stride - H_pad;
-    for(int i = 0; i < KH; i++) { 
-      for(int j = 0; j < KW; j++) { 
-        const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element 
-        const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-            output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            (c * KH + i) * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-            output[out_index] = 0;
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmPerfRow(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void
+convToGemmPerfRow(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     int h_index;
-    if(h < start) {
-        h_index = h;
+    if (h < start) {
+      h_index = h;
     } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-   //#pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-      //  int i = ki / KW;
-      //  int j = ki % KW;
-    for(int i = 0; i < KH; i++) {
-        for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element
-    const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[out_index] = 0;
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+    //#pragma unroll
+    // for (int ki = 0; ki < KH * KW; ki++) {
+    //  int i = ki / KW;
+    //  int j = ki % KW;
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
       }
     }
   }
 }
 
-__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w,
-			  float *old_data, float *new_data, int x, int start){
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-        const int ch = tx % (c * h * w) / (h * w); //filter number
-        const int row = tx % (h * w) / w; //output height index (row number)
-        const int col = tx % w; //output width index (col number)
-    
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) { 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) { 
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; 
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                (old_data[output_index] + old_data[output_index - w]) / 2;
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); 
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // filter number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      int row_index = row - ((row + 1 - start) / x);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          (old_data[output_index] + old_data[output_index - w]) / 2;
+    } else {
+      int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void convToGemmPerfCol(float * const __restrict__ output,
-		       const float * const __restrict input, const int N, const int C,
-		       const int H, const int W, const int KH, const int KW, const int V_pad,
-		       const int H_pad, const int H_out, const int W_out, const int V_stride,
-		       const int H_stride, const int x, const int start, const int W_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) { 
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
+__global__ void
+convToGemmPerfCol(float *const __restrict__ output,
+                  const float *const __restrict input, const int N, const int C,
+                  const int H, const int W, const int KH, const int KW,
+                  const int V_pad, const int H_pad, const int H_out,
+                  const int W_out, const int V_stride, const int H_stride,
+                  const int x, const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
     int w_index;
-    if(w < start) {
+    if (w < start) {
       w_index = w;
     } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
-    const int inW = w_index * H_stride - H_pad; 
-    const int inH = h * V_stride - V_pad; //input height index (row number)
+    const int inW = w_index * H_stride - H_pad;
+    const int inH = h * V_stride - V_pad; // input height index (row number)
     //#pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-      //  int i = ki / KW;
-       // int j = ki % KW;
-    
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-	const int filter_elem_num = c * KH * KW  + i * KW + j; //index of this filter element
-	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
-	    input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-	else
-	  output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
+    // for (int ki = 0; ki < KH * KW; ki++) {
+    //  int i = ki / KW;
+    // int j = ki % KW;
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = 0;
       }
     }
   }
 }
 
-__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w,
-			                        float *old_data, float *new_data, int x, int start) { 
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (c * h * w); //output image number
-    if(n < N) {
-    	const int ch = tx % (c * h * w) / (h * w); //output chan number
-   	 const int row = tx % (h * w) / w; //output height index (row number)
-    	const int col = tx % w; //output width index (col number)
-
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] 
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-       		int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 
-                	    (old_data[output_index] + old_data[output_index - 1]) / 2;
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);  
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-       	 	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
+__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h,
+                                     int w, float *old_data, float *new_data,
+                                     int x, int start) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (c * h * w);                       // output image number
+  if (n < N) {
+    const int ch = tx % (c * h * w) / (h * w); // output chan number
+    const int row = tx % (h * w) / w; // output height index (row number)
+    const int col = tx % w;           // output width index (col number)
+
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      int col_index = col - ((col + 1 - start) / x);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          (old_data[output_index] + old_data[output_index - 1]) / 2;
+    } else {
+      int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_eff * W_out); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-    const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
+__global__ void convToGemmPerfRowHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
     int h_index;
-    if(h < start) {
-        h_index = h;
+    if (h < start) {
+      h_index = h;
     } else {
-         h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inH = h_index * V_stride - V_pad;
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-  // #pragma unroll
-    //for (int ki = 0; ki < KH * KW; ki++) {
-     //   int i = ki / KW; 
-     //   int j = ki % KW;
-   
-   for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-    	const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-    	if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+    // #pragma unroll
+    // for (int ki = 0; ki < KH * KW; ki++) {
+    //   int i = ki / KW;
+    //   int j = ki % KW;
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
           output[out_index] = 0;
       }
@@ -437,872 +497,941 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
   }
 }
 
-__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int H_eff){
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (C * H_eff * W_out); //output image numbe
-    if(n < N) { 
-        const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number
-        const int h = tx % (H_eff * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        int h_index;                   
-        if(h < start) {                
-            h_index = h;               
-        } else {                       
-            h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1;                                                            
-        }                              
-        const int inH = h_index * V_stride - V_pad;
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        // #pragma unroll
-        //for (int ki = 0; ki < KH * KW; ki++) {
-            //   int i = ki / KW; 
-            //   int j = ki % KW; 
-            for(int i = 0; i < KH; i++) {
-                for(int j = 0; j < KW; j++) {
-                    const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-                    const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
-                    //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
-                    if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                    else
-                        output[out_index] = 0;
-                }
-            }
+__global__ void convToGemmPerfRowHalf2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int H_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_eff * W_out);               // output image numbe
+  if (n < N) {
+    const int c =
+        tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number
+    const int h =
+        tx % (H_eff * W_out) / W_out; // output height index (row number)
+    const int w = tx % W_out;         // output width index (col number)
+    int h_index;
+    if (h < start) {
+      h_index = h;
+    } else {
+      h_index = ((h - start + 1) * x) / (x - 1) +
+                (((h - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
-}
-
-__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-
-    const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int n = tx / (c * h * w); //output image number
-    const int stride = blockDim.x * gridDim.x;
-    //if(n < N) {
-    for(int i = index; i < N; i += stride){
-        const int col = ((i % (c * h * w)) % (h * w)) % w;
-        const int row = ((i % (c * h * w)) % (h * w)) / w;
-        const int ch = (i % (c * h * w)) / (h * w);
-        const int n = i / (c * h * w);
-
-        //const int ch = tx % (c * h * w) / (h * w); //filter number
-        //const int row = tx % (h * w) / w; //output height index (row number)
-        //const int col = tx % w; //output width index (col number)
-
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
-        } else if(row == h-1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            int row_index = row - ((row + 1 - start) / x);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-				__hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+    const int inH = h_index * V_stride - V_pad;
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+    // #pragma unroll
+    // for (int ki = 0; ki < KH * KW; ki++) {
+    //   int i = ki / KW;
+    //   int j = ki % KW;
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+        const int out_index =
+            ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
+        //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void approxInterpolateRowHalf2(int N, int old_h, int j, int c, int h, int w,
-                          __half *old_data, __half *new_data, int x, int start) {
-    
-    const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    //const int n = tx / (c * h * w); //output image numbe
-    const int stride = blockDim.x * gridDim.x;
-    //if(n < N) {
-    for(int i = index; i < N; i += stride){
-        const int col = ((i % (c * h * w)) % (h * w)) % w;
-        const int row = ((i % (c * h * w)) % (h * w)) / w;
-        const int ch = (i % (c * h * w)) / (h * w);
-        const int n = i / (c * h * w);
-        
-        //const int ch = tx % (c * h * w) / (h * w); //filter number
-        //const int row = tx % (h * w) / w; //output height index (row number)
-        //const int col = tx % w; //output width index (col number
-        if(row < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    old_data[ch * (n * old_h * w) + n * (old_h * w) + row * (w) + col];
-       } else if(row == h-1) {
-           new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (n * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col];
-        } else if (row == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                old_data[ch * (n * old_h * w) + n * (old_h * w) + 0 * (w) + col];
-        } else if((row - start) % x == 0) {
-            const int row_index = row - ((row + 1 - start) / x);
-            const int output_index = ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                    __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
-        } else {
-            const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
-            const int output_index = ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col;
-            //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
+
+  const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  // const int n = tx / (c * h * w); //output image number
+  const int stride = blockDim.x * gridDim.x;
+  // if(n < N) {
+  for (int i = index; i < N; i += stride) {
+    const int col = ((i % (c * h * w)) % (h * w)) % w;
+    const int row = ((i % (c * h * w)) % (h * w)) / w;
+    const int ch = (i % (c * h * w)) / (h * w);
+    const int n = i / (c * h * w);
+
+    // const int ch = tx % (c * h * w) / (h * w); //filter number
+    // const int row = tx % (h * w) / w; //output height index (row number)
+    // const int col = tx % w; //output width index (col number)
+
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      int row_index = row - ((row + 1 - start) / x);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+    } else {
+      int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      int output_index =
+          n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
+__global__ void approxInterpolateRowHalf2(int N, int old_h, int j, int c, int h,
+                                          int w, __half *old_data,
+                                          __half *new_data, int x, int start) {
 
-__global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                       const int H, const int W, const int KH, const int KW, const int V_pad,
-                       const int H_pad, const int H_out, const int W_out, const int V_stride,
-                       const int H_stride, const int x, const int start, const int W_eff){
+  const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  // const int n = tx / (c * h * w); //output image numbe
+  const int stride = blockDim.x * gridDim.x;
+  // if(n < N) {
+  for (int i = index; i < N; i += stride) {
+    const int col = ((i % (c * h * w)) % (h * w)) % w;
+    const int row = ((i % (c * h * w)) % (h * w)) / w;
+    const int ch = (i % (c * h * w)) / (h * w);
+    const int n = i / (c * h * w);
+
+    // const int ch = tx % (c * h * w) / (h * w); //filter number
+    // const int row = tx % (h * w) / w; //output height index (row number)
+    // const int col = tx % w; //output width index (col number
+    if (row < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * old_h * w) + n * (old_h * w) + row * (w) + col];
+    } else if (row == h - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) +
+                   col];
+    } else if (row == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * old_h * w) + n * (old_h * w) + 0 * (w) + col];
+    } else if ((row - start) % x == 0) {
+      const int row_index = row - ((row + 1 - start) / x);
+      const int output_index =
+          ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+      // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
+    } else {
+      const int row_index =
+          row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
+      const int output_index =
+          ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col;
+      // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
+    }
+  }
+}
 
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_eff); //output image number
-  if(n < N) {
-    const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-    const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-    const int w = tx % W_eff; //output width index (col number)
+__global__ void convToGemmPerfColHalf(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
     int w_index;
-    if(w < start) {
+    if (w < start) {
       w_index = w;
     } else {
-      w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
     const int inW = w_index * H_stride - H_pad;
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-     //#pragma unroll
-    //  for (int ki = 0; ki < KH * KW; ki++) {               
-      //    int i = ki / KW;
-       //   int j = ki % KW; 
-    
-    for(int i = 0; i < KH; i++) {
-      for(int j = 0; j < KW; j++) {
-        const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
-
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] =
-            input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+                                          //#pragma unroll
+    //  for (int ki = 0; ki < KH * KW; ki++) {
+    //    int i = ki / KW;
+    //   int j = ki % KW;
+
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter element
+
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
         else
-          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0;
-
+          output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff +
+                 w] = 0;
       }
     }
   }
 }
 
-__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
-                       const __half * const __restrict input, const int N, const int C,
-                        const int H, const int W, const int KH, const int KW, const int V_pad,
-                        const int H_pad, const int H_out, const int W_out, const int V_stride,
-                        const int H_stride, const int x, const int start, const int W_eff){
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_eff); //output image number
-      if(n < N) {
-          const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number
-          const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number)
-          const int w = tx % W_eff; //output width index (col number)
-          int w_index;
-          if(w < start) {
-              w_index = w;
-          } else {
-              w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
-          }
-          const int inW = w_index * H_stride - H_pad;
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-          //#pragma unroll
-          //  for (int ki = 0; ki < KH * KW; ki++) {               
-              //    int i = ki / KW;
-              //   int j = ki % KW; 
-          for(int i = 0; i < KH; i++) {
-              for(int j = 0; j < KW; j++) {
-                  const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen
-                  const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
-                  if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
-                        output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
-                  else
-                      output[out_index] = 0;
-              }
-        }
+__global__ void convToGemmPerfColHalf2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride, const int x,
+    const int start, const int W_eff) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_eff);               // output image number
+  if (n < N) {
+    const int c =
+        tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number
+    const int h =
+        tx % (H_out * W_eff) / W_eff; // output height index (row number)
+    const int w = tx % W_eff;         // output width index (col number)
+    int w_index;
+    if (w < start) {
+      w_index = w;
+    } else {
+      w_index = ((w - start + 1) * x) / (x - 1) +
+                (((w - start + 1) * x) % (x - 1) > 0) + start - 1;
     }
+    const int inW = w_index * H_stride - H_pad;
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    //#pragma unroll
+    //  for (int ki = 0; ki < KH * KW; ki++) {
+    //    int i = ki / KW;
+    //   int j = ki % KW;
+    for (int i = 0; i < KH; i++) {
+      for (int j = 0; j < KW; j++) {
+        const int filter_elem_num =
+            c * KH * KW + i * KW + j; // index of this filter elemen
+        const int out_index =
+            ((filter_elem_num * N + n) * H_out + h) * W_eff + w;
+        if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
+          output[out_index] =
+              input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
+        else
+          output[out_index] = 0;
+      }
+    }
+  }
 }
 
+__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h,
+                                         int w, __half *old_data,
+                                         __half *new_data, int x, int start) {
 
-__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
+  const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int stride = blockDim.x * gridDim.x;
 
-    const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int stride = blockDim.x * gridDim.x;
-    
-    for(int i = index; i < N; i += stride){
-        const int col = ((i % (c * h * w)) % (h * w)) % w;
-        const int row = ((i % (c * h * w)) % (h * w)) / w;
-        const int ch = (i % (c * h * w)) / (h * w);
-        const int n = i / (c * h * w);
+  for (int i = index; i < N; i += stride) {
+    const int col = ((i % (c * h * w)) % (h * w)) % w;
+    const int row = ((i % (c * h * w)) % (h * w)) / w;
+    const int ch = (i % (c * h * w)) / (h * w);
+    const int n = i / (c * h * w);
 
-    //const int n = tx / (c * h * w); //output image number
-    //if(n < N) {
-    	//const int ch = tx % (c * h * w) / (h * w); //output chan number
-    	//const int row = tx % (h * w) / w; //output height index (row number)
+    // const int n = tx / (c * h * w); //output image number
+    // if(n < N) {
+    // const int ch = tx % (c * h * w) / (h * w); //output chan number
+    // const int row = tx % (h * w) / w; //output height index (row number)
     //	const int col = tx % w; //output width index (col number)
 
-    	if(col < start) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                	= old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-    	} else if(col == w - 1) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-    	} else if (col == 0) {
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-            		old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-    	} else if((col - start) % x == 0) {
-        	int col_index = col - ((col + 1 - start) / x);
-        	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-			__hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-    	} else {
-        	int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-         	int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-        	new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-    	}
-   }
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      int col_index = col - ((col + 1 - start) / x);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    } else {
+      int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      int output_index =
+          n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
+    }
+  }
 }
 
-__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w,
-                                                __half *old_data, __half *new_data, int x, int start) {
-    
-    const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int stride = blockDim.x * gridDim.x;
-    
-    for(int i = index; i < N; i += stride){
-        const int col = ((i % (c * h * w)) % (h * w)) % w;
-        const int row = ((i % (c * h * w)) % (h * w)) / w;
-        const int ch = (i % (c * h * w)) / (h * w);
-        const int n = i / (c * h * w);
-        //const int n = tx / (c * h * w); //output image number
-        //if(n < N) {
-            //const int ch = tx % (c * h * w) / (h * w); //output chan number
-            //const int row = tx % (h * w) / w; //output height index (row number)
-            //  const int col = tx % w; //output width index (col number)
-        if(col < start) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
-                        = old_data[ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col];
-                        //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
-        } else if(col == w - 1) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
-                            //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
-        } else if (col == 0) {
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                        old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w)];
-                        //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
-        } else if((col - start) % x == 0) {
-            const int col_index = col - ((col + 1 - start) / x);
-            const int output_index = ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
-                            __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
-        } else {
-            const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
-            const int output_index = ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index;
-            //const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
-            new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
-        }
+__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h,
+                                          int w, __half *old_data,
+                                          __half *new_data, int x, int start) {
+
+  const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int stride = blockDim.x * gridDim.x;
+
+  for (int i = index; i < N; i += stride) {
+    const int col = ((i % (c * h * w)) % (h * w)) % w;
+    const int row = ((i % (c * h * w)) % (h * w)) / w;
+    const int ch = (i % (c * h * w)) / (h * w);
+    const int n = i / (c * h * w);
+    // const int n = tx / (c * h * w); //output image number
+    // if(n < N) {
+    // const int ch = tx % (c * h * w) / (h * w); //output chan number
+    // const int row = tx % (h * w) / w; //output height index (row number)
+    //  const int col = tx % w; //output width index (col number)
+    if (col < start) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col];
+      // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
+    } else if (col == w - 1) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w) +
+                   old_w - 1];
+      // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
+    } else if (col == 0) {
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w)];
+      // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
+    } else if ((col - start) % x == 0) {
+      const int col_index = col - ((col + 1 - start) / x);
+      const int output_index =
+          ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+      // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
+    } else {
+      const int col_index =
+          col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
+      const int output_index =
+          ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index;
+      // const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row *
+      // old_w + col_index;
+      new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
+          old_data[output_index];
     }
+  }
 }
 
-
-__global__ void convToGemmFullInputRegular(float * const __restrict__ output,
-				    const float * const __restrict input,
-				    const int N, const int C,
-				    const int H, const int W,
-				    const int KH, const int KW, const int V_pad,
-				    const int H_pad, const int H_out,
-				    const int W_out, const int V_stride,
-				    const int H_stride, const int reduced_filter_elem,
-				    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (H_out * W_out); //output image number
-  if(n < N) {
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-    #pragma unroll
-    for(int fi = 0; fi < reduced_filter_elem; fi++) {
-         const int ch = (fi * C) / reduced_filter_elem;
-         const int offset = (skip_offset + ch) % skip_every;
-         int in_index;
-         if(fi < offset) {
-             in_index = fi;
-         } else {
-             in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
-                        + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-        }
-        const int i = (in_index % (KW * KH)) / KW;
-        const int j = in_index % KW;
-        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; 
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-        } else {
-            output[out_index] = 0;
-        }
+__global__ void
+convToGemmFullInputRegular(float *const __restrict__ output,
+                           const float *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int reduced_filter_elem,
+                           const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int ch = (fi * C) / reduced_filter_elem;
+      const int offset = (skip_offset + ch) % skip_every;
+      int in_index;
+      if (fi < offset) {
+        in_index = fi;
+      } else {
+        in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                   (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                   offset - 1;
+      }
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
       }
     }
+  }
 }
 
-__global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
-                    const float * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-    
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            int in_index;
-            if(fi < skip_offset) {
-                in_index = fi;
-            } else {
-                in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                            + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-            }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmFullInputIrregular(
+    float *const __restrict__ output, const float *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      int in_index;
+      if (fi < skip_offset) {
+        in_index = fi;
+      } else {
+        in_index =
+            ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+            (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+            skip_offset - 1;
+      }
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void createReducedFiltersFullRegular(float * output,
-					 const float * const __restrict input, const int NF,
-					 const int num_filter_elem, const int reduced_filter_elem, 
-                     const int channels,
-					 const int skip_every, const int skip_offset, const float fac) {
-  
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int fIdx = tx / reduced_filter_elem; //filter index
-  if(fIdx < NF) { 
-    const int offset = tx % reduced_filter_elem; //offset within filter
+__global__ void createReducedFiltersFullRegular(
+    float *output, const float *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int channels, const int skip_every, const int skip_offset,
+    const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
     const int ch = (offset * channels) / reduced_filter_elem;
     const int channel_offset = (skip_offset + ch) % skip_every;
-      int in_index;
-      if(offset < channel_offset) {
-        in_index = offset;
-     } else {
-         in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                  + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
-     }
-    output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+    int in_index;
+    if (offset < channel_offset) {
+      in_index = offset;
+    } else {
+      in_index =
+          ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) +
+          (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) >
+           0) +
+          channel_offset - 1;
+    }
+    output[fIdx * reduced_filter_elem + offset] =
+        fac * input[num_filter_elem * fIdx + in_index];
   }
 }
 
-__global__ void createReducedFiltersFullIrregular(float * output,
-                     const float * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int fIdx = tx / reduced_filter_elem; //filter index
-      if(fIdx < NF) {
-        const int offset = tx % reduced_filter_elem; //offset within filter
-        int in_index;
-        if(offset < skip_offset) {
-            in_index = offset;
-        } else {
-            in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; 
-        }
-        output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
+__global__ void createReducedFiltersFullIrregular(
+    float *output, const float *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int skip_every, const int skip_offset, const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int fIdx = tx / reduced_filter_elem;            // filter index
+  if (fIdx < NF) {
+    const int offset = tx % reduced_filter_elem; // offset within filter
+    int in_index;
+    if (offset < skip_offset) {
+      in_index = offset;
+    } else {
+      in_index =
+          ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) +
+          (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+          skip_offset - 1;
     }
+    output[fIdx * reduced_filter_elem + offset] =
+        fac * input[num_filter_elem * fIdx + in_index];
+  }
 }
 
-__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-  const int n = tx / (C * H_out * W_out); //output image number
-  if(n < N) {
-    const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-    const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-    const int w = tx % W_out; //output width index (col number)
-    const int inH = h * V_stride - V_pad; //input height index (row number)
-    const int inW = w * H_stride - H_pad; //input width index (col number)
-    
-    #pragma unroll
-    //for(int fi = 0; fi < reduced_filter_elem; fi++) {
-         //const int ch = (fi * C) / reduced_filter_elem;
-      for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-        const int fi = ch * (reduced_filter_elem / C) + ki;
-        const int offset = (skip_offset + ch) % skip_every;
-         //int in_index;
-         const bool condition = (fi < offset);
-         const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-         //if(fi < offset) {
-         //    in_index = fi;
-         //} else {
-         //    in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-           //             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-       // }
-        const int i = (in_index % (KW * KH)) / KW;
-        const int j = in_index % KW;
-        const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-        if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { 
-            output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-        } else {
-            output[out_index] = 0;
-        }
+__global__ void
+convToGemmHalfInputRegular(__half *const __restrict__ output,
+                           const __half *const __restrict input, const int N,
+                           const int C, const int H, const int W, const int KH,
+                           const int KW, const int V_pad, const int H_pad,
+                           const int H_out, const int W_out, const int V_stride,
+                           const int H_stride, const int reduced_filter_elem,
+                           const int skip_every, const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int ch =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    // for(int fi = 0; fi < reduced_filter_elem; fi++) {
+    // const int ch = (fi * C) / reduced_filter_elem;
+    for (int ki = 0; ki < reduced_filter_elem / C; ki++) {
+      const int fi = ch * (reduced_filter_elem / C) + ki;
+      const int offset = (skip_offset + ch) % skip_every;
+      // int in_index;
+      const bool condition = (fi < offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               offset - 1);
+      // if(fi < offset) {
+      //    in_index = fi;
+      //} else {
+      //    in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
+      //             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0)
+      //             + offset - 1;
+      // }
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
       }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C, 
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-      const int n = tx / (C * H_out * W_out); //output image number
-      if(n < N) {
-           const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
-          const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-          const int w = tx % W_out; //output width index (col number)
-          const int inH = h * V_stride - V_pad; //input height index (row number)
-          const int inW = w * H_stride - H_pad; //input width index (col number)
-          
-          #pragma unroll
-           for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
-               const int fi = ch * (reduced_filter_elem / C) + ki;
-          //for(int fi = 0; fi < reduced_filter_elem; fi++) {
-           //   const int ch = (fi * C) / reduced_filter_elem;
-              const int offset = (skip_offset + ch) % skip_every;
-              const int condition = (fi < offset);
-             const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
-                                                          + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
-             // int in_index;
-              //if(fi < offset) {
-               //   in_index = fi;
-              //} else {
-               //   in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
-                 //               + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-             // }
-              const int i = (in_index % (KW * KH)) / KW;
-              const int j = in_index % KW;
-              const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-              if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                  output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-              } else {
-                  output[out_index] = 0;
-             }
-        }
+__global__ void convToGemmHalfInputRegular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (C * H_out * W_out);               // output image number
+  if (n < N) {
+    const int ch =
+        tx % (C * H_out * W_out) / (H_out * W_out); // output chan number
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int ki = 0; ki < reduced_filter_elem / C; ki++) {
+      const int fi = ch * (reduced_filter_elem / C) + ki;
+      // for(int fi = 0; fi < reduced_filter_elem; fi++) {
+      //   const int ch = (fi * C) / reduced_filter_elem;
+      const int offset = (skip_offset + ch) % skip_every;
+      const int condition = (fi < offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               offset - 1);
+      // int in_index;
+      // if(fi < offset) {
+      //   in_index = fi;
+      //} else {
+      //   in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
+      //               + (((fi - offset + 1) * skip_every) % (skip_every - 1) >
+      //               0) + offset - 1;
+      // }
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
-                    const __half * const __restrict input,
-                    const int N, const int C,
-                    const int H, const int W,
-                    const int KH, const int KW, const int V_pad,
-                    const int H_pad, const int H_out,
-                    const int W_out, const int V_stride,
-                    const int H_stride, const int reduced_filter_elem,
-                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-        
-        #pragma unroll
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-            //int in_index;
-            //if(fi < skip_offset) {
-             //   in_index = fi;
-            //} else {        
-              //  in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-              //              + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-           // }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW; 
-            const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmHalfInputIrregular(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int condition = (fi < skip_offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               skip_offset - 1);
+      // int in_index;
+      // if(fi < skip_offset) {
+      //   in_index = fi;
+      //} else {
+      //  in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+      //              + (((fi - skip_offset + 1) * skip_every) % (skip_every -
+      //              1) > 0) + skip_offset - 1;
+      // }
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index =
+          ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
-__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output,
-                                    const __half * const __restrict input,
-                                    const int N, const int C,
-                                    const int H, const int W,
-                                    const int KH, const int KW, const int V_pad,
-                                    const int H_pad, const int H_out,
-                                    const int W_out, const int V_stride,
-                                    const int H_stride, const int reduced_filter_elem,
-                                    const int skip_every, const int skip_offset) {
-
-    const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-    const int n = tx / (H_out * W_out); //output image number
-    if(n < N) {
-        const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
-        const int w = tx % W_out; //output width index (col number)
-        const int inH = h * V_stride - V_pad; //input height index (row number)
-        const int inW = w * H_stride - H_pad; //input width index (col number)
-       #pragma unroll 
-        for(int fi = 0; fi < reduced_filter_elem; fi++) {
-            const int condition = (fi < skip_offset);
-            const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                                 + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-           // int in_index;
-           // if(fi < skip_offset) {
-           //     in_index = fi;
-           // } else {
-            //    in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
-                   //             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
-           // }
-            const int ch = in_index / (KW * KH);
-            const int i = (in_index % (KW * KH)) / KW;
-            const int j = in_index % KW;
-            const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
-            //const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
-            if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
-                output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
-            } else {
-                output[out_index] = 0;
-            }
-        }
+__global__ void convToGemmHalfInputIrregular2(
+    __half *const __restrict__ output, const __half *const __restrict input,
+    const int N, const int C, const int H, const int W, const int KH,
+    const int KW, const int V_pad, const int H_pad, const int H_out,
+    const int W_out, const int V_stride, const int H_stride,
+    const int reduced_filter_elem, const int skip_every,
+    const int skip_offset) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int n = tx / (H_out * W_out);                   // output image number
+  if (n < N) {
+    const int h =
+        tx % (H_out * W_out) / W_out;     // output height index (row number)
+    const int w = tx % W_out;             // output width index (col number)
+    const int inH = h * V_stride - V_pad; // input height index (row number)
+    const int inW = w * H_stride - H_pad; // input width index (col number)
+#pragma unroll
+    for (int fi = 0; fi < reduced_filter_elem; fi++) {
+      const int condition = (fi < skip_offset);
+      const int in_index =
+          condition * fi +
+          (!condition) *
+              (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) +
+               (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) +
+               skip_offset - 1);
+      // int in_index;
+      // if(fi < skip_offset) {
+      //     in_index = fi;
+      // } else {
+      //    in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+      //             + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1)
+      //             > 0) + skip_offset - 1;
+      // }
+      const int ch = in_index / (KW * KH);
+      const int i = (in_index % (KW * KH)) / KW;
+      const int j = in_index % KW;
+      const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
+      // const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) *
+      // W_out + w;
+      if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
+        output[out_index] =
+            input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
+      } else {
+        output[out_index] = 0;
+      }
     }
+  }
 }
 
+__global__ void createReducedFiltersHalfRegular(
+    __half *output, const __half *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int channels, const int skip_every, const int skip_offset,
+    const float fac) {
 
-__global__ void createReducedFiltersHalfRegular(__half * output,
-                                         const __half * const __restrict input, const int NF,
-                                         const int num_filter_elem, const int reduced_filter_elem,
-                     			 const int channels,
-                                         const int skip_every, const int skip_offset, const float fac) {
-
-  const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
   const int stride = blockDim.x * gridDim.x;
-  
+
   //#pragma unroll
   for (int i = tx; i < NF; i += stride) {
-    const int fIdx = i / reduced_filter_elem; //filter index
-  //if(fIdx < NF) {
-    const int offset = i % reduced_filter_elem; //offset within filter
+    const int fIdx = i / reduced_filter_elem; // filter index
+    // if(fIdx < NF) {
+    const int offset = i % reduced_filter_elem; // offset within filter
     const int ch = (offset * channels) / reduced_filter_elem;
     const int channel_offset = (skip_offset + ch) % skip_every;
     const int condition = (offset < channel_offset);
-    const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-                          + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1);
-      
-     // int in_index;
-     // if(offset < channel_offset) {
-      //  in_index = offset;
-     //} else {
-       //  in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
-         //         + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
+    const int in_index =
+        condition * offset +
+        (!condition) *
+            (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) +
+             (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) >
+              0) +
+             channel_offset - 1);
+
+    // int in_index;
+    // if(offset < channel_offset) {
+    //  in_index = offset;
+    //} else {
+    //  in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every -
+    //  1)
+    //         + (((offset - channel_offset + 1) * skip_every) % (skip_every -
+    //         1) > 0) + channel_offset -1;
     // }
-    output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
+    output[fIdx * reduced_filter_elem + offset] =
+        __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
   }
 }
 
-__global__ void createReducedFiltersHalfIrregular(__half * output,
-                     const __half * const __restrict input, const int NF,
-                     const int num_filter_elem, const int reduced_filter_elem,
-                     const int skip_every, const int skip_offset, const float fac) {
-
-      const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
-     const int stride = blockDim.x * gridDim.x;
-      //#pragma unroll
-      for (int i = tx; i < NF; i += stride) { 
-  
-      const int fIdx = i / reduced_filter_elem; //filter index
+__global__ void createReducedFiltersHalfIrregular(
+    __half *output, const __half *const __restrict input, const int NF,
+    const int num_filter_elem, const int reduced_filter_elem,
+    const int skip_every, const int skip_offset, const float fac) {
+
+  const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
+  const int stride = blockDim.x * gridDim.x;
+  //#pragma unroll
+  for (int i = tx; i < NF; i += stride) {
+
+    const int fIdx = i / reduced_filter_elem; // filter index
     // if(fIdx < NF) {
-        const int offset = i % reduced_filter_elem; //offset within filter
-        const int condition = (offset < skip_offset);
-        int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
-                     + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
-        //}
-        output[fIdx * reduced_filter_elem + offset] =  __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); 
+    const int offset = i % reduced_filter_elem; // offset within filter
+    const int condition = (offset < skip_offset);
+    int in_index =
+        condition * offset +
+        (!condition) *
+            (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) +
+             (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) >
+              0) +
+             skip_offset - 1);
+    //}
+    output[fIdx * reduced_filter_elem + offset] =
+        __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
     //}
   }
 }
 
-void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
-			 int vertical_pad, int horizontal_pad, int vertical_stride,
-			 int horizontal_stride, int conv_mode, int conv_groups,
-			 int row, int col, int start){
+void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad,
+                         int horizontal_pad, int vertical_stride,
+                         int horizontal_stride, int conv_mode, int conv_groups,
+                         int row, int col, int start) {
 
   //////INFO("*** TensorConvolution (output perforation) \n");
-  //Event("Conv");
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  // Event("Conv");
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
-  
-  Tensor* output;
+
+  Tensor *output;
   // TODO: Support other cases;
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-  //Event("H2F_start");
+  // Event("H2F_start");
   convertToFP32(input);
   convertToFP32(filter);
-  //Event("H2F_end");
-  
+  // Event("H2F_end");
+
   long int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
-  c = filter->dims.dim_sizes[0]; //number of filters
+  c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
 
   h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
   int rem_row = (h - start) % row > 0;
   int h_eff = h - ((h - start) / row) - rem_row;
-  
-  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+
+  w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   int rem_col = (w - start) % col > 0;
   int w_eff = w - ((w - start) / col) - rem_col;
 
-  Tensor* new_output;
-  if(row > 1){
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+  Tensor *new_output;
+  if (row > 1) {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float* convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, start, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, start, h_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h_eff * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h_eff * w,
-					      num_filter_elem * h_eff * w,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h_eff * w, c * h_eff * w,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateRow<<<numBlocks, 128>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(col > 1){
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+  } else if (col > 1) {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, start, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, start, w_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData,
-					      h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data,
-					      num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data,
-					      h * w_eff, c * h * w_eff,
-					      n));
-
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-					 CUDNN_TENSOR_NCHW, n, c, h, w);
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    new_output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
 
-    //interpolate
-    int numBlocks = (n * c * h * w  + 127) / 128;
-    approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, start);
+    // interpolate
+    int numBlocks = (n * c * h * w + 127) / 128;
+    approxInterpolateCol<<<numBlocks, 128>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, start);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else { 
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
+  } else {
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
-    //total number of filter elem
+    // total number of filter elem
     const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    convToGemmApprox<<<gridSize, blockSize>>>(convData,
-					      (float *)input->gpu_data, n,
-					      input->dims.dim_sizes[1],
-					      input->dims.dim_sizes[2],
-					      input->dims.dim_sizes[3],
-					      KH, KW,
-					      vertical_pad, horizontal_pad, h, w,
-					      vertical_stride, horizontal_stride,
-					      num_filter_elem, c * h * w);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    convToGemmApprox<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, c * h * w);
     checkCudaErrors(cudaDeviceSynchronize());
-    //Do the matrix multiplication
-    //Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
-    
+    // Do the matrix multiplication
+    // Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
+
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w, num_filter_elem * h * w,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w, c * h * w,
-					      n));
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem,
+        &alpha, convData, h * w, num_filter_elem * h * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w, c * h * w, n));
 
     new_output = output;
     cudaFree(convData);
   }
 
-  //Event("Conv_end"); //, true);
+  // Event("Conv_end"); //, true);
   return new_output;
 }
 
-__global__
-void switchMatrixFull(int N, int n, int c, int h, int w,
-              float *old_data, float *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-          int col = ((i % (c * h * w)) % (h * w)) % w;
-          int row = ((i % (c * h * w)) % (h * w)) / w;
-          int ch = (i % (c * h * w)) / (h * w);
-          int n_new = i / (c * h * w);
-          
-          new_data[((n_new * c + ch) * h + row ) * w + col] =
-                        old_data[((ch * n + n_new) * h + row ) * w + col];
-        }
-}
+__global__ void switchMatrixFull(int N, int n, int c, int h, int w,
+                                 float *old_data, float *new_data) {
 
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
 
-void* tensorConvApprox(void* input_ptr, void* filter_ptr,
-		       int vertical_pad, int horizontal_pad, int vertical_stride,
-		       int horizontal_stride, int conv_mode, int conv_groups,
-		       int row, int col, int skip_every, int offset){
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
+  }
+}
+
+void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
+                       int horizontal_pad, int vertical_stride,
+                       int horizontal_stride, int conv_mode, int conv_groups,
+                       int row, int col, int skip_every, int offset) {
 
   //////INFO("*** TensorConvolution approximation \n");
-  //Event("Conv");
+  // Event("Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
@@ -1316,15 +1445,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   ////Event("H2F_end");
 
   const int n = input->dims.dim_sizes[0];
-  const int c = filter->dims.dim_sizes[0]; //number of filters
+  const int c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-  const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const int h =
+      (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const int w =
+      (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
   ////INFO("batch: %d\n", n);
@@ -1337,327 +1469,299 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
   ////INFO("horizontal_stride: %d\n", horizontal_stride);
   ////INFO("output height: %d\n", h);
   ////INFO("output width: %d\n", w);
-  if(row > 1) {
+  if (row > 1) {
     const int rem_row = (h - offset) % row > 0;
     const int h_eff = h - ((h - offset) / row) - rem_row;
 
-    Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h_eff, w);
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-    convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w,
-					       vertical_stride, horizontal_stride,
-					       row, offset, h_eff);
+    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h_eff * w));
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+    convToGemmPerfRow<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        row, offset, h_eff);
     checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                &alpha,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (float *)filter->gpu_data, num_filter_elem, 0,
-                                                &beta,
-                                                (float *)output->gpu_data, h_eff * w, c * h_eff * w,
-                                                n));
-    //interpolate
+
+    float alpha = 1.0f, beta = 0.0f;
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+        &alpha, convData, h_eff * w, num_filter_elem * h_eff * w,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h_eff * w, c * h_eff * w, n));
+    // interpolate
     int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *) output->gpu_data,
-					    (float *) new_output->gpu_data,
-					    row, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    approxInterpolateRow<<<numBlocks, blocksize>>>(
+        n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, row, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(col > 1) {
+  } else if (col > 1) {
     const int rem_col = (w - offset) % col > 0;
     const int w_eff = w - ((w - offset) / col) - rem_col;
 
-    Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+    Tensor *output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff));
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-
-    convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3], KH, KW,
-					       vertical_pad, horizontal_pad, h, w,
-					       vertical_stride, horizontal_stride,
-					       col, offset, w_eff);
+    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w_eff));
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+
+    convToGemmPerfCol<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        col, offset, w_eff);
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-					      CUBLAS_OP_N, CUBLAS_OP_N,
-					      h * w_eff, c, num_filter_elem,
-					      &alpha,
-					      convData, h * w_eff, num_filter_elem * h * w_eff,
-					      (float *)filter->gpu_data, num_filter_elem, 0,
-					      &beta,
-					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
-					      n));
-
-    //interpolate
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+        &alpha, convData, h * w_eff, num_filter_elem * h * w_eff,
+        (float *)filter->gpu_data, num_filter_elem, 0, &beta,
+        (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
+
+    // interpolate
     int blocksize = 128;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-					    (float *)output->gpu_data,
-					    (float *)new_output->gpu_data,
-					    col, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    approxInterpolateCol<<<numBlocks, blocksize>>>(
+        n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data,
+        (float *)new_output->gpu_data, col, offset);
     cudaDeviceSynchronize();
 
     freeTensor(output);
     cudaFree(convData);
-  } else if(skip_every > 1) {
-    //reduced number after skipping
+  } else if (skip_every > 1) {
+    // reduced number after skipping
     const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+    const int reduced_filter_elem =
+        num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
 
-    float* convData;
+    float *convData;
     size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    float* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
-    
+    float *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
+
     const int filtBlockSize = 128;
     ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac = ((float) skip_every) / ((float) skip_every - 1);
+    const int filtGridSize =
+        (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac = ((float)skip_every) / ((float)skip_every - 1);
     //////INFO("fac: %f\n", fac);
     const int blockSize = 128;
-    //////INFO("n * h * w : %d\n", (n * h * w ));    
-    const int gridSize = (n * h * w + blockSize - 1) / blockSize;  
-    if(!(KH * KW % skip_every)) {
-       // ////INFO("REGULAR FILTERING\n");
-        createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                (float *)filter->gpu_data,
-								c, num_filter_elem,
-								reduced_filter_elem,
-								input->dims.dim_sizes[1], skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
+    //////INFO("n * h * w : %d\n", (n * h * w ));
+    const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+    if (!(KH * KW % skip_every)) {
+      // ////INFO("REGULAR FILTERING\n");
+      createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
+          reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+          fac);
+      checkCudaErrors(cudaDeviceSynchronize());
+      convToGemmFullInputRegular<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     } else {
-       // ////INFO("IRREGULAR FILTERING\n");
-        createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (float *)filter->gpu_data,
-                                    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
-        checkCudaErrors(cudaDeviceSynchronize());
-        convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,     
-                                                                input->dims.dim_sizes[1],                                                     
-                                                                input->dims.dim_sizes[2],                                                 
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
+      // ////INFO("IRREGULAR FILTERING\n");
+      createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(
+          reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
+          reduced_filter_elem, skip_every, offset, fac);
+      checkCudaErrors(cudaDeviceSynchronize());
+      convToGemmFullInputIrregular<<<gridSize, blockSize>>>(
+          convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, reduced_filter_elem, skip_every, offset);
     }
     checkCudaErrors(cudaDeviceSynchronize());
-    
+
     const float alpha = 1.0;
     const float beta = 0.0;
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            &alpha,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
+    checkCudaErrors(cublasSgemmStridedBatched(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem,
+        &alpha, convData, h * w, reduced_filter_elem * h * w, reducedFilter,
+        reduced_filter_elem, 0, &beta, (float *)new_output->gpu_data, h * w,
+        c * h * w, n));
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
-      //INFO("FP32 BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
-                               CUDNN_TENSOR_NCHW, n, c, h, w);
+    // INFO("FP32 BASELINE\n");
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
     changeTensorPlacement(new_output, DEVICE);
 
-    float * convData;
+    float *convData;
     long int convDataSize = sizeof(float) * n * num_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w));
-    convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n,
-					       input->dims.dim_sizes[1],
-					       input->dims.dim_sizes[2],
-					       input->dims.dim_sizes[3],
-					       KH, KW, vertical_pad, horizontal_pad,
-					       h, w, vertical_stride, horizontal_stride, 
-                           skip_every, offset);//num_filter_elem);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w));
+    convToGemmFullInput<<<gridSize, blockSize>>>(
+        convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        skip_every, offset); // num_filter_elem);
     checkCudaErrors(cudaDeviceSynchronize());
-     
-     float alpha = 1.0f, beta = 0.0f;
-     /*
-     checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                          CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, num_filter_elem,
-                                            &alpha,
-                                            convData, h * w, num_filter_elem * h * w,
-                                            (float *)filter->gpu_data, num_filter_elem, 0,
-                                            &beta,
-                                            (float *)new_output->gpu_data, h * w, c * h * w,
-                                            n));
-    */
-    checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                       n * h * w, c, num_filter_elem,
-                        &alpha,
-                        convData,
-                        CUDA_R_32F, n * h * w,
-                        (float *) filter->gpu_data, CUDA_R_32F,
-                        num_filter_elem,
-                        &beta,
-                        (float *) output->gpu_data,
-                        CUDA_R_32F, n * h * w,
-                        CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-    
-    const int numBlocks = (n * c * h * w  + 255) / 256;
-    switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (float *)output->gpu_data,
-                                    (float *)new_output->gpu_data);
-    
+
+    float alpha = 1.0f, beta = 0.0f;
+    /*
+    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
+                                         CUBLAS_OP_N, CUBLAS_OP_N,
+                                           h * w, c, num_filter_elem,
+                                           &alpha,
+                                           convData, h * w, num_filter_elem * h
+    * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float
+    *)new_output->gpu_data, h * w, c * h * w, n));
+   */
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
+        &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data,
+        CUDA_R_32F, num_filter_elem, &beta, (float *)output->gpu_data,
+        CUDA_R_32F, n * h * w, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    const int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                         (float *)output->gpu_data,
+                                         (float *)new_output->gpu_data);
+
     checkCudaErrors(cudaDeviceSynchronize());
     cudaFree(convData);
   }
 
-  //Event("Conv_end");
+  // Event("Conv_end");
   return new_output;
 }
 
-__global__
-void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){
-
-      int i = blockIdx.x * blockDim.x + threadIdx.x;
-      if(i < N){
-            int col = ((i % (c * h * w)) % (h * w)) % w;
-            int row = ((i % (c * h * w)) % (h * w)) / w;
-            int ch = (i % (c * h * w)) / (h * w);
-            int n_new = i / (c * h * w);
-            
-            new_data[((n_new * c + ch) * h + row ) * w + col] =
-                            old_data[((ch * n + n_new) * h + row ) * w + col];
-      }
-}
+__global__ void switchMatrixHalf(int N, int n, int c, int h, int w,
+                                 __half *old_data, __half *new_data) {
 
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < N) {
+    int col = ((i % (c * h * w)) % (h * w)) % w;
+    int row = ((i % (c * h * w)) % (h * w)) / w;
+    int ch = (i % (c * h * w)) / (h * w);
+    int n_new = i / (c * h * w);
 
-void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
-			   int vertical_pad, int horizontal_pad,
-			   int vertical_stride, int horizontal_stride,
-			   int conv_mode, int conv_groups,
-			   int row, int col, int skip_every, int offset) {
+    new_data[((n_new * c + ch) * h + row) * w + col] =
+        old_data[((ch * n + n_new) * h + row) * w + col];
+  }
+}
+
+void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups, int row, int col, int skip_every,
+                            int offset) {
 
- //INFO("*** TensorConvolution half approximation \n");
- // profileEvent("#Conv");
+  // INFO("*** TensorConvolution half approximation \n");
+  // profileEvent("#Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
-  //FIXME: Current hack to preserve backward compatibilty
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
 
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
- // INFO("CONVERT\n");
+  // INFO("CONVERT\n");
   profileEvent("F2H_start");
-   convertToFP16(input);
-   convertToFP16(filter);
+  convertToFP16(input);
+  convertToFP16(filter);
   profileEvent("F2H_end");
-//INFO("CONVERTED\n");
+  // INFO("CONVERTED\n");
   const long int n = input->dims.dim_sizes[0];
-  const long int c = filter->dims.dim_sizes[0]; //number of filters
+  const long int c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-  const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+  const long int h =
+      (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const long int w =
+      (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
+      1;
   const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
 
-  Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-					       CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                                CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
-  //INFO("batch: %d\n", n);
+  // INFO("batch: %d\n", n);
   // INFO("channels: %d\n", input->dims.dim_sizes[1]);
   // INFO("num_filters: %d\n", c);
   // INFO("kernel height: %d\n", KH);
-  // INFO("kernel width: %d\n", KW);   
+  // INFO("kernel width: %d\n", KW);
   // INFO("num_filter_elem: %d\n", num_filter_elem);
-   //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
-   //INFO("vertical_stride: %d\n", vertical_stride);
-   //INFO("horizontal_stride: %d\n", horizontal_stride);
+  // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
+  // INFO("vertical_stride: %d\n", vertical_stride);
+  // INFO("horizontal_stride: %d\n", horizontal_stride);
   // INFO("output height: %d\n", h);
   // INFO("output width: %d\n", w);
-   //INFO("skip_every: %d\n", skip_every);
-  if(row > 1){
+  // INFO("skip_every: %d\n", skip_every);
+  if (row > 1) {
     const int rem_row = (h - offset) % row > 0;
     const int h_eff = h - ((h - offset) / row) - rem_row;
-    
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW,
-						  n, c, h_eff, w);
+
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
 
-    __half * convData;
+    __half *convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w));
+    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h_eff * w));
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
-    
-    if(h * w <= 64) {
-        convToGemmPerfRowHalf2<<<gridSize, blockSize>>>(convData,
-                                   (__half *)input->gpu_half_data, n,
-                                   input->dims.dim_sizes[1],
-                                   input->dims.dim_sizes[2],
-                                   input->dims.dim_sizes[3],
-                                   KH, KW, vertical_pad,
-                                   horizontal_pad, h, w, vertical_stride,
-                                   horizontal_stride, row, offset, h_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
+
+    if (h * w <= 64) {
+      convToGemmPerfRowHalf2<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, row, offset, h_eff);
     } else {
-        convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData,
-						   (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1],
-						   input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3],
-						   KH, KW, vertical_pad,
-						   horizontal_pad, h, w, vertical_stride,
-						   horizontal_stride, row, offset, h_eff);
+      convToGemmPerfRowHalf<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, row, offset, h_eff);
     }
     checkCudaErrors(cudaDeviceSynchronize());
 
@@ -1665,74 +1769,68 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-    if(h * w <= 64) {
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                     n * h_eff * w, c, num_filter_elem,
-                     alpha_half,
-                     convData, CUDA_R_16F, n * h_eff * w,
-                     (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                     beta_half,
-                     (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w,
-                     CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+    if (h * w <= 64) {
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c,
+          num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w,
+          (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+          beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F,
+          n * h_eff * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     } else {
-     checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                                CUBLAS_OP_N, CUBLAS_OP_N,
-                                                h_eff * w, c, num_filter_elem,
-                                                alpha_half,
-                                                convData, h_eff * w, num_filter_elem * h_eff * w,
-                                                (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                                beta_half,
-                                                (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w,
-                                                n));    
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem,
+          alpha_half, convData, h_eff * w, num_filter_elem * h_eff * w,
+          (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half,
+          (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, n));
     }
-    //interpolate
+    // interpolate
     int blocksize = 256;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    if(h * w <= 64) {
-        approxInterpolateRowHalf2<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-                                (__half *)output_half->gpu_half_data,
-                                (__half *)new_output->gpu_half_data,
-                                row, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    if (h * w <= 64) {
+      approxInterpolateRowHalf2<<<numBlocks, blocksize>>>(
+          n * c * h * w, h_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, row, offset);
     } else {
-        approxInterpolateRowHalf<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						row, offset);
+      approxInterpolateRowHalf<<<numBlocks, blocksize>>>(
+          n * c * h * w, h_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, row, offset);
     }
     checkCudaErrors(cudaDeviceSynchronize());
 
     freeTensor(output_half);
     cudaFree(convData);
-} else if(col > 1) {
+  } else if (col > 1) {
     const int rem_col = (w - offset) % col > 0;
     const int w_eff = w - ((w - offset) / col) - rem_col;
 
-    Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-						  CUDNN_TENSOR_NCHW, n, c, h, w_eff);
+    Tensor *output_half = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output_half, DEVICE);
-   
-    __half * convData;
+
+    __half *convData;
     long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff));
+    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n *
+    /// input->dims.dim_sizes[1] * h * w_eff));
     const int blockSize = 256;
-    const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
-    if(h * w <= 64) {
-        convToGemmPerfColHalf2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3], KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, col, offset, w_eff);
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
+    if (h * w <= 64) {
+      convToGemmPerfColHalf2<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, col, offset, w_eff);
     } else {
-        convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-						   input->dims.dim_sizes[1],
-						   input->dims.dim_sizes[2],
-						   input->dims.dim_sizes[3], KH, KW, vertical_pad,
-						   horizontal_pad, h, w, vertical_stride,
-						   horizontal_stride, col, offset, w_eff);
+      convToGemmPerfColHalf<<<gridSize, blockSize>>>(
+          convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+          input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+          vertical_pad, horizontal_pad, h, w, vertical_stride,
+          horizontal_stride, col, offset, w_eff);
     }
     checkCudaErrors(cudaDeviceSynchronize());
 
@@ -1740,229 +1838,211 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-    if(h * w <= 64) {
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                         n * h * w_eff, c, num_filter_elem,
-                         alpha_half,
-                         convData, CUDA_R_16F, n * h * w_eff,
-                         (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                         beta_half,
-                         (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff,
-                         CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
+    if (h * w <= 64) {
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c,
+          num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff,
+          (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
+          beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F,
+          n * h * w_eff, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
     } else {
-        checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                              CUBLAS_OP_N, CUBLAS_OP_N,
-                                              h * w_eff, c, num_filter_elem,
-                                              alpha_half,
-                                              convData, h * w_eff, num_filter_elem * h * w_eff,
-                                              (__half *)filter->gpu_half_data, num_filter_elem, 0,
-                                              beta_half,
-                                              (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff,
-                                              n));
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem,
+          alpha_half, convData, h * w_eff, num_filter_elem * h * w_eff,
+          (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half,
+          (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, n));
     }
-    //interpolate
+    // interpolate
     int blocksize = 256;
-    int numBlocks = (n * c * h * w  + blocksize - 1) / blocksize;
-    if(h * w <= 64) {
-        approxInterpolateColHalf2<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-                                (__half *)output_half->gpu_half_data,
-                                (__half *)new_output->gpu_half_data,
-                                col, offset);
+    int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
+    if (h * w <= 64) {
+      approxInterpolateColHalf2<<<numBlocks, blocksize>>>(
+          n * c * h * w, w_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, col, offset);
 
     } else {
-        approxInterpolateColHalf<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w,
-						(__half *)output_half->gpu_half_data,
-						(__half *)new_output->gpu_half_data,
-						col, offset);
-   }
-   checkCudaErrors(cudaDeviceSynchronize());
+      approxInterpolateColHalf<<<numBlocks, blocksize>>>(
+          n * c * h * w, w_eff, n, c, h, w,
+          (__half *)output_half->gpu_half_data,
+          (__half *)new_output->gpu_half_data, col, offset);
+    }
+    checkCudaErrors(cudaDeviceSynchronize());
 
     freeTensor(output_half);
     cudaFree(convData);
-  } else if(skip_every > 1) {
+  } else if (skip_every > 1) {
     const int remainder = ((num_filter_elem - offset) % skip_every > 0);
-    const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
+    const int reduced_filter_elem =
+        num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder;
 
-    __half* convData;
+    __half *convData;
     size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w;
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
-    __half* reducedFilter;
-    checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
+    __half *reducedFilter;
+    checkCudaErrors(
+        cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem));
 
     const int filtBlockSize = 256;
-    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
-    const float fac = ((float) skip_every) / ((float) skip_every - 1);
+    const int filtGridSize =
+        (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const float fac = ((float)skip_every) / ((float)skip_every - 1);
     const int blockSize = 256;
-    //const int gridSize = (n * h * w + blockSize - 1) / blockSize;
-   // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
-   // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
+    // const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+    // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
+    // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
     const __half alf = approx_float_to_half(1.0);
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-   if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { 
-      if(!(KH * KW % skip_every)) {
-        //INFO("REGULAR FILTERING\n");
-        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                (__half *)filter->gpu_half_data,
-								c, num_filter_elem,
-                                                                reduced_filter_elem,
-                                                                input->dims.dim_sizes[1], skip_every, offset, fac);
+    if (c * num_filter_elem <
+        500000) { // 250) {//c * reduced_filter_elem < 150000) {
+      if (!(KH * KW % skip_every)) {
+        // INFO("REGULAR FILTERING\n");
+        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+            fac);
         checkCudaErrors(cudaDeviceSynchronize());
-	
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                        input->dims.dim_sizes[1],
-                                                        input->dims.dim_sizes[2],
-                                                        input->dims.dim_sizes[3],
-                                                        KH, KW, vertical_pad, horizontal_pad,
-                                                        h, w, vertical_stride, horizontal_stride,
-                                                        reduced_filter_elem, skip_every, offset);
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputRegular<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
       } else {
-        //INFO("IRREGULAR FILTERING\n");
-        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                    (__half *)filter->gpu_half_data,
-				    c, num_filter_elem,
-                                    reduced_filter_elem,
-                                    skip_every, offset, fac);
+        // INFO("IRREGULAR FILTERING\n");
+        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, skip_every, offset, fac);
         checkCudaErrors(cudaDeviceSynchronize());
-        
-        const int gridSize = (n * h * w * input->dims.dim_sizes[1]  + blockSize - 1) / blockSize;
-	    //convToGemmHalfInputIrregular
-        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,  
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-     }   
-     checkCudaErrors(cudaDeviceSynchronize());
-
-     checkCudaErrors(cublasHgemmStridedBatched(cublasHandle,
-                                            CUBLAS_OP_N, CUBLAS_OP_N,
-                                            h * w, c, reduced_filter_elem,
-                                            alpha_half,
-                                            convData, h * w, reduced_filter_elem * h * w,
-                                            reducedFilter, reduced_filter_elem, 0,
-                                            beta_half,
-                                            (__half *)new_output->gpu_half_data, h * w, c * h * w,
-                                            n));
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        // convToGemmHalfInputIrregular
+        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      }
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasHgemmStridedBatched(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem,
+          alpha_half, convData, h * w, reduced_filter_elem * h * w,
+          reducedFilter, reduced_filter_elem, 0, beta_half,
+          (__half *)new_output->gpu_half_data, h * w, c * h * w, n));
     } else {
-        Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                 CUDNN_TENSOR_NCHW, n, c, h, w);
-        changeTensorPlacement(output_half, DEVICE);
-
-        if(!(KH * KW % skip_every)) {
-           // INFO("REGULAR FILTERING\n");
-            createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                        (__half *)filter->gpu_half_data,
-                                                        c, num_filter_elem,
-                                                        reduced_filter_elem,
-                                                        input->dims.dim_sizes[1], skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-        } else {
-            //INFO("IRREGULAR FILTERING\n");
-            createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter,
-                                                                            (__half *)filter->gpu_half_data,
-                                                                            c, num_filter_elem,
-                                                                            reduced_filter_elem,
-                                                                            skip_every, offset, fac);
-            checkCudaErrors(cudaDeviceSynchronize());
-            
-            const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-            convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n,
-                                                                input->dims.dim_sizes[1],
-                                                                input->dims.dim_sizes[2],
-                                                                input->dims.dim_sizes[3],
-                                                                KH, KW, vertical_pad, horizontal_pad,
-                                                                h, w, vertical_stride, horizontal_stride,
-                                                                reduced_filter_elem, skip_every, offset);
-            }
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                        n * h * w, c, reduced_filter_elem,
-                                        alpha_half,
-                                        convData, CUDA_R_16F, n * h * w,
-                                         reducedFilter, CUDA_R_16F, reduced_filter_elem,
-                                        beta_half,
-                                        (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w,
-                                        CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-            
-            int numBlocks = (n * c * h * w  + 255) / 256;
-            switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w,
-                                    (__half *)output_half->gpu_half_data,
-                                    (__half *)new_output->gpu_half_data);
-            checkCudaErrors(cudaDeviceSynchronize());
-
-            freeTensor(output_half);
+      Tensor *output_half = (Tensor *)create4DTensor(
+          (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w);
+      changeTensorPlacement(output_half, DEVICE);
+
+      if (!(KH * KW % skip_every)) {
+        // INFO("REGULAR FILTERING\n");
+        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
+            fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      } else {
+        // INFO("IRREGULAR FILTERING\n");
+        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+            reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
+            reduced_filter_elem, skip_every, offset, fac);
+        checkCudaErrors(cudaDeviceSynchronize());
+
+        const int gridSize =
+            (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
+        convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(
+            convData, (__half *)input->gpu_half_data, n,
+            input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+            input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
+            w, vertical_stride, horizontal_stride, reduced_filter_elem,
+            skip_every, offset);
+      }
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      checkCudaErrors(cublasGemmEx(
+          cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c,
+          reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w,
+          reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half,
+          (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w,
+          CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+      int numBlocks = (n * c * h * w + 255) / 256;
+      switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                           (__half *)output_half->gpu_half_data,
+                                           (__half *)new_output->gpu_half_data);
+      checkCudaErrors(cudaDeviceSynchronize());
+
+      freeTensor(output_half);
     }
-    
+
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
     //    INFO("BASELINE\n");
-      Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type,
-                                   CUDNN_TENSOR_NCHW, n, c, h, w);
-      
-      changeTensorPlacement(output, DEVICE);
-      __half * convData;
-      long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
-      checkCudaErrors(cudaMalloc(&convData, convDataSize));
-      
-      const int blockSize = 256;
-      const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-      //convToGemmHalf
-      convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData,
-                                                (__half *)input->gpu_half_data, n,
-                                                input->dims.dim_sizes[1],
-                                                input->dims.dim_sizes[2],
-                                                input->dims.dim_sizes[3],
-                                                KH, KW, vertical_pad,
-                                                horizontal_pad, h, w, vertical_stride,
-                                                horizontal_stride, num_filter_elem,
-                                                skip_every, offset);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        const __half alf = approx_float_to_half(1.0);
-        const __half bet = approx_float_to_half(0.0);
-        const __half *alpha_half = &alf;
-        const __half *beta_half = &bet;
-        checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-                                    n * h * w, c, num_filter_elem,
-                                    alpha_half,
-                                    convData, CUDA_R_16F, n * h * w,
-                                    (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem,
-                                    beta_half,
-                                    (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w,
-                                    CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-        
-        const int numBlocks = (n * c * h * w  + 255) / 256;
-        switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data,
-                                            (__half *)new_output->gpu_half_data);
-        checkCudaErrors(cudaDeviceSynchronize());
-        
-        freeTensor(output);
-        cudaFree(convData);
+    Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                              CUDNN_TENSOR_NCHW, n, c, h, w);
+
+    changeTensorPlacement(output, DEVICE);
+    __half *convData;
+    long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w;
+    checkCudaErrors(cudaMalloc(&convData, convDataSize));
+
+    const int blockSize = 256;
+    const int gridSize =
+        (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
+    // convToGemmHalf
+    convToGemmHalfInputNew<<<gridSize, blockSize>>>(
+        convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
+        vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
+        num_filter_elem, skip_every, offset);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    const __half alf = approx_float_to_half(1.0);
+    const __half bet = approx_float_to_half(0.0);
+    const __half *alpha_half = &alf;
+    const __half *beta_half = &bet;
+    checkCudaErrors(cublasGemmEx(
+        cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
+        alpha_half, convData, CUDA_R_16F, n * h * w,
+        (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half,
+        (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F,
+        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    const int numBlocks = (n * c * h * w + 255) / 256;
+    switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w,
+                                         (__half *)output->gpu_half_data,
+                                         (__half *)new_output->gpu_half_data);
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    freeTensor(output);
+    cudaFree(convData);
   }
-//    INFO("CONV DONE\n");
+  //    INFO("CONV DONE\n");
   profileEvent("H2F_start");
   convertToFP32_offline(new_output);
-  //convertToFP32(input);
-  //convertToFP32(filter);
+  // convertToFP32(input);
+  // convertToFP32(filter);
   profileEvent("H2F_end");
-  //profileEvent("#Conv_end");
-  //INFO("CONVOLUTION END\n");
+  // profileEvent("#Conv_end");
+  // INFO("CONVOLUTION END\n");
   return new_output;
 }
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
index fd1492fe68..c18ffcea26 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp
@@ -1,13 +1,13 @@
-//===--------------------------- configuration.cpp -------------------------===//
+//===--------------------------- configuration.cpp
+//-------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  consists of the definitions of API to get information about 
+//
+//  This file  consists of the definitions of API to get information about
 // configurations for rest of the tensor runtime to use.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "configuration.h"
 
 using G_APPROX = GPUNodeConfiguration::APPROX;
@@ -31,9 +31,8 @@ void GPUNodeConfiguration::pushNewTensorOperation(G_TENSOR_OP top) {
 void GPUNodeConfiguration::pushNewApproximationChoiceForOperation(
     G_APPROX approx, int u) {
   unsigned size = ApproxChoices.size();
-  CUSTOM_ASSERT(
-      size >= 1 &&
-      "Cannot apply approximation choice to non existent operation.");
+  CUSTOM_ASSERT(size >= 1 &&
+                "Cannot apply approximation choice to non existent operation.");
   ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u));
 }
 
@@ -55,9 +54,8 @@ void CPUNodeConfiguration::pushNewTensorOperation(C_TENSOR_OP top) {
 void CPUNodeConfiguration::pushNewApproximationChoiceForOperation(
     C_APPROX approx, int u) {
   unsigned size = ApproxChoices.size();
-  CUSTOM_ASSERT(
-      size >= 1 &&
-      "Cannot apply approximation choice to non existent operation.");
+  CUSTOM_ASSERT(size >= 1 &&
+                "Cannot apply approximation choice to non existent operation.");
   ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u));
 }
 
@@ -71,8 +69,8 @@ CPUNodeConfiguration::CPUNodeConfiguration() {
 }
 CPUNodeConfiguration::~CPUNodeConfiguration() {}
 
-Configuration::Configuration(
-    std::string &n, float f, float e, float a, float al)
+Configuration::Configuration(std::string &n, float f, float e, float a,
+                             float al)
     : name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {}
 
 float Configuration::getSpeedup() { return speedup; }
@@ -82,20 +80,20 @@ float Configuration::getEnergy() { return energy; }
 float Configuration::getAccuracy() { return accuracy; }
 
 float Configuration::getAccuracyLoss() { return accuracyLoss; }
-bool ConfigurationLessThan::
-operator()(const struct Configuration &a, const struct Configuration &b) const {
+bool ConfigurationLessThan::operator()(const struct Configuration &a,
+                                       const struct Configuration &b) const {
   return (a.accuracyLoss < b.accuracyLoss);
 }
-bool ConfigurationLessThan_AL::
-operator()(const struct Configuration *a, const float &b) const {
+bool ConfigurationLessThan_AL::operator()(const struct Configuration *a,
+                                          const float &b) const {
   return (a->accuracyLoss < b);
 }
-bool ConfigurationLessThan_SP::
-operator()(const struct Configuration *a, const float &b) const {
+bool ConfigurationLessThan_SP::operator()(const struct Configuration *a,
+                                          const float &b) const {
   return (a->speedup < b);
 }
-bool ConfigurationLessThan_E::
-operator()(const struct Configuration *a, const float &b) const {
+bool ConfigurationLessThan_E::operator()(const struct Configuration *a,
+                                         const float &b) const {
   return (a->energy < b);
 }
 
@@ -286,9 +284,8 @@ void CPUNodeConfiguration::print() {
 void Configuration::print() {
 
   printf("+++++\n");
-  printf(
-      "%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy,
-      accuracyLoss);
+  printf("%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy,
+         accuracyLoss);
   for (std::map<std::string, NodeConfiguration *>::const_iterator it =
            setup.begin();
        it != setup.end(); ++it) {
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc
index 3e4aecb824..1abf5432b9 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc
@@ -5,7 +5,7 @@
 
 #define LOG_DEBUG 0 // Sets the debug logging to true
 #define LOG_INFO 1  // Sets the info logging to true
-#define LOG_ERROR 1  // Print Errors 
+#define LOG_ERROR 1 // Print Errors
 #define ASSERT_FLAG // Sets assertions to true (opposite of NDEBUG macro)
 
 #include "debug.h"
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
index 0e05813bb6..032443bd7a 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
@@ -12,8 +12,8 @@
 #define CASE_FUNC(ename, fname)                                                \
   case MathOp::ename: {                                                        \
     void *v_func_ptr = nullptr;                                                \
-    checkCudaErrors(cudaMemcpyFromSymbol(                                      \
-        &v_func_ptr, _internal::fname##_ptr, sizeof(void *)));                 \
+    checkCudaErrors(cudaMemcpyFromSymbol(&v_func_ptr, _internal::fname##_ptr,  \
+                                         sizeof(void *)));                     \
     return v_func_ptr;                                                         \
   }
 
@@ -120,7 +120,7 @@ template <> void *mathOpToFunc<float2>(MathOp op) {
     CASE_FUNC(Mul, f2mul)
   default:
     ERROR("Float2 function not found\n");
-    return nullptr;  // For some compilers
+    return nullptr; // For some compilers
   }
 }
 
@@ -129,7 +129,7 @@ template <> void *mathOpToFunc<half2>(MathOp op) {
     CASE_FUNC(Mul, h2mul)
   default:
     ERROR("Half2 function not found\n");
-    return nullptr;  // For some compilers
+    return nullptr; // For some compilers
   }
 }
 
@@ -151,7 +151,7 @@ template <> void *mathOpToFunc<float>(MathOp op) {
   default:
     ERROR("Float function not found\n");
   }
-  return nullptr;  // For some compilers
+  return nullptr; // For some compilers
 }
 
 template <> void *mathOpToFunc<half>(MathOp op) {
@@ -169,7 +169,7 @@ template <> void *mathOpToFunc<half>(MathOp op) {
   default:
     ERROR("Half function not found\n");
   }
-  return nullptr;  // For some compilers
+  return nullptr; // For some compilers
 }
 
 template <> half reduceOpToIdentity<half>(MathOp op) {
@@ -185,5 +185,5 @@ template <> half reduceOpToIdentity<half>(MathOp op) {
   default:
     ERROR("Operator does not have id value\n");
   }
-  return 0.0f;  // For some compilers
+  return 0.0f; // For some compilers
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu
index 4afed4c287..638e06e786 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu
@@ -2,7 +2,6 @@
 #ifndef ERROR_HEADER
 #define ERROR_HEADER
 
-
 #include <stdio.h>
 #include <stdarg.h>
 #include <cstdio>
@@ -23,7 +22,6 @@
 #include <math.h>
 #include <assert.h>
 
-
 #include "debug.h"
 #include "tensor.h"
 #include "profiling.h"
@@ -31,39 +29,33 @@
 #include "global_data.h"
 #include "error.h"
 
+extern "C" {
 
+void readSkipTensors(int *skip_tensor_ids, int op_count) {
 
-extern "C"{
-  
-  
-void readSkipTensors(int* skip_tensor_ids, int op_count){
-
-  for(int i = 0; i < op_count; i++){
+  for (int i = 0; i < op_count; i++) {
     int tensor_id = skip_tensor_ids[i];
     skip_tensors[tensor_id] = 1;
   }
-
 }
 
-
-
-void readOpenTunerFlags(const char* file_name){
+void readOpenTunerFlags(const char *file_name) {
 
   total_ops = 0;
   op_counter = 0;
   op_accuracies.clear();
-  
-  FILE* fp = fopen(file_name, "r");
-  if(fp == NULL){
+
+  FILE *fp = fopen(file_name, "r");
+  if (fp == NULL) {
     DEBUG("\n WARNING: File 'opentuner_flags' not found \n\n\n");
     return;
   }
-    
+
   int retVal = 200;
-  while(retVal != EOF){
+  while (retVal != EOF) {
 
     int op_acc;
-    if(fp != NULL)
+    if (fp != NULL)
       retVal = fscanf(fp, "%d", &op_acc);
     else
       op_acc = 0;
@@ -75,24 +67,23 @@ void readOpenTunerFlags(const char* file_name){
   fclose(fp);
 }
 
-
-void readQuantRanges(char* file_name){
+void readQuantRanges(char *file_name) {
 
   total_ops = 0;
   op_counter = 0;
   quant_ranges.clear();
-  
-  FILE* fp = fopen(file_name, "r");
-  if(fp == NULL){
+
+  FILE *fp = fopen(file_name, "r");
+  if (fp == NULL) {
     ERROR("File %s not found \n", file_name);
   }
-    
+
   int retVal = 200;
-  while(retVal != EOF && retVal != -1){
+  while (retVal != EOF && retVal != -1) {
 
     int min;
     int max;
-    if(fp != NULL){
+    if (fp != NULL) {
       retVal = fscanf(fp, "%d", &min);
       printf("min =% d \n", min);
 
@@ -100,22 +91,18 @@ void readQuantRanges(char* file_name){
       printf("max =% d \n", max);
     }
 
-    if(retVal != -1){
-      struct Range* range = (struct Range*) malloc(sizeof(struct Range));
+    if (retVal != -1) {
+      struct Range *range = (struct Range *)malloc(sizeof(struct Range));
       range->min = min;
       range->max = max;
       quant_ranges.push_back(range);
       total_ops++;
     }
   }
-  
+
   fclose(fp);
 }
 
-
-
-
-
 /*__device__ inline void atomicAdd(float* address, float value)
 
 {
@@ -133,11 +120,7 @@ void readQuantRanges(char* file_name){
 };
 */
 
-
-
-
-
-Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
+Norm_t *calculateNorms(Tensor *x, Tensor *x_orig) {
 
   deviceToHostCopy(x);
   deviceToHostCopy(x_orig);
@@ -148,18 +131,18 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
   float inf_norm = -1.0;
   double total = 0.0;
 
-  float* arr1 = (float*) x->host_data;
-  float* arr2 = (float*) x_orig->host_data;
-  
-  for(unsigned int i = 0; i < x->num_elems; i++){
+  float *arr1 = (float *)x->host_data;
+  float *arr2 = (float *)x_orig->host_data;
+
+  for (unsigned int i = 0; i < x->num_elems; i++) {
 
     total = total + arr2[i];
-    
+
     float diff = abs(arr1[i] - arr2[i]);
     l1_norm += diff;
-    l2_norm += (arr1[i] - arr2[i]) *  (arr1[i] - arr2[i]);
+    l2_norm += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]);
 
-    if(inf_norm < diff)
+    if (inf_norm < diff)
       inf_norm = diff;
   }
 
@@ -170,12 +153,11 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
   l1_norm = l1_norm / distribution_mean;
   l2_norm = l2_norm / distribution_mean;
 
-    
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
+  Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t));
   norms->l1_norm = l1_norm;
   norms->l2_norm = l2_norm;
-  norms->inf_norm = inf_norm;  
-  
+  norms->inf_norm = inf_norm;
+
   INFO("l1_norm = %f \n", l1_norm);
   INFO("l2_norm = %f \n", l2_norm);
   INFO("inf_norm = %f \n", inf_norm);
@@ -183,9 +165,7 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){
   return norms;
 }
 
-
-
-Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
+Norm_t *calculateNorms2(Tensor *x, Tensor *x_orig) {
 
   deviceToHostCopy(x);
   deviceToHostCopy(x_orig);
@@ -196,50 +176,49 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
 
   double l1_norm_A = 0.0;
   double l1_norm_B = 0.0;
-  
+
   double l2_norm_A = 0.0;
   double l2_norm_B = 0.0;
   float inf_norm = -1.0;
   float orig_inf_norm = -1.0;
   double total_diff = 0.0;
   double total_diff_squared = 0.0;
- 
-  float* arr1 = (float*) x->host_data;
-  float* arr2 = (float*) x_orig->host_data;
-  
-  for(unsigned int i = 0; i < x->num_elems; i++){
 
-    if(arr2[i] != 0.0)
+  float *arr1 = (float *)x->host_data;
+  float *arr2 = (float *)x_orig->host_data;
+
+  for (unsigned int i = 0; i < x->num_elems; i++) {
+
+    if (arr2[i] != 0.0)
       l0_norm_A = l0_norm_A + 1.0;
-    if(arr1[i] != 0.0)
+    if (arr1[i] != 0.0)
       l0_norm_B = l0_norm_B + 1.0;
-        
+
     l1_norm_A = l1_norm_A + abs(arr2[i]);
     l1_norm_B = l1_norm_B + abs(arr1[i]);
 
     l2_norm_A = l2_norm_A + (arr2[i] * arr2[i]);
     l2_norm_B = l2_norm_B + (arr1[i] * arr1[i]);
-      
+
     float diff = abs(arr1[i] - arr2[i]);
     total_diff = total_diff + diff;
     float diff_squared = diff * diff;
-    total_diff_squared = total_diff_squared + diff_squared; 
-
+    total_diff_squared = total_diff_squared + diff_squared;
 
-    if(orig_inf_norm < diff){
+    if (orig_inf_norm < diff) {
       orig_inf_norm = diff;
     }
-    
+
     // Relative difference value
-    float normalized_diff = diff / arr2[i];   
-    if(inf_norm < normalized_diff){
+    float normalized_diff = diff / arr2[i];
+    if (inf_norm < normalized_diff) {
       inf_norm = normalized_diff;
-    }    
+    }
   }
 
   // Relative L1 and Mean L1 norms of the difference Matrix
-  float mean_l1 = ( total_diff ) / x->num_elems;
-  float relative_l1 = ( total_diff ) / l1_norm_A;
+  float mean_l1 = (total_diff) / x->num_elems;
+  float relative_l1 = (total_diff) / l1_norm_A;
 
   // Computing Relative L2 norm - i.e., Euclidean distance
   double norm_root_A = sqrt(l2_norm_A);
@@ -248,8 +227,9 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
   float relative_l2 = diff_root / norm_root_A;
 
   // Packing computed norms in Norm_t struct
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
-  // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
+  Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t));
+  // Mean metrics - not normalized for the distribution - suitable for precision
+  // tuning hardware
   norms->mean_l1 = mean_l1;
   norms->mean_l2 = mean_l2;
   norms->orig_inf_norm = orig_inf_norm;
@@ -257,8 +237,8 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
   // Relative metrics (relative to distribution) - suitable for PROMISE
   norms->l1_norm = relative_l1;
   norms->l2_norm = relative_l2;
-  norms->inf_norm = inf_norm;  
-  
+  norms->inf_norm = inf_norm;
+
   INFO("l1_norm = %f \n", relative_l1);
   INFO("l2_norm = %f \n", relative_l2);
   INFO("inf_norm = %f \n", inf_norm);
@@ -266,33 +246,28 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){
   return norms;
 }
 
-
-
-
-
-__global__ void normComputeKernel(float* A, float * B, double* l1_A, double* l2_A,
-				  double* l1_diff, double* l2_diff, unsigned int n){
+__global__ void normComputeKernel(float *A, float *B, double *l1_A,
+                                  double *l2_A, double *l1_diff,
+                                  double *l2_diff, unsigned int n) {
 
   int i = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(i < n){
-    
+  if (i < n) {
+
     double diff = fabsf(A[i] - B[i]);
-    double diff_squared = diff * diff;   
+    double diff_squared = diff * diff;
 
-    atomicAdd( l1_A,  fabsf(A[i]) );
-    atomicAdd( l2_A, (A[i] * A[i]) );
+    atomicAdd(l1_A, fabsf(A[i]));
+    atomicAdd(l2_A, (A[i] * A[i]));
 
-    atomicAdd( l1_diff, diff);
-    atomicAdd( l2_diff, diff_squared);
+    atomicAdd(l1_diff, diff);
+    atomicAdd(l2_diff, diff_squared);
   }
 }
 
-
-
 __inline__ __device__ double warpReduceSum(double val) {
 
-  for (int offset = warpSize/2; offset > 0; offset /= 2)
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
     val += __shfl_down_sync(0xFFFFFFFF, val, offset);
 
   return val;
@@ -304,36 +279,34 @@ __inline__ __device__ double blockReduceSum(double val) {
   int lane = threadIdx.x % warpSize;
   int wid = threadIdx.x / warpSize;
 
-  val = warpReduceSum(val);     // Each warp performs partial reduction
+  val = warpReduceSum(val); // Each warp performs partial reduction
 
   if (lane == 0)
-    shared[wid]=val; // Write reduced value to shared memory
+    shared[wid] = val; // Write reduced value to shared memory
 
-  
-  __syncthreads();              // Wait for all partial reductions
+  __syncthreads(); // Wait for all partial reductions
 
-  
-  //read from shared memory only if that warp existed
+  // read from shared memory only if that warp existed
   val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
 
-  if (wid == 0) val = warpReduceSum(val); //Final reduce within first warp
+  if (wid == 0)
+    val = warpReduceSum(val); // Final reduce within first warp
 
   return val;
-
 }
 
-
-
-__global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N,
-					      double* A_l1, double* A_l2,
-					      double* diff_l1, double* diff_l2) {
+__global__ void deviceReduceBlockAtomicKernel(float *A, float *B, int N,
+                                              double *A_l1, double *A_l2,
+                                              double *diff_l1,
+                                              double *diff_l2) {
 
   double sum_A_l1 = double(0);
   double sum_A_l2 = double(0);
   double sum_diff_l1 = double(0);
   double sum_diff_l2 = double(0);
 
-  for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
 
     sum_A_l1 += fabsf(A[i]);
     sum_A_l2 += (A[i] * A[i]);
@@ -347,31 +320,28 @@ __global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N,
   sum_A_l2 = blockReduceSum(sum_A_l2);
   sum_diff_l1 = blockReduceSum(sum_diff_l1);
   sum_diff_l2 = blockReduceSum(sum_diff_l2);
-  
-  if (threadIdx.x == 0){
+
+  if (threadIdx.x == 0) {
     atomicAdd(A_l1, sum_A_l1);
     atomicAdd(A_l2, sum_A_l2);
     atomicAdd(diff_l1, sum_diff_l1);
     atomicAdd(diff_l2, sum_diff_l2);
-  }   
+  }
 }
 
-
-void deviceReduce(float* A, float* B, int N,
-		  double* A_l1, double* A_l2,
-		  double* diff_l1, double* diff_l2) {
+void deviceReduce(float *A, float *B, int N, double *A_l1, double *A_l2,
+                  double *diff_l1, double *diff_l2) {
 
   int threads = 512;
   int blocks = min((N + threads - 1) / threads, 1024);
 
-  deviceReduceBlockAtomicKernel<<<blocks, threads>>>(A, B, N, A_l1, A_l2, diff_l1, diff_l2);
+  deviceReduceBlockAtomicKernel<<<blocks, threads>>>(A, B, N, A_l1, A_l2,
+                                                     diff_l1, diff_l2);
   //-- deviceReduceKernel<<<1, 1024>>>(out, out, blocks);
 }
 
-
-
 // Compute Norms on the GPU
-Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){
+Norm_t *calculateNormsTreeReduction(Tensor *x, Tensor *x_orig) {
 
   hostToDeviceCopy(x);
   hostToDeviceCopy(x_orig);
@@ -388,26 +358,27 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){
   double *l2_norm_A_d;
   double *l1_diff_d;
   double *l2_diff_d;
-  
-  cudaMalloc( (void**) &l1_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l2_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l1_diff_d, sizeof(double));
-  cudaMalloc( (void**) &l2_diff_d, sizeof(double));
- 
-    
-  float* arr1 = (float*) x->gpu_data;
-  float* arr2 = (float*) x_orig->gpu_data;
-
-  //normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
-  deviceReduce(arr1, arr2, x->num_elems, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d);
-  
+
+  cudaMalloc((void **)&l1_norm_A_d, sizeof(double));
+  cudaMalloc((void **)&l2_norm_A_d, sizeof(double));
+  cudaMalloc((void **)&l1_diff_d, sizeof(double));
+  cudaMalloc((void **)&l2_diff_d, sizeof(double));
+
+  float *arr1 = (float *)x->gpu_data;
+  float *arr2 = (float *)x_orig->gpu_data;
+
+  // normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d,
+  // l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
+  deviceReduce(arr1, arr2, x->num_elems, l1_norm_A_d, l2_norm_A_d, l1_diff_d,
+               l2_diff_d);
+
   cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
 
   INFO("l1_norm_A = %f, l2_norm_A = %f, l1_diff = %f, l2_diff = %f \n",
-       l1_norm_A, l2_norm_A,l1_diff, l2_diff);
+       l1_norm_A, l2_norm_A, l1_diff, l2_diff);
 
   // Relative L1 and Mean L1 norms of the difference Matrix
   float mean_l1 = l1_diff / x->num_elems;
@@ -420,34 +391,32 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){
   float relative_l2 = diff_root / norm_root_A;
 
   // Packing computed norms in Norm_t struct
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
-  // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
+  Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t));
+  // Mean metrics - not normalized for the distribution - suitable for precision
+  // tuning hardware
   norms->mean_l1 = mean_l1;
   norms->mean_l2 = mean_l2;
   norms->orig_inf_norm = 0.0;
 
-  // Relative metrics (relative to distribution) 
+  // Relative metrics (relative to distribution)
   norms->l1_norm = relative_l1;
   norms->l2_norm = relative_l2;
-  norms->inf_norm = 0.0;  
-  
+  norms->inf_norm = 0.0;
+
   INFO("l1_norm = %f \n", relative_l1);
   INFO("l2_norm = %f \n", relative_l2);
 
   return norms;
 }
 
-
-
-
 // Compute Norms on the GPU
-Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
+Norm_t *calculateNormsGPU(Tensor *x, Tensor *x_orig) {
 
   hostToDeviceCopy(x);
   hostToDeviceCopy(x_orig);
 
   // FIXIT: Move all floats to doubles - overflow is possible
-  
+
   double l1_norm_A;
   double l2_norm_A;
 
@@ -459,27 +428,26 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
   double *l2_norm_A_d;
   double *l1_diff_d;
   double *l2_diff_d;
-  
-  cudaMalloc( (void**) &l1_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l2_norm_A_d, sizeof(double));
-  cudaMalloc( (void**) &l1_diff_d, sizeof(double));
-  cudaMalloc( (void**) &l2_diff_d, sizeof(double));
- 
-    
-  float* arr1 = (float*) x->gpu_data;
-  float* arr2 = (float*) x_orig->gpu_data;
+
+  cudaMalloc((void **)&l1_norm_A_d, sizeof(double));
+  cudaMalloc((void **)&l2_norm_A_d, sizeof(double));
+  cudaMalloc((void **)&l1_diff_d, sizeof(double));
+  cudaMalloc((void **)&l2_diff_d, sizeof(double));
+
+  float *arr1 = (float *)x->gpu_data;
+  float *arr2 = (float *)x_orig->gpu_data;
 
   int blockSize = 1024;
-  int gridSize = (int) ceil ((float) x->num_elems / blockSize);
+  int gridSize = (int)ceil((float)x->num_elems / blockSize);
   INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
 
-  normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
+  normComputeKernel<<<gridSize, blockSize>>>(
+      arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems);
 
   cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
   cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost);
-  
 
   // Relative L1 and Mean L1 norms of the difference Matrix
   float mean_l1 = l1_diff / x->num_elems;
@@ -492,8 +460,9 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
   float relative_l2 = diff_root / norm_root_A;
 
   // Packing computed norms in Norm_t struct
-  Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t));
-  // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware
+  Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t));
+  // Mean metrics - not normalized for the distribution - suitable for precision
+  // tuning hardware
   norms->mean_l1 = mean_l1;
   norms->mean_l2 = mean_l2;
   norms->orig_inf_norm = 0.0;
@@ -501,54 +470,47 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){
   // Relative metrics (relative to distribution) - suitable for PROMISE
   norms->l1_norm = relative_l1;
   norms->l2_norm = relative_l2;
-  norms->inf_norm = 0.0;  
-  
+  norms->inf_norm = 0.0;
+
   INFO("l1_norm = %f \n", relative_l1);
   INFO("l2_norm = %f \n", relative_l2);
 
   return norms;
 }
 
-
-
-
-__global__ void vecConstMul(float* A, float mul_factor, int n){
+__global__ void vecConstMul(float *A, float mul_factor, int n) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(id < n)
-    A[id] = A[id] * mul_factor; 
+  if (id < n)
+    A[id] = A[id] * mul_factor;
 }
 
-
-__global__ void vecRound(float* A, int n){
+__global__ void vecRound(float *A, int n) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(id < n)
-    A[id] = roundf(A[id]); 
+  if (id < n)
+    A[id] = roundf(A[id]);
 }
 
-
-__global__ void vecConstDiv(float* A, float div_factor, int n){
+__global__ void vecConstDiv(float *A, float div_factor, int n) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(id < n)
-    A[id] = A[id] / div_factor; 
+  if (id < n)
+    A[id] = A[id] / div_factor;
 }
 
-
-
-__global__ void vecMul(float* A, float* B, int n){
+__global__ void vecMul(float *A, float *B, int n) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-  if(id < n)
-    B[id] = A[id] * B[id]; 
+  if (id < n)
+    B[id] = A[id] * B[id];
 }
 
-void initPromiseRandValues(Tensor* bias, int error_scale){
+void initPromiseRandValues(Tensor *bias, int error_scale) {
 
   float scaling_values[10];
 
@@ -558,98 +520,91 @@ void initPromiseRandValues(Tensor* bias, int error_scale){
   scaling_values[2] = 0.336;
   scaling_values[3] = 0.21;
   scaling_values[4] = 0.168;
-  scaling_values[5] = 0.14;  
+  scaling_values[5] = 0.14;
   scaling_values[6] = 0.11;
   scaling_values[7] = 0.0784;
   scaling_values[8] = 0.005;
   scaling_values[9] = 0.000;
 
-  
   curandGenerator_t gen;
   struct timespec ts;
-  if(timespec_get(&ts, TIME_UTC) == 0){
+  if (timespec_get(&ts, TIME_UTC) == 0) {
     printf("crashed \n");
     abort();
   }
 
   curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
-  curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec);
-  curandGenerateNormal(gen,
-		       (float*) bias->gpu_data,
-		       bias->num_elems, 0.0,
-		       1.0 * scaling_values[error_scale]);
-  
+  curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec ^ ts.tv_sec);
+  curandGenerateNormal(gen, (float *)bias->gpu_data, bias->num_elems, 0.0,
+                       1.0 * scaling_values[error_scale]);
 }
 
-
 // NOTE: Assumption is that x_ptr is FP32 tensor - doesn't work with FP16
 // Routine for Adding PROMISE bitline swing error
-void* addPromiseError(void* x_ptr, int error_scale){
+void *addPromiseError(void *x_ptr, int error_scale) {
 
-  if(error_scale > 10 || error_scale < 0){
+  if (error_scale > 10 || error_scale < 0) {
     ERROR("Error Scale out of bounds for PROMISE - 8 Swing values \n");
   }
-      
-  INFO("*** addPromiseError \n");  
+
+  INFO("*** addPromiseError \n");
   profileEvent("addPromiseError");
 
-  Tensor* x = (Tensor*) x_ptr;
-  
-  size_t* dim_sizes = x->dims.dim_sizes;
-  Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format,
-					  dim_sizes[0], dim_sizes[1],
-					  dim_sizes[2], dim_sizes[3]);
- 
+  Tensor *x = (Tensor *)x_ptr;
+
+  size_t *dim_sizes = x->dims.dim_sizes;
+  Tensor *bias =
+      (Tensor *)create4DTensor(x->cur_type, x->data_format, dim_sizes[0],
+                               dim_sizes[1], dim_sizes[2], dim_sizes[3]);
+
   // NOTE: Error scale is used to generate the bias matrix
-  initPromiseRandValues(bias, error_scale);  
+  initPromiseRandValues(bias, error_scale);
 
   hostToDeviceCopy(x);
-  //hostToDeviceCopy(bias);
- 
+  // hostToDeviceCopy(bias);
+
   int blockSize = 1024;
-  int gridSize = (int) ceil ((float) x->num_elems / blockSize);
+  int gridSize = (int)ceil((float)x->num_elems / blockSize);
   INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
 
   // NOTE: Check if a large gridSize will work with really large tensors
-  vecMul<<<gridSize, blockSize>>>((float*) x->gpu_data, (float*) bias->gpu_data, x->num_elems);
-  
+  vecMul<<<gridSize, blockSize>>>((float *)x->gpu_data, (float *)bias->gpu_data,
+                                  x->num_elems);
+
   float alpha = 1.0f;
-  //float beta = 0.0f;    
+  // float beta = 0.0f;
   checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc,
-			    bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data));
+                            bias->gpu_data, &alpha, x->tensor_desc,
+                            x->gpu_data));
 
   profileEvent("addPromiseError_end", true);
-  
-  return (void*) x;
-}
-
-
 
+  return (void *)x;
+}
 
-__global__ void quantizeAndClip(float* A, int n, float mul_factor, float min, float max){
+__global__ void quantizeAndClip(float *A, int n, float mul_factor, float min,
+                                float max) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
-  if(id < n){
+  if (id < n) {
     int temp = (A[id] - min) / mul_factor;
     float result = temp * 1.0 * mul_factor;
     result = result + min;
     A[id] = result;
 
-    if(A[id] > max){
+    if (A[id] > max) {
       A[id] = max;
     }
-    if(A[id] < min){
+    if (A[id] < min) {
       A[id] = min;
     }
-    
   }
 }
 
-
-__global__ void quantizeElem(float* A, int n, float mul_factor, float min){
+__global__ void quantizeElem(float *A, int n, float mul_factor, float min) {
 
   int id = blockIdx.x * blockDim.x + threadIdx.x;
-  if(id < n){
+  if (id < n) {
     int temp = (A[id] - min) / mul_factor;
     float result = temp * 1.0 * mul_factor;
     result = result + min;
@@ -657,32 +612,27 @@ __global__ void quantizeElem(float* A, int n, float mul_factor, float min){
   }
 }
 
-
-void* quantizeTensorPromise(void* input_ptr, float min, float max){
+void *quantizeTensorPromise(void *input_ptr, float min, float max) {
 
   INFO("QuantizeTensorPROMISE \n");
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
-  
   int quantize_range = 256;
   float input_range = max - min;
   float mul_factor = input_range / quantize_range;
   INFO("mul_factor = %f \n", mul_factor);
 
   int blockSize = 1024;
-  int gridSize = (int) ceil ((float) input->num_elems / blockSize);
+  int gridSize = (int)ceil((float)input->num_elems / blockSize);
   INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize);
 
   hostToDeviceCopy(input);
 
-  quantizeAndClip<<<gridSize, blockSize>>>((float*) input->gpu_data,
-					   input->num_elems, mul_factor, min, max);
+  quantizeAndClip<<<gridSize, blockSize>>>(
+      (float *)input->gpu_data, input->num_elems, mul_factor, min, max);
 
-  
   return input;
 }
-
 }
-  
 
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu
index 4392839f7f..00334f8ecc 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu
@@ -1,7 +1,7 @@
 //===--------------------------- fp16_gemm.cu -----------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 //  This file  consists of the custom implementation of quantization kernels.
 // This helps HPVM to switch compute precision for tensor operations between
 // FP32 and FP16.
@@ -17,236 +17,199 @@
 #include <cuda_fp16.h>
 #include "fp16_emu.h"
 
-
-
 inline cudaError_t checkCuda(cudaError_t result) {
-    if (result != cudaSuccess)
-        std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n";
-    return result;
+  if (result != cudaSuccess)
+    std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n";
+  return result;
 }
 
 inline cublasStatus_t checkCublas(cublasStatus_t result) {
-    if (result != CUBLAS_STATUS_SUCCESS)
-        std::cerr << "cuBLAS Error: " << result << "\n";
-    return result;
+  if (result != CUBLAS_STATUS_SUCCESS)
+    std::cerr << "cuBLAS Error: " << result << "\n";
+  return result;
 }
 
 template <typename T>
-inline void printArray(const T * const __restrict__ array,
+inline void printArray(const T *const __restrict__ array,
                        const unsigned elements) {
-    for (unsigned i = 0; i < elements; i++)
-        std::cout << std::to_string(array[i]) << "\n";
+  for (unsigned i = 0; i < elements; i++)
+    std::cout << std::to_string(array[i]) << "\n";
 }
 
 // initialization
 template <typename T>
-__global__ void initKernel(T * const __restrict__ array,
+__global__ void initKernel(T *const __restrict__ array,
                            const unsigned elements) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        array[idx] = 1.2;
+  const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < elements)
+    array[idx] = 1.2;
 }
 
 template <typename T>
-void init(T * const __restrict__ array,
-          const unsigned elements) {
-    const unsigned block_size = 512;
-    const unsigned num_blocks = (elements + block_size - 1) / block_size;
-    initKernel<<<num_blocks, block_size>>>(array, elements);
-    checkCuda(cudaDeviceSynchronize());
+void init(T *const __restrict__ array, const unsigned elements) {
+  const unsigned block_size = 512;
+  const unsigned num_blocks = (elements + block_size - 1) / block_size;
+  initKernel<<<num_blocks, block_size>>>(array, elements);
+  checkCuda(cudaDeviceSynchronize());
 }
 
 // float to half
-__global__ void f2hKernel(const float * const __restrict__ input,
+__global__ void f2hKernel(const float *const __restrict__ input,
                           const unsigned elements,
-                          half * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = __float2half_rn(input[idx]);
+                          half *const __restrict__ output) {
+  const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < elements)
+    output[idx] = __float2half_rn(input[idx]);
 }
 
-void f2h(const float * const __restrict__ input,
-         const unsigned elements,
-         half * const __restrict__ output) {
-    const unsigned block_size = 512;
-    const unsigned num_blocks = (elements + block_size - 1) / block_size;
-    f2hKernel<<<num_blocks, block_size>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
+void f2h(const float *const __restrict__ input, const unsigned elements,
+         half *const __restrict__ output) {
+  const unsigned block_size = 512;
+  const unsigned num_blocks = (elements + block_size - 1) / block_size;
+  f2hKernel<<<num_blocks, block_size>>>(input, elements, output);
+  checkCuda(cudaDeviceSynchronize());
 }
 
 // half to float
-__global__ void h2fKernel(const half * const __restrict__ input,
+__global__ void h2fKernel(const half *const __restrict__ input,
                           const unsigned elements,
-                          float * const __restrict__ output) {
-    const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < elements)
-        output[idx] = __half2float(input[idx]);
+                          float *const __restrict__ output) {
+  const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < elements)
+    output[idx] = __half2float(input[idx]);
 }
 
-void h2f(const half * const __restrict__ input,
-         const unsigned elements,
-         float * const __restrict__ output) {
-    const unsigned block_size = 512;
-    const unsigned num_blocks = (elements + block_size - 1) / block_size;
-    h2fKernel<<<num_blocks, block_size>>>(input, elements, output);
-    checkCuda(cudaDeviceSynchronize());
+void h2f(const half *const __restrict__ input, const unsigned elements,
+         float *const __restrict__ output) {
+  const unsigned block_size = 512;
+  const unsigned num_blocks = (elements + block_size - 1) / block_size;
+  h2fKernel<<<num_blocks, block_size>>>(input, elements, output);
+  checkCuda(cudaDeviceSynchronize());
 }
 
-void sgemm(const float * const __restrict__ a,
-           const unsigned num_rows_a,
-           const unsigned num_cols_a,
-           const float * const __restrict__ b,
-           const unsigned num_rows_b,
-           const unsigned num_cols_b,
-           float * const __restrict__ c) {
-    const unsigned iterations = 10;
-    float kernel_time;
-    cudaEvent_t start;
-    cudaEvent_t stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cublasHandle_t handle;
-    checkCublas(cublasCreate(&handle));
-
-    // Enable Tensor Cores
-    checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
-
-    const float alpha_ = 1.0;
-    const float beta_  = 0.0;
-    const float *alpha = &alpha_;
-    const float *beta  = &beta_;
-
-    cudaEventRecord(start, 0);
-    for (unsigned i = 0; i < iterations; i++) {
-        checkCublas(cublasGemmEx(handle,
-                                 CUBLAS_OP_N,
-                                 CUBLAS_OP_N,
-                                 // Dimensions
-                                 num_rows_a,
-                                 num_cols_b,
-                                 num_cols_a,
-                                 alpha,
-                                 // A
-                                 a,
-                                 CUDA_R_32F,
-                                 num_rows_a,
-                                 // B
-                                 b,
-                                 CUDA_R_32F,
-                                 num_rows_b,
-                                 beta,
-                                 // C
-                                 c,
-                                 CUDA_R_32F,
-                                 num_rows_a,
-                                 // Compute precision and algorithm
-                                 CUDA_R_32F,
-                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&kernel_time, start, stop);
-
-    std::cout << "FP32 GEMM: " << std::to_string(kernel_time / iterations) << " ms\n";
+void sgemm(const float *const __restrict__ a, const unsigned num_rows_a,
+           const unsigned num_cols_a, const float *const __restrict__ b,
+           const unsigned num_rows_b, const unsigned num_cols_b,
+           float *const __restrict__ c) {
+  const unsigned iterations = 10;
+  float kernel_time;
+  cudaEvent_t start;
+  cudaEvent_t stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cublasHandle_t handle;
+  checkCublas(cublasCreate(&handle));
+
+  // Enable Tensor Cores
+  checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+
+  const float alpha_ = 1.0;
+  const float beta_ = 0.0;
+  const float *alpha = &alpha_;
+  const float *beta = &beta_;
+
+  cudaEventRecord(start, 0);
+  for (unsigned i = 0; i < iterations; i++) {
+    checkCublas(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             // Dimensions
+                             num_rows_a, num_cols_b, num_cols_a, alpha,
+                             // A
+                             a, CUDA_R_32F, num_rows_a,
+                             // B
+                             b, CUDA_R_32F, num_rows_b, beta,
+                             // C
+                             c, CUDA_R_32F, num_rows_a,
+                             // Compute precision and algorithm
+                             CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&kernel_time, start, stop);
+
+  std::cout << "FP32 GEMM: " << std::to_string(kernel_time / iterations)
+            << " ms\n";
 }
 
-void hgemm(const float * const __restrict__ af,
-           const unsigned num_rows_a,
-           const unsigned num_cols_a,
-           const float * const __restrict__ bf,
-           const unsigned num_rows_b,
-           const unsigned num_cols_b,
-           float * const __restrict__ cf) {
-    const unsigned iterations = 10;
-
-    const unsigned num_elements_a = num_rows_a * num_cols_a;
-    const unsigned num_elements_b = num_rows_b * num_cols_b;
-    const unsigned num_elements_c = num_rows_a * num_cols_b;
-
-    float to_fp16_time;
-    float to_fp32_time;
-    float kernel_time;
-    float total_time;
-
-    cudaEvent_t start;
-    cudaEvent_t stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    half *a;
-    half *b;
-    half *c;
-
-    checkCuda(cudaMallocManaged(&a, sizeof(half) * num_elements_a));
-    checkCuda(cudaMallocManaged(&b, sizeof(half) * num_elements_b));
-    checkCuda(cudaMallocManaged(&c, sizeof(half) * num_elements_c));
-
-    init(a, num_elements_a);
-    init(b, num_elements_b);
-    init(c, num_elements_c);
-
-    // Convert floats to halfs
-    cudaEventRecord(start, 0);
-    f2h(af, num_elements_a, a);
-    f2h(bf, num_elements_b, b);
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&to_fp16_time, start, stop);
-
-    cublasHandle_t handle;
-    checkCublas(cublasCreate(&handle));
-    checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
-
-    const half alpha_ = cpu_float2half_rn(1.0);
-    const half beta_  = cpu_float2half_rn(0.0);
-    const half *alpha = &alpha_;
-    const half *beta  = &beta_;
-
-    cudaEventRecord(start, 0);
-    for (unsigned i = 0; i < iterations; i++) {
-        checkCublas(cublasGemmEx(handle,
-                                 CUBLAS_OP_N,
-                                 CUBLAS_OP_N,
-                                 // Dimensions
-                                 num_rows_a,
-                                 num_cols_b,
-                                 num_cols_a,
-                                 alpha,
-                                 // A
-                                 a,
-                                 CUDA_R_16F,
-                                 num_rows_a,
-                                 // B
-                                 b,
-                                 CUDA_R_16F,
-                                 num_rows_b,
-                                 beta,
-                                 // C
-                                 c,
-                                 CUDA_R_16F,
-                                 num_rows_a,
-                                 // Compute precision and algorithm
-                                 CUDA_R_16F,
-                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    }
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&kernel_time, start, stop);
-
-    cudaEventRecord(start, 0);
-    h2f(c, num_elements_c, cf);
-    cudaEventRecord(stop, 0);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&to_fp32_time, start, stop);
-
-    total_time = to_fp16_time + (kernel_time / iterations) + to_fp32_time;
-    std::cout << "FP16 GEMM: " << std::to_string(total_time) << " ms\n";
-    std::cout << "\tTo FP16: " << std::to_string(to_fp16_time) << " ms\n";
-    std::cout << "\tKernel : " << std::to_string(kernel_time / iterations) << " ms\n";
-    std::cout << "\tTo FP32: " << std::to_string(to_fp32_time) << " ms\n";
+void hgemm(const float *const __restrict__ af, const unsigned num_rows_a,
+           const unsigned num_cols_a, const float *const __restrict__ bf,
+           const unsigned num_rows_b, const unsigned num_cols_b,
+           float *const __restrict__ cf) {
+  const unsigned iterations = 10;
+
+  const unsigned num_elements_a = num_rows_a * num_cols_a;
+  const unsigned num_elements_b = num_rows_b * num_cols_b;
+  const unsigned num_elements_c = num_rows_a * num_cols_b;
+
+  float to_fp16_time;
+  float to_fp32_time;
+  float kernel_time;
+  float total_time;
+
+  cudaEvent_t start;
+  cudaEvent_t stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  half *a;
+  half *b;
+  half *c;
+
+  checkCuda(cudaMallocManaged(&a, sizeof(half) * num_elements_a));
+  checkCuda(cudaMallocManaged(&b, sizeof(half) * num_elements_b));
+  checkCuda(cudaMallocManaged(&c, sizeof(half) * num_elements_c));
+
+  init(a, num_elements_a);
+  init(b, num_elements_b);
+  init(c, num_elements_c);
+
+  // Convert floats to halfs
+  cudaEventRecord(start, 0);
+  f2h(af, num_elements_a, a);
+  f2h(bf, num_elements_b, b);
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&to_fp16_time, start, stop);
+
+  cublasHandle_t handle;
+  checkCublas(cublasCreate(&handle));
+  checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+
+  const half alpha_ = cpu_float2half_rn(1.0);
+  const half beta_ = cpu_float2half_rn(0.0);
+  const half *alpha = &alpha_;
+  const half *beta = &beta_;
+
+  cudaEventRecord(start, 0);
+  for (unsigned i = 0; i < iterations; i++) {
+    checkCublas(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             // Dimensions
+                             num_rows_a, num_cols_b, num_cols_a, alpha,
+                             // A
+                             a, CUDA_R_16F, num_rows_a,
+                             // B
+                             b, CUDA_R_16F, num_rows_b, beta,
+                             // C
+                             c, CUDA_R_16F, num_rows_a,
+                             // Compute precision and algorithm
+                             CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&kernel_time, start, stop);
+
+  cudaEventRecord(start, 0);
+  h2f(c, num_elements_c, cf);
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&to_fp32_time, start, stop);
+
+  total_time = to_fp16_time + (kernel_time / iterations) + to_fp32_time;
+  std::cout << "FP16 GEMM: " << std::to_string(total_time) << " ms\n";
+  std::cout << "\tTo FP16: " << std::to_string(to_fp16_time) << " ms\n";
+  std::cout << "\tKernel : " << std::to_string(kernel_time / iterations)
+            << " ms\n";
+  std::cout << "\tTo FP32: " << std::to_string(to_fp32_time) << " ms\n";
 }
 
-
- 
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
index 4902043b7c..b812a51d7e 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc
@@ -48,4 +48,3 @@ PerfParamSet *perfParamSet;
 SampParamSet *sampParamSet;
 
 unsigned int currentTensorID = -1;
-
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
index ab8896369a..6a3fcc12e0 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
@@ -1,14 +1,14 @@
-//===--------------------------- group_conv.cu -----------------------------===//
+//===--------------------------- group_conv.cu
+//-----------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  group convolutions with FP16 and FP32 compute precisions. 
+//
+//  This file  group convolutions with FP16 and FP32 compute precisions.
 // Note that group convolutions, unlike regular convolutions, are not
 // approximable in any other way in HPVM.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "tensor_utils.h"
 #include "fp16_gemm.h"
 #include "debug.h"
@@ -17,31 +17,26 @@
 #include "op_overheads.h"
 #include "error.h"
 
+extern "C" {
 
-extern "C"{
-
-
+__global__ void depthwise_convNew8(
+    float *const __restrict__ y, const float *const __restrict__ x,
+    const float *const __restrict__ w, const int B, const int M, const int H,
+    const int W, const int KH, const int KW, const int H_out, const int W_out,
+    const int H_pad, const int W_pad, const int H_stride, const int W_stride) {
 
-__global__ void depthwise_convNew8(float* const __restrict__ y,
-				   const float* const __restrict__ x,
-				   const float* const __restrict__ w,
-				   const int B, const int M,
-				   const int H, const int W, const int KH,
-				   const int KW, const int H_out, const int W_out,
-				   const int H_pad, const int W_pad,
-				   const int H_stride, const int W_stride)
-{
-
-  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+#define y4d(i3, i2, i1, i0)                                                    \
+  y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+#define x4d(i3, i2, i1, i0)                                                    \
+  x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
   const int num = 8;
 
   const int b = num * blockIdx.x;
-  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
-	
-  if(m < M){
-    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+  const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out);
+
+  if (m < M) {
+    const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out);
 
     const int start_h = (tx / W_out) * H_stride - H_pad;
     const int start_w = (tx % W_out) * W_stride - W_pad;
@@ -54,80 +49,73 @@ __global__ void depthwise_convNew8(float* const __restrict__ y,
     float c5 = 0;
     float c6 = 0;
     float c7 = 0;
-	
-    const float* weights = &w[m * KH * KW];
+
+    const float *weights = &w[m * KH * KW];
 
     for (int k = 0; k < KH * KW; k++) {
       int p = k / KW;
       int q = k % KW;
 
-      if (start_h + p > -1 && start_h + p < H &&
-	  start_w + q > -1 && start_w + q < W) {
-
-	c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
-	if(b + 1 < B)
-	  c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
-	if(b + 2 < B)
-	  c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
-	if(b + 3 < B)
-	  c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
-	if(b + 4 < B)
-	  c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
-	if(b + 5 < B)
-	  c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
-	if(b + 6 < B)
-	  c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
-	if(b + 7 < B)
-	  c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
-    
-
+      if (start_h + p > -1 && start_h + p < H && start_w + q > -1 &&
+          start_w + q < W) {
+
+        c0 += x4d(b, m, start_h + p, start_w + q) * weights[k];
+        if (b + 1 < B)
+          c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k];
+        if (b + 2 < B)
+          c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k];
+        if (b + 3 < B)
+          c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k];
+        if (b + 4 < B)
+          c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k];
+        if (b + 5 < B)
+          c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k];
+        if (b + 6 < B)
+          c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k];
+        if (b + 7 < B)
+          c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k];
       }
     }
 
-    y4d(b, m, 0, tx) = c0;	
-    if(b + 1 < B)
+    y4d(b, m, 0, tx) = c0;
+    if (b + 1 < B)
       y4d(b + 1, m, 0, tx) = c1;
-    if(b + 2 < B)
+    if (b + 2 < B)
       y4d(b + 2, m, 0, tx) = c2;
-    if(b + 3 < B)
+    if (b + 3 < B)
       y4d(b + 3, m, 0, tx) = c3;
-    if(b + 4 < B)
+    if (b + 4 < B)
       y4d(b + 4, m, 0, tx) = c4;
-    if(b + 5 < B)
+    if (b + 5 < B)
       y4d(b + 5, m, 0, tx) = c5;
-    if(b + 6 < B)
+    if (b + 6 < B)
       y4d(b + 6, m, 0, tx) = c6;
-    if(b + 7 < B)
+    if (b + 7 < B)
       y4d(b + 7, m, 0, tx) = c7;
   }
-	
-  #undef y4d 
-  #undef x4d
-}
-
-
 
+#undef y4d
+#undef x4d
+}
 
-__global__ void depthwise_convNew8_half2(__half* const __restrict__ y,
-					const __half* const __restrict__ x,
-					const __half* const __restrict__ w,
-					const int B, const int M,
-					const int H, const int W, const int KH,
-					const int KW, const int H_out, const int W_out,
-					const int H_pad, const int W_pad,
-					const int H_stride, const int W_stride)
-{
+__global__ void depthwise_convNew8_half2(
+    __half *const __restrict__ y, const __half *const __restrict__ x,
+    const __half *const __restrict__ w, const int B, const int M, const int H,
+    const int W, const int KH, const int KW, const int H_out, const int W_out,
+    const int H_pad, const int W_pad, const int H_stride, const int W_stride) {
 
-  #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
-  #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
+#define y4d(i3, i2, i1, i0)                                                    \
+  y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
+#define x4d(i3, i2, i1, i0)                                                    \
+  x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 
   const int num = 8;
 
   const int b = num * blockIdx.x;
-  const int m = (blockIdx.y * blockDim.x  + threadIdx.x)/ (H_out * W_out);
-	
-  if(m < M){
-    const int tx = (blockIdx.y * blockDim.x  + threadIdx.x) % (H_out * W_out);
+  const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out);
+
+  if (m < M) {
+    const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out);
 
     const int start_h = (tx / W_out) * H_stride - H_pad;
     const int start_w = (tx % W_out) * W_stride - W_pad;
@@ -136,111 +124,112 @@ __global__ void depthwise_convNew8_half2(__half* const __restrict__ y,
     __half2 c1 = __half2half2(0);
     __half2 c2 = __half2half2(0);
     __half2 c3 = __half2half2(0);
-    	
-    const __half* weights = &w[m * KH * KW];
+
+    const __half *weights = &w[m * KH * KW];
 
     for (int k = 0; k < KH * KW; k++) {
       int p = k / KW;
       int q = k % KW;
-      if (start_h + p > -1 && start_h + p < H &&
-	  start_w + q > -1 && start_w + q < W) {
-
-      
-	__half2 t1;
-	__half2 t2;
-	__half2 t3;
-	__half2 t4;
-	if(b + 7 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
-	  t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q));
-	}
-	else if(b + 6 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	  t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
-	  t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q));
-
-	}
-	else if(b + 5 < B){
-	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	    t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q));
-	}
-	else if(b + 4 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	  t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q));
-
-	}
-	else if(b + 3 < B){
-	    t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	    t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q));
-	 }
-	else if(b + 2 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	  t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q));
-
-	}
-	else if(b + 1 < B){
-	  t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q));
-	}
-	else{
-	  t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q));
-
-	 }
-
-	
-	c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0);
-	c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1);
-	c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2);
-	c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3);
-
+      if (start_h + p > -1 && start_h + p < H && start_w + q > -1 &&
+          start_w + q < W) {
+
+        __half2 t1;
+        __half2 t2;
+        __half2 t3;
+        __half2 t4;
+        if (b + 7 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+          t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q),
+                              x4d(b + 4, m, start_h + p, start_w + q));
+          t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q),
+                              x4d(b + 6, m, start_h + p, start_w + q));
+        } else if (b + 6 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+          t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q),
+                              x4d(b + 4, m, start_h + p, start_w + q));
+          t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q));
+
+        } else if (b + 5 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+          t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q),
+                              x4d(b + 4, m, start_h + p, start_w + q));
+        } else if (b + 4 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+          t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q));
+
+        } else if (b + 3 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q),
+                              x4d(b + 2, m, start_h + p, start_w + q));
+        } else if (b + 2 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+          t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q));
+
+        } else if (b + 1 < B) {
+          t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q),
+                              x4d(b, m, start_h + p, start_w + q));
+        } else {
+          t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q));
+        }
+
+        c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0);
+        c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1);
+        c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2);
+        c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3);
       }
     }
 
-    y4d(b, m, 0, tx) = __high2half(c0);	
-    if(b + 1 < B)
+    y4d(b, m, 0, tx) = __high2half(c0);
+    if (b + 1 < B)
       y4d(b + 1, m, 0, tx) = __low2half(c0);
-    if(b + 2 < B)
+    if (b + 2 < B)
       y4d(b + 2, m, 0, tx) = __high2half(c1);
-    if(b + 3 < B)
+    if (b + 3 < B)
       y4d(b + 3, m, 0, tx) = __low2half(c1);
-    if(b + 4 < B)
+    if (b + 4 < B)
       y4d(b + 4, m, 0, tx) = __high2half(c2);
-    if(b + 5 < B)
+    if (b + 5 < B)
       y4d(b + 5, m, 0, tx) = __low2half(c2);
-    if(b + 6 < B)
+    if (b + 6 < B)
       y4d(b + 6, m, 0, tx) = __high2half(c3);
-    if(b + 7 < B)
+    if (b + 7 < B)
       y4d(b + 7, m, 0, tx) = __low2half(c3);
   }
-	
-  #undef y4d 
-  #undef x4d
-}
-
-
 
-void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups){
+#undef y4d
+#undef x4d
+}
 
+void *tensorConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups) {
 
   INFO("*** TensorConvolution \n");
   profileEvent("Conv");
 
-  Tensor* input = (Tensor*)input_ptr;
-  Tensor* filter = (Tensor*)filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
-  //FIXME: Current hack to preserve backward compatibilty
+  // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
   }
 
-  Tensor* output;
+  Tensor *output;
 
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
@@ -248,43 +237,43 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
   convertToFP32(input);
   convertToFP32(filter);
 
-  
   if (conv_groups > 32) {
-    // TODO: Support other cases;  
+    // TODO: Support other cases;
     hostToDeviceCopy(input);
     hostToDeviceCopy(filter);
 
-    int n, c, h, w; // output dimensions  
+    int n, c, h, w; // output dimensions
     n = input->dims.dim_sizes[0];
     c = input->dims.dim_sizes[1];
     const int KH = filter->dims.dim_sizes[2];
     const int KW = filter->dims.dim_sizes[3];
-    h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-    w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
+    h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride +
+        1;
+    w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) /
+            horizontal_stride +
+        1;
 
-    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				     CUDNN_TENSOR_NCHW, n, c, h, w);
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)float_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
 
-
-		
     int blockSize;
     blockSize = 64;
-		
-    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
+
+    dim3 grid(((n + 7) / 8), (c * h * w + blockSize - 1) / blockSize);
     dim3 block(blockSize);
-    depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data,
-					 (float*)input->gpu_data, (float*)filter->gpu_data,
-					 input->dims.dim_sizes[0], input->dims.dim_sizes[1],
-					 input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-					 KH, KW, h, w, vertical_pad, horizontal_pad,
-					 vertical_stride, horizontal_stride);
+    depthwise_convNew8<<<grid, block>>>(
+        (float *)output->gpu_data, (float *)input->gpu_data,
+        (float *)filter->gpu_data, input->dims.dim_sizes[0],
+        input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+        input->dims.dim_sizes[3], KH, KW, h, w, vertical_pad, horizontal_pad,
+        vertical_stride, horizontal_stride);
 
-  }
-  else {
+  } else {
 
     cudnnConvolutionDescriptor_t convDesc;
     cudnnConvolutionFwdAlgo_t convAlgo;
@@ -297,130 +286,119 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
     // FIXIT: Need to be more aware of the implications of alpha and beta
     float alpha = 1.0f, beta = 0.0f;
 
-    // TODO: Support other cases;  
+    // TODO: Support other cases;
     hostToDeviceCopy(input);
     hostToDeviceCopy(filter);
 
-    INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
+    INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+         horizontal_stride);
 
     checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
     // NOTE: Adding support for grouped convolution
     checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
-
     cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
     // FIXIT: Think if upscaling values need to be configurable?
-    // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
-    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					       vertical_pad, horizontal_pad, // conv padding
-					       vertical_stride, horizontal_stride, // conv strides
-					       1, 1, // upscaling values
-					       mode, // mode is configurable
-					       computeType)); // defines compute precision
-
-    int n, c, h, w; // output dimensions  
-    // Find dimension of convolution output
-    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     &n, &c, &h, &w));
+    // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE
+    // should be used?
+    checkCUDNN(cudnnSetConvolution2dDescriptor(
+        convDesc, vertical_pad, horizontal_pad, // conv padding
+        vertical_stride, horizontal_stride,     // conv strides
+        1, 1,                                   // upscaling values
+        mode,                                   // mode is configurable
+        computeType));                          // defines compute precision
 
+    int n, c, h, w; // output dimensions
+    // Find dimension of convolution output
+    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+        convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h,
+          w);
 
     if (input->data_format == CUDNN_TENSOR_NCHW)
-      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
-				       CUDNN_TENSOR_NCHW, n, c, h, w);
+      output = (Tensor *)create4DTensor(
+          (cudnnDataType_t)float_type, // input->data_type,
+          CUDNN_TENSOR_NCHW, n, c, h, w);
     else if (input->data_format == CUDNN_TENSOR_NHWC) {
       DEBUG("* NHWC Format \n");
-      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
-				       CUDNN_TENSOR_NHWC, n, h, w, c);
-    }
-    else
+      output = (Tensor *)create4DTensor(
+          (cudnnDataType_t)float_type, // input->data_type,
+          CUDNN_TENSOR_NHWC, n, h, w, c);
+    } else
       ERROR("Unsupported Tensor Type");
 
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
 
-    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	  output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H "
+          "= %d, W = %d \n",
+          output->data_type, output->data_format, output->dims.dim_sizes[0],
+          output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+          output->dims.dim_sizes[3]);
 
     if (convDesc == NULL || input->tensor_desc == NULL ||
-	filter->filter_desc == NULL || output->tensor_desc == NULL)
+        filter->filter_desc == NULL || output->tensor_desc == NULL)
       ERROR("NULL descriptor! \n");
 
-
-    // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-    checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   convDesc,
-						   output->tensor_desc,
-						   CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-						   //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						   0,
-						   &convAlgo));
-
+    // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN
+    // support is lacking
+    checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+        cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+        output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+        // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+        0, &convAlgo));
 
     DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	  CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	  CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-
+          CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
     // FIXIT: Algo shouldn't be hardcoded
     convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
     size_t workspace_size;
-    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						       input->tensor_desc,
-						       filter->filter_desc,
-						       convDesc,
-						       output->tensor_desc,
-						       convAlgo,
-						       &workspace_size));
+    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+        cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+        output->tensor_desc, convAlgo, &workspace_size));
 
     // Allocating memory for the convolution workspace
-    void* workspace;
+    void *workspace;
     checkCudaErrors(cudaMalloc(&workspace, workspace_size));
     DEBUG("workspace size = %d \n", workspace_size);
 
-
-    checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				       input->gpu_data, filter->filter_desc, filter->gpu_data,
-				       convDesc, convAlgo, workspace, workspace_size,
-				       &beta, output->tensor_desc, output->gpu_data));
+    checkCUDNN(cudnnConvolutionForward(
+        cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+        filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace,
+        workspace_size, &beta, output->tensor_desc, output->gpu_data));
   }
 
   cudaDeviceSynchronize();
   profileEvent("Conv_end", true);
 
   return output;
-
-
 }
 
 // FIXME: Need to properly fix the new HALF type conversion
-void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
-			    int vertical_pad, int horizontal_pad,
-			    int vertical_stride, int horizontal_stride,
-			    int conv_mode, int conv_groups){
+void *tensorHalfConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups) {
 
   INFO("*** TensorHConvolution \n");
   profileEvent("#Conv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
   cudnnConvolutionDescriptor_t convDesc;
   cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  
-  if(conv_mode == 0)
+
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   // FIXIT: Need to be more aware of the implications of alpha and beta
@@ -432,33 +410,34 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-
   // Float-Half Conversions
   profileEvent("F2H_start");
 
   convertToFP16(input);
-  convertToFP16(filter);  
+  convertToFP16(filter);
 
   profileEvent("F2H_end");
   /******* END OF INPUT DATA CONVERSIONS*/
 
-  
   Tensor *output;
-  if(conv_groups > 1){
+  if (conv_groups > 1) {
     int n = input->dims.dim_sizes[0];
     int c = input->dims.dim_sizes[1];
     const int KH = filter->dims.dim_sizes[2];
     const int KW = filter->dims.dim_sizes[3];
-    int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-    int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
-    
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
-    
+    int h =
+        (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride +
+        1;
+    int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) /
+                horizontal_stride +
+            1;
+
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h,
+          w);
 
-    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, 
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
+    output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
+                                      CUDNN_TENSOR_NCHW, n, c, h, w);
 
-  
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
@@ -466,117 +445,90 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
     int blockSize;
     blockSize = 128;
 
-    dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize);
+    dim3 grid(((n + 7) / 8), (c * h * w + blockSize - 1) / blockSize);
     dim3 block(blockSize);
-    depthwise_convNew8_half2<<<grid, block>>> ((__half*) output->gpu_half_data,
-					      (__half*) input->gpu_half_data,
-					      (__half*) filter->gpu_half_data,
-					      input->dims.dim_sizes[0], input->dims.dim_sizes[1],
-					      input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-					      KH, KW, h, w,
-					      vertical_pad, horizontal_pad,
-					      vertical_stride, horizontal_stride);
+    depthwise_convNew8_half2<<<grid, block>>>(
+        (__half *)output->gpu_half_data, (__half *)input->gpu_half_data,
+        (__half *)filter->gpu_half_data, input->dims.dim_sizes[0],
+        input->dims.dim_sizes[1], input->dims.dim_sizes[2],
+        input->dims.dim_sizes[3], KH, KW, h, w, vertical_pad, horizontal_pad,
+        vertical_stride, horizontal_stride);
     cudaDeviceSynchronize();
 
-    
-  }
-  else{    
+  } else {
     checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-    //FIXME: Current hack to preserve backward compatibilty
-    if(conv_groups == 0){
+    // FIXME: Current hack to preserve backward compatibilty
+    if (conv_groups == 0) {
       conv_groups = 1;
     }
-  
+
     // NOTE: Adding support for grouped convolution
     checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
-  
-    checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					       vertical_pad, horizontal_pad, // conv padding
-					       vertical_stride, horizontal_stride, // conv strides
-					       1, 1, // upscaling values
-					       mode, // mode is configurable
-					       computeType)); // defines compute precision
+    checkCUDNN(cudnnSetConvolution2dDescriptor(
+        convDesc, vertical_pad, horizontal_pad, // conv padding
+        vertical_stride, horizontal_stride,     // conv strides
+        1, 1,                                   // upscaling values
+        mode,                                   // mode is configurable
+        computeType));                          // defines compute precision
 
     int n, c, h, w; // output dimensions
     // Find dimension of convolution output
-    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						     input->tensor_half_desc,
-						     filter->filter_half_desc,
-						     &n, &c, &h, &w));
-    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
-
+    checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+        convDesc, input->tensor_half_desc, filter->filter_half_desc, &n, &c, &h,
+        &w));
+    DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h,
+          w);
 
-    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type,
-				      CUDNN_TENSOR_NCHW, n, c, h, w);
+    output = (Tensor *)create4DTensor(
+        (cudnnDataType_t)half_type, // input->data_type,
+        CUDNN_TENSOR_NCHW, n, c, h, w);
 
-  
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(output, DEVICE);
     // NOTE: Necessary to insert the above call for every output tensor
 
-    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
-	  output->data_type, output->data_format,
-	  output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	  output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+    DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W "
+          "= %d, C = %d \n",
+          output->data_type, output->data_format, output->dims.dim_sizes[0],
+          output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+          output->dims.dim_sizes[3]);
 
-    if(convDesc == NULL || input->tensor_desc == NULL ||
-       filter->filter_desc == NULL || output->tensor_desc == NULL)
+    if (convDesc == NULL || input->tensor_desc == NULL ||
+        filter->filter_desc == NULL || output->tensor_desc == NULL)
       ERROR("NULL descriptor! \n");
 
-
     // NOTE: The following algo works with TRUE half precision
     convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+    // convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
-  
     size_t workspace_size;
-    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						       input->tensor_half_desc,
-						       filter->filter_half_desc,
-						       convDesc,
-						       output->tensor_half_desc,
-						       convAlgo,
-						       &workspace_size));
+    checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+        cudnnHandle, input->tensor_half_desc, filter->filter_half_desc,
+        convDesc, output->tensor_half_desc, convAlgo, &workspace_size));
 
     // Allocating memory for the convolution workspace
     DEBUG("workspace size = %d \n", workspace_size);
-    void* workspace;
+    void *workspace;
     checkCudaErrors(cudaMalloc(&workspace, workspace_size));
 
-
-
-
-    checkCUDNN(cudnnConvolutionForward(cudnnHandle,
-				       &alpha,
-				       input->tensor_half_desc,
-				       input->gpu_half_data,
-				       filter->filter_half_desc,
-				       filter->gpu_half_data,
-				       convDesc, convAlgo, workspace, workspace_size,
-				       &beta,
-				       output->tensor_half_desc,
-				       output->gpu_half_data));
-
+    checkCUDNN(cudnnConvolutionForward(
+        cudnnHandle, &alpha, input->tensor_half_desc, input->gpu_half_data,
+        filter->filter_half_desc, filter->gpu_half_data, convDesc, convAlgo,
+        workspace, workspace_size, &beta, output->tensor_half_desc,
+        output->gpu_half_data));
   }
-  
+
   profileEvent("H2F_start");
 
   convertToFP32_offline(output);
-  
-  profileEvent("H2F_end");
 
+  profileEvent("H2F_end");
 
   profileEvent("#Conv_end");
 
-  
   return output;
-
 }
 
-
-  
-
-}// End of Extern C
-
+} // End of Extern C
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu
index e706080051..8324b18e04 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu
@@ -1,9 +1,11 @@
-//===--------------------------- half_precision_api.cu --------------------------===//
+//===--------------------------- half_precision_api.cu
+//--------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  consists of the custom implementation of tensor precision changing
-// kernels useful for approximated and non-approximated versions of tensor 
+//
+//  This file  consists of the custom implementation of tensor precision
+//  changing
+// kernels useful for approximated and non-approximated versions of tensor
 // operations. This file also contains API for tensor operations operating on
 // tensors with half-precision.
 //
@@ -12,7 +14,6 @@
 #ifndef HALF_API_HEADER
 #define HALF_API_HEADER
 
-
 #include <stdio.h>
 #include <stdarg.h>
 #include <cstdio>
@@ -37,7 +38,6 @@
 #include <cuda_fp16.h>
 #include <driver_types.h>
 
-
 // Tensor runtime header files
 #include "../include/tensor_runtime.h"
 #include "../include/tensor_utils.h"
@@ -48,15 +48,13 @@
 #include "../include/fp16_gemm.h"
 #include "../include/fp16_conversion.h"
 
-
-
-void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){
+void *tensorHalfGemm(void *lhs_ptr, void *rhs_ptr) {
 
   INFO("*** TensorHalfGemm \n");
   profileEvent("#Mul");
 
-  Tensor* lhs = (Tensor*) lhs_ptr;
-  Tensor* rhs = (Tensor*) rhs_ptr;
+  Tensor *lhs = (Tensor *)lhs_ptr;
+  Tensor *rhs = (Tensor *)rhs_ptr;
 
   DEBUG("rhs->dims.num_dims = %d \n", rhs->dims.num_dims);
   DEBUG("lhs->dims.num_dims = %d \n", lhs->dims.num_dims);
@@ -64,65 +62,60 @@ void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){
   hostToDeviceCopy(lhs);
   hostToDeviceCopy(rhs);
 
-  
   profileEvent("F2H_start");
 
   convertToFP16(lhs);
   convertToFP16(rhs);
-  
-  profileEvent("F2H_end");
 
+  profileEvent("F2H_end");
 
   // 'm' holds the batch dimension - assuming NCHW format Tensors
   int m = lhs->dims.dim_sizes[0];
   // The rhs last dimension must contain the neurons
-  int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
   int k = 1;
 
-  for (int j = 1 ; j < lhs->dims.num_dims; j++){
+  for (int j = 1; j < lhs->dims.num_dims; j++) {
     k = k * lhs->dims.dim_sizes[j]; // input neurons
   }
 
-  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
+  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
   // Dimension-note: Check if k is same across the two tensors
   DEBUG("m = %d, n = %d, k = %d \n", m, n, k);
-  if(rhs_k != k){
+  if (rhs_k != k) {
     ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k);
   }
 
-  // NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines
-  Tensor* output = (Tensor*) create4DTensor(half_type, CUDNN_TENSOR_NCHW,
-					    m, n, 1, 1);
+  // NOTE: Creating a 4D tensor to be compatible with later called cuDNN
+  // routines
+  Tensor *output =
+      (Tensor *)create4DTensor(half_type, CUDNN_TENSOR_NCHW, m, n, 1, 1);
 
   changeTensorPlacement(output, DEVICE);
 
-  //convertToFP16(output);
-
+  // convertToFP16(output);
 
   // INFO: cuBlas uses column-major format
   // INFO: The leading dimension is just the FIRST Dimension
-  // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN expects
+  // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN
+  // expects
   const __half alf = approx_float_to_half(1.0);
   const __half bet = approx_float_to_half(0.0);
   const __half *alpha_half = &alf;
   const __half *beta_half = &bet;
 
-
-  checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-			       n, m, k,
-			       alpha_half,
-			       (__half*) rhs->gpu_half_data, CUDA_R_16F, n,
-			       (__half*) lhs->gpu_half_data, CUDA_R_16F, k,
-			       beta_half,
-			       (__half*) output->gpu_half_data, CUDA_R_16F, n,
-			       CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) );
-
+  checkCudaErrors(cublasGemmEx(
+      cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, alpha_half,
+      (__half *)rhs->gpu_half_data, CUDA_R_16F, n, (__half *)lhs->gpu_half_data,
+      CUDA_R_16F, k, beta_half, (__half *)output->gpu_half_data, CUDA_R_16F, n,
+      CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(output);
 
-  //h2f((half*) output_half->gpu_data, output->num_elems, (float*) output->gpu_data);
+  // h2f((half*) output_half->gpu_data, output->num_elems, (float*)
+  // output->gpu_data);
 
   profileEvent("H2F_end");
 
@@ -131,32 +124,28 @@ void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){
   return output;
 }
 
-
-
-void* tensorHalfGemmGPU(void* lhs_ptr, void* rhs_ptr){
+void *tensorHalfGemmGPU(void *lhs_ptr, void *rhs_ptr) {
   return tensorHalfGemm(lhs_ptr, rhs_ptr);
 }
 
-
-
 // FIXIT: Generalize all of the routines for types {half, float, double}
-void* tensorHalfConvolution(void* input_ptr, void* filter_ptr,
-			    int vertical_pad, int horizontal_pad,
-			    int vertical_stride, int horizontal_stride,
-			    int conv_mode, int conv_groups){
+void *tensorHalfConvolution(void *input_ptr, void *filter_ptr, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride, int conv_mode,
+                            int conv_groups) {
 
   INFO("*** TensorHConvolution \n");
   profileEvent("#Conv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
 
   cudnnConvolutionDescriptor_t convDesc;
   cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  if(conv_mode == 0)
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   // FIXIT: Need to be more aware of the implications of alpha and beta
@@ -168,7 +157,6 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-
   /***** CONVERSIONS from FP32 to FP16 - on the GPU */
   profileEvent("F2H_start");
 
@@ -178,95 +166,76 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr,
   profileEvent("F2H_end");
   /******* END OF INPUT DATA CONVERSIONS*/
 
-  
-
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
   }
-  
+
   // NOTE: Adding support for grouped convolution
   checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
-  
   // FIXIT: Think if upscaling values need to be configurable?
   // IMP-FIXIT:  CUDNN Cross correlation is only used in the Lenet context
-  // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used?
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     vertical_stride, horizontal_stride, // conv strides
-					     1, 1, // upscaling values
-					     mode, // mode is configurable
-					     computeType)); // defines compute precision
+  // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE
+  // should be used?
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      vertical_stride, horizontal_stride,     // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   int n, c, h, w; // output dimensions
   // Find dimension of convolution output
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
-  
-  DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
 
+  DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, // input->data_type,
-					    CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *output =
+      (Tensor *)create4DTensor((cudnnDataType_t)half_type, // input->data_type,
+                               CUDNN_TENSOR_NCHW, n, c, h, w);
 
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
 
-  //convertToFP16(output);
+  // convertToFP16(output);
 
-  
   // NOTE: Necessary to insert the above call for every output tensor
 
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
-	output->data_type, output->data_format,
-	output->dims.dim_sizes[0], output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = "
+        "%d, C = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
 
-  if(convDesc == NULL || input->tensor_half_desc == NULL ||
-     filter->filter_half_desc == NULL || output->tensor_half_desc == NULL)
+  if (convDesc == NULL || input->tensor_half_desc == NULL ||
+      filter->filter_half_desc == NULL || output->tensor_half_desc == NULL)
     ERROR("NULL descriptor! \n");
 
-
   // NOTE: The following algo works with TRUE half precision
 
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
 
-  //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+  // convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
-  
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_half_desc,
-						     filter->filter_half_desc,
-						     convDesc,
-						     output->tensor_half_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_half_desc, filter->filter_half_desc, convDesc,
+      output->tensor_half_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
   DEBUG("workspace size = %d \n", workspace_size);
-  void* workspace;
+  void *workspace;
   checkCudaErrors(cudaMalloc(&workspace, workspace_size));
 
-
-
-
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle,
-				     &alpha,
-				     input->tensor_half_desc,
-				     input->gpu_half_data,
-				     filter->filter_half_desc,
-				     filter->gpu_half_data,
-				     convDesc, convAlgo,
-				     workspace, workspace_size,
-				     &beta,
-				     output->tensor_half_desc,
-				     output->gpu_half_data));
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_half_desc, input->gpu_half_data,
+      filter->filter_half_desc, filter->gpu_half_data, convDesc, convAlgo,
+      workspace, workspace_size, &beta, output->tensor_half_desc,
+      output->gpu_half_data));
 
   profileEvent("H2F_start");
 
@@ -279,21 +248,18 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr,
   return output;
 }
 
-
-
-
-void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-           		  void* mean_ptr, void* variance_ptr, double epsilon){
+void *tensorHalfBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                          void *mean_ptr, void *variance_ptr, double epsilon) {
 
   INFO("*** TensorHalfBatchNorm \n");
   profileEvent("#BatchNorm");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* gamma = (Tensor*) gamma_ptr;
-  Tensor* beta = (Tensor*) beta_ptr;
-  Tensor* mean = (Tensor*) mean_ptr;
-  Tensor* variance = (Tensor*) variance_ptr;
-  
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *gamma = (Tensor *)gamma_ptr;
+  Tensor *beta = (Tensor *)beta_ptr;
+  Tensor *mean = (Tensor *)mean_ptr;
+  Tensor *variance = (Tensor *)variance_ptr;
+
   float alpha_val = 1.0f, beta_val = 0.0f;
   hostToDeviceCopy(input);
   hostToDeviceCopy(gamma);
@@ -301,56 +267,37 @@ void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
   hostToDeviceCopy(mean);
   hostToDeviceCopy(variance);
 
-  
   profileEvent("F2H_start");
 
   convertToFP16(input);
 
   profileEvent("F2H_end");
-  
-
-
-  checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle, CUDNN_BATCHNORM_SPATIAL,
-						     &alpha_val, &beta_val,
-						     input->tensor_half_desc,
-						     input->gpu_half_data,
-						     input->tensor_half_desc,
-						     input->gpu_half_data,
-						     gamma->tensor_desc, gamma->gpu_data,
-						     beta->gpu_data, mean->gpu_data,
-						     variance->gpu_data, epsilon));
-
 
+  checkCUDNN(cudnnBatchNormalizationForwardInference(
+      cudnnHandle, CUDNN_BATCHNORM_SPATIAL, &alpha_val, &beta_val,
+      input->tensor_half_desc, input->gpu_half_data, input->tensor_half_desc,
+      input->gpu_half_data, gamma->tensor_desc, gamma->gpu_data, beta->gpu_data,
+      mean->gpu_data, variance->gpu_data, epsilon));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(input);
-  
-  profileEvent("H2F_end");
 
+  profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfBatchNorm_end", true);
 
-
   return input;
 }
 
-
-
-
-void* tensorHalfPooling(void* input_ptr,
-			int poolFunction,
-			int window_height, int window_width,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride){
-
-  
+void *tensorHalfPooling(void *input_ptr, int poolFunction, int window_height,
+                        int window_width, int vertical_pad, int horizontal_pad,
+                        int vertical_stride, int horizontal_stride) {
 
   INFO("*** TensorHalfPooling \n");
   profileEvent("#Pool");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   hostToDeviceCopy(input);
 
@@ -366,218 +313,185 @@ void* tensorHalfPooling(void* input_ptr,
   // FIXIT: Need to be more aware of the implications of alpha and beta
   float alpha = 1.0f, beta = 0.0f;
 
-
   checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc));
 
   int n = input->dims.dim_sizes[0];
   int c = input->dims.dim_sizes[1];
-  int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / vertical_stride;
+  int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) /
+          vertical_stride;
   h = h + 1;
-  int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / horizontal_stride;
+  int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) /
+          horizontal_stride;
   w = w + 1;
 
   DEBUG("n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
   // FIXIT: Don't be specific to floats
-  Tensor* output = (Tensor*) create4DTensor(half_type, CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *output =
+      (Tensor *)create4DTensor(half_type, CUDNN_TENSOR_NCHW, n, c, h, w);
   // Changing output tensor placement from host to device
   changeTensorPlacement(output, DEVICE);
 
-  //convertToFP16(output);
+  // convertToFP16(output);
 
   // FIXIT: Fix being specific to CUDNN_DATA_FLOAT and NCHW format
   // FIXIT: Is this setTensor even needed?
   checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_half_desc,
-					CUDNN_TENSOR_NCHW,
-					CUDNN_DATA_HALF,
-					n, c,
-					h, w));
+                                        CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, n,
+                                        c, h, w));
 
   cudnnPoolingMode_t pool_mode;
-  if(poolFunction == 0)
+  if (poolFunction == 0)
     pool_mode = CUDNN_POOLING_MAX;
-  else if(poolFunction == 1)
+  else if (poolFunction == 1)
     pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
 
-
   // FIXIT: Make the pool function (max, min, avg) configurable
-  checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc,
-					 pool_mode,
-					 CUDNN_PROPAGATE_NAN,
-					 window_height, window_width,
-					 vertical_pad, horizontal_pad,
-					 vertical_stride, horizontal_stride));
-
-  
-  checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha,
-				 input->tensor_half_desc,
-				 input->gpu_half_data, &beta,
-				 output->tensor_half_desc, output->gpu_half_data));
-
+  checkCUDNN(cudnnSetPooling2dDescriptor(
+      poolDesc, pool_mode, CUDNN_PROPAGATE_NAN, window_height, window_width,
+      vertical_pad, horizontal_pad, vertical_stride, horizontal_stride));
 
+  checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha,
+                                 input->tensor_half_desc, input->gpu_half_data,
+                                 &beta, output->tensor_half_desc,
+                                 output->gpu_half_data));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(output);
-  
+
   profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfPooling_end", true);
 
   return output;
 }
 
-
-
-
-
-void* tensorHalfRelu2(void* input_ptr, float min, float max){
+void *tensorHalfRelu2(void *input_ptr, float min, float max) {
 
   INFO("*** TensorClippedRelu \n");
   profileEvent("#Relu");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   cudnnActivationDescriptor_t reluDesc;
   float alpha = 1.0f, beta = 0.0f;
   hostToDeviceCopy(input);
 
-
   //**** Floating point to half conversions
   profileEvent("F2H_start");
 
   convertToFP16(input);
-  
+
   profileEvent("F2H_end");
   /*** End of data type conversion **/
 
-
   checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc));
 
-  checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU,
-					  CUDNN_PROPAGATE_NAN, 2.0));
-
-  checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha,
-				    input->tensor_half_desc, input->gpu_half_data, &beta,
-				    input->tensor_half_desc, input->gpu_half_data));
+  checkCUDNN(cudnnSetActivationDescriptor(
+      reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_PROPAGATE_NAN, 2.0));
 
+  checkCUDNN(cudnnActivationForward(
+      cudnnHandle, reluDesc, &alpha, input->tensor_half_desc,
+      input->gpu_half_data, &beta, input->tensor_half_desc,
+      input->gpu_half_data));
 
   profileEvent("H2F_start");
   // NOTE: Transforming half precision output to single precision
 
   convertToFP32_offline(input);
-  
+
   profileEvent("H2F_end");
 
   profileEvent("#tensorHalfClippedRelu_end");
 
-
   return input;
 }
 
-
-
-
-void* tensorHalfRelu(void* input_ptr){
+void *tensorHalfRelu(void *input_ptr) {
 
   INFO("*** TensorHalfRelu \n");
   profileEvent("#Relu");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   cudnnActivationDescriptor_t reluDesc;
   float alpha = 1.0f, beta = 0.0f;
   hostToDeviceCopy(input);
 
-
   //**** Floating point to half conversions
   profileEvent("F2H_start");
 
   convertToFP16(input);
-	    
+
   profileEvent("F2H_end");
   /*** End of data type conversion **/
 
-
   checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc));
 
   checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU,
-					  CUDNN_PROPAGATE_NAN, 0.0));
+                                          CUDNN_PROPAGATE_NAN, 0.0));
 
-  checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha,
-				    input->tensor_half_desc, input->gpu_half_data, &beta,
-				    input->tensor_half_desc, input->gpu_half_data));
+  checkCUDNN(cudnnActivationForward(
+      cudnnHandle, reluDesc, &alpha, input->tensor_half_desc,
+      input->gpu_half_data, &beta, input->tensor_half_desc,
+      input->gpu_half_data));
 
- 
   profileEvent("H2F_start");
 
   convertToFP32_offline(input);
-  
+
   profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfRelu_end");
 
-  
   return input;
 }
 
-
-
-
-
-
-void* tensorHalfTanh(void* input_ptr){
+void *tensorHalfTanh(void *input_ptr) {
 
   INFO("*** TensorHalfTanh \n");
   profileEvent("#Tanh");
 
-
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   cudnnActivationDescriptor_t tanhDesc;
   float alpha = 1.0f, beta = 0.0f;
   hostToDeviceCopy(input);
 
-
   //**** Data conversion from float to half
   profileEvent("F2H_start");
 
   convertToFP16(input);
-  
+
   profileEvent("F2H_end");
   /**** End of data type conversion ****/
 
-
   checkCUDNN(cudnnCreateActivationDescriptor(&tanhDesc));
 
   checkCUDNN(cudnnSetActivationDescriptor(tanhDesc, CUDNN_ACTIVATION_TANH,
-					  CUDNN_PROPAGATE_NAN, 0.0));
+                                          CUDNN_PROPAGATE_NAN, 0.0));
 
-  checkCUDNN(cudnnActivationForward(cudnnHandle, tanhDesc, &alpha,
-				    input->tensor_half_desc, input->gpu_half_data, &beta,
-				    input->tensor_half_desc, input->gpu_half_data));
+  checkCUDNN(cudnnActivationForward(
+      cudnnHandle, tanhDesc, &alpha, input->tensor_half_desc,
+      input->gpu_half_data, &beta, input->tensor_half_desc,
+      input->gpu_half_data));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(input);
-  
+
   profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfTanh_end");
 
-
   return input;
 }
 
+void *tensorHalfAdd(void *x_ptr, void *bias_ptr) {
 
-
-void* tensorHalfAdd(void* x_ptr, void* bias_ptr){
-
-  Tensor* x = (Tensor*) x_ptr;
-  Tensor* bias = (Tensor*) bias_ptr;
+  Tensor *x = (Tensor *)x_ptr;
+  Tensor *bias = (Tensor *)bias_ptr;
 
   INFO("*** TensorHalfAdd \n");
   profileEvent("#Add");
@@ -587,36 +501,29 @@ void* tensorHalfAdd(void* x_ptr, void* bias_ptr){
   hostToDeviceCopy(x);
   hostToDeviceCopy(bias);
 
-
   //**** Data conversion from float to half
   profileEvent("F2H_start");
 
   convertToFP16(x);
   convertToFP16(bias);
-  
+
   profileEvent("F2H_end");
   /*** End of data type conversions ****/
 
-
   // FIXIT: routine fails for 3D tensors
   checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_half_desc,
-			    bias->gpu_half_data, &alpha,
-			    x->tensor_half_desc, x->gpu_half_data));
-
+                            bias->gpu_half_data, &alpha, x->tensor_half_desc,
+                            x->gpu_half_data));
 
   profileEvent("H2F_start");
 
   convertToFP32_offline(x);
-  
+
   profileEvent("H2F_end");
 
-  
   profileEvent("#tensorHalfAdd_end");
 
-
   return x;
 }
 
-
-
 #endif
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
index ae92f12335..c7237c0076 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -1,47 +1,45 @@
-//===--------------------------- hpvm-rt-controller.cpp ---------------------===//
+//===--------------------------- hpvm-rt-controller.cpp
+//---------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file contains code for that allows the tensor runtime to adapt 
+//
+//  This file contains code for that allows the tensor runtime to adapt
 // in response to external changes in conditions (such as frequency changes)
 // by helping to choose correct approximation configurations. It also provides
 // routines for the rest of the runtime to get performance and energy profiling.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "hpvm-rt-controller.h"
 #include "global_data.h"
 #include <fstream>
 
 //-------- Functionality to read and update frequency on Jetson board -------//
 /*const char* available_freqs[] = {"140250000", "229500000", "318750000",
-                                 "408000000", "497250000", "586500000", 
+                                 "408000000", "497250000", "586500000",
                                  "675750000", "765000000", "854250000",
                                  "943500000", "1032750000", "1122000000",
                                  "1211250000", "1300500000"};
 
 */
 
-
 const int available_freqs[] = {
-140250000, // 0
-229500000, // 1
-318750000, // 2
-408000000, // 3
-497250000, // 4
-586500000, // 5
-675750000, // 6
-765000000, // 7
-854250000, // 8
-943500000, // 9
-1032750000,// 10
-1122000000,// 11
-1211250000,// 12
-1300500000 // 13
+    140250000,  // 0
+    229500000,  // 1
+    318750000,  // 2
+    408000000,  // 3
+    497250000,  // 4
+    586500000,  // 5
+    675750000,  // 6
+    765000000,  // 7
+    854250000,  // 8
+    943500000,  // 9
+    1032750000, // 10
+    1122000000, // 11
+    1211250000, // 12
+    1300500000  // 13
 };
 
-
 /*void updateJetsonGPUFreq(int freq_level) {
 
   if (freq_level < 0 || freq_level > 13) {
@@ -49,7 +47,7 @@ const int available_freqs[] = {
     abort();
   }
 
-  const char* freq_val = available_freqs[freq_level]; 
+  const char* freq_val = available_freqs[freq_level];
   printf("freq-val[0] = %s \n", freq_val);
 
   FILE* max_file =
@@ -59,7 +57,7 @@ const int available_freqs[] = {
   }
   fwrite(freq_val, strlen(freq_val), 1, max_file);
   fclose(max_file);
-  
+
   FILE* min_file =
     fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+");
   if (min_file == NULL){
@@ -80,7 +78,7 @@ unsigned long int readJetsonGPUFreq() {
 
   char buf[50];
   char* ptr;
-  
+
   fread(buf, 50, 1, cur_freq_file);
   unsigned long cur_freq = strtoul(buf, &ptr, 10);
   fclose(cur_freq_file);
@@ -89,14 +87,15 @@ unsigned long int readJetsonGPUFreq() {
 
 */
 
-
 // Sets frequency
 void setFreq(unsigned freq_index) {
 
   unsigned target_freq = available_freqs[freq_index];
-  
-  const char * const min_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
-  const char * const max_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
+
+  const char *const min_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq";
+  const char *const max_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq";
 
   std::ofstream min_stream;
   std::ofstream max_stream;
@@ -115,7 +114,8 @@ void setFreq(unsigned freq_index) {
 unsigned recordFreq() {
 
   // Current frequency file
-  const char * const cur_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
+  const char *const cur_freq_file =
+      "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq";
   std::ifstream cur_stream;
   cur_stream.open(cur_freq_file, std::ifstream::in);
 
@@ -128,10 +128,6 @@ unsigned recordFreq() {
   return cur_freq;
 }
 
-
-
-
-
 //---------------------------------------------------------------------------//
 
 /*
@@ -145,13 +141,13 @@ bool fileExists(const std::string &file) {
 
 // There will be no frequency request for the first batch
 // Therefore, we skip the first element by initializing to 1, not 0.
-FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) :
-  idx_list(il), rep_factor(rf), count(1), idx(0) {}
+FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf)
+    : idx_list(il), rep_factor(rf), count(1), idx(0) {}
 
 unsigned FrequencyIndexList::getNextIndex() {
   if (count == rep_factor) {
     count = 0;
-    idx = (idx+1) % idx_list.size();
+    idx = (idx + 1) % idx_list.size();
   }
   count++;
   return idx_list[idx];
@@ -218,7 +214,7 @@ void ProfileInfo::readIterationFrequency() {
   frequency_current_iteration = recordFreq();
 #else
   frequency_current_iteration = 0;
-#endif //JETSON_EXECUTION
+#endif // JETSON_EXECUTION
 }
 
 unsigned long ProfileInfo::getIterationFrequency() {
@@ -285,15 +281,14 @@ void ProfileInfo::printToFile() {
   // to have equal sizes, in outer and inner vectors both,
   // and all time_info and energy_info vectors must have the same size.
   unsigned iterations = tensor_time_info.size();
-  CUSTOM_ASSERT(
-      (tensor_time_info.size() == iterations) &&
-      (tensor_energy_info.size() == iterations) &&
-      (control_time_info.size() == iterations) &&
-      (control_energy_info.size() == iterations) &&
-      (config_time_info.size() == iterations) &&
-      (config_energy_info.size() == iterations) &&
-      (frequency_info.size() == iterations) &&
-      "time_info, energy_info, frequency_info size: \
+  CUSTOM_ASSERT((tensor_time_info.size() == iterations) &&
+                (tensor_energy_info.size() == iterations) &&
+                (control_time_info.size() == iterations) &&
+                (control_energy_info.size() == iterations) &&
+                (config_time_info.size() == iterations) &&
+                (config_energy_info.size() == iterations) &&
+                (frequency_info.size() == iterations) &&
+                "time_info, energy_info, frequency_info size: \
                    iteration number does not match.");
 
   for (unsigned i = 0; i < tensor_time_info.size(); i++) {
@@ -343,8 +338,8 @@ ProfileInfo::ProfileInfo()
       time_control_current_iteration(0.0), time_config_current_iteration(0.0),
       energy_compute_current_iteration(0.0),
       energy_control_current_iteration(0.0),
-      energy_config_current_iteration(0.0),
-      frequency_current_iteration(0), in_iteration(false) {}
+      energy_config_current_iteration(0.0), frequency_current_iteration(0),
+      in_iteration(false) {}
 
 Slowdowns::Slowdowns() {
   idx = 0;
@@ -386,52 +381,50 @@ void RuntimeController::stop_profiler() {
     profiler->stop_profiler();
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &RuntimeController::
-getSpeedupConfigurations() {
+std::vector<struct Configuration *> &
+RuntimeController::getSpeedupConfigurations() {
   return SpeedupConfigurations;
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &RuntimeController::
-getEnergyConfigurations() {
+std::vector<struct Configuration *> &
+RuntimeController::getEnergyConfigurations() {
   return EnergyConfigurations;
 }
 // For testing purposes only - do not use widely
-std::vector<struct Configuration *> &RuntimeController::
-getThreeDCurveConfigurations() {
+std::vector<struct Configuration *> &
+RuntimeController::getThreeDCurveConfigurations() {
   return ThreeDCurveConfigurations;
 }
 // For testing purposes only - do not use widely
 unsigned RuntimeController::getConfigurationIdx() { return configurationIdx; }
 
 double RuntimeController::getCurrentConfigurationSpeedup() {
-  return (double) (*Configurations)[configurationIdx]->speedup;
+  return (double)(*Configurations)[configurationIdx]->speedup;
 }
 
 double RuntimeController::getCurrentConfigurationEnergy() {
-  return (double) (*Configurations)[configurationIdx]->energy;
+  return (double)(*Configurations)[configurationIdx]->energy;
 }
 
 double RuntimeController::getCurrentConfigurationAccuracy() {
-  return (double) (*Configurations)[configurationIdx]->accuracy;
+  return (double)(*Configurations)[configurationIdx]->accuracy;
 }
 
 double RuntimeController::getCurrentConfigurationAccuracyLoss() {
-  return (double) (*Configurations)[configurationIdx]->accuracyLoss;
+  return (double)(*Configurations)[configurationIdx]->accuracyLoss;
 }
 
 NodeConfiguration *RuntimeController::getNodeConfiguration(const char *data) {
 
   // if visc.node.id Not specified for this HPVM Node
-  if (currentTensorID == -1){
+  if (currentTensorID == -1) {
     std::string s(data);
     // All nodes are expected to have a configuration
     return (*Configurations)[configurationIdx]->setup.at(s);
-  }
-  else{
-    DEBUG("-- currentTensorID = \%u \n", currentTensorID); 
+  } else {
+    DEBUG("-- currentTensorID = \%u \n", currentTensorID);
     return (*Configurations)[configurationIdx]->idConfigMap.at(currentTensorID);
   }
-  
 }
 
 void RuntimeController::init(const char *Cstr) {
@@ -440,7 +433,8 @@ void RuntimeController::init(const char *Cstr) {
   setProfileInfoFilename(Cstr);
   readConfigurationFile(Cstr);
 
-  // NOTE: Configurations is pareto-configs. InitialConfigurations is the full list (config file)
+  // NOTE: Configurations is pareto-configs. InitialConfigurations is the full
+  // list (config file)
   Configurations = NULL;
   computeParetoConfigurationPoints();
   //    compute3DParetoConfigurationPoints(); Not using 3D curve
@@ -461,8 +455,10 @@ void RuntimeController::init(const char *Cstr) {
   // Pseudo random variable (when we did few experiments)
   // or true random numbers for probabilistic control
   pseudo_rd = 0.0;
-  std::random_device rd;  //Will be used to obtain a seed for the random number engine
-  generator = std::mt19937 (rd()); //Standard mersenne_twister_engine seeded with rd()
+  std::random_device
+      rd; // Will be used to obtain a seed for the random number engine
+  generator =
+      std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd()
   distr = std::uniform_real_distribution<>(0.0, 1.0);
 
   g_freq = available_freqs[13];
@@ -484,8 +480,8 @@ void RuntimeController::end_iteration() {
     PI->end_iteration();
 }
 
-void RuntimeController::addToCurrentIterationComputeTime(
-    const char *s, double t) {
+void RuntimeController::addToCurrentIterationComputeTime(const char *s,
+                                                         double t) {
   if (PI)
     PI->addToCurrentIterationComputeTime(s, t);
 }
@@ -500,8 +496,8 @@ void RuntimeController::addToCurrentIterationConfigTime(double t) {
     PI->addToCurrentIterationConfigTime(t);
 }
 
-void RuntimeController::addToCurrentIterationComputeEnergy(
-    const char *s, double e) {
+void RuntimeController::addToCurrentIterationComputeEnergy(const char *s,
+                                                           double e) {
   if (PI)
     PI->addToCurrentIterationComputeEnergy(s, e);
 }
@@ -539,8 +535,8 @@ void RuntimeController::updateFrequency() {
   //--- updateJetsonGPUFreq(freq_idx);
 
   setFreq(freq_idx);
-  
-#endif //JETSON_EXECUTION
+
+#endif // JETSON_EXECUTION
 }
 
 void RuntimeController::writeProfileInfo() {
@@ -573,11 +569,9 @@ std::pair<double, double> RuntimeController::fc_profile(
     const unsigned num_rows_a, const unsigned num_cols_a,
     const unsigned num_rows_b, const unsigned num_cols_b,
     const unsigned voltage_swing, const unsigned patch_factor) {
-  return (
-      promise ? promise->fc_profile(
-                    num_rows_a, num_cols_a, num_rows_b, num_cols_b,
-                    voltage_swing, patch_factor)
-              : std::make_pair(0.0, 0.0));
+  return (promise ? promise->fc_profile(num_rows_a, num_cols_a, num_rows_b,
+                                        num_cols_b, voltage_swing, patch_factor)
+                  : std::make_pair(0.0, 0.0));
 }
 
 std::pair<double, double> RuntimeController::conv_profile(
@@ -585,17 +579,16 @@ std::pair<double, double> RuntimeController::conv_profile(
     const unsigned c_out, const unsigned c_in, const unsigned k_h,
     const unsigned k_w, const unsigned s_h, const unsigned s_w,
     const unsigned voltage_swing, const unsigned patch_factor) {
-  return (
-      promise ? promise->conv_profile(
-                    n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing,
-                    patch_factor)
-              : std::make_pair(0.0, 0.0));
+  return (promise ? promise->conv_profile(n, c, h, w, c_out, c_in, k_h, k_w,
+                                          s_h, s_w, voltage_swing, patch_factor)
+                  : std::make_pair(0.0, 0.0));
 }
 
 // Constructor and descructor
 RuntimeController::RuntimeController() {
   configurationIdx = 0;
-  FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10);
+  FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+                               10);
 #ifdef ACTIVE_PROFILING
   PI = new ProfileInfo();
   profiler = new Profiler();
@@ -679,16 +672,14 @@ void RuntimeController::readConfigurationFile(const char *str) {
   std::getline(qin, first_line);
   DEBUG("first_line: %s\n", first_line.c_str());
 
-  try{
+  try {
     baseline_time = std::stod(first_line);
     DEBUG("Baseline time: %lf\n\n", baseline_time);
-  }
-  catch(...){
+  } catch (...) {
     ERROR("Please Add/Fix Baseline Time at Top of Config File.. ");
   }
 
-  
-  unsigned int firstTensorID = 1;  
+  unsigned int firstTensorID = 1;
   for (std::string line; std::getline(qin, line);) {
     DEBUG("line: %s\n", line.c_str());
 
@@ -721,10 +712,10 @@ void RuntimeController::readConfigurationFile(const char *str) {
       // Read first line, to create the new configuration struct
       readingFirstLine = false;
       firstTensorID = 1; // reset first tensor ID for new config
-      
-      InitialConfigurations.push_back(Configuration(
-          tokens[0], std::stof(tokens[1]), std::stof(tokens[2]),
-          std::stof(tokens[3]), std::stof(tokens[4])));
+
+      InitialConfigurations.push_back(
+          Configuration(tokens[0], std::stof(tokens[1]), std::stof(tokens[2]),
+                        std::stof(tokens[3]), std::stof(tokens[4])));
       continue;
     }
 
@@ -732,9 +723,8 @@ void RuntimeController::readConfigurationFile(const char *str) {
       DEBUG("Found gpu configuration\n");
 
       // There must be at least one operation, with an approximation option
-      CUSTOM_ASSERT(
-          (tokens.size() >= 5) &&
-          "Not enough operations - approximation options.");
+      CUSTOM_ASSERT((tokens.size() >= 5) &&
+                    "Not enough operations - approximation options.");
 
       GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration();
       InitialConfigurations.back().setup.insert(
@@ -745,7 +735,7 @@ void RuntimeController::readConfigurationFile(const char *str) {
       InitialConfigurations.back().idConfigMap.insert(
           std::make_pair(firstTensorID, NodeConf));
       DEBUG("*** firstTensorID = %d \n\n", firstTensorID);
-      
+
       unsigned idx = 2;
       while (idx < tokens.size()) {
         if (tokens[idx] == "add") {
@@ -894,14 +884,13 @@ void RuntimeController::readConfigurationFile(const char *str) {
 
       // Update first TensorID using number of tensor ops in current node
       firstTensorID += NodeConf->getApproxChoices().size();
-      
+
     } else if (tokens[1] == "cpu") {
       DEBUG("Found gpu configuration\n");
 
       // There must be at least one operation, with an approximation option
-      CUSTOM_ASSERT(
-          (tokens.size() >= 5) &&
-          "Not enough operations - approximation options.");
+      CUSTOM_ASSERT((tokens.size() >= 5) &&
+                    "Not enough operations - approximation options.");
 
       CPUNodeConfiguration *NodeConf = new CPUNodeConfiguration();
       InitialConfigurations.back().setup.insert(
@@ -1017,9 +1006,8 @@ void RuntimeController::computeParetoConfigurationPoints() {
 
   // Sort the configurations according to accuracy loss
   INFO("Sorting autotuner configurations...\n");
-  std::sort(
-      InitialConfigurations.begin() + 1, InitialConfigurations.end(),
-      ConfigurationLessThan());
+  std::sort(InitialConfigurations.begin() + 1, InitialConfigurations.end(),
+            ConfigurationLessThan());
   INFO("Done sorting.\n");
 
   for (unsigned start_idx = 1; start_idx < InitialConfigurations.size();) {
@@ -1053,14 +1041,12 @@ void RuntimeController::computeParetoConfigurationPoints() {
         en_idx = i;
       }
     }
-    DEBUG(
-        "accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
-        InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
+    DEBUG("accuracy loss = %f, speedup = %f, at sp_idx = %d\n",
+          InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx);
     // Found best speedup for this accuracy point (not dominated by any of
     // these).
-    DEBUG(
-        "accuracy loss = %f, energy = %f, at en_idx = %d\n",
-        InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
+    DEBUG("accuracy loss = %f, energy = %f, at en_idx = %d\n",
+          InitialConfigurations[en_idx].accuracyLoss, en, en_idx);
     // Found best energy for this accuracy point (not dominated by any of
     // these).
 
@@ -1130,9 +1116,8 @@ void RuntimeController::compute3DParetoConfigurationPoints() {
 
   // Sort the configurations according to accuracy loss
   INFO("Sorting autotuner configurations...\n");
-  std::sort(
-      InitialConfigurations.begin(), InitialConfigurations.end(),
-      ConfigurationLessThan());
+  std::sort(InitialConfigurations.begin(), InitialConfigurations.end(),
+            ConfigurationLessThan());
   INFO("Done sorting.\n");
 
   for (unsigned start_idx = 0; start_idx < InitialConfigurations.size();) {
@@ -1166,11 +1151,10 @@ void RuntimeController::compute3DParetoConfigurationPoints() {
         }
       }
       if (!dominated) {
-        DEBUG(
-            "accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
-            InitialConfigurations[i].accuracyLoss,
-            InitialConfigurations[i].speedup, InitialConfigurations[i].energy,
-            i);
+        DEBUG("accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n",
+              InitialConfigurations[i].accuracyLoss,
+              InitialConfigurations[i].speedup, InitialConfigurations[i].energy,
+              i);
         Indices.push_back(i);
       }
     }
@@ -1229,31 +1213,22 @@ void RuntimeController::printConfigurations(
   }
 }
 
-unsigned long RuntimeController::getLastFrequency() {
-  return g_freq;
-}
+unsigned long RuntimeController::getLastFrequency() { return g_freq; }
 
-void RuntimeController::setLastFrequency(unsigned long f) {
-  g_freq = f;
-}
+void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; }
 
-double RuntimeController::getLastSpeedup() {
-  return g_speedup;
-}
+double RuntimeController::getLastSpeedup() { return g_speedup; }
 
-void RuntimeController::setLastSpeedup(double s) {
-  g_speedup = s;
-}
+void RuntimeController::setLastSpeedup(double s) { g_speedup = s; }
 
 void RuntimeController::findNextConfiguration() {
   configurationIdx = (configurationIdx + 1) % Configurations->size();
-  DEBUG(
-      "findNextConfiguration: Updated configurationIdx to %u.\n",
-      configurationIdx);
+  DEBUG("findNextConfiguration: Updated configurationIdx to %u.\n",
+        configurationIdx);
 }
 
-void RuntimeController::findTargetConfiguration(
-    float goal, enum SEARCH_KIND sk) {
+void RuntimeController::findTargetConfiguration(float goal,
+                                                enum SEARCH_KIND sk) {
   // We search in range begin(), end()-1 . It is OK to decrement end(), because
   // the configurations vector always points to one of the pareto curves, and
   // they are never empty - we have always pushed at least one configuration.
@@ -1264,25 +1239,25 @@ void RuntimeController::findTargetConfiguration(
   case SPEEDUP: {
     // Assigning one of Pareto configs to 'Configurations' class attribute
     Configurations = &SpeedupConfigurations;
-    low_it = std::lower_bound(
-        Configurations->begin(), Configurations->end() - 1, goal,
-        ConfigurationLessThan_SP());
+    low_it =
+        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+                         goal, ConfigurationLessThan_SP());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ENERGY: {
     Configurations = &EnergyConfigurations;
-    low_it = std::lower_bound(
-        Configurations->begin(), Configurations->end() - 1, goal,
-        ConfigurationLessThan_E());
+    low_it =
+        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+                         goal, ConfigurationLessThan_E());
     configurationIdx = low_it - Configurations->begin();
     break;
   }
   case ACCURACY_LOSS: {
     Configurations = &SpeedupConfigurations;
-    low_it = std::lower_bound(
-        Configurations->begin(), Configurations->end() - 1, goal,
-        ConfigurationLessThan_AL());
+    low_it =
+        std::lower_bound(Configurations->begin(), Configurations->end() - 1,
+                         goal, ConfigurationLessThan_AL());
     if ((*low_it)->accuracyLoss > goal)
       --low_it;
     configurationIdx = low_it - Configurations->begin();
@@ -1297,9 +1272,8 @@ void RuntimeController::findTargetConfiguration(
   // After search, low_it points to the Configuration to the element with the
   // goal value or the immediately lower value if it does not exist
 
-  DEBUG(
-      "findTargetConfiguration: Updated configurationIdx to %u.\n",
-      configurationIdx);
+  DEBUG("findTargetConfiguration: Updated configurationIdx to %u.\n",
+        configurationIdx);
 }
 
 void RuntimeController::adjustTargetConfiguration(float goal) {
@@ -1310,8 +1284,8 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
   // Find configuration before the selected one.
   // There is always one, unless goal is 1. Then, we would pick baseline, and
   //  both upper and lower should be the same configuration, at index 0.
-  unsigned prev_conf_idx = configurationIdx > 0 ? configurationIdx - 1
-                                                : configurationIdx;
+  unsigned prev_conf_idx =
+      configurationIdx > 0 ? configurationIdx - 1 : configurationIdx;
   // Get the two configurations' speedup, and compute the appropriate ranges
   float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup;
   float prev_conf_speedup = (*Configurations)[prev_conf_idx]->speedup;
@@ -1330,32 +1304,32 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
 
     //***--- Probability adjustment strategy 1 ---***//
     // No big adjustments at edges of probability range
-//    float adjust_val = 0.0;
-//    if (low_pb < high_pb) {
-//      adjust_val = low_pb * 0.2;
-//    } else {
-//      adjust_val = high_pb * 0.2;
-//    }
-//    low_pb -= adjust_val;
-//    high_pb += adjust_val;
+    //    float adjust_val = 0.0;
+    //    if (low_pb < high_pb) {
+    //      adjust_val = low_pb * 0.2;
+    //    } else {
+    //      adjust_val = high_pb * 0.2;
+    //    }
+    //    low_pb -= adjust_val;
+    //    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 2 ---***//
     // No big adjustment at high edge of probability range
-//    float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2;
-//    low_pb -= adjust_val;
-//    high_pb += adjust_val;
+    //    float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2;
+    //    low_pb -= adjust_val;
+    //    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 3 ---***//
-    //Similar to 2, but higher always increases, more significantly
-//    float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5;
-//    low_pb -= adjust_val;
-//    high_pb += adjust_val;
+    // Similar to 2, but higher always increases, more significantly
+    //    float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5;
+    //    low_pb -= adjust_val;
+    //    high_pb += adjust_val;
     //***---                                   ---***//
 
     //***--- Probability adjustment strategy 4 ---***//
-    //Similar to 2, but higher always increases, more significantly
+    // Similar to 2, but higher always increases, more significantly
     // Low end, high end a bit less aggressive than total range
     float adjust_val = low_pb * 0.6 > 0.2 ? 0.2 : low_pb * 0.6;
     adjust_val = adjust_val > high_pb ? high_pb : adjust_val;
@@ -1364,20 +1338,18 @@ void RuntimeController::adjustTargetConfiguration(float goal) {
     //***---                                   ---***//
   }
 
-  DEBUG(
-      "**---- adjustTargetConfiguration: upper conf = %s with probability: "
-      "%f.\n",
-      ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
-  DEBUG(
-      "**---- adjustTargetConfiguration: lower conf = %s with probability: "
-      "%f.\n\n",
-      ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb);
+  DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: "
+        "%f.\n",
+        ((*Configurations)[configurationIdx]->name).c_str(), high_pb);
+  DEBUG("**---- adjustTargetConfiguration: lower conf = %s with probability: "
+        "%f.\n\n",
+        ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb);
 
   // Select a random number from 0 to 1
   // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1)
   // to the upper
   // float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ;
-  //float rd = pseudo_rd;
+  // float rd = pseudo_rd;
   float rd = distr(generator);
   if (rd < low_pb) {
     // If the probability is in the low range
@@ -1411,8 +1383,8 @@ extern "C" void llvm_hpvm_clearRuntimeController() {
 //*** Methods to compute accuracy of a tensor by the runtime controller   ***//
 uint32_t *labels_from_file = NULL;
 
-uint32_t *
-hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) {
+uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start,
+                                         int end) {
 
   // Initialize buffer
   if (!labels_from_file) {
@@ -1485,10 +1457,10 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
   printf("****** Accuracy = %f \n\n", accuracy);
-  
-  average_accuracy = accuracy + (average_accuracy * num_executations); 
+
+  average_accuracy = accuracy + (average_accuracy * num_executations);
   num_executations++;
-  average_accuracy = average_accuracy/num_executations;
+  average_accuracy = average_accuracy / num_executations;
 
   FILE *fp = fopen("final_accuracy", "w+");
   if (fp != NULL) {
@@ -1510,8 +1482,8 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
 //#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl
 //#define llvm_hpvm_invokeRtControl_ITERATE llvm_hpvm_invokeRtControl
 
-extern "C" void llvm_hpvm_invokeRtControl_BASE(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str,
+                                               int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1528,16 +1500,15 @@ extern "C" void llvm_hpvm_invokeRtControl_BASE(
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n\n",
+       current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ITERATE(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str,
+                                                  int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1561,16 +1532,15 @@ extern "C" void llvm_hpvm_invokeRtControl_ITERATE(
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n\n",
+       current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ADJUST(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str,
+                                                 int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1613,17 +1583,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST(
   RC->addToCurrentIterationConfigEnergy(pinfo2.second);
   //*                                                                        */
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("target speedup = %lf\n\n", target_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result,
+                                                    const char *str, int start,
+                                                    int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1667,17 +1637,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(
   RC->addToCurrentIterationConfigEnergy(pinfo2.second);
   //*                                                                        */
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("target speedup = %lf\n\n", target_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result,
+                                                   const char *str, int start,
+                                                   int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1704,21 +1674,20 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(
   float next_conf_speedup =
       RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("slowdown (target speedup) = %f\n", slowdown);
   INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO(
-      "Swapping to next configuration: %s with speedup %f\n\n",
-      next_conf_name.c_str(), next_conf_speedup);
+  INFO("Swapping to next configuration: %s with speedup %f\n\n",
+       next_conf_name.c_str(), next_conf_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result,
+                                                      const char *str,
+                                                      int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1746,21 +1715,19 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(
   float next_conf_speedup =
       RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup;
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n",
+       current_iteration_time, current_iteration_energy);
   INFO("slowdown (target speedup) = %f\n", slowdown);
   INFO("Previous configuration: %s\n", prev_conf_name.c_str());
-  INFO(
-      "Swapping to next configuration: %s with speedup %f\n\n",
-      next_conf_name.c_str(), next_conf_speedup);
+  INFO("Swapping to next configuration: %s with speedup %f\n\n",
+       next_conf_name.c_str(), next_conf_speedup);
 
   // Note the end of iteration
   RC->end_iteration();
 }
 
-extern "C" void llvm_hpvm_invokeRtControl_RAND(
-    void *result, const char *str, int start, int end) {
+extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str,
+                                               int start, int end) {
 
   uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end);
   hpvm_rt_computeAccuracy3(labels_cached, result);
@@ -1778,9 +1745,8 @@ extern "C" void llvm_hpvm_invokeRtControl_RAND(
   RC->addToCurrentIterationControlTime(pinfo.first);
   RC->addToCurrentIterationControlEnergy(pinfo.second);
 
-  INFO(
-      "current iteration time = %f, current iteration energy = %f\n\n",
-      current_iteration_time, current_iteration_energy);
+  INFO("current iteration time = %f, current iteration energy = %f\n\n",
+       current_iteration_time, current_iteration_energy);
 
   // Note the end of iteration
   RC->end_iteration();
@@ -1791,7 +1757,7 @@ static void writeVectorToFile(const char *path, const std::vector<T> &vec) {
   std::ofstream of(path, std::ofstream::out | std::ofstream::app);
   if (!of.good())
     ERROR("Cannot write to %s file", path);
-  for (float f: vec)
+  for (float f : vec)
     of << f << ' ';
   of << '\n';
 }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
index 74ee15c2dc..b322ee2be3 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc
@@ -68,9 +68,7 @@ void llvm_hpvm_initApproxhpvmRt(int gpuid) {
 
 void llvm_hpvm_cleanupApproxhpvmRt() {}
 
-void dumpAccuracyNorms() {
-  dump_result("accuracy_summary");
-}
+void dumpAccuracyNorms() { dump_result("accuracy_summary"); }
 
 // Returns the number of GPUs active on the platform
 unsigned int getGPUCount() {
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc
index ad1d2e137d..08f13bf0f8 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc
@@ -1,13 +1,12 @@
 //===----------------------------- profling.cc  ---------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 //  This file contains code provides the definition of the interface for
 // applications to start and stop profiling for energy and performance.
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef PROFILING_HEADER
 #define PROFILING_HEADER
 
@@ -52,7 +51,7 @@ void stopProfiling() {
 void profileEvent(const char *event_name, bool compare_previous = false) {
 
   checkCudaErrors(cudaDeviceSynchronize());
-  
+
   auto it = func_counters.find(event_name);
   if (it == func_counters.end()) {
     func_counters[event_name] = 1;
@@ -73,7 +72,7 @@ void profileEvent(const char *event_name, bool compare_previous = false) {
       time_reading - zero_time;
 
   DEBUG("AbsoluteTime, Event = %s, Time = %f \n", event_name,
-       current_time.count());
+        current_time.count());
   profile_data.append(event_name);
   profile_data.append(event_count);
   profile_data.append("\t");
@@ -86,14 +85,13 @@ void profileEvent(const char *event_name, bool compare_previous = false) {
     profile_data.append("\t");
     profile_data.append(std::to_string(duration_time.count()));
     DEBUG("TimeDuration, Event = %s, Time = %f \n", event_name,
-         duration_time.count());
+          duration_time.count());
   }
 
   profile_data.append("\n");
 
   previous_time = time_reading; // set the previous time reading to the current
                                 // profiled time
-
 }
 }
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index 9250810a20..7a1acd2ba0 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -1,11 +1,11 @@
 //===--------------------------- tensor_runtime_cpu.cc --------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-//  This file  consists of the custom implementation of non-approximated and 
-// approximated  versions of tensor operations to execute on CPUs. The 
-// software approximations implemented for tensor convolutions are feature 
-// sampling and perforation for FP32 compute precisions only.  
+//
+//  This file  consists of the custom implementation of non-approximated and
+// approximated  versions of tensor operations to execute on CPUs. The
+// software approximations implemented for tensor convolutions are feature
+// sampling and perforation for FP32 compute precisions only.
 //
 //===----------------------------------------------------------------------===//
 
@@ -29,7 +29,7 @@
 #include <string>
 #include <vector>
 #include <math.h>
-#include<bits/stdc++.h>
+#include <bits/stdc++.h>
 #include <pthread.h>
 #include <omp.h>
 
@@ -39,1081 +39,1140 @@
 #include "tensor_cpu_runtime.h"
 
 void llvm_hpvm_initTensorRtCPU() {
-    // NOTE: Do Nothing
+  // NOTE: Do Nothing
 }
 
 void llvm_hpvm_cleanupTensorRtCPU() {
-    // NOTE: Do Nothing
+  // NOTE: Do Nothing
 }
 
 void hpvm_request_tensorCPU(void *tensor, int destination) {
-    // NOTE: Do Nothing
+  // NOTE: Do Nothing
 }
-  
+
 std::vector<void *> PtrVect;
 
 void freeBatchMemory() {
-    for(auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
-        free(*it);
-    }
-    PtrVect.erase(PtrVect.begin(), PtrVect.end());
+  for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
+    free(*it);
+  }
+  PtrVect.erase(PtrVect.begin(), PtrVect.end());
 }
 
-
-int getTypeSizeCPU(int data_type)  __attribute__((always_inline));
+int getTypeSizeCPU(int data_type) __attribute__((always_inline));
 inline int getTypeSizeCPU(int data_type) {
-    return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
+  return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
 }
 
-void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline));
-inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) {
-    int type_size = getTypeSizeCPU(data_type);
-    size_t size_in_bytes = type_size * num_elems;
-    tensor->size_in_bytes = size_in_bytes;
+void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems)
+    __attribute__((always_inline));
+inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type,
+                              size_t num_elems) {
+  int type_size = getTypeSizeCPU(data_type);
+  size_t size_in_bytes = type_size * num_elems;
+  tensor->size_in_bytes = size_in_bytes;
 }
 
-void allocateMemCPU(struct Tensor *tensor, int data_type, 
-                    size_t num_elems, bool freeMemory = true) __attribute__((always_inline));
-inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, bool freeMemory) {
-    setSizeInBytesCPU(tensor, data_type, num_elems);
-    tensor->data_type = data_type;
-    tensor->num_elems = num_elems;
-    tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
-    if(freeMemory)
-        PtrVect.push_back(tensor->host_data);
+void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems,
+                    bool freeMemory = true) __attribute__((always_inline));
+inline void allocateMemCPU(struct Tensor *tensor, int data_type,
+                           size_t num_elems, bool freeMemory) {
+  setSizeInBytesCPU(tensor, data_type, num_elems);
+  tensor->data_type = data_type;
+  tensor->num_elems = num_elems;
+  tensor->host_data =
+      (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
+  if (freeMemory)
+    PtrVect.push_back(tensor->host_data);
 }
 
-void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) __attribute__((always_inline));
-inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
-    Tensor *tensor = (Tensor *)tensor_ptr;
-    if (tensor->size_in_bytes != size_in_bytes) {
-        printf("The destination and source sizes don't match");
-    }
-    memcpy(tensor->host_data, data_ptr, size_in_bytes); // Is this efficient enough?
+void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes)
+    __attribute__((always_inline));
+inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr,
+                              size_t size_in_bytes) {
+  Tensor *tensor = (Tensor *)tensor_ptr;
+  if (tensor->size_in_bytes != size_in_bytes) {
+    printf("The destination and source sizes don't match");
+  }
+  memcpy(tensor->host_data, data_ptr,
+         size_in_bytes); // Is this efficient enough?
 }
 
 void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
-                     size_t dim2_size, size_t dim3_size, size_t dim4_size, 
-                    bool freeMemory = true) __attribute__((always_inline));
-inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,         
-                                    size_t dim2_size, size_t dim3_size, 
-                                    size_t dim4_size, bool freeMemory) {
-    struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
-    size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-    if(freeMemory)
-        PtrVect.push_back(tensor);
-    allocateMemCPU(tensor, data_type, num_elems, freeMemory);
-    
-    // Setting the tensor dimensions
-    size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
-    dim_sizes[0] = dim1_size;
-    dim_sizes[1] = dim2_size;
-    dim_sizes[2] = dim3_size;
-    dim_sizes[3] = dim4_size;
-    tensor->dims.dim_sizes = dim_sizes;
-    tensor->dims.num_dims = 4;
-    tensor->data_placement = HOST;    
-    return tensor;
+                        size_t dim2_size, size_t dim3_size, size_t dim4_size,
+                        bool freeMemory = true) __attribute__((always_inline));
+inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
+                               size_t dim2_size, size_t dim3_size,
+                               size_t dim4_size, bool freeMemory) {
+  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+  if (freeMemory)
+    PtrVect.push_back(tensor);
+  allocateMemCPU(tensor, data_type, num_elems, freeMemory);
+
+  // Setting the tensor dimensions
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  dim_sizes[2] = dim3_size;
+  dim_sizes[3] = dim4_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 4;
+  tensor->data_placement = HOST;
+  return tensor;
 }
 
-void* tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                                    int horizontal_pad, int vertical_stride,
-                                    int horizontal_stride, int conv_mode,
-                                    int compute_precision) {
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-    int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int output_size = output_width * output_height;
-    printf("--CREATE 4D TENSOR\n");    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    printf("CREATED 4D TENSOR\n");
-    long int conv_data_size = 
-        sizeof(float) * num_filter_elem * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(conv_data_size);
-    printf("host data: %p\n", host_data);
-    printf("number of batches: %d\n", batch_size);
-    omp_set_num_threads(4);
-     #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                        + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                                && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                        + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
+void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int compute_precision) {
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+  int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) /
+                          horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int output_size = output_width * output_height;
+  printf("--CREATE 4D TENSOR\n");
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+                                               output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+  printf("CREATED 4D TENSOR\n");
+  long int conv_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(conv_data_size);
+  printf("host data: %p\n", host_data);
+  printf("number of batches: %d\n", batch_size);
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        for (int w = 0; w < output_width; w++) {
+          const int inH = h * vertical_stride - vertical_pad;
+          const int inW = w * horizontal_stride - horizontal_pad;
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
-        for (int p = 0; p < num_filters; ++p) {
-             for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+      }
+    }
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(host_data);
-    printf("END: %p\n", output);
-    return output;
+  }
+  free(host_data);
+  printf("END: %p\n", output);
+  return output;
 }
 
-void* tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, 
-                                                int vertical_pad, int horizontal_pad, 
-                                                int vertical_stride, int horizontal_stride, 
-                                                int conv_mode, int compute_precision, 
-                                                int skip_every, int start) {
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    const int batch_size = input->dims.dim_sizes[0];
-    const int channels = input->dims.dim_sizes[1];
-    const int image_height = input->dims.dim_sizes[2];
-    const int image_width = input->dims.dim_sizes[3];
-    const int num_filters = filter->dims.dim_sizes[0];
-    const int kernel_height = filter->dims.dim_sizes[2];
-    const int kernel_width = filter->dims.dim_sizes[3];
-    const int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    const int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    const int num_filter_elem = kernel_height * kernel_width * channels;
-
-    const int remainder = ((num_filter_elem - start) % skip_every > 0);
-    const int reduced_num_filter_elem = 
-            num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
-    const int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    const long int host_data_size = sizeof(float) * reduced_num_filter_elem 
-                                    * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
-   
-    const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
-    float *reduced_kernels = (float *) malloc(reduced_filer_size);
-   
-    float fac =  (((float) skip_every) / ((float) skip_every - 1));
-    int reduced_filter_dim = reduced_num_filter_elem / channels;
-
-    // Create reduced filter
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int f = 0; f < num_filters; f++) {
-        for(int i = 0; i < reduced_num_filter_elem; i++) {
-            int ch = i / reduced_filter_dim;
-            int offset  = (start + ch) % skip_every; 
-            int in_index;
-            if(i < offset) {
-                in_index = i;
-            } else {
-                in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) 
-                        + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset -1;
-            }
-            reduced_kernels[f * reduced_num_filter_elem + i] = 
-                                fac * host_filter[num_filter_elem * f + in_index];
+void *tensorRegularFilterSamplingConvolutionCPU(
+    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int compute_precision, int skip_every, int start) {
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  const int batch_size = input->dims.dim_sizes[0];
+  const int channels = input->dims.dim_sizes[1];
+  const int image_height = input->dims.dim_sizes[2];
+  const int image_width = input->dims.dim_sizes[3];
+  const int num_filters = filter->dims.dim_sizes[0];
+  const int kernel_height = filter->dims.dim_sizes[2];
+  const int kernel_width = filter->dims.dim_sizes[3];
+  const int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  const int output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  const int num_filter_elem = kernel_height * kernel_width * channels;
+
+  const int remainder = ((num_filter_elem - start) % skip_every > 0);
+  const int reduced_num_filter_elem =
+      num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
+  const int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+                                               output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  const long int host_data_size = sizeof(float) * reduced_num_filter_elem *
+                                  output_height * output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  const int reduced_filer_size =
+      sizeof(float) * num_filters * reduced_num_filter_elem;
+  float *reduced_kernels = (float *)malloc(reduced_filer_size);
+
+  float fac = (((float)skip_every) / ((float)skip_every - 1));
+  int reduced_filter_dim = reduced_num_filter_elem / channels;
+
+  // Create reduced filter
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int f = 0; f < num_filters; f++) {
+    for (int i = 0; i < reduced_num_filter_elem; i++) {
+      int ch = i / reduced_filter_dim;
+      int offset = (start + ch) % skip_every;
+      int in_index;
+      if (i < offset) {
+        in_index = i;
+      } else {
+        in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) +
+                   (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                   offset - 1;
+      }
+      reduced_kernels[f * reduced_num_filter_elem + i] =
+          fac * host_filter[num_filter_elem * f + in_index];
+    }
+  }
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int h = 0; h < output_height; h++) {
+      for (int w = 0; w < output_width; w++) {
+        const int inH = h * vertical_stride - vertical_pad;
+        const int inW = w * horizontal_stride - horizontal_pad;
+        for (int fi = 0; fi < reduced_num_filter_elem; fi++) {
+          int in_index;
+          const int ch = fi / reduced_filter_dim;
+          const int offset = (start + ch) % skip_every;
+          if (fi < offset) {
+            in_index = fi;
+          } else {
+            in_index =
+                ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                offset - 1;
+          }
+          const int i =
+              (in_index % (kernel_width * kernel_height)) / kernel_width;
+          const int j = in_index % kernel_width;
+          const int output_index = h * output_width + w;
+          const int out_index = b * reduced_num_filter_elem * output_size +
+                                output_index * reduced_num_filter_elem + fi;
+          if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+              inW + j < image_width) {
+            host_data[out_index] =
+                host_image[((b * channels + ch) * image_height + (inH + i)) *
+                               image_width +
+                           (inW + j)];
+          } else {
+            host_data[out_index] = 0;
+          }
         }
+      }
     }
 
-    omp_set_num_threads(4);   
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
-                        int in_index;
-                        const int ch = fi / reduced_filter_dim;
-                        const int offset  = (start + ch) % skip_every;
-                        if(fi < offset) {
-                            in_index = fi;
-                        } else {
-                            in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-                                + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-                        }
-                        const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; 
-                        const int j = in_index % kernel_width;
-                        const int output_index = h * output_width + w;
-                        const int out_index = b * reduced_num_filter_elem * output_size 
-                                            + output_index * reduced_num_filter_elem + fi;
-                        if(inH + i >= 0 && inH + i < image_height 
-                        && inW + j >= 0 && inW + j < image_width) {
-                            host_data[out_index] = 
-                                host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                        } else {
-                            host_data[out_index] = 0;
-                        }
-                }
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < reduced_num_filter_elem; ++k) {
+          int input_index = k + reduced_num_filter_elem * m +
+                            b * reduced_num_filter_elem * output_size;
+          sum += host_data[input_index] *
+                 reduced_kernels[p * reduced_num_filter_elem + k];
         }
-
-         // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < reduced_num_filter_elem; ++k) {
-                    int input_index = k + reduced_num_filter_elem * m 
-                                    + b * reduced_num_filter_elem * output_size;
-                    sum += host_data[input_index] 
-                            * reduced_kernels[p * reduced_num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
-        }
-
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(reduced_kernels);
-    free(host_data);
-  
-    return output;
+  }
+  free(reduced_kernels);
+  free(host_data);
+
+  return output;
 }
 
-void* tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, 
-                                                  int vertical_pad, int horizontal_pad, 
-                                                  int vertical_stride, int horizontal_stride, 
-                                                  int conv_mode, int compute_precision, 
-                                                  int skip_every, int start) {
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    const int batch_size = input->dims.dim_sizes[0];
-    const int channels = input->dims.dim_sizes[1];
-    const int image_height = input->dims.dim_sizes[2];
-    const int image_width = input->dims.dim_sizes[3];
-    const int num_filters = filter->dims.dim_sizes[0];
-    const int kernel_height = filter->dims.dim_sizes[2];
-    const int kernel_width = filter->dims.dim_sizes[3];
-    const int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    const int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    const int num_filter_elem = kernel_height * kernel_width * channels;
-
-    const int remainder = ((num_filter_elem - start) % skip_every > 0);
-    const int reduced_num_filter_elem = 
-            num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
-    const int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                                    output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    const long int host_data_size = sizeof(float) * reduced_num_filter_elem 
-                                    * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
-   
-    const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
-    float *reduced_kernels = (float *) malloc(reduced_filer_size);
-   
-    float fac =  (((float) skip_every) / ((float) skip_every - 1));
-    int reduced_filter_dim = reduced_num_filter_elem / channels;
-
-    // Create Reduced filter
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int f = 0; f < num_filters; f++) {
-        for(int i = 0; i < start; i++) {
-            reduced_kernels[f * reduced_num_filter_elem + i] = 
-                                        host_filter[num_filter_elem * f + i];
-        }
-        #pragma omp simd
-        for(int i = start; i < reduced_num_filter_elem; i++) {
-            int in_index = ((i - start + 1) * skip_every) / (skip_every - 1)
-                    + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + start - 1;
-            reduced_kernels[f * reduced_num_filter_elem + i] = 
-                            fac * host_filter[num_filter_elem * f + in_index];
+void *tensorIrregularFilterSamplingConvolutionCPU(
+    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
+    int vertical_stride, int horizontal_stride, int conv_mode,
+    int compute_precision, int skip_every, int start) {
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  const int batch_size = input->dims.dim_sizes[0];
+  const int channels = input->dims.dim_sizes[1];
+  const int image_height = input->dims.dim_sizes[2];
+  const int image_width = input->dims.dim_sizes[3];
+  const int num_filters = filter->dims.dim_sizes[0];
+  const int kernel_height = filter->dims.dim_sizes[2];
+  const int kernel_width = filter->dims.dim_sizes[3];
+  const int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  const int output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  const int num_filter_elem = kernel_height * kernel_width * channels;
+
+  const int remainder = ((num_filter_elem - start) % skip_every > 0);
+  const int reduced_num_filter_elem =
+      num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
+  const int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+                                               output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  const long int host_data_size = sizeof(float) * reduced_num_filter_elem *
+                                  output_height * output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  const int reduced_filer_size =
+      sizeof(float) * num_filters * reduced_num_filter_elem;
+  float *reduced_kernels = (float *)malloc(reduced_filer_size);
+
+  float fac = (((float)skip_every) / ((float)skip_every - 1));
+  int reduced_filter_dim = reduced_num_filter_elem / channels;
+
+  // Create Reduced filter
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int f = 0; f < num_filters; f++) {
+    for (int i = 0; i < start; i++) {
+      reduced_kernels[f * reduced_num_filter_elem + i] =
+          host_filter[num_filter_elem * f + i];
+    }
+#pragma omp simd
+    for (int i = start; i < reduced_num_filter_elem; i++) {
+      int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) +
+                     (((i - start + 1) * skip_every) % (skip_every - 1) > 0) +
+                     start - 1;
+      reduced_kernels[f * reduced_num_filter_elem + i] =
+          fac * host_filter[num_filter_elem * f + in_index];
+    }
+  }
+
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int h = 0; h < output_height; h++) {
+      for (int w = 0; w < output_width; w++) {
+        const int inH = h * vertical_stride - vertical_pad;
+        const int inW = w * horizontal_stride - horizontal_pad;
+        for (int fi = 0; fi < reduced_num_filter_elem; fi++) {
+          int in_index;
+          int offset = start;
+          if (fi < offset) {
+            in_index = fi;
+          } else {
+            in_index =
+                ((fi - offset + 1) * skip_every) / (skip_every - 1) +
+                (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) +
+                offset - 1;
+          }
+          const int ch = in_index / (kernel_width * kernel_height);
+          const int i =
+              (in_index % (kernel_width * kernel_height)) / kernel_width;
+          const int j = in_index % kernel_width;
+          const int output_index = h * output_width + w;
+          const int out_index = b * reduced_num_filter_elem * output_size +
+                                output_index * reduced_num_filter_elem + fi;
+          if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+              inW + j < image_width) {
+            host_data[out_index] =
+                host_image[((b * channels + ch) * image_height + (inH + i)) *
+                               image_width +
+                           (inW + j)];
+          } else {
+            host_data[out_index] = 0;
+          }
         }
+      }
     }
 
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
-                        int in_index;
-                        int offset = start;
-                        if(fi < offset) {
-                            in_index = fi;
-                        } else {
-                            in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) 
-                             + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
-                        }
-                        const int ch = in_index / (kernel_width * kernel_height);
-                        const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; 
-                        const int j = in_index % kernel_width;
-                        const int output_index = h * output_width + w;
-                        const int out_index = b * reduced_num_filter_elem * output_size 
-                                            + output_index * reduced_num_filter_elem + fi;
-                        if(inH + i >= 0 && inH + i < image_height 
-                        && inW + j >= 0 && inW + j < image_width) {
-                            host_data[out_index] = 
-                                host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                        } else {
-                            host_data[out_index] = 0;
-                        }
-                }
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < reduced_num_filter_elem; ++k) {
+          int input_index = k + reduced_num_filter_elem * m +
+                            b * reduced_num_filter_elem * output_size;
+          sum += host_data[input_index] *
+                 reduced_kernels[p * reduced_num_filter_elem + k];
         }
-
-        // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < reduced_num_filter_elem; ++k) {
-                    int input_index = k + reduced_num_filter_elem * m 
-                                    + b * reduced_num_filter_elem * output_size;
-                    sum += host_data[input_index] 
-                                * reduced_kernels[p * reduced_num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
-        }
-
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(reduced_kernels);
-    free(host_data);
-  
-    return output;
-}
+  }
+  free(reduced_kernels);
+  free(host_data);
 
-void* tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                                int horizontal_pad, int vertical_stride, int horizontal_stride, 
-                                int conv_mode, int compute_precision, int row, int start) {
-    
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-
-    int full_output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int full_output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int full_output_size = full_output_height * full_output_width;
-
-    Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                            full_output_height, full_output_width);
-    float * __restrict__ full_output_data = (float *)full_output->host_data;
-   
-    int remainder = (full_output_height - start) % row > 0;
-    int output_height = 
-            full_output_height - ((full_output_height - start) / row) - remainder;
-
-    int output_width = full_output_width;
-    float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters 
-                                                * output_height * output_width);   
-    int output_size = output_width * output_height;
-    long int host_data_size = sizeof(float) * num_filter_elem * output_height 
-                                                        * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
+  return output;
+}
 
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                int inH;
-                if(h < start) {
-                    inH = h * vertical_stride - vertical_pad;
-                } else {
-                    int h_index = ((h - start + 1) * row) / (row - 1) 
-                                + (((h - start + 1) * row) % (row - 1) > 0) + start - 1;
-                    inH = h_index * vertical_stride - vertical_pad;
-                }
-                for(int w = 0; w < output_width; w++) {
-                    int inW = w * horizontal_stride - horizontal_pad;
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = 
-                                        (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                    + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                            && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
+void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int compute_precision, int row,
+                                  int start) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+
+  int full_output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int full_output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int full_output_size = full_output_height * full_output_width;
+
+  Tensor *full_output = (Tensor *)create4DTensorCPU(
+      0, 0, batch_size, num_filters, full_output_height, full_output_width);
+  float *__restrict__ full_output_data = (float *)full_output->host_data;
+
+  int remainder = (full_output_height - start) % row > 0;
+  int output_height =
+      full_output_height - ((full_output_height - start) / row) - remainder;
+
+  int output_width = full_output_width;
+  float *output_data = (float *)malloc(
+      sizeof(float) * batch_size * num_filters * output_height * output_width);
+  int output_size = output_width * output_height;
+  long int host_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        int inH;
+        if (h < start) {
+          inH = h * vertical_stride - vertical_pad;
+        } else {
+          int h_index = ((h - start + 1) * row) / (row - 1) +
+                        (((h - start + 1) * row) % (row - 1) > 0) + start - 1;
+          inH = h_index * vertical_stride - vertical_pad;
+        }
+        for (int w = 0; w < output_width; w++) {
+          int inW = w * horizontal_stride - horizontal_pad;
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
+      }
+    }
 
-        // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
+    }
 
-        // Interpolate
-        for (int p = 0; p < num_filters; ++p) {
-            for(int h = 0; h < full_output_height; h++) { 
-                for(int w = 0; w < full_output_width; w++) {
-                   int full_output_index = b * num_filters * full_output_size 
-                            + p * full_output_size + h * full_output_width  + w;
-                   if(h < start) {
-                       int output_index = b * num_filters * output_size 
-                                        + p * output_size + h * output_width  + w;
-                       full_output_data[full_output_index] = output_data[output_index];
-                   } else if(h == full_output_height - 1) {
-                       int output_index = b * num_filters * output_size + p * output_size 
-                                                + (output_height - 1) * output_width  + w;
-                       full_output_data[full_output_index] = output_data[output_index];
-                    } else if(h == 0) {
-                        int output_index = b * num_filters * output_size 
-                                            + p * output_size + 0 * output_width  + w;
-                        full_output_data[full_output_index] = output_data[output_index]; 
-                    } else if((h - start) % row == 0) {
-                        int row_index = h - ((h + 1 - start) / row); 
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                            + row_index * output_width + w;
-                        full_output_data[full_output_index] = 
-                            (output_data[output_index] + output_data[output_index - output_width]) / 2;
-                   } else {
-                       int remainder = ((h + 1 - start) % row) > 0;
-                       int row_index = h - ((h + 1 - start) / row) - remainder;
-                       int output_index = b * num_filters * output_size + p * output_size 
-                                                        + row_index * output_width + w;
-                       full_output_data[full_output_index] = output_data[output_index];
-                  }
-                }
-            }
-         }
+    // Interpolate
+    for (int p = 0; p < num_filters; ++p) {
+      for (int h = 0; h < full_output_height; h++) {
+        for (int w = 0; w < full_output_width; w++) {
+          int full_output_index = b * num_filters * full_output_size +
+                                  p * full_output_size + h * full_output_width +
+                                  w;
+          if (h < start) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (h == full_output_height - 1) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               (output_height - 1) * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (h == 0) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               0 * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if ((h - start) % row == 0) {
+            int row_index = h - ((h + 1 - start) / row);
+            int output_index = b * num_filters * output_size + p * output_size +
+                               row_index * output_width + w;
+            full_output_data[full_output_index] =
+                (output_data[output_index] +
+                 output_data[output_index - output_width]) /
+                2;
+          } else {
+            int remainder = ((h + 1 - start) % row) > 0;
+            int row_index = h - ((h + 1 - start) / row) - remainder;
+            int output_index = b * num_filters * output_size + p * output_size +
+                               row_index * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          }
+        }
+      }
     }
-    free(output_data);
-    free(host_data);
+  }
+  free(output_data);
+  free(host_data);
 
-    return full_output;
+  return full_output;
 }
 
-void* tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                                int horizontal_pad, int vertical_stride, int horizontal_stride, 
-                                int conv_mode, int compute_precision, int col, int start) {
-    
-    Tensor *input = (Tensor *)input_ptr;
-    Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-    
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int num_filters = filter->dims.dim_sizes[0];
-    int kernel_height = filter->dims.dim_sizes[2];
-    int kernel_width = filter->dims.dim_sizes[3];
-    int full_output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    int full_output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    int num_filter_elem = kernel_height * kernel_width * channels;
-    int full_output_size = full_output_height * full_output_width;
-
-    Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, 
-                                                    full_output_height, full_output_width);
-    float * __restrict__ full_output_data = (float *)full_output->host_data;
-
-    int remainder = (full_output_width - start) % col > 0;
-    int output_width = full_output_width - ((full_output_width - start) / col) - remainder;
-
-    int output_height = full_output_height;
-    float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters 
-                                                    * output_height * output_width);
-    int output_size = output_width * output_height;
-    long int host_data_size = sizeof(float) * num_filter_elem * output_height 
-                                                        * output_width * batch_size;
-    float *host_data = (float *) malloc(host_data_size);
-
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                int inH = h * vertical_stride - vertical_pad;
-                for(int w = 0; w < output_width; w++) {
-                    int inW;
-                    if(w < start) {
-                        inW = w * horizontal_stride - horizontal_pad;
-                    } else {
-                        int w_index = ((w - start + 1) * col) / (col - 1) 
-                                + (((w - start + 1) * col) % (col - 1) > 0) + start - 1;
-                        inW = w_index * horizontal_stride - horizontal_pad;
-                    }
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = 
-                                        (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                    + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                            && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                            + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        // Tensor Multiply
-        for (int p = 0; p < num_filters; ++p) {
-            for (int m = 0; m < output_size; ++m) {
-                float sum = 0;
-                #pragma omp simd reduction(+:sum)
-                for (int k = 0; k < num_filter_elem; ++k) {
-                    int input_index = k + num_filter_elem * m 
-                                            + b * num_filter_elem * output_size;
-                    sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
-                }
-                output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
+void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
+                                  int vertical_pad, int horizontal_pad,
+                                  int vertical_stride, int horizontal_stride,
+                                  int conv_mode, int compute_precision, int col,
+                                  int start) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int num_filters = filter->dims.dim_sizes[0];
+  int kernel_height = filter->dims.dim_sizes[2];
+  int kernel_width = filter->dims.dim_sizes[3];
+  int full_output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  int full_output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  int num_filter_elem = kernel_height * kernel_width * channels;
+  int full_output_size = full_output_height * full_output_width;
+
+  Tensor *full_output = (Tensor *)create4DTensorCPU(
+      0, 0, batch_size, num_filters, full_output_height, full_output_width);
+  float *__restrict__ full_output_data = (float *)full_output->host_data;
+
+  int remainder = (full_output_width - start) % col > 0;
+  int output_width =
+      full_output_width - ((full_output_width - start) / col) - remainder;
+
+  int output_height = full_output_height;
+  float *output_data = (float *)malloc(
+      sizeof(float) * batch_size * num_filters * output_height * output_width);
+  int output_size = output_width * output_height;
+  long int host_data_size = sizeof(float) * num_filter_elem * output_height *
+                            output_width * batch_size;
+  float *host_data = (float *)malloc(host_data_size);
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        int inH = h * vertical_stride - vertical_pad;
+        for (int w = 0; w < output_width; w++) {
+          int inW;
+          if (w < start) {
+            inW = w * horizontal_stride - horizontal_pad;
+          } else {
+            int w_index = ((w - start + 1) * col) / (col - 1) +
+                          (((w - start + 1) * col) % (col - 1) > 0) + start - 1;
+            inW = w_index * horizontal_stride - horizontal_pad;
+          }
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
+      }
+    }
 
-        // Interpolate
-        for (int p = 0; p < num_filters; ++p) {
-            for(int h = 0; h < full_output_height; h++) {
-                for(int w = 0; w < full_output_width; w++) {
-                    int full_output_index = b * num_filters * full_output_size 
-                                + p * full_output_size + h * full_output_width  + w;
-                     if(w < start) {
-                         int output_index = b * num_filters * output_size 
-                                        + p * output_size + h * output_width + w;
-                         full_output_data[full_output_index] = output_data[output_index];
-                    } else if(w == full_output_width - 1) {
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                    + h * output_width  + output_width - 1;
-                        full_output_data[full_output_index] = output_data[output_index];
-                    } else if(w == 0) {
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                                + h * output_width  + 0;
-                        full_output_data[full_output_index] = output_data[output_index];
-                    } else if((w - start) % col == 0) {
-                        int col_index = w - ((w + 1 - start) / col);
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                            + h * output_width + col_index;
-                        full_output_data[full_output_index] = 
-                            (output_data[output_index] + output_data[output_index - 1]) / 2;
-                    } else {
-                        int remainder = ((w + 1 - start) % col) > 0;
-                        int col_index = w - ((w + 1 - start) / col) - remainder;
-                        int output_index = b * num_filters * output_size + p * output_size 
-                                                            + h * output_width + col_index;
-                        full_output_data[full_output_index] = output_data[output_index];
-                    }
-                }
-            }
+    // Tensor Multiply
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        float sum = 0;
+#pragma omp simd reduction(+ : sum)
+        for (int k = 0; k < num_filter_elem; ++k) {
+          int input_index =
+              k + num_filter_elem * m + b * num_filter_elem * output_size;
+          sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
         }
+        output_data[b * (output_size * num_filters) + p * output_size + m] =
+            sum;
+      }
     }
-    free(output_data);
-    free(host_data);
 
-    return full_output;
-}
-
-void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr, 
-                          int vertical_pad, int horizontal_pad, 
-                          int vertical_stride, int horizontal_stride, 
-                          int conv_mode, int compute_precision, 
-                          int row, int col, int skip_every, int start) {
-    if(row > 1) {
-        printf("ROW PERFORATION\n");
-        return tensorRowPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
-                        horizontal_pad, vertical_stride, horizontal_stride, conv_mode, 
-                        compute_precision, row, start);
-    } 
-    if(col > 1) {
-     printf("COL PERFORATION\n");
-     return tensorColPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
-                             horizontal_pad, vertical_stride, horizontal_stride, conv_mode, 
-                            compute_precision, col, start);
-    }  
-    if(skip_every > 1) {
-        printf("INPUT FILTERING\n");
-        Tensor *input = (Tensor *)input_ptr;
-        Tensor *filter = (Tensor *)filter_ptr;
-
-        const int kernel_height = filter->dims.dim_sizes[2];
-        const int kernel_width = filter->dims.dim_sizes[3];
-
-        if(!(kernel_height * kernel_width % skip_every)) {
-            return tensorRegularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, 
-                                    vertical_pad, horizontal_pad, vertical_stride,
-                                    horizontal_stride, conv_mode, 
-                                    compute_precision, skip_every, start);
+    // Interpolate
+    for (int p = 0; p < num_filters; ++p) {
+      for (int h = 0; h < full_output_height; h++) {
+        for (int w = 0; w < full_output_width; w++) {
+          int full_output_index = b * num_filters * full_output_size +
+                                  p * full_output_size + h * full_output_width +
+                                  w;
+          if (w < start) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + w;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (w == full_output_width - 1) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + output_width - 1;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if (w == 0) {
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + 0;
+            full_output_data[full_output_index] = output_data[output_index];
+          } else if ((w - start) % col == 0) {
+            int col_index = w - ((w + 1 - start) / col);
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + col_index;
+            full_output_data[full_output_index] =
+                (output_data[output_index] + output_data[output_index - 1]) / 2;
+          } else {
+            int remainder = ((w + 1 - start) % col) > 0;
+            int col_index = w - ((w + 1 - start) / col) - remainder;
+            int output_index = b * num_filters * output_size + p * output_size +
+                               h * output_width + col_index;
+            full_output_data[full_output_index] = output_data[output_index];
+          }
         }
-        return tensorIrregularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, 
-                                    vertical_pad, horizontal_pad, vertical_stride, 
-                                    horizontal_stride, conv_mode, 
-                                    compute_precision, skip_every, start);
+      }
     }
-    printf("---REGULAR CONV\n");
-    return tensorRegularConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
-                                 horizontal_pad, vertical_stride, 
-                                 horizontal_stride, conv_mode, compute_precision);
+  }
+  free(output_data);
+  free(host_data);
+
+  return full_output;
 }
 
-void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups){
-	
+void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                          int horizontal_pad, int vertical_stride,
+                          int horizontal_stride, int conv_mode,
+                          int compute_precision, int row, int col,
+                          int skip_every, int start) {
+  if (row > 1) {
+    printf("ROW PERFORATION\n");
+    return tensorRowPerfConvolutionCPU(
+        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+        horizontal_stride, conv_mode, compute_precision, row, start);
+  }
+  if (col > 1) {
+    printf("COL PERFORATION\n");
+    return tensorColPerfConvolutionCPU(
+        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+        horizontal_stride, conv_mode, compute_precision, col, start);
+  }
+  if (skip_every > 1) {
+    printf("INPUT FILTERING\n");
     Tensor *input = (Tensor *)input_ptr;
     Tensor *filter = (Tensor *)filter_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_filter = (float *)filter->host_data;
-
-    const int batch_size = input->dims.dim_sizes[0];
-    const int channels = input->dims.dim_sizes[1];
-    const int image_height = input->dims.dim_sizes[2];
-    const int image_width = input->dims.dim_sizes[3];
-    const int num_filters = filter->dims.dim_sizes[0];
+
     const int kernel_height = filter->dims.dim_sizes[2];
     const int kernel_width = filter->dims.dim_sizes[3];
-    const int output_height = 
-        1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-    const int output_width = 
-        1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
-    const int filter_dim = kernel_height * kernel_width;
-    const int num_filter_elem = filter_dim * channels;
-    const int output_size = output_width * output_height;
-    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, channels, 
-                                                    output_height * output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-    
-    const long int conv_data_size = 
-        sizeof(float) * num_filter_elem * output_height * output_width * batch_size;
-    float *host_data = (float *) malloc(conv_data_size);
-   
-    omp_set_num_threads(4);
-     #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            for(int h = 0; h < output_height; h++) {
-                for(int w = 0; w < output_width; w++) {
-                    const int inH = h * vertical_stride - vertical_pad;
-                    const int inW = w * horizontal_stride - horizontal_pad;
-                    for(int i = 0; i < kernel_height; i++) {
-                        for(int j = 0; j < kernel_width; j++) {
-                            const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j;
-                            const int output_index = h * output_width + w;
-                            const int out_index = b * num_filter_elem * output_size 
-                                        + output_index * num_filter_elem + filter_elem_num;
-                            if(inH + i >= 0 && inH + i < image_height 
-                                && inW + j >= 0 && inW + j < image_width) {
-                                host_data[out_index] = 
-                                    host_image[((b * channels + ch) * image_height 
-                                        + (inH + i)) * image_width + (inW + j)];
-                            } else {
-                                host_data[out_index] = 0;
-                            }
-                        }
-                    }
-                }
+
+    if (!(kernel_height * kernel_width % skip_every)) {
+      return tensorRegularFilterSamplingConvolutionCPU(
+          input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+          horizontal_stride, conv_mode, compute_precision, skip_every, start);
+    }
+    return tensorIrregularFilterSamplingConvolutionCPU(
+        input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+        horizontal_stride, conv_mode, compute_precision, skip_every, start);
+  }
+  printf("---REGULAR CONV\n");
+  return tensorRegularConvolutionCPU(
+      input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
+      horizontal_stride, conv_mode, compute_precision);
+}
+
+void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+                           int horizontal_pad, int vertical_stride,
+                           int horizontal_stride, int conv_mode,
+                           int conv_groups) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_filter = (float *)filter->host_data;
+
+  const int batch_size = input->dims.dim_sizes[0];
+  const int channels = input->dims.dim_sizes[1];
+  const int image_height = input->dims.dim_sizes[2];
+  const int image_width = input->dims.dim_sizes[3];
+  const int num_filters = filter->dims.dim_sizes[0];
+  const int kernel_height = filter->dims.dim_sizes[2];
+  const int kernel_width = filter->dims.dim_sizes[3];
+  const int output_height =
+      1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
+  const int output_width =
+      1 +
+      ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
+  const int filter_dim = kernel_height * kernel_width;
+  const int num_filter_elem = filter_dim * channels;
+  const int output_size = output_width * output_height;
+
+  Tensor *output = (Tensor *)create4DTensorCPU(
+      0, 0, batch_size, num_filters, channels, output_height * output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  const long int conv_data_size = sizeof(float) * num_filter_elem *
+                                  output_height * output_width * batch_size;
+  float *host_data = (float *)malloc(conv_data_size);
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      for (int h = 0; h < output_height; h++) {
+        for (int w = 0; w < output_width; w++) {
+          const int inH = h * vertical_stride - vertical_pad;
+          const int inW = w * horizontal_stride - horizontal_pad;
+          for (int i = 0; i < kernel_height; i++) {
+            for (int j = 0; j < kernel_width; j++) {
+              const int filter_elem_num =
+                  (ch * kernel_height + i) * kernel_width + j;
+              const int output_index = h * output_width + w;
+              const int out_index = b * num_filter_elem * output_size +
+                                    output_index * num_filter_elem +
+                                    filter_elem_num;
+              if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 &&
+                  inW + j < image_width) {
+                host_data[out_index] =
+                    host_image[((b * channels + ch) * image_height +
+                                (inH + i)) *
+                                   image_width +
+                               (inW + j)];
+              } else {
+                host_data[out_index] = 0;
+              }
             }
+          }
         }
-        for (int p = 0; p < num_filters; ++p) {
-             for (int m = 0; m < output_size; ++m) {
-                for (int ch = 0; ch < channels; ch++) {
-                    float sum = 0;
-                    #pragma omp simd reduction(+:sum)
-                    for (int k = 0; k < filter_dim; ++k) {
-                        int input_index = k + ch * filter_dim + num_filter_elem * m + b * num_filter_elem * output_size;
-                        sum += host_data[input_index] * host_filter[p * num_filter_elem + ch * filter_dim + k];
-                    }
-                    output_data[b * (output_size * num_filters * channels) + p * output_size * channels + ch * output_size + m] = sum;
-                }
-            }
+      }
+    }
+    for (int p = 0; p < num_filters; ++p) {
+      for (int m = 0; m < output_size; ++m) {
+        for (int ch = 0; ch < channels; ch++) {
+          float sum = 0;
+#pragma omp simd reduction(+ : sum)
+          for (int k = 0; k < filter_dim; ++k) {
+            int input_index = k + ch * filter_dim + num_filter_elem * m +
+                              b * num_filter_elem * output_size;
+            sum += host_data[input_index] *
+                   host_filter[p * num_filter_elem + ch * filter_dim + k];
+          }
+          output_data[b * (output_size * num_filters * channels) +
+                      p * output_size * channels + ch * output_size + m] = sum;
         }
+      }
     }
+  }
 
-    free(host_data);
-    return output;
+  free(host_data);
+  return output;
 }
 
-void* tensorAddCPU(void *x_ptr, void *bias_ptr) {
-    Tensor *x = (Tensor *)x_ptr;
-    Tensor *bias = (Tensor *)bias_ptr;
-    
-    float * __restrict__ x_data = (float *)x->host_data;
-    float * __restrict__ bias_data = (float *)bias->host_data;
-    int n = x->dims.dim_sizes[0];
-    int c = x->dims.dim_sizes[1];
-    int h = x->dims.dim_sizes[2];
-    int w = x->dims.dim_sizes[3];
-    
-    if(x->num_elems == bias->num_elems) {
-        int const1 = c * h * w;
-        int const2 = h * w;
-         omp_set_num_threads(4);
-        #pragma omp parallel for
-        for (int i = 0; i < n; i++) { 
-            for (int j = 0; j < c; j++) {
-                 #pragma omp simd collapse(2)
-                for (int k = 0; k < h; k++) {
-                    for (int l = 0; l < w; l++) {
-                        x_data[i * const1 + j * const2 + (k * w)  + l] += 
-                                bias_data[i * const1 + j * const2 + (k*w) + l];
-                    }
-                }
-            }
+void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
+  Tensor *x = (Tensor *)x_ptr;
+  Tensor *bias = (Tensor *)bias_ptr;
+
+  float *__restrict__ x_data = (float *)x->host_data;
+  float *__restrict__ bias_data = (float *)bias->host_data;
+  int n = x->dims.dim_sizes[0];
+  int c = x->dims.dim_sizes[1];
+  int h = x->dims.dim_sizes[2];
+  int w = x->dims.dim_sizes[3];
+
+  if (x->num_elems == bias->num_elems) {
+    int const1 = c * h * w;
+    int const2 = h * w;
+    omp_set_num_threads(4);
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+#pragma omp simd collapse(2)
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            x_data[i * const1 + j * const2 + (k * w) + l] +=
+                bias_data[i * const1 + j * const2 + (k * w) + l];
+          }
         }
-    } else {
-         omp_set_num_threads(4);
-        #pragma omp parallel for
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < c; j++) {
-                #pragma omp simd collapse(2)
-                for (int k = 0; k < h; k++) {
-                    for (int l = 0; l < w; l++) {
-                        x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
-                    }
-                }
-            }
-        }   
+      }
+    }
+  } else {
+    omp_set_num_threads(4);
+#pragma omp parallel for
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+#pragma omp simd collapse(2)
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
+          }
+        }
+      }
     }
-    
-    return x;
+  }
+
+  return x;
 }
 
 float max(float v1, float v2) __attribute__((always_inline));
-inline float maximum(float v1, float v2){
-    return (v1 < v2) ? v2 : v1;
-}
+inline float maximum(float v1, float v2) { return (v1 < v2) ? v2 : v1; }
 
 void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
-             int window_width, int vertical_pad, int horizontal_pad,
-                          int vertical_stride, int horizontal_stride) {
-   
-    Tensor *input = (Tensor *)input_ptr;
-    float * __restrict__ input_data = (float *)input->host_data;
-    
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    
-    int output_height = 
-        1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
-    int output_width = 
-        1 + ((image_width - window_width + 2 * horizontal_pad) / horizontal_stride);
-    
-    int center_x = (window_width - 1) / 2 - horizontal_pad;
-    int center_y = (window_height - 1) / 2 - vertical_pad;
-    int x_radius = (window_width - 1) / 2;
-    int y_radius = (window_height - 1) / 2;
-    
-    Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, channels, 
-                                                output_height, output_width);
-    float * __restrict__ output_data = (float *)output->host_data;
-   
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for (int b = 0; b < batch_size; b++) {
-        for (int ch = 0; ch < channels; ch++) {
-            int ii = 0, jj = 0;
-            for (int r = center_y; r < image_height + vertical_pad - y_radius; 
-                                                        r += vertical_stride) {
-                for (int c = center_x; c < image_width + horizontal_pad - x_radius; 
-                                                            c += horizontal_stride) {
-                    float val = (poolFunction == 0) ? -3.40282e+38 : 0;
-                    int y_radius_var = y_radius - r;
-                    int y_radius_var_max = y_radius_var + image_height;
-                    int x_radius_var = x_radius - c;
-                    int x_radius_var_max = x_radius_var + image_width;
-                    int ki_min = (y_radius_var > 0) ? 
-                        ((y_radius_var < window_height) ? y_radius_var : -1) : 0;
-                    int ki_max = (y_radius_var_max < window_height) ? 
-                                 ((y_radius_var_max >= 0) ?  y_radius_var_max : -1) : window_height;
-                    int kj_min = (x_radius_var > 0) ? 
-                                ((x_radius_var < window_width) ? x_radius_var : -1) : 0;
-                    int kj_max = (x_radius_var_max < window_width) ? 
-                                    ((x_radius_var_max >= 0) ?  x_radius_var_max : -1) : window_width;
-                                        
-                    if(ki_min != ki_max && kj_min != kj_max && ki_min != -1 
-                            && ki_max != -1 && kj_min != -1 && kj_max != -1) {
-                        if(!poolFunction) {
-                            for (int ki = 0; ki < window_height; ki++) {
-                                for (int kj = 0; kj < window_width; kj++) {
-                                    val = maximum(
-                                    val,
-                                    input_data[b * (channels * image_height * image_width) +
-                                    ch * (image_height * image_width) +
-                                    (r - y_radius + ki) * image_width + (c - x_radius + kj)]);
-                                }
-                            }
-                        } else {
-                            for (int ki = 0; ki < window_height; ki++) {
-                                for (int kj = 0; kj < window_width; kj++) {
-                                    val += input_data[b * (channels * image_height * image_width) 
-                                            + ch * (image_height * image_width) +
-                                            (r - y_radius + ki) * image_width + (c - x_radius + kj)];
-                                }
-                            }
-                        }
-                    }
-                    if (poolFunction == 1) {
-                        val /= window_height * window_width;
-                    }
-                    output_data[b * (channels * output_height * output_width) +
-                        ch * (output_height * output_width) + ii * output_width + jj] = val;
-                    jj++;
-                    if (jj == output_width) {
-                        jj = 0;
-                        ii++;
-                    }
+                       int window_width, int vertical_pad, int horizontal_pad,
+                       int vertical_stride, int horizontal_stride) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  float *__restrict__ input_data = (float *)input->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+
+  int output_height =
+      1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
+  int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) /
+                          horizontal_stride);
+
+  int center_x = (window_width - 1) / 2 - horizontal_pad;
+  int center_y = (window_height - 1) / 2 - vertical_pad;
+  int x_radius = (window_width - 1) / 2;
+  int y_radius = (window_height - 1) / 2;
+
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels,
+                                               output_height, output_width);
+  float *__restrict__ output_data = (float *)output->host_data;
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      int ii = 0, jj = 0;
+      for (int r = center_y; r < image_height + vertical_pad - y_radius;
+           r += vertical_stride) {
+        for (int c = center_x; c < image_width + horizontal_pad - x_radius;
+             c += horizontal_stride) {
+          float val = (poolFunction == 0) ? -3.40282e+38 : 0;
+          int y_radius_var = y_radius - r;
+          int y_radius_var_max = y_radius_var + image_height;
+          int x_radius_var = x_radius - c;
+          int x_radius_var_max = x_radius_var + image_width;
+          int ki_min =
+              (y_radius_var > 0)
+                  ? ((y_radius_var < window_height) ? y_radius_var : -1)
+                  : 0;
+          int ki_max = (y_radius_var_max < window_height)
+                           ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1)
+                           : window_height;
+          int kj_min = (x_radius_var > 0)
+                           ? ((x_radius_var < window_width) ? x_radius_var : -1)
+                           : 0;
+          int kj_max = (x_radius_var_max < window_width)
+                           ? ((x_radius_var_max >= 0) ? x_radius_var_max : -1)
+                           : window_width;
+
+          if (ki_min != ki_max && kj_min != kj_max && ki_min != -1 &&
+              ki_max != -1 && kj_min != -1 && kj_max != -1) {
+            if (!poolFunction) {
+              for (int ki = 0; ki < window_height; ki++) {
+                for (int kj = 0; kj < window_width; kj++) {
+                  val = maximum(
+                      val,
+                      input_data[b * (channels * image_height * image_width) +
+                                 ch * (image_height * image_width) +
+                                 (r - y_radius + ki) * image_width +
+                                 (c - x_radius + kj)]);
+                }
+              }
+            } else {
+              for (int ki = 0; ki < window_height; ki++) {
+                for (int kj = 0; kj < window_width; kj++) {
+                  val +=
+                      input_data[b * (channels * image_height * image_width) +
+                                 ch * (image_height * image_width) +
+                                 (r - y_radius + ki) * image_width +
+                                 (c - x_radius + kj)];
                 }
+              }
             }
+          }
+          if (poolFunction == 1) {
+            val /= window_height * window_width;
+          }
+          output_data[b * (channels * output_height * output_width) +
+                      ch * (output_height * output_width) + ii * output_width +
+                      jj] = val;
+          jj++;
+          if (jj == output_width) {
+            jj = 0;
+            ii++;
+          }
         }
+      }
     }
-  
-    return output;
+  }
+
+  return output;
 }
 
 void *tensorTanhCPU(void *input_ptr) {
-    Tensor *input = (Tensor *)input_ptr;
-    
-    float *input_data = (float *)input->host_data;
-    size_t num_elems = input->num_elems;
-    
-     omp_set_num_threads(4);
-     #pragma omp parallel for
-    for (size_t i = 0; i < num_elems; i++) {
-        input_data[i] = tanhf(input_data[i]);
-    }
-   
-    return input;
+  Tensor *input = (Tensor *)input_ptr;
+
+  float *input_data = (float *)input->host_data;
+  size_t num_elems = input->num_elems;
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (size_t i = 0; i < num_elems; i++) {
+    input_data[i] = tanhf(input_data[i]);
+  }
+
+  return input;
 }
 
 void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
-    Tensor *lhs = (Tensor *)lhs_ptr;
-    Tensor *rhs = (Tensor *)rhs_ptr;
-    
-    int m = lhs->dims.dim_sizes[0];
-    int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
-    int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
-    
-    Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1);
-
-    float * __restrict__ lhs_arr = (float *)lhs->host_data;
-    float * __restrict__ rhs_arr = (float *)rhs->host_data;
-    float * __restrict__ output_arr = (float *)output->host_data;
-    
-    int k = 1;
-    #pragma unroll 4   // Can we unroll more???
-    for (int j = 1; j < lhs->dims.num_dims; j++) {
-        k = k * lhs->dims.dim_sizes[j]; // input neurons
-    }
-    float *tran_rhs = (float *) malloc(sizeof(float) * k * n);
-    omp_set_num_threads(4);
-    #pragma omp parallel for simd
-    for (int l = 0; l < k; l++) {
-        for (int j = 0; j < n; j++) {
-            tran_rhs[j * k + l] = rhs_arr[l * n + j];
-        }   
+  Tensor *lhs = (Tensor *)lhs_ptr;
+  Tensor *rhs = (Tensor *)rhs_ptr;
+
+  int m = lhs->dims.dim_sizes[0];
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
+  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
+
+  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1);
+
+  float *__restrict__ lhs_arr = (float *)lhs->host_data;
+  float *__restrict__ rhs_arr = (float *)rhs->host_data;
+  float *__restrict__ output_arr = (float *)output->host_data;
+
+  int k = 1;
+#pragma unroll 4 // Can we unroll more???
+  for (int j = 1; j < lhs->dims.num_dims; j++) {
+    k = k * lhs->dims.dim_sizes[j]; // input neurons
+  }
+  float *tran_rhs = (float *)malloc(sizeof(float) * k * n);
+  omp_set_num_threads(4);
+#pragma omp parallel for simd
+  for (int l = 0; l < k; l++) {
+    for (int j = 0; j < n; j++) {
+      tran_rhs[j * k + l] = rhs_arr[l * n + j];
     }
-    
-    #pragma omp parallel for
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-           float sum = 0.0;
-          #pragma omp simd reduction(+:sum)
-           for (int l = 0; l < k; l++) {
-                sum += lhs_arr[i * k + l] * tran_rhs[j * k + l];
-            }
-            output_arr[i * n + j] = sum;
-        }
+  }
+
+#pragma omp parallel for
+  for (int i = 0; i < m; i++) {
+    for (int j = 0; j < n; j++) {
+      float sum = 0.0;
+#pragma omp simd reduction(+ : sum)
+      for (int l = 0; l < k; l++) {
+        sum += lhs_arr[i * k + l] * tran_rhs[j * k + l];
+      }
+      output_arr[i * n + j] = sum;
     }
-    free(tran_rhs);
-    return output;
+  }
+  free(tran_rhs);
+  return output;
 }
 
 void *tensorSoftmaxCPU(void *input_ptr) {
-    Tensor *input = (Tensor *)input_ptr;
-    
-    float *logits = (float *)input->host_data;
-    int n = input->dims.dim_sizes[0];
-    int c = input->dims.dim_sizes[1];
-    
-     omp_set_num_threads(4);
-    #pragma omp parallel for
-    for (int i = 0; i < n; i++) {
-        float x = 0;
-        for(int j = i*c; j < c + i*c; j++) {
-            logits[j] = expf(logits[j]);
-        }
-       
-        #pragma omp simd reduction(+:x)
-        for(int j = i*c; j < i*c+c; j++) {
-            x += logits[j];
-        }
-        
-         #pragma omp simd
-        for(int j = i*c; j < i*c + c; j++) {
-            logits[j] /= x;
-        }                                                                                                                                                   
+  Tensor *input = (Tensor *)input_ptr;
+
+  float *logits = (float *)input->host_data;
+  int n = input->dims.dim_sizes[0];
+  int c = input->dims.dim_sizes[1];
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int i = 0; i < n; i++) {
+    float x = 0;
+    for (int j = i * c; j < c + i * c; j++) {
+      logits[j] = expf(logits[j]);
     }
 
-    return input;
-}
-
-void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-                         void* mean_ptr, void* variance_ptr, double epsilon) {
-    
-    Tensor* input = (Tensor*) input_ptr;
-    Tensor* gamma = (Tensor*) gamma_ptr;
-    Tensor* beta = (Tensor*) beta_ptr;
-    Tensor* mean = (Tensor*) mean_ptr;
-    Tensor* variance = (Tensor*) variance_ptr;
-    
-    float * __restrict__ host_image = (float *)input->host_data;
-    float * __restrict__ host_beta = (float *)beta->host_data;
-    float * __restrict__ host_gamma = (float *)gamma->host_data;
-    float * __restrict__ host_mean = (float *)mean->host_data;
-    float * __restrict__ host_variance = (float *)variance->host_data;
-    
-    float alpha_val = 1.0f, beta_val = 0.0f;
-    size_t num_elems = input->num_elems;
-
-    int batch_size = input->dims.dim_sizes[0];
-    int channels = input->dims.dim_sizes[1];
-    int image_height = input->dims.dim_sizes[2];
-    int image_width = input->dims.dim_sizes[3];
-    int image_dim = image_height * image_width;
+#pragma omp simd reduction(+ : x)
+    for (int j = i * c; j < i * c + c; j++) {
+      x += logits[j];
+    }
 
-    omp_set_num_threads(4);
-    #pragma omp parallel for
-    for(int b = 0; b < batch_size; b++) {
-        for(int ch = 0; ch < channels; ch++) {
-            float mean = 0;
-            #pragma omp simd reduction(+:mean)
-            for(int i = 0; i < image_dim; i++) {
-                int index = b * channels * image_dim + ch * image_dim + i;
-                mean += host_image[index];
-            }
-            mean = mean / channels;
-         
-            float variance = 0;
-            #pragma omp simd reduction(+:variance)
-            for(int i = 0; i < image_dim; i++) {
-                int index = b * channels * image_dim + ch * image_dim + i;
-                float tmp = host_image[index] - mean;
-                variance += (tmp * tmp);  
-            }
-            variance = variance / channels;
-            
-           #pragma omp simd 
-            for(int i = 0; i < image_dim; i++) {
-                int index = b * channels * image_dim + ch * image_dim + i;
-                host_image[index] = host_beta[ch] 
-                                  + (host_gamma[ch] * ((host_image[index] - mean) / sqrt(epsilon + variance)));
-            }
-        }
+#pragma omp simd
+    for (int j = i * c; j < i * c + c; j++) {
+      logits[j] /= x;
     }
-    return input;
+  }
+
+  return input;
 }
 
- void *tensorReluCPU(void *input_ptr) {
-     Tensor *input = (Tensor *)input_ptr;
-     float *input_data = (float *)input->host_data;
-     size_t num_elems = input->num_elems;
-     
-     #pragma omp simd
-     for (size_t i = 0; i < num_elems; i++) {
-         input_data[i] = (input_data[i] < 0) ? 0 : input_data[i];
+void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                         void *mean_ptr, void *variance_ptr, double epsilon) {
+
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *gamma = (Tensor *)gamma_ptr;
+  Tensor *beta = (Tensor *)beta_ptr;
+  Tensor *mean = (Tensor *)mean_ptr;
+  Tensor *variance = (Tensor *)variance_ptr;
+
+  float *__restrict__ host_image = (float *)input->host_data;
+  float *__restrict__ host_beta = (float *)beta->host_data;
+  float *__restrict__ host_gamma = (float *)gamma->host_data;
+  float *__restrict__ host_mean = (float *)mean->host_data;
+  float *__restrict__ host_variance = (float *)variance->host_data;
+
+  float alpha_val = 1.0f, beta_val = 0.0f;
+  size_t num_elems = input->num_elems;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+  int image_height = input->dims.dim_sizes[2];
+  int image_width = input->dims.dim_sizes[3];
+  int image_dim = image_height * image_width;
+
+  omp_set_num_threads(4);
+#pragma omp parallel for
+  for (int b = 0; b < batch_size; b++) {
+    for (int ch = 0; ch < channels; ch++) {
+      float mean = 0;
+#pragma omp simd reduction(+ : mean)
+      for (int i = 0; i < image_dim; i++) {
+        int index = b * channels * image_dim + ch * image_dim + i;
+        mean += host_image[index];
+      }
+      mean = mean / channels;
+
+      float variance = 0;
+#pragma omp simd reduction(+ : variance)
+      for (int i = 0; i < image_dim; i++) {
+        int index = b * channels * image_dim + ch * image_dim + i;
+        float tmp = host_image[index] - mean;
+        variance += (tmp * tmp);
+      }
+      variance = variance / channels;
+
+#pragma omp simd
+      for (int i = 0; i < image_dim; i++) {
+        int index = b * channels * image_dim + ch * image_dim + i;
+        host_image[index] =
+            host_beta[ch] + (host_gamma[ch] * ((host_image[index] - mean) /
+                                               sqrt(epsilon + variance)));
+      }
     }
+  }
+  return input;
+}
 
-    return input;
+void *tensorReluCPU(void *input_ptr) {
+  Tensor *input = (Tensor *)input_ptr;
+  float *input_data = (float *)input->host_data;
+  size_t num_elems = input->num_elems;
+
+#pragma omp simd
+  for (size_t i = 0; i < num_elems; i++) {
+    input_data[i] = (input_data[i] < 0) ? 0 : input_data[i];
+  }
+
+  return input;
 }
 
 void *tensorRelu2CPU(void *input_ptr, float min, float max) {
-    Tensor *input = (Tensor *)input_ptr;
-    float *input_data = (float *)input->host_data;
-    size_t num_elems = input->num_elems;
-    
-    #pragma omp simd
-    for (size_t i = 0; i < num_elems; i++) {
-        input_data[i] = (input_data[i] < min) ? min : ((input_data[i] > max) ? 
-                                                        max : input_data[i]);
-    }       
-
-    return input;
-}         
+  Tensor *input = (Tensor *)input_ptr;
+  float *input_data = (float *)input->host_data;
+  size_t num_elems = input->num_elems;
+
+#pragma omp simd
+  for (size_t i = 0; i < num_elems; i++) {
+    input_data[i] = (input_data[i] < min)
+                        ? min
+                        : ((input_data[i] > max) ? max : input_data[i]);
+  }
+
+  return input;
+}
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index 1b28ccaa19..253f761433 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -1,8 +1,9 @@
-/* This file includes the API implementation of the HPVM tensor runtime built on cublas, cudnn
-**
-**  Author: Hashim Sharif
-**  Email: hsharif3@illinois.edu
-*/
+/* This file includes the API implementation of the HPVM tensor runtime built on
+ *cublas, cudnn
+ **
+ **  Author: Hashim Sharif
+ **  Email: hsharif3@illinois.edu
+ */
 
 #include <stdio.h>
 #include <stdarg.h>
@@ -31,7 +32,6 @@
 #include <cuda_fp16.h>
 #include <driver_types.h>
 
-
 // Tensor runtime header files
 #include "tensor_runtime.h"
 #include "tensor_utils.h"
@@ -46,202 +46,177 @@
 #include "half_precision_api.h"
 #include "approx_simulation.h"
 
+// FIXIT: tensorAdd currently only works for 4D tensors
+void *tensorAdd(void *x_ptr, void *bias_ptr) {
 
+  Tensor *x = (Tensor *)x_ptr;
+  Tensor *bias = (Tensor *)bias_ptr;
 
-
-
-// FIXIT: tensorAdd currently only works for 4D tensors
-void* tensorAdd(void* x_ptr, void* bias_ptr){
-  
-  Tensor* x = (Tensor*) x_ptr;
-  Tensor* bias = (Tensor*) bias_ptr;
-  
-  INFO("*** TensorAdd \n");  
+  INFO("*** TensorAdd \n");
   profileEvent("Add");
-    
+
   float alpha = 1.0f;
-  //float beta = 0.0f;
+  // float beta = 0.0f;
   hostToDeviceCopy(x);
   hostToDeviceCopy(bias);
 
   convertToFP32(x);
   convertToFP32(bias);
 
-  
   DEBUG("x->num_elems = %d \n", x->num_elems);
   DEBUG("bias->num_elems = %d \n", bias->num_elems);
 
-  if(cudnnHandle == NULL){
-    ERROR("cudnnHandle NOT initialized!! \n");    
+  if (cudnnHandle == NULL) {
+    ERROR("cudnnHandle NOT initialized!! \n");
   }
-  
+
   // FIXIT: routine fails for 3D tensors
   checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc,
-			    bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data));
+                            bias->gpu_data, &alpha, x->tensor_desc,
+                            x->gpu_data));
 
   profileEvent("Add_end", true);
 
   return x;
 }
 
-
 // FIXIT: Generalize all of the routines for types {half, float, double}
-void* tensorConvolution(void* input_ptr, void* filter_ptr,
-			int vertical_pad, int horizontal_pad,
-			int vertical_stride, int horizontal_stride,
-			int conv_mode, int conv_groups){  
-  
+void *tensorConvolution(void *input_ptr, void *filter_ptr, int vertical_pad,
+                        int horizontal_pad, int vertical_stride,
+                        int horizontal_stride, int conv_mode, int conv_groups) {
+
   INFO("*** TensorConvolution \n");
   profileEvent("Conv");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* filter = (Tensor*) filter_ptr;
-  
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *filter = (Tensor *)filter_ptr;
+
   cudnnConvolutionDescriptor_t convDesc;
   cudnnConvolutionFwdAlgo_t convAlgo;
   cudnnConvolutionMode_t mode;
-  if(conv_mode == 0)
+  if (conv_mode == 0)
     mode = CUDNN_CONVOLUTION;
-  else if(conv_mode == 1)
+  else if (conv_mode == 1)
     mode = CUDNN_CROSS_CORRELATION;
 
   mode = CUDNN_CROSS_CORRELATION;
   // FIXIT: Need to be more aware of the implications of alpha and beta
   float alpha = 1.0f, beta = 0.0f;
-  
-  // TODO: Support other cases;  
+
+  // TODO: Support other cases;
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
   convertToFP32(input);
   convertToFP32(filter);
 
-  
-  DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);  
+  DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+        horizontal_stride);
 
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
 
-  //FIXME: Current hack to preserve backward compatibilty
-  if(conv_groups == 0){
+  // FIXME: Current hack to preserve backward compatibilty
+  if (conv_groups == 0) {
     conv_groups = 1;
-  }  
-  
-  
+  }
+
   cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
-  
-  checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
-					     vertical_pad, horizontal_pad, // conv padding
-					     vertical_stride, horizontal_stride, // conv strides
-					     1, 1, // upscaling values
-					     mode , // mode is configurable
-                                             computeType)); // defines compute precision
+
+  checkCUDNN(cudnnSetConvolution2dDescriptor(
+      convDesc, vertical_pad, horizontal_pad, // conv padding
+      vertical_stride, horizontal_stride,     // conv strides
+      1, 1,                                   // upscaling values
+      mode,                                   // mode is configurable
+      computeType));                          // defines compute precision
 
   // NOTE: Set conv groups for grouped convolution e.g., depthwise convolution
   checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
 
-  int n, c, h, w; // output dimensions  
+  int n, c, h, w; // output dimensions
   // Find dimension of convolution output
 
-  if(input->tensor_desc == NULL || filter->filter_desc == NULL)
+  if (input->tensor_desc == NULL || filter->filter_desc == NULL)
     ERROR("Input or Filter descriptor is NULL");
-    
-  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
-						   input->tensor_desc,
-						   filter->filter_desc,
-						   &n, &c, &h, &w));
 
-    
+  checkCUDNN(cudnnGetConvolution2dForwardOutputDim(
+      convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w));
+
   DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
-  Tensor* output;
-  if(input->data_format == CUDNN_TENSOR_NCHW)
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type,  
-			              CUDNN_TENSOR_NCHW, n, c, h, w);
-  else if(input->data_format == CUDNN_TENSOR_NHWC){
+  Tensor *output;
+  if (input->data_format == CUDNN_TENSOR_NCHW)
+    output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                      CUDNN_TENSOR_NCHW, n, c, h, w);
+  else if (input->data_format == CUDNN_TENSOR_NHWC) {
     DEBUG("* NHWC Format \n");
-    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
-			              CUDNN_TENSOR_NHWC, n, h, w, c);
-  }
-  else
+    output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
+                                      CUDNN_TENSOR_NHWC, n, h, w, c);
+  } else
     ERROR("Unsupported Tensor Type");
 
   // NOTE: Changing output tensor placement from host to device
-  changeTensorPlacement(output, DEVICE); 
+  changeTensorPlacement(output, DEVICE);
   // NOTE: Necessary to insert the above call for every output tensor
-    
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
-	output->data_type, output->data_format, output->dims.dim_sizes[0],
-	output->dims.dim_sizes[1],
-	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
-
-  if(convDesc == NULL || input->tensor_desc == NULL ||
-     filter->filter_desc == NULL || output->tensor_desc == NULL)
-    ERROR("NULL descriptor! \n");
 
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = "
+        "%d, W = %d \n",
+        output->data_type, output->data_format, output->dims.dim_sizes[0],
+        output->dims.dim_sizes[1], output->dims.dim_sizes[2],
+        output->dims.dim_sizes[3]);
+
+  if (convDesc == NULL || input->tensor_desc == NULL ||
+      filter->filter_desc == NULL || output->tensor_desc == NULL)
+    ERROR("NULL descriptor! \n");
 
   // Debugging info prints
   printTensorDescInfo(input);
   printTensorDescInfo(filter);
   printTensorDescInfo(output);
 
-  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
-  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
-						 input->tensor_desc,
-						 filter->filter_desc,
-						 convDesc,
-						 output->tensor_desc,
-						 CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,	 
-						 //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
-						 0,
-						 &convAlgo));
-
-  
+  // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support
+  // is lacking
+  checkCUDNN(cudnnGetConvolutionForwardAlgorithm(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+      // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
+      0, &convAlgo));
+
   DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
-	 CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
-	 CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
-	 
+        CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
+        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
 
   // NOTE: Currently using GEMM based convolution - other algorithms available
-  // TODO: Benchmark other convolution algorithms e.g., winograd 
+  // TODO: Benchmark other convolution algorithms e.g., winograd
   convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
 
   size_t workspace_size;
-  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
-						     input->tensor_desc,
-						     filter->filter_desc,
-						     convDesc,
-						     output->tensor_desc,
-						     convAlgo,
-						     &workspace_size));
+  checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc,
+      output->tensor_desc, convAlgo, &workspace_size));
 
   // Allocating memory for the convolution workspace
-  void* workspace;
-  checkCudaErrors(cudaMalloc(&workspace, workspace_size)); 
+  void *workspace;
+  checkCudaErrors(cudaMalloc(&workspace, workspace_size));
   DEBUG("workspace size = %d \n", workspace_size);
 
+  checkCUDNN(cudnnConvolutionForward(
+      cudnnHandle, &alpha, input->tensor_desc, input->gpu_data,
+      filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace,
+      workspace_size, &beta, output->tensor_desc, output->gpu_data));
 
-  checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
-				     input->gpu_data, filter->filter_desc, filter->gpu_data,
-				     convDesc, convAlgo, workspace, workspace_size,
-				     &beta, output->tensor_desc, output->gpu_data));
-		       
   profileEvent("Conv_end", true);
   return output;
 }
 
-
-
 // NOTE: Supports Max and Avg Pooling
-void* tensorPooling(void* input_ptr,
-		    int poolFunction,
-		    int window_height, int window_width,
-		    int vertical_pad, int horizontal_pad,
-		    int vertical_stride, int horizontal_stride){
+void *tensorPooling(void *input_ptr, int poolFunction, int window_height,
+                    int window_width, int vertical_pad, int horizontal_pad,
+                    int vertical_stride, int horizontal_stride) {
 
   INFO("*** TensorPooling \n");
   profileEvent("Pool");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   cudnnPoolingDescriptor_t poolDesc;
   // FIXIT: Need to be more aware of the implications of alpha and beta
@@ -251,65 +226,57 @@ void* tensorPooling(void* input_ptr,
 
   convertToFP32(input);
 
-  
-  checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc));            
+  checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc));
 
   int n = input->dims.dim_sizes[0];
   int c = input->dims.dim_sizes[1];
-  int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / vertical_stride;
+  int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) /
+          vertical_stride;
   h = h + 1;
-  int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / horizontal_stride;
+  int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) /
+          horizontal_stride;
   w = w + 1;
 
-  DEBUG("n = %d, c = %d, h = %d, w = %d , dim1 = %d , dim2 = %d \n",
-	n, c, h, w, input->dims.dim_sizes[2], input->dims.dim_sizes[3]);
-  
+  DEBUG("n = %d, c = %d, h = %d, w = %d , dim1 = %d , dim2 = %d \n", n, c, h, w,
+        input->dims.dim_sizes[2], input->dims.dim_sizes[3]);
 
-  Tensor* output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, n, c, h, w);
+  Tensor *output =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, n, c, h, w);
   // Changing output tensor placement from host to device
-  changeTensorPlacement(output, DEVICE); 
+  changeTensorPlacement(output, DEVICE);
 
   // FIXIT: The output tensor is hardcoded to NCHW
-  checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_desc,
-					CUDNN_TENSOR_NCHW,
-					CUDNN_DATA_FLOAT,
-					n, c,
-					h, w));
+  checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_desc, CUDNN_TENSOR_NCHW,
+                                        CUDNN_DATA_FLOAT, n, c, h, w));
 
   // Select between Max-Pooling and Avg-Pooling
   cudnnPoolingMode_t pool_mode;
-  if(poolFunction == 0)
+  if (poolFunction == 0)
     pool_mode = CUDNN_POOLING_MAX;
-  else if(poolFunction == 1)
+  else if (poolFunction == 1)
     pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-  
-  checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc,
-					 pool_mode,
-					 CUDNN_PROPAGATE_NAN,
-					 window_height, window_width,
-					 vertical_pad, horizontal_pad,
-					 vertical_stride, horizontal_stride));
-     
-  checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, input->tensor_desc,
-				 input->gpu_data, &beta, output->tensor_desc, output->gpu_data));
+
+  checkCUDNN(cudnnSetPooling2dDescriptor(
+      poolDesc, pool_mode, CUDNN_PROPAGATE_NAN, window_height, window_width,
+      vertical_pad, horizontal_pad, vertical_stride, horizontal_stride));
+
+  checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha,
+                                 input->tensor_desc, input->gpu_data, &beta,
+                                 output->tensor_desc, output->gpu_data));
 
   profileEvent("Pool_end", true);
   return output;
 }
 
-
-
-
-
-/* Reference Implementation based on: https://gist.github.com/peterwittek/6303527 */
-void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){
+/* Reference Implementation based on:
+ * https://gist.github.com/peterwittek/6303527 */
+void *tensorGemmGPU(void *lhs_ptr, void *rhs_ptr) {
 
   INFO("*** TensorGemmGPU \n");
   profileEvent("Mul");
 
-  Tensor* lhs = (Tensor*) lhs_ptr;
-  Tensor* rhs = (Tensor*) rhs_ptr;
-
+  Tensor *lhs = (Tensor *)lhs_ptr;
+  Tensor *rhs = (Tensor *)rhs_ptr;
 
   DEBUG("rhs->dims.num_dims = %d \n", rhs->dims.num_dims);
   DEBUG("lhs->dims.num_dims = %d \n", lhs->dims.num_dims);
@@ -319,30 +286,30 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){
   // 'm' holds the batch dimension - assuming NCHW format Tensors
   int m = lhs->dims.dim_sizes[0];
   // The rhs last dimension must contain the neurons
-  int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
+  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
   int k = 1;
-  
+
   // Flattening the dimensions after the batch dimension
   // NOTE: Allowing any number of dimensions > 2 for lhs
-  for (int j = 1 ; j < lhs->dims.num_dims; j++){
+  for (int j = 1; j < lhs->dims.num_dims; j++) {
     k = k * lhs->dims.dim_sizes[j]; // input neurons
   }
 
-  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
+  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
   // Dimension-note: Check if k is same across the two tensors
   DEBUG("m = %d, n = %d, k = %d \n", m, n, k);
-  if(rhs_k != k){
+  if (rhs_k != k) {
     ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k);
   }
 
-  Tensor* output = NULL;
+  Tensor *output = NULL;
   DEBUG("Creating new TENSOR * \n");
-  output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1);
+  output =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1);
 
-   
   DEBUG("Changing placement *\n");
   // Changing output tensor placement from host to device
-  changeTensorPlacement(output, DEVICE); 
+  changeTensorPlacement(output, DEVICE);
 
   DEBUG("Changed Placement * \n\n");
 
@@ -352,122 +319,105 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){
   convertToFP32(lhs);
   convertToFP32(rhs);
 
-  
   DEBUG("CuBlasSgemm *\n");
-   
+
   // INFO: cuBlas uses column-major format
   // INFO: The leading dimension is just the FIRST Dimension
-  // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN expects
-  checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
-			      n, m, k,
-			      &alpha,
-			      (float*) rhs->gpu_data, n,
-			      (float*) lhs->gpu_data, k,
-			      &beta,
-			      (float*) output->gpu_data, n));  
-
-  
+  // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN
+  // expects
+  checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
+                              &alpha, (float *)rhs->gpu_data, n,
+                              (float *)lhs->gpu_data, k, &beta,
+                              (float *)output->gpu_data, n));
+
   profileEvent("Mul_end", true);
   return output;
 }
 
-
-
-
-
-
-void* tensorRelu(void* input_ptr){
+void *tensorRelu(void *input_ptr) {
 
   DEBUG("*** TensorRelu \n");
   profileEvent("Relu");
 
-  Tensor* input = (Tensor*) input_ptr;
-  
+  Tensor *input = (Tensor *)input_ptr;
+
   cudnnActivationDescriptor_t reluDesc;
   float alpha = 1.0f, beta = 0.0f;
 
   hostToDeviceCopy(input);
 
   convertToFP32(input);
-  
-  
+
   checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc));
 
   checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU,
-					  CUDNN_PROPAGATE_NAN, 0.0));
+                                          CUDNN_PROPAGATE_NAN, 0.0));
 
   checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha,
-				    input->tensor_desc, input->gpu_data, &beta,
-				    input->tensor_desc, input->gpu_data));
+                                    input->tensor_desc, input->gpu_data, &beta,
+                                    input->tensor_desc, input->gpu_data));
 
   profileEvent("Relu_end", true);
   return input;
 }
 
-
 // Think: Should Softmax be broken into multiple IR operations?
-void* tensorSoftmax(void* input_ptr){
+void *tensorSoftmax(void *input_ptr) {
 
   INFO("*** TensorSoftmax \n");
   profileEvent("Softmax");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
   float alpha = 1.0f, beta = 0.0f;
 
   hostToDeviceCopy(input);
-  convertToFP32(input); 
-     
-  checkCUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL,
-				 &alpha, input->tensor_desc, input->gpu_data, &beta,
-				 input->tensor_desc, input->gpu_data));
+  convertToFP32(input);
+
+  checkCUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE,
+                                 CUDNN_SOFTMAX_MODE_CHANNEL, &alpha,
+                                 input->tensor_desc, input->gpu_data, &beta,
+                                 input->tensor_desc, input->gpu_data));
 
-  deviceToHostCopy(input);  
+  deviceToHostCopy(input);
   profileEvent("Softmax_end", true);
-  
+
   return input;
 }
 
-
-
-
-void* tensorRelu2(void* input_ptr, float min, float max){
+void *tensorRelu2(void *input_ptr, float min, float max) {
 
   INFO("*** TensorClippedRelu *** \n");
   profileEvent("Relu");
 
   cudnnActivationDescriptor_t reluDesc;
   float alpha = 1.0f, beta = 0.0f;
-  
-  Tensor* input = (Tensor*) input_ptr;
+
+  Tensor *input = (Tensor *)input_ptr;
 
   hostToDeviceCopy(input);
 
   convertToFP32(input);
-  
 
   checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc));
 
-  checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU,
-					  CUDNN_PROPAGATE_NAN, max));
+  checkCUDNN(cudnnSetActivationDescriptor(
+      reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_PROPAGATE_NAN, max));
 
   checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha,
-				    input->tensor_desc, input->gpu_data, &beta,
-				    input->tensor_desc, input->gpu_data));
+                                    input->tensor_desc, input->gpu_data, &beta,
+                                    input->tensor_desc, input->gpu_data));
 
-  
-  
   profileEvent("Relu_end", true);
   return input;
 }
 
-
-void* tensorTanh(void* input_ptr){
+void *tensorTanh(void *input_ptr) {
 
   INFO("*** TensorTanh \n");
   profileEvent("Tanh");
 
-  Tensor* input = (Tensor*) input_ptr;
-  
+  Tensor *input = (Tensor *)input_ptr;
+
   cudnnActivationDescriptor_t tanhDesc;
   float alpha = 1.0f, beta = 0.0f;
 
@@ -475,39 +425,36 @@ void* tensorTanh(void* input_ptr){
 
   convertToFP32(input);
 
-  
   checkCUDNN(cudnnCreateActivationDescriptor(&tanhDesc));
 
   checkCUDNN(cudnnSetActivationDescriptor(tanhDesc, CUDNN_ACTIVATION_TANH,
-					  CUDNN_PROPAGATE_NAN, 0.0));
+                                          CUDNN_PROPAGATE_NAN, 0.0));
 
   checkCUDNN(cudnnActivationForward(cudnnHandle, tanhDesc, &alpha,
-				    input->tensor_desc, input->gpu_data, &beta,
-				    input->tensor_desc, input->gpu_data));
+                                    input->tensor_desc, input->gpu_data, &beta,
+                                    input->tensor_desc, input->gpu_data));
 
   profileEvent("Tanh_end", true);
   return input;
 }
 
-
-
-
-void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
-		      void* mean_ptr, void* variance_ptr, double epsilon){
+void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
+                      void *mean_ptr, void *variance_ptr, double epsilon) {
 
   INFO("*** TensorBatchNorm \n");
   profileEvent("BatchNorm");
 
-  Tensor* input = (Tensor*) input_ptr;
-  Tensor* gamma = (Tensor*) gamma_ptr;
-  Tensor* beta = (Tensor*) beta_ptr;
-  Tensor* mean = (Tensor*) mean_ptr;
-  Tensor* variance = (Tensor*) variance_ptr;
+  Tensor *input = (Tensor *)input_ptr;
+  Tensor *gamma = (Tensor *)gamma_ptr;
+  Tensor *beta = (Tensor *)beta_ptr;
+  Tensor *mean = (Tensor *)mean_ptr;
+  Tensor *variance = (Tensor *)variance_ptr;
 
-  if (input == NULL || gamma == NULL || beta == NULL || mean == NULL || variance == NULL){
+  if (input == NULL || gamma == NULL || beta == NULL || mean == NULL ||
+      variance == NULL) {
     ERROR("NULL Input Tensor");
   }
-  
+
   float alpha_val = 1.0f, beta_val = 0.0f;
   hostToDeviceCopy(input);
   hostToDeviceCopy(gamma);
@@ -517,133 +464,127 @@ void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
 
   convertToFP32(input);
 
- 
-  
-  checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle, CUDNN_BATCHNORM_SPATIAL,
-						     &alpha_val, &beta_val,
-						     input->tensor_desc, input->gpu_data,
-						     input->tensor_desc, input->gpu_data,
-						     gamma->tensor_desc, gamma->gpu_data,
-						     beta->gpu_data, mean->gpu_data,
-						     variance->gpu_data,
-						     epsilon));
+  checkCUDNN(cudnnBatchNormalizationForwardInference(
+      cudnnHandle, CUDNN_BATCHNORM_SPATIAL, &alpha_val, &beta_val,
+      input->tensor_desc, input->gpu_data, input->tensor_desc, input->gpu_data,
+      gamma->tensor_desc, gamma->gpu_data, beta->gpu_data, mean->gpu_data,
+      variance->gpu_data, epsilon));
 
   profileEvent("BatchNorm_end", true);
   return input;
 }
 
-
-
-
 // TODO: benchmark performance of tensorSplit
-void** tensorSplit(void* tensor_ptr, int num_splits, int split_dim){
+void **tensorSplit(void *tensor_ptr, int num_splits, int split_dim) {
 
-  INFO("*** TensorSplit \n");  
+  INFO("*** TensorSplit \n");
   profileEvent("tensorSplit");
 
-  Tensor* tensor = (Tensor*) tensor_ptr;
-  
+  Tensor *tensor = (Tensor *)tensor_ptr;
+
   deviceToHostCopy(tensor); // Splitting done on the host
 
-  Tensor** splits = (Tensor**) malloc(sizeof(Tensor*) * num_splits);
-  size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * tensor->dims.num_dims);
-  for(unsigned int i = 0; i < tensor->dims.num_dims; i++){
+  Tensor **splits = (Tensor **)malloc(sizeof(Tensor *) * num_splits);
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * tensor->dims.num_dims);
+  for (unsigned int i = 0; i < tensor->dims.num_dims; i++) {
     dim_sizes[i] = tensor->dims.dim_sizes[i];
   }
 
-  
   dim_sizes[split_dim] = tensor->dims.dim_sizes[split_dim] / num_splits;
-  if(dim_sizes[split_dim] < 1)
+  if (dim_sizes[split_dim] < 1)
     ERROR("Split Dimension < 1 after splitting");
 
   size_t copy_size = getTypeSize(tensor->data_type);
-  for(unsigned int i = split_dim; i < tensor->dims.num_dims; i++){
+  for (unsigned int i = split_dim; i < tensor->dims.num_dims; i++) {
     copy_size = copy_size * dim_sizes[i];
   }
-  
-  for(unsigned int i = 0; i < num_splits; i++){
 
-    DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] = %d \n",
-	 dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
+  for (unsigned int i = 0; i < num_splits; i++) {
+
+    DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, "
+          "dim_sizes[3] = %d \n",
+          dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
+
+    Tensor *split = (Tensor *)create4DTensor(
+        tensor->data_type, tensor->data_format, dim_sizes[0], dim_sizes[1],
+        dim_sizes[2], dim_sizes[3]);
 
-    Tensor* split = (Tensor*) create4DTensor(tensor->data_type, tensor->data_format,
-					  dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
-    
     size_t copy_start = i * copy_size;
     size_t copy_stride = num_splits * copy_size;
-    DEBUG("copy_size = %d, copy_start = %d, copy_stride = %d, tensor->size_in_bytes = %d \n",
-	 copy_size, copy_start, copy_stride, tensor->size_in_bytes);
+    DEBUG("copy_size = %d, copy_start = %d, copy_stride = %d, "
+          "tensor->size_in_bytes = %d \n",
+          copy_size, copy_start, copy_stride, tensor->size_in_bytes);
 
     int index = 0;
-    while(copy_start + copy_size <= tensor->size_in_bytes){
-      memcpy(((char*) split->host_data + (index * copy_size)),
-	     ((char*)tensor->host_data + copy_start),
-	     copy_size);
+    while (copy_start + copy_size <= tensor->size_in_bytes) {
+      memcpy(((char *)split->host_data + (index * copy_size)),
+             ((char *)tensor->host_data + copy_start), copy_size);
       copy_start += copy_stride;
       index++;
     }
-   	
-    splits[i] = split;     
+
+    splits[i] = split;
   }
 
   profileEvent("tensorSplit_end", true);
 
-  return (void**) splits;
+  return (void **)splits;
 }
 
+void *tensorConcat(void **tensors_ptr, int num_splits, int split_dim) {
 
-void* tensorConcat(void** tensors_ptr, int num_splits, int split_dim){
-
-  INFO("*** TensorConcat \n");  
+  INFO("*** TensorConcat \n");
   profileEvent("tensorConcat");
 
-  Tensor** tensors = (Tensor**) tensors_ptr;
+  Tensor **tensors = (Tensor **)tensors_ptr;
 
-  for(int i = 0; i < num_splits; i++){
+  for (int i = 0; i < num_splits; i++) {
     deviceToHostCopy(tensors[i]); // Concatenation done on the host
   }
-  
+
   // The no of dimensions of concatenated tensor are the same
-  size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * tensors[0]->dims.num_dims);
-  for(unsigned int i = 0; i < tensors[0]->dims.num_dims; i++){
+  size_t *dim_sizes =
+      (size_t *)malloc(sizeof(size_t) * tensors[0]->dims.num_dims);
+  for (unsigned int i = 0; i < tensors[0]->dims.num_dims; i++) {
     dim_sizes[i] = tensors[0]->dims.dim_sizes[i];
   }
-  
+
   size_t copy_size = getTypeSize(tensors[0]->data_type);
-  for(unsigned int i = split_dim; i < tensors[0]->dims.num_dims; i++){
+  for (unsigned int i = split_dim; i < tensors[0]->dims.num_dims; i++) {
     copy_size = copy_size * dim_sizes[i];
   }
 
   dim_sizes[split_dim] = dim_sizes[split_dim] * num_splits;
-  if(dim_sizes[split_dim] < 1)
+  if (dim_sizes[split_dim] < 1)
     ERROR("Split Dimension < 1 after concat");
 
-  Tensor* output = (Tensor*) create4DTensor(tensors[0]->data_type, tensors[0]->data_format,
-					 dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
-
-  DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] = %d \n",
-       dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
+  Tensor *output = (Tensor *)create4DTensor(
+      tensors[0]->data_type, tensors[0]->data_format, dim_sizes[0],
+      dim_sizes[1], dim_sizes[2], dim_sizes[3]);
 
+  DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] "
+        "= %d \n",
+        dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]);
 
   int num_copies = 1;
-  for(unsigned int i = 0; i < split_dim; i++){
+  for (unsigned int i = 0; i < split_dim; i++) {
     num_copies = num_copies * dim_sizes[i];
   }
-  
+
   size_t copy_stride = num_splits * copy_size;
-  DEBUG("copy_size = %d, num_copies = %d, copy_stride = %d, output->size_in_bytes = %d \n",
-       copy_size, num_copies, copy_stride, output->size_in_bytes);
+  DEBUG("copy_size = %d, num_copies = %d, copy_stride = %d, "
+        "output->size_in_bytes = %d \n",
+        copy_size, num_copies, copy_stride, output->size_in_bytes);
 
-  for(unsigned int i = 0; i < num_copies; i++){
+  for (unsigned int i = 0; i < num_copies; i++) {
     // FIXIT: Don't be specific to 4D tensors
     size_t copy_start = i * copy_stride;
-   
-    for(int j = 0; j < num_splits; j++){
-      struct Tensor* split = tensors[j];
-      memcpy(((char*) output->host_data + copy_start + (j * copy_size)),
-	     ((char*) split->host_data + (i * copy_size)),
-	     copy_size);   
-    }      
+
+    for (int j = 0; j < num_splits; j++) {
+      struct Tensor *split = tensors[j];
+      memcpy(((char *)output->host_data + copy_start + (j * copy_size)),
+             ((char *)split->host_data + (i * copy_size)), copy_size);
+    }
   }
 
   profileEvent("tensorConcat_end", true);
@@ -651,15 +592,13 @@ void* tensorConcat(void** tensors_ptr, int num_splits, int split_dim){
   return output;
 }
 
+void *tensorLRN(void *input_ptr, unsigned int LRN_window, double LRN_alpha,
+                double LRN_beta, double LRN_k) {
 
-
-void* tensorLRN(void* input_ptr, unsigned int LRN_window,
-		double LRN_alpha, double LRN_beta, double LRN_k){
-
-  INFO("*** TensorLRN \n");  
+  INFO("*** TensorLRN \n");
   profileEvent("tensorLRN");
 
-  Tensor* input = (Tensor*) input_ptr;
+  Tensor *input = (Tensor *)input_ptr;
 
   hostToDeviceCopy(input);
 
@@ -667,29 +606,28 @@ void* tensorLRN(void* input_ptr, unsigned int LRN_window,
   cudnnLRNDescriptor_t LRNDesc;
   checkCUDNN(cudnnCreateLRNDescriptor(&LRNDesc));
 
-  DEBUG("window = %d, LRN_alpha = %f, LRN_beta = %f, LRN_k = %f \n",
-       LRN_window, LRN_alpha, LRN_beta, LRN_k);
- 
-  
-  checkCUDNN(cudnnSetLRNDescriptor(LRNDesc, LRN_window, LRN_alpha, LRN_beta, LRN_k));
+  DEBUG("window = %d, LRN_alpha = %f, LRN_beta = %f, LRN_k = %f \n", LRN_window,
+        LRN_alpha, LRN_beta, LRN_k);
+
+  checkCUDNN(
+      cudnnSetLRNDescriptor(LRNDesc, LRN_window, LRN_alpha, LRN_beta, LRN_k));
 
-  size_t* dim_sizes = input->dims.dim_sizes;
-  Tensor* output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
-			  CUDNN_TENSOR_NCHW, dim_sizes[0], dim_sizes[1],
-			  dim_sizes[2], dim_sizes[3]);
+  size_t *dim_sizes = input->dims.dim_sizes;
+  Tensor *output = (Tensor *)create4DTensor(
+      (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, dim_sizes[0],
+      dim_sizes[1], dim_sizes[2], dim_sizes[3]);
 
-  changeTensorPlacement(output, DEVICE); 
+  changeTensorPlacement(output, DEVICE);
 
   printTensorDescInfo(input);
   printTensorDescInfo(output);
-  
-  checkCUDNN(cudnnLRNCrossChannelForward(cudnnHandle, LRNDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1,
-					 &alpha, input->tensor_desc, input->gpu_data,
-					 &beta, output->tensor_desc, output->gpu_data));
+
+  checkCUDNN(cudnnLRNCrossChannelForward(
+      cudnnHandle, LRNDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha,
+      input->tensor_desc, input->gpu_data, &beta, output->tensor_desc,
+      output->gpu_data));
 
   profileEvent("tensorLRN_end", true);
-    
+
   return output;
 }
-
-
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
index 079a989829..f6bfe700b4 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
@@ -1,13 +1,12 @@
 //===--------------------------- tensor_utils.cu --------------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
+//
 //  This file  consists of the custom implementation of utility functions
 // useful for approximated and non-approximated versions of tensor operations.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
@@ -42,18 +41,15 @@
 #include "global_data.h"
 #include "fp16_gemm.h"
 
+extern "C" {
 
-
-extern "C"{
-
-
-void freeTensor(void* tensor_ptr){
-  Tensor* tensor = (Tensor*) tensor_ptr;
+void freeTensor(void *tensor_ptr) {
+  Tensor *tensor = (Tensor *)tensor_ptr;
 
   tensors_ptr.erase(tensor->gpu_data);
   tensors_ptr.erase(tensor->gpu_half_data);
   host_ptr.erase(tensor->host_data);
-  
+
   cudaFree(tensor->gpu_data);
   tensor->gpu_data = nullptr;
   cudaFree(tensor->gpu_half_data);
@@ -62,43 +58,42 @@ void freeTensor(void* tensor_ptr){
   tensor->host_data = nullptr;
 }
 
-
 // Returns the size of the target datatype
-int getTypeSize(int data_type){
+int getTypeSize(int data_type) {
   // TODO: Add support for more data types
   switch (data_type) {
-    case float_type:
-      return 4;
-    case double_type:
-      return 8;
-    case half_type:
-      return 2;
-    case int_type:
-      return 1;
-    case float2_type:
-      return 8;
-    case half2_type:
-      return 4;
-    default:
-      ERROR("Unknown type %d\n", data_type);
+  case float_type:
+    return 4;
+  case double_type:
+    return 8;
+  case half_type:
+    return 2;
+  case int_type:
+    return 1;
+  case float2_type:
+    return 8;
+  case half2_type:
+    return 4;
+  default:
+    ERROR("Unknown type %d\n", data_type);
   }
   return 0;
 }
 
-static int getFullPrecTypeSize(int data_type){
+static int getFullPrecTypeSize(int data_type) {
   switch (data_type) {
-    case float_type:
-    case half_type:
-      return 4;
-    case double_type:
-      return 8;
-    case int_type:
-      return 1;
-    case float2_type:
-    case half2_type:
-      return 8;
-    default:
-      ERROR("Unknown type %d\n", data_type);
+  case float_type:
+  case half_type:
+    return 4;
+  case double_type:
+    return 8;
+  case int_type:
+    return 1;
+  case float2_type:
+  case half2_type:
+    return 8;
+  default:
+    ERROR("Unknown type %d\n", data_type);
   }
   return 0;
 }
@@ -107,7 +102,7 @@ static bool isFP16Compound(int data_type) {
   return data_type == half_type || data_type == half2_type;
 }
 
-void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){
+void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) {
   int type_size = getTypeSize(data_type);
   size_t size_in_bytes = type_size * num_elems;
   tensor->size_in_bytes = size_in_bytes;
@@ -115,18 +110,20 @@ void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){
   DEBUG("***--- size_in_bytes = %d \n", size_in_bytes);
 }
 
-
 // NOTE: Always allocates FP32 on Host, FP32/FP16 for Device (GPU)
-void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){
+void allocateMem(struct Tensor *tensor, int data_type, size_t num_elems) {
   setSizeInBytes(tensor, data_type, num_elems);
   tensor->data_type = data_type;
-  tensor->cur_type = data_type; // type maintained for hanlding FP32 <-> FP16 conversions
+  tensor->cur_type =
+      data_type; // type maintained for hanlding FP32 <-> FP16 conversions
   tensor->num_elems = num_elems;
-  
-  size_t size_on_host = num_elems * getFullPrecTypeSize(data_type); // NOTE: On host, always FP32
-  tensor->host_data = (void*) malloc(size_on_host); // Allocate memory on the host
-  tensor->data_placement = HOST; // By defaut data is on the host
-  
+
+  size_t size_on_host =
+      num_elems * getFullPrecTypeSize(data_type); // NOTE: On host, always FP32
+  tensor->host_data =
+      (void *)malloc(size_on_host); // Allocate memory on the host
+  tensor->data_placement = HOST;    // By defaut data is on the host
+
   DEBUG("Attempting to Allocate = %lu \n\n\n", tensor->size_in_bytes);
 
   if (isFP16Compound(data_type)) {
@@ -142,23 +139,25 @@ void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){
   }
 
   tracked_tensors[tensor] = 1; // For FP16-FP32 data handling
-  
+
   host_ptr.insert(tensor->host_data);
   obj_ptr.insert(tensor);
-  //host_ptr.push_back(tensor->host_data); 
+  // host_ptr.push_back(tensor->host_data);
 }
 
 /// Two tensor formats are supported: NCHW and NHWC.
 /// TODO: Make this more general in the future.
 ///
-void setCudnnDataFormat(struct Tensor* tensor, int data_format){
+void setCudnnDataFormat(struct Tensor *tensor, int data_format) {
 
-  switch(data_format){
+  switch (data_format) {
   case 0:
-    data_format = CUDNN_TENSOR_NCHW; break;
+    data_format = CUDNN_TENSOR_NCHW;
+    break;
   case 1:
-    data_format = CUDNN_TENSOR_NHWC; break;
-  
+    data_format = CUDNN_TENSOR_NHWC;
+    break;
+
   default:
     break;
   }
@@ -167,39 +166,31 @@ void setCudnnDataFormat(struct Tensor* tensor, int data_format){
   DEBUG("tensor->data_format = %d \n", tensor->data_format);
 }
 
-
-void set4DFilterDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size,
-			   size_t dim2_size, size_t dim3_size, size_t dim4_size){
+void set4DFilterDescriptor(struct Tensor *tensor, int data_format,
+                           size_t dim1_size, size_t dim2_size, size_t dim3_size,
+                           size_t dim4_size) {
 
   setCudnnDataFormat(tensor, data_format);
-  
+
   checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_desc));
 
   checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_half_desc));
 
-  
-  checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_desc,
-					(cudnnDataType_t) CUDNN_DATA_FLOAT, //tensor->data_type,
-					(cudnnTensorFormat_t) tensor->data_format,
-					dim1_size,
-					dim2_size, 
-					dim3_size,
-					dim4_size));
-
-  checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_half_desc,
-					(cudnnDataType_t) CUDNN_DATA_HALF,
-					(cudnnTensorFormat_t) tensor->data_format,
-					dim1_size,
-					dim2_size, 
-					dim3_size,
-					dim4_size));  
+  checkCUDNN(cudnnSetFilter4dDescriptor(
+      tensor->filter_desc,
+      (cudnnDataType_t)CUDNN_DATA_FLOAT, // tensor->data_type,
+      (cudnnTensorFormat_t)tensor->data_format, dim1_size, dim2_size, dim3_size,
+      dim4_size));
 
+  checkCUDNN(cudnnSetFilter4dDescriptor(
+      tensor->filter_half_desc, (cudnnDataType_t)CUDNN_DATA_HALF,
+      (cudnnTensorFormat_t)tensor->data_format, dim1_size, dim2_size, dim3_size,
+      dim4_size));
 }
 
-
-
-void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size,
-			   size_t dim2_size, size_t dim3_size, size_t dim4_size){
+void set4DTensorDescriptor(struct Tensor *tensor, int data_format,
+                           size_t dim1_size, size_t dim2_size, size_t dim3_size,
+                           size_t dim4_size) {
 
   setCudnnDataFormat(tensor, data_format);
 
@@ -207,292 +198,270 @@ void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s
 
   checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_half_desc));
 
-  // For certain operations, the strides may need to change - in which case the descriptor
-  // needs to be reinitialized
-  cudnnSetTensor4dDescriptor(tensor->tensor_desc,
-			     (cudnnTensorFormat_t) tensor->data_format, // Data format
-			     (cudnnDataType_t) CUDNN_DATA_FLOAT, //tensor->data_type, // Data type
-			     dim1_size, dim2_size, 
-			     dim3_size, dim4_size);
-
+  // For certain operations, the strides may need to change - in which case the
+  // descriptor needs to be reinitialized
+  cudnnSetTensor4dDescriptor(
+      tensor->tensor_desc,
+      (cudnnTensorFormat_t)tensor->data_format, // Data format
+      (cudnnDataType_t)CUDNN_DATA_FLOAT, // tensor->data_type, // Data type
+      dim1_size, dim2_size, dim3_size, dim4_size);
 
-  cudnnSetTensor4dDescriptor(tensor->tensor_half_desc,
-			     (cudnnTensorFormat_t) tensor->data_format, // Data format
-			     (cudnnDataType_t) CUDNN_DATA_HALF, // Data type
-			     dim1_size, dim2_size, 
-			     dim3_size, dim4_size);
+  cudnnSetTensor4dDescriptor(
+      tensor->tensor_half_desc,
+      (cudnnTensorFormat_t)tensor->data_format, // Data format
+      (cudnnDataType_t)CUDNN_DATA_HALF,         // Data type
+      dim1_size, dim2_size, dim3_size, dim4_size);
 
-  
   cudnnDataType_t dType;
   int nStride, cStride, hStride, wStride;
   int size1, size2, size3, size4;
-  cudnnGetTensor4dDescriptor(tensor->tensor_desc,
-  			     &dType,
-  			     &size1, &size2, &size3, &size4,
-  			     &nStride, &cStride, &hStride, &wStride);
-			   
-  DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n",
-  	 nStride, cStride, hStride, wStride);
-}
+  cudnnGetTensor4dDescriptor(tensor->tensor_desc, &dType, &size1, &size2,
+                             &size3, &size4, &nStride, &cStride, &hStride,
+                             &wStride);
 
+  DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", nStride,
+        cStride, hStride, wStride);
+}
 
 // FIXIT: Striding still not working - hence 2D and 3D tensor support is missing
-void setTensorDescriptor(struct Tensor* tensor, int num_dims,
-			 size_t* dim_sizes){
+void setTensorDescriptor(struct Tensor *tensor, int num_dims,
+                         size_t *dim_sizes) {
 
   checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_desc));
 
-  int* strides = (int*) malloc(sizeof(int) * num_dims);
+  int *strides = (int *)malloc(sizeof(int) * num_dims);
   strides[num_dims - 1] = 1;
-  for(int i = num_dims - 2; i >= 0; i--){
-    strides[i] = strides[i+1] * dim_sizes[i+1];
+  for (int i = num_dims - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * dim_sizes[i + 1];
   }
 
-  for(int i = 0; i < num_dims; i++){
+  for (int i = 0; i < num_dims; i++) {
     DEBUG("strides[%d] = %d \n", i, strides[i]);
   }
 
-  int* const_dims = (int*) malloc(sizeof(int) * num_dims);
-  for(int j = 0 ; j < num_dims; j++){
-    const_dims[j] = (int) dim_sizes[j];
+  int *const_dims = (int *)malloc(sizeof(int) * num_dims);
+  for (int j = 0; j < num_dims; j++) {
+    const_dims[j] = (int)dim_sizes[j];
     DEBUG("const_dim = %d \n", const_dims[j]);
   }
-  
-  DEBUG("data_type = %d, cuDNN_value = %d \n", tensor->data_type, CUDNN_DATA_FLOAT); 
-  // For certain operations, the strides may need to change - in which case the descriptor
-  // needs to be reinitialized
-  checkCUDNN(cudnnSetTensorNdDescriptor(tensor->tensor_desc,
-					(cudnnDataType_t) tensor->data_type, // Data type
-					num_dims,
-					(const int*) const_dims,
-					(const int*) strides));
+
+  DEBUG("data_type = %d, cuDNN_value = %d \n", tensor->data_type,
+        CUDNN_DATA_FLOAT);
+  // For certain operations, the strides may need to change - in which case the
+  // descriptor needs to be reinitialized
+  checkCUDNN(cudnnSetTensorNdDescriptor(
+      tensor->tensor_desc,
+      (cudnnDataType_t)tensor->data_type, // Data type
+      num_dims, (const int *)const_dims, (const int *)strides));
 }
 
+/// HPVM tensor runtime allows creation of 2D, 3D and 4D tensors.
 
-/// HPVM tensor runtime allows creation of 2D, 3D and 4D tensors. 
+void *create2DTensor(int data_type, size_t dim1_size, size_t dim2_size) {
+  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size;
+  allocateMem(tensor, data_type, num_elems);
+  // Setting the tensor dimensions
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 2);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 2;
 
+  return tensor;
+}
 
-  void* create2DTensor(int data_type, size_t dim1_size, size_t dim2_size){
-    struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor));
-    size_t num_elems = dim1_size * dim2_size;
-    allocateMem(tensor, data_type, num_elems);
-    // Setting the tensor dimensions  
-    size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 2);
-    dim_sizes[0] = dim1_size;
-    dim_sizes[1] = dim2_size;
-    tensor->dims.dim_sizes = dim_sizes;
-    tensor->dims.num_dims = 2;
-  
-    return tensor;
-  }
+void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
+                     size_t dim3_size) {
+  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size * dim3_size;
+  allocateMem(tensor, data_type, num_elems);
+  // Setting the tensor dimensions
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 3);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  dim_sizes[2] = dim3_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 3;
+
+  return tensor;
+}
 
+void *create4DTensor(int data_type, int data_format, size_t dim1_size,
+                     size_t dim2_size, size_t dim3_size, size_t dim4_size) {
+  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+  size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+  allocateMem(tensor, data_type, num_elems);
+  // Setting the tensor dimensions
+  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
+  dim_sizes[0] = dim1_size;
+  dim_sizes[1] = dim2_size;
+  dim_sizes[2] = dim3_size;
+  dim_sizes[3] = dim4_size;
+  tensor->dims.dim_sizes = dim_sizes;
+  tensor->dims.num_dims = 4;
+  // Done setting tensor dimensions
+  // setTensorDescriptor(tensor, 4, dim_sizes);
+  set4DTensorDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size,
+                        dim4_size);
+  // FIXIT: filter descriptor should be invoked only for filters
+  set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size,
+                        dim4_size);
+
+  return tensor;
+}
 
-  void* create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size){
-    struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor));
-    size_t num_elems = dim1_size * dim2_size * dim3_size;
-    allocateMem(tensor, data_type, num_elems);
-    // Setting the tensor dimensions  
-    size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 3);
-    dim_sizes[0] = dim1_size;
-    dim_sizes[1] = dim2_size;
-    dim_sizes[2] = dim3_size;
-    tensor->dims.dim_sizes = dim_sizes;
-    tensor->dims.num_dims = 3;
-
-    return tensor;
-  }
+void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
 
+  Tensor *tensor = (Tensor *)tensor_ptr;
 
-  void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
-		       size_t dim3_size, size_t dim4_size){
-    struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor));
-    size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
-    allocateMem(tensor, data_type, num_elems);
-    // Setting the tensor dimensions  
-    size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 4);
-    dim_sizes[0] = dim1_size;
-    dim_sizes[1] = dim2_size;
-    dim_sizes[2] = dim3_size;
-    dim_sizes[3] = dim4_size;
-    tensor->dims.dim_sizes = dim_sizes;
-    tensor->dims.num_dims = 4;
-    // Done setting tensor dimensions  
-    //setTensorDescriptor(tensor, 4, dim_sizes);
-    set4DTensorDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size);
-    // FIXIT: filter descriptor should be invoked only for filters
-    set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size);
-  
-    return tensor;
+  size_t host_size_in_bytes = tensor->num_elems * 4;
+  // if(tensor->size_in_bytes != size_in_bytes){
+  if (host_size_in_bytes != size_in_bytes) {
+    ERROR("The destination and source sizes don't match");
   }
 
+  std::memcpy(tensor->host_data, data_ptr, size_in_bytes);
 
-  void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){
+  changeTensorPlacement(tensor, HOST);
+
+  tensor->cur_type = float_type;
+}
 
-    Tensor* tensor = (Tensor*) tensor_ptr;
+void hostToDeviceCopy(struct Tensor *tensor) {
 
-    size_t host_size_in_bytes = tensor->num_elems * 4;
-    //if(tensor->size_in_bytes != size_in_bytes){
-    if(host_size_in_bytes != size_in_bytes){
-      ERROR("The destination and source sizes don't match");
-    }
-  
-    std::memcpy(tensor->host_data, data_ptr, size_in_bytes);
+  if (tensor->data_placement != DEVICE) {
+    cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
+               cudaMemcpyHostToDevice);
+    DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
+    tensor->data_placement = DEVICE;
+  } else {
+    DEBUG("No data movement required - Data on Device \n");
+  }
+}
 
-    changeTensorPlacement(tensor, HOST);
+void deviceToHostCopy(struct Tensor *tensor) {
 
-    tensor->cur_type = float_type;
+  if (tensor->data_placement != HOST) {
+    cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes,
+               cudaMemcpyDeviceToHost);
+    DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
+    tensor->data_placement = HOST;
+  } else {
+    DEBUG("No data movement required - Data on Host \n");
   }
+}
 
-		      
+// void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){
 
-  void hostToDeviceCopy(struct Tensor* tensor){
+void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) {
 
-    if(tensor->data_placement != DEVICE){
-      cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
-		 cudaMemcpyHostToDevice);
-      DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
-      tensor->data_placement = DEVICE;
-    }
-    else{
-      DEBUG("No data movement required - Data on Device \n");    
-    }
-  
-  }
+  struct Tensor *srcTensor = (struct Tensor *)srcTensor_ptr;
+  struct Tensor *dstTensor = (struct Tensor *)dstTensor_ptr;
 
+  if (srcTensor->data_placement == HOST) {
+    memcpy(dstTensor->host_data, srcTensor->host_data,
+           srcTensor->size_in_bytes);
+    DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes);
+    dstTensor->data_placement = HOST;
+  } else if (srcTensor->data_placement == DEVICE) {
+    cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data,
+               srcTensor->size_in_bytes, cudaMemcpyDeviceToDevice);
+    DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes);
+    dstTensor->data_placement = DEVICE;
+  }
+}
 
-  void deviceToHostCopy(struct Tensor* tensor){
+void hpvm_request_tensor(void *tensor_ptr, int destination) {
 
-    if(tensor->data_placement != HOST){
+  Tensor *tensor = (Tensor *)tensor_ptr;
+  // If destination is the host
+  if (destination == 0) {
+    if (tensor->data_placement != HOST) {
       cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes,
-		 cudaMemcpyDeviceToHost);  
+                 cudaMemcpyDeviceToHost);
       DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
       tensor->data_placement = HOST;
+    } else {
+      DEBUG("No data movement required - Data on Host \n");
     }
-    else{
-      DEBUG("No data movement required - Data on Host \n");    
-    }
-    
-  }
-
-
-  //void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){
-
-  void tensorCopy(void* srcTensor_ptr, void* dstTensor_ptr){
-
-    struct Tensor* srcTensor = (struct Tensor*) srcTensor_ptr;
-    struct Tensor* dstTensor = (struct Tensor*) dstTensor_ptr;
-
-    
-    if(srcTensor->data_placement == HOST){
-      memcpy(dstTensor->host_data, srcTensor->host_data, srcTensor->size_in_bytes);  
-      DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes);
-      dstTensor->data_placement = HOST;
-    }
-    else if (srcTensor->data_placement == DEVICE){
-      cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data, srcTensor->size_in_bytes,
-		 cudaMemcpyDeviceToDevice);
-      DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes);
-      dstTensor->data_placement = DEVICE;
-    }
-    
   }
+  // If destination is the GPU
+  else if (destination == 1) {
 
-
-  void hpvm_request_tensor(void* tensor_ptr, int destination){
-
-    Tensor* tensor = (Tensor*) tensor_ptr;
-    // If destination is the host
-    if(destination == 0){  
-      if(tensor->data_placement != HOST){
-	cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes,
-		   cudaMemcpyDeviceToHost);  
-	DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
-	tensor->data_placement = HOST;
-      }
-      else{
-	DEBUG("No data movement required - Data on Host \n");    
-      }
-    }
-    // If destination is the GPU
-    else if(destination == 1){
-
-      if(tensor->data_placement != DEVICE){
-	cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
-		   cudaMemcpyHostToDevice);
-	DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
-	tensor->data_placement = DEVICE;
-      }
-      else{
-	DEBUG("No data movement required - Data on Device \n");    
-      }    
+    if (tensor->data_placement != DEVICE) {
+      cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
+                 cudaMemcpyHostToDevice);
+      DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
+      tensor->data_placement = DEVICE;
+    } else {
+      DEBUG("No data movement required - Data on Device \n");
     }
-  
   }
+}
 
+void convertToFP16(struct Tensor *tensor) {
 
-
- void convertToFP16(struct Tensor* tensor){
-
-  if(tensor == NULL)
+  if (tensor == NULL)
     return;
-  
+
   if (tensor->cur_type == half_type)
     return;
-    
+
   DEBUG("ConvertoFP16 \n");
 
   setSizeInBytes(tensor, half_type, tensor->num_elems);
   size_t size_in_bytes = tensor->size_in_bytes;
   DEBUG("size_in_bytes = %d \n", size_in_bytes);
-  
-  if(tensor->gpu_half_data == NULL)
-     checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, size_in_bytes)); // Allocate memory on GPU
-  // If Tensor is one of Tracked (has to free per batch) then track all data types
-  if(tracked_tensors.find(tensor) != tracked_tensors.end())
+
+  if (tensor->gpu_half_data == NULL)
+    checkCudaErrors(cudaMalloc(&tensor->gpu_half_data,
+                               size_in_bytes)); // Allocate memory on GPU
+  // If Tensor is one of Tracked (has to free per batch) then track all data
+  // types
+  if (tracked_tensors.find(tensor) != tracked_tensors.end())
     tensors_ptr.insert(tensor->gpu_half_data);
 
-  f2h((float*) tensor->gpu_data, tensor->num_elems, (half*) tensor->gpu_half_data);
+  f2h((float *)tensor->gpu_data, tensor->num_elems,
+      (half *)tensor->gpu_half_data);
 
-  tensor->cur_type = half_type;  
+  tensor->cur_type = half_type;
 }
 
+void convertToFP32(struct Tensor *tensor) {
 
-
-void convertToFP32(struct Tensor* tensor){
-
-  if(tensor == NULL)
+  if (tensor == NULL)
     return;
-  
+
   // Need this check for both offline and online profiling path
   if (tensor->cur_type == float_type)
     return;
-    
+
   DEBUG("ConvertoFP32 \n");
-  
+
   setSizeInBytes(tensor, float_type, tensor->num_elems);
   size_t size_in_bytes = tensor->size_in_bytes;
-  
+
   // If FP32 data array doesn't exist, allocate
-  if(tensor->gpu_data == NULL){
-    checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
+  if (tensor->gpu_data == NULL) {
+    checkCudaErrors(
+        cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
     DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes);
   }
-  // If Tensor is one of Tracked (has to free per batch) then track all data types
-  if(tracked_tensors.find(tensor) != tracked_tensors.end())
+  // If Tensor is one of Tracked (has to free per batch) then track all data
+  // types
+  if (tracked_tensors.find(tensor) != tracked_tensors.end())
     tensors_ptr.insert(tensor->gpu_data);
 
-  h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data);
+  h2f((half *)tensor->gpu_half_data, tensor->num_elems,
+      (float *)tensor->gpu_data);
 
   tensor->cur_type = float_type;
-
 }
 
+void convertToFP32_offline(struct Tensor *tensor) {
 
-
-void convertToFP32_offline(struct Tensor* tensor){
-
-  if(tensor == NULL)
+  if (tensor == NULL)
     return;
 
   if (tensor->cur_type == half_type)
@@ -504,36 +473,36 @@ void convertToFP32_offline(struct Tensor* tensor){
   size_t size_in_bytes = tensor->size_in_bytes;
 
   // If FP32 data array doesn't exist, allocate
-  if(tensor->gpu_data == NULL){
-    checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
+  if (tensor->gpu_data == NULL) {
+    checkCudaErrors(
+        cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU
     DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes);
   }
 
-  // If Tensor is one of Tracked (has to free per batch) then track all data types
-  if(tracked_tensors.find(tensor) != tracked_tensors.end())
+  // If Tensor is one of Tracked (has to free per batch) then track all data
+  // types
+  if (tracked_tensors.find(tensor) != tracked_tensors.end())
     tensors_ptr.insert(tensor->gpu_data);
 
-  h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data);
+  h2f((half *)tensor->gpu_half_data, tensor->num_elems,
+      (float *)tensor->gpu_data);
 
   tensor->cur_type = float_type;
-  
+
   cudaFree(tensor->gpu_half_data);
   tensors_ptr.erase(tensor->gpu_half_data);
   tensor->gpu_half_data = NULL;
 }
 
-
-
-
-
 // Called from within the runtime to change the data placement
-// This routine is required to change the output data placements from host to device
-void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement){
+// This routine is required to change the output data placements from host to
+// device
+void changeTensorPlacement(struct Tensor *tensor,
+                           data_location_t data_placement) {
 
-  if(tensor == NULL)
+  if (tensor == NULL)
     ERROR("Tensor == NULL");
   tensor->data_placement = data_placement;
 }
 
-
 } // end of Extern"C"
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu
index 5cdfdf5a55..8c77234e24 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu
@@ -1,13 +1,13 @@
 //===--------------------------- wrapper_runtime.cu -----------------------===//
 //
 //===----------------------------------------------------------------------===//
-//   
-// This file contains the implementation of some of the core API to tensor runtime
-// so that runtime tuning of approximations can be done on different targets.
+//
+// This file contains the implementation of some of the core API to tensor
+// runtime so that runtime tuning of approximations can be done on different
+// targets.
 //
 //===----------------------------------------------------------------------===//
 
-
 #include <stdio.h>
 #include <cstdio>
 #include <cstdlib>
@@ -24,7 +24,6 @@
 #include <cuda_fp16.h>
 #include <driver_types.h>
 
-
 // Tensor runtime header files
 #include "tensor_utils.h"
 #include "debug.h"
@@ -37,641 +36,580 @@
 #include "half_precision_api.h"
 
 #include "hpvm-rt-controller.h"
-#include "approxhpvm_runtime_utils.h" 
+#include "approxhpvm_runtime_utils.h"
 #include "approx_api.h"
 
-
-extern "C"{
-
-  /**** Wrapper Runtime API ***/
-
-
-  void* wrapper_ConvLayer(const char* hpvm_node_id,
-			  void* input, 
-			  void* filter, 
-			  void* bias, 
-			  int conv_pad_h, int conv_pad_w,
-			  int conv_stride_h, int conv_stride_w,
-			  int pool_id, int pool_size,
-			  int activation_id,
-			  // NOTE: out_min, out_max are only relevant for ClippedRelu
-			  float out_min, float out_max){
-
-    NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
-
-    if (NodeConf->isGPUNodeConfiguration()) {
-	DEBUG("GPU Configuration for ConvLayer\n");
-	// Mapped to GPU - get a GPU node configuration
-	GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
-
-	std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-				std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						       int> > > > &ApproxChoices =
-	  GPUConf->getApproxChoices();
-
-	// Check for convolution as first operation
-	CUSTOM_ASSERT((ApproxChoices.size() >= 1) &&
-		      (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) &&
-		      "Incorrect number/type of operations in provided Conv layer configuration");
-
-	void* conv_out = handleTensorConvApproximationTuples(ApproxChoices[0].second,
-							     input, filter, conv_pad_h, conv_pad_w,
-							     conv_stride_h, conv_stride_w);
-	void* add_out;
-	if (bias != NULL) {
-	  // Check for add as second operation
-	  CUSTOM_ASSERT((ApproxChoices.size() >= 2) &&
-			(ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) &&
-			"Incorrect number/type of operations in provided Conv layer configuration");
-	  add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
-						       conv_out, bias);
-	} else {
-	  add_out = conv_out;
-	}
-
-	void* activation_out;
-	switch (activation_id) {
-	case -1:
-	  { // No activation
-	    //INFO("No activation Function\n");
-	    activation_out = add_out;
-	  }
-	  break;
-	case 0:
-	  { // TanH activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out = handleTensorTanhApproximationTuples(ApproxChoices[2].second,
-								 add_out);
-	  }
-	  break;
-	case 1:
-	  { // ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out = handleTensorReluApproximationTuples(ApproxChoices[2].second,
-								 add_out);
-	  }
-	  break;
-	case 2:
-	  { // Clipped ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out =
-	      handleTensorClippedReluApproximationTuples(ApproxChoices[2].second,
-							 add_out, out_min, out_max);
-	  }
-	  break;
-	default:
-	  {
-	    ERROR("Activation id %d NOT supported \n", activation_id);
-	  }
-	  break;
-	}
-
-	void* pool_out;
-
-	if (pool_size > 0) {
-	  switch (pool_id) {
-	  case 0:
-	    {
-	      // If we remove the asserts, we can have all cases handled by a single call
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MAX) &&
-			    "Expected POOL_MAX in provided Conv layer configuration");
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size, pool_size, 0, 0,
-						       pool_size, pool_size);
-	    }
-	    break;
-	  case 1:
-	    {
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) &&
-			    "Expected POOL_MEAN in provided Conv layer configuration");
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size, pool_size, 0, 0,
-						       pool_size, pool_size);
-	    }
-	    break;
-	  case 2:
-	    {
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
-			    "Expected POOL_MIN in provided Conv layer configuration");
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size, pool_size, 0, 0,
-						       pool_size, pool_size);
-	    }
-	    break;
-	  default:
-	    {
-	      ERROR("Pool id %d NOT supported \n", pool_id);
-	    }
-	    break;
-	  }
-	} else {
-	  pool_out = activation_out;
-	}
-	return pool_out;
+extern "C" {
+
+/**** Wrapper Runtime API ***/
+
+void *
+wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
+                  void *bias, int conv_pad_h, int conv_pad_w, int conv_stride_h,
+                  int conv_stride_w, int pool_id, int pool_size,
+                  int activation_id,
+                  // NOTE: out_min, out_max are only relevant for ClippedRelu
+                  float out_min, float out_max) {
+
+  NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
+
+  if (NodeConf->isGPUNodeConfiguration()) {
+    DEBUG("GPU Configuration for ConvLayer\n");
+    // Mapped to GPU - get a GPU node configuration
+    GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
+
+    std::vector<
+        std::pair<GPUNodeConfiguration::TENSOR_OP,
+                  std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+        &ApproxChoices = GPUConf->getApproxChoices();
+
+    // Check for convolution as first operation
+    CUSTOM_ASSERT(
+        (ApproxChoices.size() >= 1) &&
+        (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) &&
+        "Incorrect number/type of operations in provided Conv layer "
+        "configuration");
+
+    void *conv_out = handleTensorConvApproximationTuples(
+        ApproxChoices[0].second, input, filter, conv_pad_h, conv_pad_w,
+        conv_stride_h, conv_stride_w);
+    void *add_out;
+    if (bias != NULL) {
+      // Check for add as second operation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 2) &&
+          (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
+                                                   conv_out, bias);
+    } else {
+      add_out = conv_out;
+    }
+
+    void *activation_out;
+    switch (activation_id) {
+    case -1: { // No activation
+      // INFO("No activation Function\n");
+      activation_out = add_out;
+    } break;
+    case 0: { // TanH activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      activation_out =
+          handleTensorTanhApproximationTuples(ApproxChoices[2].second, add_out);
+    } break;
+    case 1: { // ReLU activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      activation_out =
+          handleTensorReluApproximationTuples(ApproxChoices[2].second, add_out);
+    } break;
+    case 2: { // Clipped ReLU activation
+      CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
+                    (ApproxChoices[2].first ==
+                     GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
+                    "Incorrect number/type of operations in provided Conv "
+                    "layer configuration");
+      activation_out = handleTensorClippedReluApproximationTuples(
+          ApproxChoices[2].second, add_out, out_min, out_max);
+    } break;
+    default: {
+      ERROR("Activation id %d NOT supported \n", activation_id);
+    } break;
+    }
+
+    void *pool_out;
+
+    if (pool_size > 0) {
+      switch (pool_id) {
+      case 0: {
+        // If we remove the asserts, we can have all cases handled by a single
+        // call
+        CUSTOM_ASSERT((ApproxChoices.back().first ==
+                       GPUNodeConfiguration::TENSOR_OP::POOL_MAX) &&
+                      "Expected POOL_MAX in provided Conv layer configuration");
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size,
+            pool_size, 0, 0, pool_size, pool_size);
+      } break;
+      case 1: {
+        CUSTOM_ASSERT(
+            (ApproxChoices.back().first ==
+             GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) &&
+            "Expected POOL_MEAN in provided Conv layer configuration");
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size,
+            pool_size, 0, 0, pool_size, pool_size);
+      } break;
+      case 2: {
+        CUSTOM_ASSERT((ApproxChoices.back().first ==
+                       GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
+                      "Expected POOL_MIN in provided Conv layer configuration");
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size,
+            pool_size, 0, 0, pool_size, pool_size);
+      } break;
+      default: {
+        ERROR("Pool id %d NOT supported \n", pool_id);
+      } break;
       }
-      else {
-	ERROR("Unsupported Configuration");
-	abort();
-      }
-
-    return NULL;
+    } else {
+      pool_out = activation_out;
+    }
+    return pool_out;
+  } else {
+    ERROR("Unsupported Configuration");
+    abort();
   }
 
+  return NULL;
+}
 
-
-
-  
-  void* wrapper_ConvLayer2(const char* hpvm_node_id,
-			  void* input, 
-			  void* filter, 
-			  void* bias, 
-			  int conv_pad_h, int conv_pad_w,
-			  int conv_stride_h, int conv_stride_w,
-			  int pool_id,
-			  int pool_size_v, int pool_size_h,			 
-			  int pool_pad_v, int pool_pad_h,
-			  int pool_stride_v, int pool_stride_h,
-			  int activation_id,
-			  // NOTE: out_min, out_max are only relevant for ClippedRelu
-			  float out_min, float out_max){
-
-    INFO ("*** Conv Layer \n");
-    
-    NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
-	if (NodeConf->isGPUNodeConfiguration()) {
-	DEBUG("GPU Configuration for ConvLayer\n");
-	// Mapped to GPU - get a GPU node configuration
-	GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
-
-	std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-				std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						       int> > > > &ApproxChoices =
-	GPUConf->getApproxChoices();
-
-	
-	//printf("*** Convolution \n ApproxChoice = %d \n  BatchNorm = %d \n CONV = %d \n", ApproxChoices[0].first,
-	//	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
-	//       GPUNodeConfiguration::TENSOR_OP::CONV);
-
-	// Check for convolution as first operation
-	CUSTOM_ASSERT((ApproxChoices.size() >= 1) &&
-		      (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) &&
-		      "Incorrect number/type of operations in provided Conv layer configuration");
-
-
-	
-	void* conv_out = handleTensorConvApproximationTuples(ApproxChoices[0].second,
-							     input, filter, conv_pad_h, conv_pad_w,
-							     conv_stride_h, conv_stride_w);
-	void* add_out;
-	if (bias != NULL) {
-	  // Check for add as second operation
-	  CUSTOM_ASSERT((ApproxChoices.size() >= 2) &&
-			(ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) &&
-			"Incorrect number/type of operations in provided Conv layer configuration");
-	  add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
-						       conv_out, bias);
-	} else {
-	  add_out = conv_out;
-	}
-
-	void* activation_out;
-	switch (activation_id) {
-	case -1:
-	  { // No activation
-	    //INFO("No activation Function\n");
-	    activation_out = add_out;
-	  }
-	  break;
-	case 0:
-	  { // TanH activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out = handleTensorTanhApproximationTuples(ApproxChoices[2].second,
-								 add_out);
-	  }
-	  break;
-	case 1:
-	  { // ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out = handleTensorReluApproximationTuples(ApproxChoices[2].second,
-								 add_out);
-	  }
-	  break;
-	case 2:
-	  { // Clipped ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
-			  "Incorrect number/type of operations in provided Conv layer configuration");
-	    activation_out =
-	      handleTensorClippedReluApproximationTuples(ApproxChoices[2].second,
-							 add_out, out_min, out_max);
-	  }
-	  break;
-	default:
-	  {
-	    ERROR("Activation id %d NOT supported \n", activation_id);
-	  }
-	  break;
-	}
-
-	void* pool_out;
-
-	if (pool_size_v > 0) {
-	  switch (pool_id) {
-	  case 0:
-	    {
-	      // If we remove the asserts, we can have all cases handled by a single call
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MAX) &&
-			    "Expected POOL_MAX in provided Conv layer configuration");
-	      
-	      pool_out = handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-								activation_out, pool_id,
-								pool_size_v, pool_size_h,
-								pool_pad_v, pool_pad_h,
-								pool_stride_v, pool_stride_h);
-	      
-
-	    }
-	    break;
-	  case 1:
-	    {
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) &&
-			    "Expected POOL_MEAN in provided Conv layer configuration");
-
-	      // FIXIT: POOL_MEAN still needs fixing
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size_v, pool_size_h,
-						       0, 0,
-						       pool_size_v, pool_size_h);
-	    
-	    }
-	    break;
-	  case 2:
-	    {
-	      CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
-			    "Expected POOL_MIN in provided Conv layer configuration");
-
-	      // FIXIT: Pool_MEAN needs fixing
-	      pool_out =
-		handleTensorPoolingApproximationTuples(ApproxChoices.back().second,
-						       activation_out, pool_id,
-						       pool_size_v, pool_size_h, 0, 0,
-						       pool_size_v, pool_size_h);
-	    }
-	    break;
-	  default:
-	    {
-	      ERROR("Pool id %d NOT supported \n", pool_id);
-	    }
-	    break;
-	  }
-	} else {
-	  pool_out = activation_out;
-	}
-	return pool_out;
-      }
-      else {
-	ERROR("Unsupported Configuration");
-	abort();
+void *wrapper_ConvLayer2(
+    const char *hpvm_node_id, void *input, void *filter, void *bias,
+    int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w,
+    int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v,
+    int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id,
+    // NOTE: out_min, out_max are only relevant for ClippedRelu
+    float out_min, float out_max) {
+
+  INFO("*** Conv Layer \n");
+
+  NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
+  if (NodeConf->isGPUNodeConfiguration()) {
+    DEBUG("GPU Configuration for ConvLayer\n");
+    // Mapped to GPU - get a GPU node configuration
+    GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
+
+    std::vector<
+        std::pair<GPUNodeConfiguration::TENSOR_OP,
+                  std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+        &ApproxChoices = GPUConf->getApproxChoices();
+
+    // printf("*** Convolution \n ApproxChoice = %d \n  BatchNorm = %d \n CONV =
+    // %d \n", ApproxChoices[0].first,
+    //	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
+    //       GPUNodeConfiguration::TENSOR_OP::CONV);
+
+    // Check for convolution as first operation
+    CUSTOM_ASSERT(
+        (ApproxChoices.size() >= 1) &&
+        (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) &&
+        "Incorrect number/type of operations in provided Conv layer "
+        "configuration");
+
+    void *conv_out = handleTensorConvApproximationTuples(
+        ApproxChoices[0].second, input, filter, conv_pad_h, conv_pad_w,
+        conv_stride_h, conv_stride_w);
+    void *add_out;
+    if (bias != NULL) {
+      // Check for add as second operation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 2) &&
+          (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
+                                                   conv_out, bias);
+    } else {
+      add_out = conv_out;
+    }
+
+    void *activation_out;
+    switch (activation_id) {
+    case -1: { // No activation
+      // INFO("No activation Function\n");
+      activation_out = add_out;
+    } break;
+    case 0: { // TanH activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      activation_out =
+          handleTensorTanhApproximationTuples(ApproxChoices[2].second, add_out);
+    } break;
+    case 1: { // ReLU activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() >= 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
+          "Incorrect number/type of operations in provided Conv layer "
+          "configuration");
+      activation_out =
+          handleTensorReluApproximationTuples(ApproxChoices[2].second, add_out);
+    } break;
+    case 2: { // Clipped ReLU activation
+      CUSTOM_ASSERT((ApproxChoices.size() >= 3) &&
+                    (ApproxChoices[2].first ==
+                     GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
+                    "Incorrect number/type of operations in provided Conv "
+                    "layer configuration");
+      activation_out = handleTensorClippedReluApproximationTuples(
+          ApproxChoices[2].second, add_out, out_min, out_max);
+    } break;
+    default: {
+      ERROR("Activation id %d NOT supported \n", activation_id);
+    } break;
+    }
+
+    void *pool_out;
+
+    if (pool_size_v > 0) {
+      switch (pool_id) {
+      case 0: {
+        // If we remove the asserts, we can have all cases handled by a single
+        // call
+        CUSTOM_ASSERT((ApproxChoices.back().first ==
+                       GPUNodeConfiguration::TENSOR_OP::POOL_MAX) &&
+                      "Expected POOL_MAX in provided Conv layer configuration");
+
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size_v,
+            pool_size_h, pool_pad_v, pool_pad_h, pool_stride_v, pool_stride_h);
+
+      } break;
+      case 1: {
+        CUSTOM_ASSERT(
+            (ApproxChoices.back().first ==
+             GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) &&
+            "Expected POOL_MEAN in provided Conv layer configuration");
+
+        // FIXIT: POOL_MEAN still needs fixing
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size_v,
+            pool_size_h, 0, 0, pool_size_v, pool_size_h);
+
+      } break;
+      case 2: {
+        CUSTOM_ASSERT((ApproxChoices.back().first ==
+                       GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
+                      "Expected POOL_MIN in provided Conv layer configuration");
+
+        // FIXIT: Pool_MEAN needs fixing
+        pool_out = handleTensorPoolingApproximationTuples(
+            ApproxChoices.back().second, activation_out, pool_id, pool_size_v,
+            pool_size_h, 0, 0, pool_size_v, pool_size_h);
+      } break;
+      default: {
+        ERROR("Pool id %d NOT supported \n", pool_id);
+      } break;
       }
-
-    return NULL;
+    } else {
+      pool_out = activation_out;
+    }
+    return pool_out;
+  } else {
+    ERROR("Unsupported Configuration");
+    abort();
   }
 
+  return NULL;
+}
 
-
-
-  
-
-  void* wrapper_FCLayer(const char* hpvm_node_id,
-			void* input, 
-			void* weights, 
-			void* bias, 
-			int activation_id,
-			// NOTE: out_min and out_max are only relevant for ClippedRelu
-			float out_min, float out_max){ 
-
-    INFO ("*** Dense Layer \n");
-    
-    NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
-	if (NodeConf->isGPUNodeConfiguration()) {
-	DEBUG("GPU Configuration for FCLayer\n");
-	// Mapped to GPU - get a GPU node configuration
-	GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
-
-	std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-				std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						       int> > > > &ApproxChoices =
-	GPUConf->getApproxChoices();
-
-	// Approximation choices must be for a FC wrapper operation
-	CUSTOM_ASSERT((ApproxChoices.size() == 2 || ApproxChoices.size() == 3) &&
-		      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MUL &&
-		      ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD &&
-		      "Invalid configuration generated for FC layer wrapper operation");
-
-	void* gemm_out = handleTensorMulApproximationTuples(ApproxChoices[0].second,
-							    input, weights);
-	void* add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
-							   gemm_out, bias);
-
-	void* activation_out;
-	switch (activation_id) {
-	case -1:
-	  { // No activation
-	    CUSTOM_ASSERT((ApproxChoices.size() == 2) &&
-			  "Incorrect number of operations in provided FC layer configuration");
-	    //INFO("No activation Function\n");
-	    activation_out = add_out;
-	  }
-	  break;
-	case 0:
-	  { // TanH activation
-	    CUSTOM_ASSERT((ApproxChoices.size() == 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
-			  "Incorrect number/type of operations in provided FC layer configuration");
-	    activation_out = handleTensorTanhApproximationTuples(ApproxChoices[1].second,
-								 add_out);
-	  }
-	  break;
-	case 1:
-	  { // ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() == 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
-			  "Incorrect number/type of operations in provided FC layer configuration");
-	    activation_out = handleTensorReluApproximationTuples(ApproxChoices[1].second,
-								 add_out);
-	  }
-	  break;
-	case 2:
-	  { // Clipped ReLU activation
-	    CUSTOM_ASSERT((ApproxChoices.size() == 3) &&
-			  (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
-			  "Incorrect number/type of operations in provided FC layer configuration");
-	    activation_out =
-	      handleTensorClippedReluApproximationTuples(ApproxChoices[1].second,
-							 add_out, out_min, out_max);
-	  }
-	  break;
-	default:
-	  {
-	    ERROR("Activation id %d NOT supported \n", activation_id);
-	  }
-	  break;
-	}
-	return activation_out;
-      }
-      else {
-	ERROR("Unsupported Configuration");
-	abort();
-      }
-
-    return NULL;
+void *
+wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights,
+                void *bias, int activation_id,
+                // NOTE: out_min and out_max are only relevant for ClippedRelu
+                float out_min, float out_max) {
+
+  INFO("*** Dense Layer \n");
+
+  NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
+  if (NodeConf->isGPUNodeConfiguration()) {
+    DEBUG("GPU Configuration for FCLayer\n");
+    // Mapped to GPU - get a GPU node configuration
+    GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf;
+
+    std::vector<
+        std::pair<GPUNodeConfiguration::TENSOR_OP,
+                  std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+        &ApproxChoices = GPUConf->getApproxChoices();
+
+    // Approximation choices must be for a FC wrapper operation
+    CUSTOM_ASSERT(
+        (ApproxChoices.size() == 2 || ApproxChoices.size() == 3) &&
+        ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MUL &&
+        ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD &&
+        "Invalid configuration generated for FC layer wrapper operation");
+
+    void *gemm_out = handleTensorMulApproximationTuples(ApproxChoices[0].second,
+                                                        input, weights);
+    void *add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second,
+                                                       gemm_out, bias);
+
+    void *activation_out;
+    switch (activation_id) {
+    case -1: { // No activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() == 2) &&
+          "Incorrect number of operations in provided FC layer configuration");
+      // INFO("No activation Function\n");
+      activation_out = add_out;
+    } break;
+    case 0: { // TanH activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() == 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) &&
+          "Incorrect number/type of operations in provided FC layer "
+          "configuration");
+      activation_out =
+          handleTensorTanhApproximationTuples(ApproxChoices[1].second, add_out);
+    } break;
+    case 1: { // ReLU activation
+      CUSTOM_ASSERT(
+          (ApproxChoices.size() == 3) &&
+          (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) &&
+          "Incorrect number/type of operations in provided FC layer "
+          "configuration");
+      activation_out =
+          handleTensorReluApproximationTuples(ApproxChoices[1].second, add_out);
+    } break;
+    case 2: { // Clipped ReLU activation
+      CUSTOM_ASSERT((ApproxChoices.size() == 3) &&
+                    (ApproxChoices[2].first ==
+                     GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) &&
+                    "Incorrect number/type of operations in provided FC layer "
+                    "configuration");
+      activation_out = handleTensorClippedReluApproximationTuples(
+          ApproxChoices[1].second, add_out, out_min, out_max);
+    } break;
+    default: {
+      ERROR("Activation id %d NOT supported \n", activation_id);
+    } break;
+    }
+    return activation_out;
+  } else {
+    ERROR("Unsupported Configuration");
+    abort();
   }
 
+  return NULL;
+}
 
+void *wrapper_tensorRelu(const char *hpvm_node_id, void *input_ptr) {
 
+  INFO("*** Relu Operation \n");
 
-  void* wrapper_tensorRelu(const char* hpvm_node_id, void* input_ptr){
-
-    INFO("*** Relu Operation \n");
-    
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
 
-    // Approximation choices must be for a relu operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::RELU &&
-		  "Invalid configuration generated for tensor relu wrapper operation");
+  // Approximation choices must be for a relu operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::RELU &&
+      "Invalid configuration generated for tensor relu wrapper operation");
 
-    return handleTensorReluApproximationTuples(ApproxChoices[0].second,
-					       input_ptr);
-
-  }
+  return handleTensorReluApproximationTuples(ApproxChoices[0].second,
+                                             input_ptr);
+}
 
-  void* wrapper_tensorClippedRelu(const char* hpvm_node_id,
-				  void* input_ptr,
-				  float out_min, float out_max){
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+void *wrapper_tensorClippedRelu(const char *hpvm_node_id, void *input_ptr,
+                                float out_min, float out_max) {
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
-
-    // Approximation choices must be for a relu operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU &&
-		  "Invalid configuration generated for tensor clipped relu wrapper operation");
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
 
-    return handleTensorClippedReluApproximationTuples(ApproxChoices[0].second,
-						      input_ptr, out_min, out_max);
+  // Approximation choices must be for a relu operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+                ApproxChoices[0].first ==
+                    GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU &&
+                "Invalid configuration generated for tensor clipped relu "
+                "wrapper operation");
 
-  }
+  return handleTensorClippedReluApproximationTuples(
+      ApproxChoices[0].second, input_ptr, out_min, out_max);
+}
 
-  void* wrapper_tensorTanh(const char* hpvm_node_id, void* input_ptr){
-    //  return tensorTanh(input_ptr);
+void *wrapper_tensorTanh(const char *hpvm_node_id, void *input_ptr) {
+  //  return tensorTanh(input_ptr);
 
-    GPUNodeConfiguration *GPUConf =
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
 
-    // Approximation choices must be for a tanh operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::TANH &&
-		  "Invalid configuration generated for tensor tanh wrapper operation");
-
-    return handleTensorTanhApproximationTuples(ApproxChoices[0].second,
-					       input_ptr);
-
-  }
+  // Approximation choices must be for a tanh operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::TANH &&
+      "Invalid configuration generated for tensor tanh wrapper operation");
 
+  return handleTensorTanhApproximationTuples(ApproxChoices[0].second,
+                                             input_ptr);
+}
 
-  void* wrapper_tensorBatchNorm(const char* hpvm_node_id,
-				void* input_ptr, void* gamma_ptr, void* beta_ptr,
-				void* mean_ptr, void* variance_ptr, double epsilon){
+void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr,
+                              void *gamma_ptr, void *beta_ptr, void *mean_ptr,
+                              void *variance_ptr, double epsilon) {
 
-    INFO("*** BatchNorm Operation \n");
+  INFO("*** BatchNorm Operation \n");
 
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices =
 
-    GPUConf->getApproxChoices();
+          GPUConf->getApproxChoices();
 
-    // printf("*** BatchNorm \n ApproxChoice = %d \n  BatchNorm = %d \n CONV = %d \n", ApproxChoices[0].first,
-    //	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
-    //	       GPUNodeConfiguration::TENSOR_OP::CONV);
+  // printf("*** BatchNorm \n ApproxChoice = %d \n  BatchNorm = %d \n CONV = %d
+  // \n", ApproxChoices[0].first,
+  //	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
+  //	       GPUNodeConfiguration::TENSOR_OP::CONV);
 
-    // Approximation choices must be for a batchnorm operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::BATCHNORM &&
-		  "Invalid configuration generated for tensor batchnorm wrapper operation");
-
-    return handleTensorBatchNormApproximationTuples(ApproxChoices[0].second,
-						    input_ptr, gamma_ptr, beta_ptr,
-						    mean_ptr, variance_ptr, epsilon);
-
-  }
+  // Approximation choices must be for a batchnorm operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::BATCHNORM &&
+      "Invalid configuration generated for tensor batchnorm wrapper operation");
 
+  return handleTensorBatchNormApproximationTuples(
+      ApproxChoices[0].second, input_ptr, gamma_ptr, beta_ptr, mean_ptr,
+      variance_ptr, epsilon);
+}
 
-  void* wrapper_tensorAdd(const char* hpvm_node_id, void* input_ptr, void* bias_ptr){
+void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr,
+                        void *bias_ptr) {
 
-   
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices =
 
-    GPUConf->getApproxChoices();
+          GPUConf->getApproxChoices();
 
-    // Approximation choices must be for an add operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::ADD &&
-		  "Invalid configuration generated for tensor add wrapper operation");
+  // Approximation choices must be for an add operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::ADD &&
+      "Invalid configuration generated for tensor add wrapper operation");
 
-    return handleTensorAddApproximationTuples(ApproxChoices[0].second,
-					      input_ptr, bias_ptr);
-
-  }
+  return handleTensorAddApproximationTuples(ApproxChoices[0].second, input_ptr,
+                                            bias_ptr);
+}
 
+void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr,
+                            int poolFunction, int window_height,
+                            int window_width, int vertical_pad,
+                            int horizontal_pad, int vertical_stride,
+                            int horizontal_stride) {
 
-  void* wrapper_tensorPooling(const char* hpvm_node_id,
-			      void* input_ptr,
-			      int poolFunction,
-			      int window_height, int window_width,
-			      int vertical_pad, int horizontal_pad,
-			      int vertical_stride, int horizontal_stride){
+  INFO("*** TensorPooling Operation \n");
 
-    INFO("*** TensorPooling Operation \n");
-    
-    //  return tensorPooling(input_ptr, poolFunction, window_height, window_width,
-    //		       vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+  //  return tensorPooling(input_ptr, poolFunction, window_height, window_width,
+  //		       vertical_pad, horizontal_pad, vertical_stride,
+  // horizontal_stride);
 
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-
-    GPUConf->getApproxChoices();
-
-    // Approximation choices must be for a single operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  "Invalid configuration generated for tensor pool wrapper operation");
-    enum GPUNodeConfiguration::TENSOR_OP top = ApproxChoices[0].first;
-    // Approximation choices must be for a pool operation
-    CUSTOM_ASSERT((top == GPUNodeConfiguration::TENSOR_OP::POOL_MAX  ||
-		   top == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN ||
-		   top == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
-		  "Invalid configuration generated for tensor pool wrapper operation");
-
-    return handleTensorPoolingApproximationTuples(ApproxChoices[0].second,
-						  input_ptr, poolFunction,
-						  window_height, window_width,
-						  vertical_pad, horizontal_pad,
-						  vertical_stride, horizontal_stride);
-
-  }
-
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices =
+
+          GPUConf->getApproxChoices();
+
+  // Approximation choices must be for a single operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      "Invalid configuration generated for tensor pool wrapper operation");
+  enum GPUNodeConfiguration::TENSOR_OP top = ApproxChoices[0].first;
+  // Approximation choices must be for a pool operation
+  CUSTOM_ASSERT(
+      (top == GPUNodeConfiguration::TENSOR_OP::POOL_MAX ||
+       top == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN ||
+       top == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) &&
+      "Invalid configuration generated for tensor pool wrapper operation");
+
+  return handleTensorPoolingApproximationTuples(
+      ApproxChoices[0].second, input_ptr, poolFunction, window_height,
+      window_width, vertical_pad, horizontal_pad, vertical_stride,
+      horizontal_stride);
+}
 
-  void* wrapper_tensorGroupConvolution(const char* hpvm_node_id,
-				       void* input, void* filter,
-				       int vertical_pad, int horizontal_pad,
-				       int vertical_stride, int horizontal_stride,
-				       int conv_mode, int conv_groups){
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+void *wrapper_tensorGroupConvolution(const char *hpvm_node_id, void *input,
+                                     void *filter, int vertical_pad,
+                                     int horizontal_pad, int vertical_stride,
+                                     int horizontal_stride, int conv_mode,
+                                     int conv_groups) {
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
-
-    // Approximation choices must be for a group_conv operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::GROUP_CONV &&
-		  "Invalid configuration generated for tensor group_conv wrapper operation");
-
-    return handleTensorGroupConvApproximationTuples(ApproxChoices[0].second,
-						    input, filter,
-						    vertical_pad, horizontal_pad,
-						    vertical_stride, horizontal_stride,
-						    conv_mode, conv_groups);
-
-  }
-
-
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
+
+  // Approximation choices must be for a group_conv operation
+  CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
+                ApproxChoices[0].first ==
+                    GPUNodeConfiguration::TENSOR_OP::GROUP_CONV &&
+                "Invalid configuration generated for tensor group_conv wrapper "
+                "operation");
+
+  return handleTensorGroupConvApproximationTuples(
+      ApproxChoices[0].second, input, filter, vertical_pad, horizontal_pad,
+      vertical_stride, horizontal_stride, conv_mode, conv_groups);
+}
 
-  void* wrapper_tensorSoftmax(const char* hpvm_node_id, void* input_ptr){
-    //  return tensorSoftmax(input_ptr);
+void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr) {
+  //  return tensorSoftmax(input_ptr);
 
-    // Only mapped to GPU - get a GPU configuration
-    GPUNodeConfiguration *GPUConf =
+  // Only mapped to GPU - get a GPU configuration
+  GPUNodeConfiguration *GPUConf =
       (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id);
 
-    std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP,
-			    std::vector< std::pair<GPUNodeConfiguration::APPROX,
-						   int> > > > &ApproxChoices =
-    GPUConf->getApproxChoices();
-
-    // Approximation choices must be for a softmax operation
-    CUSTOM_ASSERT(ApproxChoices.size() == 1 &&
-		  ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::SOFTMAX &&
-		  "Invalid configuration generated for tensor softmax wrapper operation");
-
-    return handleTensorSoftmaxApproximationTuples(ApproxChoices[0].second, input_ptr);
+  std::vector<
+      std::pair<GPUNodeConfiguration::TENSOR_OP,
+                std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
+      &ApproxChoices = GPUConf->getApproxChoices();
 
+  // Approximation choices must be for a softmax operation
+  CUSTOM_ASSERT(
+      ApproxChoices.size() == 1 &&
+      ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::SOFTMAX &&
+      "Invalid configuration generated for tensor softmax wrapper operation");
 
-  }
-
-
-
-  void* tensor_set_node_id(unsigned int node_id){
+  return handleTensorSoftmaxApproximationTuples(ApproxChoices[0].second,
+                                                input_ptr);
+}
 
-    currentTensorID = node_id;
+void *tensor_set_node_id(unsigned int node_id) {
 
-    return NULL;
-  }
+  currentTensorID = node_id;
 
+  return NULL;
+}
 }
-- 
GitLab