diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
index 820d41b9745e9893d4052ae500b7940aa05f0f7d..4f24d164c1a3a3a8081b9d98cd70977e81055dca 100644
--- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
+++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt
@@ -129,11 +129,11 @@ target_link_libraries(fc2_cpu  tensor_cpu_runtime ${GPU_PROFILER_LIB} ${SOC_SIMU
 add_executable(lenet_keras  dnn_sources/src/lenet_keras.cc)
 target_link_libraries(lenet_keras  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(alexnet_cifar10  dnn_sources/src/alexnet_cifar10_front.cc)
+add_executable(alexnet_cifar10  dnn_sources/src/alexnet_cifar10.cc)
 target_link_libraries(alexnet_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(alexnet_cifar10_tuner  dnn_sources/src/alexnet_cifar10_tuner.cc)
-target_link_libraries(alexnet_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+#-- add_executable(alexnet_cifar10_tuner  dnn_sources/src/alexnet_cifar10_tuner.cc)
+#-- target_link_libraries(alexnet_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 #add_executable(alexnet_cifar10_approx  dnn_sources/src/alexnet_cifar10_approx.cc)
 #target_link_libraries(alexnet_cifar10_approx  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
@@ -141,14 +141,14 @@ target_link_libraries(alexnet_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB}
 add_executable(alexnet2_cifar10  dnn_sources/src/alexnet2_cifar10.cc)
 target_link_libraries(alexnet2_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(alexnet2_cifar10_tuner  dnn_sources/src/alexnet2_cifar10_tuner.cc)
-target_link_libraries(alexnet2_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+#-- add_executable(alexnet2_cifar10_tuner  dnn_sources/src/alexnet2_cifar10_tuner.cc)
+#-- target_link_libraries(alexnet2_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar10  dnn_sources/src/vgg16_cifar10.cc)
 target_link_libraries(vgg16_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(vgg16_cifar10_tuner  dnn_sources/src/vgg16_cifar10_tuner.cc)
-target_link_libraries(vgg16_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+#-- add_executable(vgg16_cifar10_tuner  dnn_sources/src/vgg16_cifar10_tuner.cc)
+#-- target_link_libraries(vgg16_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(resnet18_cifar10  dnn_sources/src/resnet18_cifar10.cc)
 target_link_libraries(resnet18_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
@@ -159,29 +159,26 @@ target_link_libraries(resnet18_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC
 #add_executable(resnet18_cifar10_inputapprox  dnn_sources/src/resnet18_cifar10_inputapprox.cc)
 #target_link_libraries(resnet18_cifar10_inputapprox  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(resnet18_cifar10_tuner  dnn_sources/src/resnet18_cifar10_tuner.cc)
-target_link_libraries(resnet18_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+#-- add_executable(resnet18_cifar10_tuner  dnn_sources/src/resnet18_cifar10_tuner.cc)
+#-- target_link_libraries(resnet18_cifar10_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(vgg16_cifar100  dnn_sources/src/vgg16_cifar100.cc)
 target_link_libraries(vgg16_cifar100  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(vgg16_cifar100_tuner  dnn_sources/src/vgg16_cifar100_tuner.cc)
-target_link_libraries(vgg16_cifar100_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+#-- add_executable(vgg16_cifar100_tuner  dnn_sources/src/vgg16_cifar100_tuner.cc)
+#-- target_link_libraries(vgg16_cifar100_tuner  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(vgg16_cifar100_top5  dnn_sources/src/vgg16_cifar100_5.cc)
-target_link_libraries(vgg16_cifar100_top5  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+#-- add_executable(vgg16_cifar100_top5  dnn_sources/src/vgg16_cifar100_5.cc)
+#-- target_link_libraries(vgg16_cifar100_top5  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 # REF binaries
-add_executable(mobilenet_cifar10  dnn_sources/src/mobilenet_cifar10.cc)
-target_link_libraries(mobilenet_cifar10  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(mobilenet_depthwise  dnn_sources/src/mobilenet_depthwise.cc)
+add_executable(mobilenet_depthwise  dnn_sources/src/mobilenet.cc)
 target_link_libraries(mobilenet_depthwise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-
-add_executable(mobilenet_cifar10_shallow  dnn_sources/src/mobilenet_cifar10_shallow.cc)
-target_link_libraries(mobilenet_cifar10_shallow  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+#-- add_executable(mobilenet_cifar10_shallow  dnn_sources/src/mobilenet_cifar10_shallow.cc)
+#-- target_link_libraries(mobilenet_cifar10_shallow  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 add_executable(mobilenet_shallow_depthwise  dnn_sources/src/mobilenet_shallow_depthwise.cc)
 target_link_libraries(mobilenet_shallow_depthwise  tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
@@ -189,8 +186,8 @@ target_link_libraries(mobilenet_shallow_depthwise  tensor_runtime ${GPU_PROFILER
 add_executable(resnet_imagenet  dnn_sources/src/resnet_imagenet.cc)
 target_link_libraries(resnet_imagenet  tensor_runtime_online  ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
-add_executable(mobilenet_imagenet  dnn_sources/src/mobilenet_imagenet.cc)
-target_link_libraries(mobilenet_imagenet  tensor_runtime_online  ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+#add_executable(mobilenet_imagenet  dnn_sources/src/mobilenet_imagenet.cc)
+#target_link_libraries(mobilenet_imagenet  tensor_runtime_online  ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
 
 
 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_cifar10.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_cifar10.cc
index fe71eb14caedba8d5813bbb0fa7feadcf0c72950..ee7f50bed8dd2dfccf00489f5fcca6b2aa941595 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_cifar10.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet2_cifar10.cc
@@ -59,8 +59,8 @@ void testCifarNet(){
 
   startMemTracking();
 
-  int test_input_size = 10000;
-  int batch_size = 2500;
+  int test_input_size = 1000;
+  int batch_size = 1000;
   int batch_count = test_input_size / batch_size;
   float final_accuracy = 0.0;
 
@@ -121,6 +121,7 @@ void testCifarNet(){
 
     float accuracy = computeAccuracy2(labels, batch_size, result); 
     final_accuracy += accuracy;
+
     
     freeBatchMemory();
   }
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10.cc
index 3e5cec7d0760252ebff1b31293a51bdf570415f4..7d2da0ce7fdef3b76d26c1d9d4a2050f3a16a692 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10.cc
@@ -1,196 +1,106 @@
 
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/types.h> 
+#include <sys/stat.h> 
+#include <string.h> 
+#include "../../tensor_runtime/include/tensor_runtime.h" 
+#include "../include/utils.h" 
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <string.h>
+int main(){ 
 
+  llvm_hpvm_initTensorRt(0); 
 
-#include "../../tensor_runtime/include/tensor_runtime.h"
-#include "../include/utils.h"
 
-
-bool Opentuner_run = false;
-
-
-/* NOTE: Reference Architecture to use for profiling */
-void testCifarNet(){
-
-  int total_runs = 100;
-  if(Opentuner_run){
-    total_runs = 1000000;
-  }
-
-  
-  printf("********* CIFAR-10 DNN ********** \n");
-  // FIXIT: Extend this to batch of images - currently 5 images
-
-  int test_batch_size = 5000;
-
-  //uint8_t* labels = readLabels("../model_params/cifar_keras/labels.bin", test_batch_size);
-  uint8_t* labels = readLabels("../model_params/alexnet_cifar10/test_labels.bin", test_batch_size);
-    
-  void* input = readTrainedWeights("../model_params/alexnet_cifar10/norm_cifar_input.bin",
-			  	   float_type,
-				   test_batch_size, 3, 32, 32);
-
-  void* conv1_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv1.bin",
-					  float_type, 64, 3, 11, 11);  
-  void* conv1_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv1_bias.bin",
-					float_type, 1, 64, 1, 1);  
-  void* conv2_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv2.bin",
-					  float_type, 192, 64, 5, 5);  
-  void* conv2_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv2_bias.bin",
-					float_type, 1, 192, 1, 1);
-
-  void* conv3_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv3.bin",
-					  float_type, 384, 192, 3, 3);  
-  void* conv3_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv3_bias.bin",
-					float_type, 1, 384, 1, 1);  
-  void* conv4_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv4.bin",
-					  float_type, 256, 384, 3, 3);  
-  void* conv4_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv4_bias.bin",
-					float_type, 1, 256, 1, 1);
-  void* conv5_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv5.bin",
-					  float_type, 256, 256, 3, 3);  
-  void* conv5_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv5_bias.bin",
-					float_type, 1, 256, 1, 1);
-  
-  void* fc1_weights = readTrainedWeights("../model_params/alexnet_cifar10/fc1.bin",
-					 float_type, 1, 1, 4096, 10);  
-  void* fc1_bias = readTrainedWeights("../model_params/alexnet_cifar10/fc1_bias.bin",
-				      float_type, 1, 10, 1, 1);  
+  //std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/");
+  std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); 
  
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  //void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  //uint8_t* labels = readLabels(labels_path.c_str(),10000); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv0.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
+  std::string conv2d_1_b_path =  dir_prefix + std::string("conv_bias0.bin"); 
+  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv3.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
+  std::string conv2d_2_b_path =  dir_prefix + std::string("conv_bias3.bin"); 
+  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv6.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
+  std::string conv2d_3_b_path =  dir_prefix + std::string("conv_bias6.bin"); 
+  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv7.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
+  std::string conv2d_4_b_path =  dir_prefix + std::string("conv_bias7.bin"); 
+  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv8.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
+  std::string conv2d_5_b_path =  dir_prefix + std::string("conv_bias8.bin"); 
+  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("fc12.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
+  std::string dense_1_b_path =  dir_prefix + std::string("fc_bias12.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+
+
+  startMemTracking();
+
+  int test_input_size = 1000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
   
-  clearTensorMap();
-  
-  for(int i = 0; i < total_runs; i++){
-
-    if(Opentuner_run){
-
-      const char* myfifo = "/tmp/myfifo";
-      int fd = open(myfifo, O_RDONLY);
-
-      int ret_val = fcntl(fd, F_GETFD);
-      if(ret_val == -1){
-	printf("Invalid descriptor \n");
-	abort();
-      }
-
-      char str[100];
-      read(fd, str, 80);
-      if(strcmp(str, "stop_run") == 0){
-	abort();
-      }
-
-      close(fd);
-    }
-
-    
-    readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters
-
-    // Start power and performnce profiling 
-    startProfiling();
-  
-    int conv_mode = 1; // NOTE: using CROSS_CORRELATION
-    int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
-
-    // NOTE: 'SAME' convolution
-    void* conv1out = tensorConvolution(input, conv1_filter, 5, 5, 1, 1,
-				       conv_mode, conv_precision);
-
-    tensorAdd(conv1out, conv1_bias); // NOTE: In place operation
-
-    void* conv1_tanh = tensorTanh(conv1out);
-
-    void* pool1out = tensorPooling(conv1_tanh, 0, 2, 2, 0, 0, 2, 2);
-
-    // 2nd Layer
-    void* conv2out = tensorConvolution(pool1out, conv2_filter, 2, 2, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv2out, conv2_bias); // NOTE: In place operation
-
-    void* conv2_tanh = tensorTanh(conv2out);
-
-    void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
-      
-
-    // 3rd Layer
-    void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv3out, conv3_bias); // NOTE: In place operation
-  
-    void* conv3_tanh = tensorTanh(conv3out);
-
-    // 4th Layer
-    void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv4out, conv4_bias); // NOTE: In place operation
-  
-    void* conv4_tanh = tensorTanh(conv4out);
-    
-    // 5th Layer
-    void* conv5out = tensorConvolution(conv4_tanh, conv5_filter, 1, 1, 1, 1,
-				       conv_mode, conv_precision);
-    tensorAdd(conv5out, conv5_bias); // NOTE: In place operation
-  
-    void* conv5_tanh = tensorTanh(conv5out);
-
-    void* pool5out = tensorPooling(conv5_tanh, 0, 2, 2, 0, 0, 2, 2);
-
-    // final FC Layer
-    void* gemm1out = tensorGemmGPU(pool5out, fc1_weights);  
-
-    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
-
-    void* result = tensorSoftmax(gemm1biasout);
-
-    printTensorDims(result);
-    
-    // End profiling and dump output to profile.txt
-    stopProfiling();
-  
-    computeAccuracy2(labels, test_batch_size, result);
-    
-    dumpAccuracyNorms();
-    freeOutputTensors();  
-
-    if(Opentuner_run){
-
-      const char* myfifo = "/tmp/myfifo";
-      int fd_out = open(myfifo, O_WRONLY);
-      int ret_val = fcntl(fd_out, F_GETFD);
-      if(ret_val == -1){
-	printf("Invalid descriptor \n");
-	abort();
-      }
-      
-      const char* str = "completed***!\n\0";
-      write(fd_out, str, 80);
-      close(fd_out);
-    }
+  for(int i = 0; i < batch_count; i++){
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
+
+    void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
+    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
+    void* var_2 = tensorTanh(var_1); 
+    void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); 
+    void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
+    void* var_6 = tensorAdd(var_5, conv2d_2_b); 
+    void* var_7 = tensorTanh(var_6); 
+    void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); 
+    void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
+    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
+    void* var_12 = tensorTanh(var_11); 
+    void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
+    void* var_14 = tensorAdd(var_13, conv2d_4_b); 
+    void* var_15 = tensorTanh(var_14); 
+    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
+    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
+    void* var_18 = tensorTanh(var_17); 
+    void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); 
+    void* var_22 = tensorGemmGPU(var_19, dense_1_w); 
+    void* var_23 = tensorAdd(var_22, dense_1_b); 
+    void* var_24 = tensorSoftmax(var_23); 
+
+    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+    float accuracy = computeAccuracy2(labels,batch_size,var_24); 
+    final_accuracy += accuracy;
     
+    freeBatchMemory();
   }
 
+  stopProfiling();
 
-  
-}
-
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
 
-int main(int argc, char* argv[]){
 
-  if(argc > 1)
-    Opentuner_run = true;
+  llvm_hpvm_cleanupTensorRt(); 
 
-  llvm_hpvm_initTensorRt(1);
+  return 0; 
 
-  testCifarNet();
-
-  llvm_hpvm_cleanupTensorRt();
-
-  return 0;
 }
-
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10_front.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10_front.cc
deleted file mode 100644
index 84510c5342811eb20c8c7e834f4fcf34d5561ccb..0000000000000000000000000000000000000000
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10_front.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <unistd.h> 
-#include <fcntl.h> 
-#include <sys/types.h> 
-#include <sys/stat.h> 
-#include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
-#include "../include/utils.h" 
-
-int main(){ 
-
-  llvm_hpvm_initTensorRt(0); 
-
-
-  std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); 
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  //void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  //uint8_t* labels = readLabels(labels_path.c_str(),10000); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv0.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv_bias0.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv3.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv_bias3.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv6.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv_bias6.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv7.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv_bias7.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv8.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv_bias8.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("fc12.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("fc_bias12.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-
-
-  startMemTracking();
-
-  int test_input_size = 10000;
-  int batch_size = 2500;
-  int batch_count = test_input_size / batch_size;
-  float final_accuracy = 0.0;
-
-  // NOTE: Starting time profiling
-  startProfiling();
-  
-  for(int i = 0; i < batch_count; i++){
-
-    int start = i * batch_size;
-    int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
-
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
-    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorTanh(var_1); 
-    void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); 
-    void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
-    void* var_6 = tensorAdd(var_5, conv2d_2_b); 
-    void* var_7 = tensorTanh(var_6); 
-    void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); 
-    void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorTanh(var_11); 
-    void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_14 = tensorAdd(var_13, conv2d_4_b); 
-    void* var_15 = tensorTanh(var_14); 
-    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorTanh(var_17); 
-    void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); 
-    void* var_22 = tensorGemmGPU(var_19, dense_1_w); 
-    void* var_23 = tensorAdd(var_22, dense_1_b); 
-    void* var_24 = tensorSoftmax(var_23); 
-
-    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
-
-    float accuracy = computeAccuracy2(labels,batch_size,var_24); 
-    final_accuracy += accuracy;
-    
-    freeBatchMemory();
-  }
-
-  stopProfiling();
-
-  final_accuracy = final_accuracy / batch_count;
-  dumpFinalAccuracy(final_accuracy);
-
-
-  llvm_hpvm_cleanupTensorRt(); 
-
-  return 0; 
-
-}
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10_old.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10_old.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e5cec7d0760252ebff1b31293a51bdf570415f4
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_cifar10_old.cc
@@ -0,0 +1,196 @@
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <string.h>
+
+
+#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../include/utils.h"
+
+
+bool Opentuner_run = false;
+
+
+/* NOTE: Reference Architecture to use for profiling */
+void testCifarNet(){
+
+  int total_runs = 100;
+  if(Opentuner_run){
+    total_runs = 1000000;
+  }
+
+  
+  printf("********* CIFAR-10 DNN ********** \n");
+  // FIXIT: Extend this to batch of images - currently 5 images
+
+  int test_batch_size = 5000;
+
+  //uint8_t* labels = readLabels("../model_params/cifar_keras/labels.bin", test_batch_size);
+  uint8_t* labels = readLabels("../model_params/alexnet_cifar10/test_labels.bin", test_batch_size);
+    
+  void* input = readTrainedWeights("../model_params/alexnet_cifar10/norm_cifar_input.bin",
+			  	   float_type,
+				   test_batch_size, 3, 32, 32);
+
+  void* conv1_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv1.bin",
+					  float_type, 64, 3, 11, 11);  
+  void* conv1_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv1_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv2.bin",
+					  float_type, 192, 64, 5, 5);  
+  void* conv2_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv2_bias.bin",
+					float_type, 1, 192, 1, 1);
+
+  void* conv3_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv3.bin",
+					  float_type, 384, 192, 3, 3);  
+  void* conv3_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv3_bias.bin",
+					float_type, 1, 384, 1, 1);  
+  void* conv4_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv4.bin",
+					  float_type, 256, 384, 3, 3);  
+  void* conv4_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv4_bias.bin",
+					float_type, 1, 256, 1, 1);
+  void* conv5_filter = readTrainedWeights("../model_params/alexnet_cifar10/conv5.bin",
+					  float_type, 256, 256, 3, 3);  
+  void* conv5_bias = readTrainedWeights("../model_params/alexnet_cifar10/conv5_bias.bin",
+					float_type, 1, 256, 1, 1);
+  
+  void* fc1_weights = readTrainedWeights("../model_params/alexnet_cifar10/fc1.bin",
+					 float_type, 1, 1, 4096, 10);  
+  void* fc1_bias = readTrainedWeights("../model_params/alexnet_cifar10/fc1_bias.bin",
+				      float_type, 1, 10, 1, 1);  
+ 
+  
+  clearTensorMap();
+  
+  for(int i = 0; i < total_runs; i++){
+
+    if(Opentuner_run){
+
+      const char* myfifo = "/tmp/myfifo";
+      int fd = open(myfifo, O_RDONLY);
+
+      int ret_val = fcntl(fd, F_GETFD);
+      if(ret_val == -1){
+	printf("Invalid descriptor \n");
+	abort();
+      }
+
+      char str[100];
+      read(fd, str, 80);
+      if(strcmp(str, "stop_run") == 0){
+	abort();
+      }
+
+      close(fd);
+    }
+
+    
+    readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters
+
+    // Start power and performnce profiling 
+    startProfiling();
+  
+    int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+    int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
+    // NOTE: 'SAME' convolution
+    void* conv1out = tensorConvolution(input, conv1_filter, 5, 5, 1, 1,
+				       conv_mode, conv_precision);
+
+    tensorAdd(conv1out, conv1_bias); // NOTE: In place operation
+
+    void* conv1_tanh = tensorTanh(conv1out);
+
+    void* pool1out = tensorPooling(conv1_tanh, 0, 2, 2, 0, 0, 2, 2);
+
+    // 2nd Layer
+    void* conv2out = tensorConvolution(pool1out, conv2_filter, 2, 2, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv2out, conv2_bias); // NOTE: In place operation
+
+    void* conv2_tanh = tensorTanh(conv2out);
+
+    void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+      
+
+    // 3rd Layer
+    void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv3out, conv3_bias); // NOTE: In place operation
+  
+    void* conv3_tanh = tensorTanh(conv3out);
+
+    // 4th Layer
+    void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv4out, conv4_bias); // NOTE: In place operation
+  
+    void* conv4_tanh = tensorTanh(conv4out);
+    
+    // 5th Layer
+    void* conv5out = tensorConvolution(conv4_tanh, conv5_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv5out, conv5_bias); // NOTE: In place operation
+  
+    void* conv5_tanh = tensorTanh(conv5out);
+
+    void* pool5out = tensorPooling(conv5_tanh, 0, 2, 2, 0, 0, 2, 2);
+
+    // final FC Layer
+    void* gemm1out = tensorGemmGPU(pool5out, fc1_weights);  
+
+    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+
+    void* result = tensorSoftmax(gemm1biasout);
+
+    printTensorDims(result);
+    
+    // End profiling and dump output to profile.txt
+    stopProfiling();
+  
+    computeAccuracy2(labels, test_batch_size, result);
+    
+    dumpAccuracyNorms();
+    freeOutputTensors();  
+
+    if(Opentuner_run){
+
+      const char* myfifo = "/tmp/myfifo";
+      int fd_out = open(myfifo, O_WRONLY);
+      int ret_val = fcntl(fd_out, F_GETFD);
+      if(ret_val == -1){
+	printf("Invalid descriptor \n");
+	abort();
+      }
+      
+      const char* str = "completed***!\n\0";
+      write(fd_out, str, 80);
+      close(fd_out);
+    }
+    
+  }
+
+
+  
+}
+
+
+int main(int argc, char* argv[]){
+
+  if(argc > 1)
+    Opentuner_run = true;
+
+  llvm_hpvm_initTensorRt(1);
+
+  testCifarNet();
+
+  llvm_hpvm_cleanupTensorRt();
+
+  return 0;
+}
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet.cc
index ba7af9846916057fedc05757bdad77fefb01590e..107024c81a7d8124a46528f7a59fac5af340bcac 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet.cc
@@ -1,4 +1,5 @@
 
+
 #include <stdio.h> 
 #include <stdlib.h> 
 #include <unistd.h> 
@@ -6,15 +7,15 @@
 #include <sys/types.h> 
 #include <sys/stat.h> 
 #include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h" 
+#include "../../tensor_runtime/include/tensor_runtime.h"
 #include "../include/utils.h" 
 
 int main(){ 
 
-  llvm_hpvm_initTensorRt(1); 
+  llvm_hpvm_initTensorRt(0); 
 
 
-  std::string dir_prefix = std::string("../model_params/mobilenet_hpvm_3/"); 
+  std::string dir_prefix = std::string("../model_params/mobilenet/"); 
   std::string input_path =  dir_prefix + std::string("input.bin"); 
   std::string labels_path =  dir_prefix + std::string("labels.bin"); 
   std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
@@ -296,8 +297,8 @@ int main(){
 
   startMemTracking(); 
 
-  int test_input_size = 3000; 
-  int batch_size = 1000; 
+  int test_input_size = 5000; 
+  int batch_size = 2500;  
   int batch_count = test_input_size / batch_size; 
   float final_accuracy = 0.0; 
 
@@ -311,95 +312,95 @@ int main(){
     void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
     void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
     void* var_2 = tensorRelu(var_1); 
-    void* var_3 = tensorConvolution(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
-    void* var_4 = tensorBatchNorm(var_3, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
-    void* var_5 = tensorRelu(var_4); 
-    void* var_6 = tensorConvolution(var_5, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
-    void* var_7 = tensorBatchNorm(var_6, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
-    void* var_8 = tensorRelu(var_7); 
-    void* var_9 = tensorConvolution(var_8, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
-    void* var_10 = tensorBatchNorm(var_9, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
-    void* var_11 = tensorRelu(var_10); 
-    void* var_12 = tensorConvolution(var_11, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
-    void* var_13 = tensorBatchNorm(var_12, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
-    void* var_14 = tensorRelu(var_13); 
-    void* var_15 = tensorConvolution(var_14, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
-    void* var_16 = tensorBatchNorm(var_15, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
-    void* var_17 = tensorRelu(var_16); 
-    void* var_18 = tensorConvolution(var_17, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
-    void* var_19 = tensorBatchNorm(var_18, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
+    void* var_4 = tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
+    void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
+    void* var_6 = tensorRelu(var_5); 
+    void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
+    void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
+    void* var_9 = tensorRelu(var_8); 
+    void* var_11 = tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
+    void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
+    void* var_13 = tensorRelu(var_12); 
+    void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
+    void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
+    void* var_16 = tensorRelu(var_15); 
+    void* var_18 = tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
+    void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
     void* var_20 = tensorRelu(var_19); 
-    void* var_22 = tensorConvolution(var_20, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
-    void* var_23 = tensorBatchNorm(var_22, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
-    void* var_24 = tensorRelu(var_23); 
-    void* var_25 = tensorConvolution(var_24, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
-    void* var_26 = tensorBatchNorm(var_25, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
-    void* var_27 = tensorRelu(var_26); 
-    void* var_28 = tensorConvolution(var_27, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
-    void* var_29 = tensorBatchNorm(var_28, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
-    void* var_30 = tensorRelu(var_29); 
-    void* var_31 = tensorConvolution(var_30, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
-    void* var_32 = tensorBatchNorm(var_31, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
-    void* var_33 = tensorRelu(var_32); 
-    void* var_35 = tensorConvolution(var_33, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
-    void* var_36 = tensorBatchNorm(var_35, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
-    void* var_37 = tensorRelu(var_36); 
-    void* var_38 = tensorConvolution(var_37, conv2d_7_w, 0, 0, 1, 1, 1, 1); 
-    void* var_39 = tensorBatchNorm(var_38, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
-    void* var_40 = tensorRelu(var_39); 
-    void* var_41 = tensorConvolution(var_40, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
-    void* var_42 = tensorBatchNorm(var_41, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
+    void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
+    void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
+    void* var_23 = tensorRelu(var_22); 
+    void* var_26 = tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
+    void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
+    void* var_28 = tensorRelu(var_27); 
+    void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
+    void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
+    void* var_31 = tensorRelu(var_30); 
+    void* var_33 = tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
+    void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
+    void* var_35 = tensorRelu(var_34); 
+    void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
+    void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
+    void* var_38 = tensorRelu(var_37); 
+    void* var_41 = tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
+    void* var_42 = tensorBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
     void* var_43 = tensorRelu(var_42); 
-    void* var_44 = tensorConvolution(var_43, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
-    void* var_45 = tensorBatchNorm(var_44, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
+    void* var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); 
+    void* var_45 = tensorBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
     void* var_46 = tensorRelu(var_45); 
-    void* var_47 = tensorConvolution(var_46, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
-    void* var_48 = tensorBatchNorm(var_47, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
-    void* var_49 = tensorRelu(var_48); 
-    void* var_50 = tensorConvolution(var_49, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
-    void* var_51 = tensorBatchNorm(var_50, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
-    void* var_52 = tensorRelu(var_51); 
-    void* var_54 = tensorConvolution(var_52, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
-    void* var_55 = tensorBatchNorm(var_54, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
-    void* var_56 = tensorRelu(var_55); 
-    void* var_57 = tensorConvolution(var_56, conv2d_10_w, 0, 0, 1, 1, 1, 1); 
-    void* var_58 = tensorBatchNorm(var_57, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
-    void* var_59 = tensorRelu(var_58); 
-    void* var_60 = tensorConvolution(var_59, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
-    void* var_61 = tensorBatchNorm(var_60, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
-    void* var_62 = tensorRelu(var_61); 
-    void* var_63 = tensorConvolution(var_62, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
-    void* var_64 = tensorBatchNorm(var_63, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
+    void* var_48 = tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
+    void* var_49 = tensorBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
+    void* var_50 = tensorRelu(var_49); 
+    void* var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
+    void* var_52 = tensorBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
+    void* var_53 = tensorRelu(var_52); 
+    void* var_55 = tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
+    void* var_56 = tensorBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
+    void* var_57 = tensorRelu(var_56); 
+    void* var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
+    void* var_59 = tensorBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
+    void* var_60 = tensorRelu(var_59); 
+    void* var_63 = tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
+    void* var_64 = tensorBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
     void* var_65 = tensorRelu(var_64); 
-    void* var_66 = tensorConvolution(var_65, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
-    void* var_67 = tensorBatchNorm(var_66, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
+    void* var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); 
+    void* var_67 = tensorBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
     void* var_68 = tensorRelu(var_67); 
-    void* var_69 = tensorConvolution(var_68, conv2d_12_w, 0, 0, 1, 1, 1, 1); 
-    void* var_70 = tensorBatchNorm(var_69, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
-    void* var_71 = tensorRelu(var_70); 
-    void* var_73 = tensorConvolution(var_71, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
-    void* var_74 = tensorBatchNorm(var_73, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
+    void* var_70 = tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
+    void* var_71 = tensorBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
+    void* var_72 = tensorRelu(var_71); 
+    void* var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
+    void* var_74 = tensorBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
     void* var_75 = tensorRelu(var_74); 
-    void* var_76 = tensorConvolution(var_75, conv2d_13_w, 0, 0, 1, 1, 1, 1); 
-    void* var_77 = tensorBatchNorm(var_76, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-    void* var_78 = tensorRelu(var_77); 
-    void* var_79 = tensorConvolution(var_78, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
-    void* var_80 = tensorBatchNorm(var_79, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
-    void* var_81 = tensorRelu(var_80); 
-    void* var_82 = tensorConvolution(var_81, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
-    void* var_83 = tensorBatchNorm(var_82, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-    void* var_84 = tensorRelu(var_83); 
-    void* var_86 = tensorPooling(var_84,1,2,2,0,0,2,2); 
-    void* var_88 = tensorGemmGPU(var_86, dense_1_w); 
-    void* var_89 = tensorAdd(var_88, dense_1_b); 
-    void* var_90 = tensorSoftmax(var_89); 
+    void* var_77 = tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
+    void* var_78 = tensorBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
+    void* var_79 = tensorRelu(var_78); 
+    void* var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); 
+    void* var_81 = tensorBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
+    void* var_82 = tensorRelu(var_81); 
+    void* var_85 = tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
+    void* var_86 = tensorBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
+    void* var_87 = tensorRelu(var_86); 
+    void* var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); 
+    void* var_89 = tensorBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
+    void* var_90 = tensorRelu(var_89); 
+    void* var_92 = tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
+    void* var_93 = tensorBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
+    void* var_94 = tensorRelu(var_93); 
+    void* var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
+    void* var_96 = tensorBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
+    void* var_97 = tensorRelu(var_96); 
+    void* var_99 = tensorPooling(var_97,1,2,2,0,0,2,2); 
+    void* var_101 = tensorGemmGPU(var_99, dense_1_w); 
+    void* var_102 = tensorAdd(var_101, dense_1_b); 
+    void* var_103 = tensorSoftmax(var_102); 
 
     uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
 
-    float accuracy = computeAccuracy2(labels, batch_size, var_90); 
+    float accuracy = computeAccuracy2(labels, batch_size, var_103); 
     final_accuracy += accuracy; 
     freeBatchMemory(); 
- 
+
   }
 
   final_accuracy = final_accuracy / batch_count; 
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet_depthwise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet_old.cc
similarity index 88%
rename from llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet_depthwise.cc
rename to llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet_old.cc
index 107024c81a7d8124a46528f7a59fac5af340bcac..ba7af9846916057fedc05757bdad77fefb01590e 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet_depthwise.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/mobilenet_old.cc
@@ -1,5 +1,4 @@
 
-
 #include <stdio.h> 
 #include <stdlib.h> 
 #include <unistd.h> 
@@ -7,15 +6,15 @@
 #include <sys/types.h> 
 #include <sys/stat.h> 
 #include <string.h> 
-#include "../../tensor_runtime/include/tensor_runtime.h"
+#include "../../tensor_runtime/include/tensor_runtime.h" 
 #include "../include/utils.h" 
 
 int main(){ 
 
-  llvm_hpvm_initTensorRt(0); 
+  llvm_hpvm_initTensorRt(1); 
 
 
-  std::string dir_prefix = std::string("../model_params/mobilenet/"); 
+  std::string dir_prefix = std::string("../model_params/mobilenet_hpvm_3/"); 
   std::string input_path =  dir_prefix + std::string("input.bin"); 
   std::string labels_path =  dir_prefix + std::string("labels.bin"); 
   std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
@@ -297,8 +296,8 @@ int main(){
 
   startMemTracking(); 
 
-  int test_input_size = 5000; 
-  int batch_size = 2500;  
+  int test_input_size = 3000; 
+  int batch_size = 1000; 
   int batch_count = test_input_size / batch_size; 
   float final_accuracy = 0.0; 
 
@@ -312,95 +311,95 @@ int main(){
     void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); 
     void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); 
     void* var_2 = tensorRelu(var_1); 
-    void* var_4 = tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
-    void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
-    void* var_6 = tensorRelu(var_5); 
-    void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
-    void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
-    void* var_9 = tensorRelu(var_8); 
-    void* var_11 = tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
-    void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
-    void* var_13 = tensorRelu(var_12); 
-    void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
-    void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
-    void* var_16 = tensorRelu(var_15); 
-    void* var_18 = tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
-    void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
+    void* var_3 = tensorConvolution(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); 
+    void* var_4 = tensorBatchNorm(var_3, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); 
+    void* var_5 = tensorRelu(var_4); 
+    void* var_6 = tensorConvolution(var_5, conv2d_2_w, 0, 0, 1, 1, 1, 1); 
+    void* var_7 = tensorBatchNorm(var_6, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); 
+    void* var_8 = tensorRelu(var_7); 
+    void* var_9 = tensorConvolution(var_8, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); 
+    void* var_10 = tensorBatchNorm(var_9, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); 
+    void* var_11 = tensorRelu(var_10); 
+    void* var_12 = tensorConvolution(var_11, conv2d_3_w, 0, 0, 1, 1, 1, 1); 
+    void* var_13 = tensorBatchNorm(var_12, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); 
+    void* var_14 = tensorRelu(var_13); 
+    void* var_15 = tensorConvolution(var_14, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); 
+    void* var_16 = tensorBatchNorm(var_15, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); 
+    void* var_17 = tensorRelu(var_16); 
+    void* var_18 = tensorConvolution(var_17, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
+    void* var_19 = tensorBatchNorm(var_18, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
     void* var_20 = tensorRelu(var_19); 
-    void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); 
-    void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); 
-    void* var_23 = tensorRelu(var_22); 
-    void* var_26 = tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
-    void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
-    void* var_28 = tensorRelu(var_27); 
-    void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
-    void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
-    void* var_31 = tensorRelu(var_30); 
-    void* var_33 = tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
-    void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
-    void* var_35 = tensorRelu(var_34); 
-    void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
-    void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
-    void* var_38 = tensorRelu(var_37); 
-    void* var_41 = tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
-    void* var_42 = tensorBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
+    void* var_22 = tensorConvolution(var_20, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); 
+    void* var_23 = tensorBatchNorm(var_22, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); 
+    void* var_24 = tensorRelu(var_23); 
+    void* var_25 = tensorConvolution(var_24, conv2d_5_w, 0, 0, 1, 1, 1, 1); 
+    void* var_26 = tensorBatchNorm(var_25, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); 
+    void* var_27 = tensorRelu(var_26); 
+    void* var_28 = tensorConvolution(var_27, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); 
+    void* var_29 = tensorBatchNorm(var_28, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); 
+    void* var_30 = tensorRelu(var_29); 
+    void* var_31 = tensorConvolution(var_30, conv2d_6_w, 0, 0, 1, 1, 1, 1); 
+    void* var_32 = tensorBatchNorm(var_31, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); 
+    void* var_33 = tensorRelu(var_32); 
+    void* var_35 = tensorConvolution(var_33, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); 
+    void* var_36 = tensorBatchNorm(var_35, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); 
+    void* var_37 = tensorRelu(var_36); 
+    void* var_38 = tensorConvolution(var_37, conv2d_7_w, 0, 0, 1, 1, 1, 1); 
+    void* var_39 = tensorBatchNorm(var_38, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
+    void* var_40 = tensorRelu(var_39); 
+    void* var_41 = tensorConvolution(var_40, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
+    void* var_42 = tensorBatchNorm(var_41, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
     void* var_43 = tensorRelu(var_42); 
-    void* var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); 
-    void* var_45 = tensorBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); 
+    void* var_44 = tensorConvolution(var_43, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
+    void* var_45 = tensorBatchNorm(var_44, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
     void* var_46 = tensorRelu(var_45); 
-    void* var_48 = tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); 
-    void* var_49 = tensorBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); 
-    void* var_50 = tensorRelu(var_49); 
-    void* var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); 
-    void* var_52 = tensorBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); 
-    void* var_53 = tensorRelu(var_52); 
-    void* var_55 = tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
-    void* var_56 = tensorBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
-    void* var_57 = tensorRelu(var_56); 
-    void* var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
-    void* var_59 = tensorBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
-    void* var_60 = tensorRelu(var_59); 
-    void* var_63 = tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
-    void* var_64 = tensorBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
+    void* var_47 = tensorConvolution(var_46, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); 
+    void* var_48 = tensorBatchNorm(var_47, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); 
+    void* var_49 = tensorRelu(var_48); 
+    void* var_50 = tensorConvolution(var_49, conv2d_9_w, 0, 0, 1, 1, 1, 1); 
+    void* var_51 = tensorBatchNorm(var_50, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); 
+    void* var_52 = tensorRelu(var_51); 
+    void* var_54 = tensorConvolution(var_52, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); 
+    void* var_55 = tensorBatchNorm(var_54, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); 
+    void* var_56 = tensorRelu(var_55); 
+    void* var_57 = tensorConvolution(var_56, conv2d_10_w, 0, 0, 1, 1, 1, 1); 
+    void* var_58 = tensorBatchNorm(var_57, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
+    void* var_59 = tensorRelu(var_58); 
+    void* var_60 = tensorConvolution(var_59, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
+    void* var_61 = tensorBatchNorm(var_60, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
+    void* var_62 = tensorRelu(var_61); 
+    void* var_63 = tensorConvolution(var_62, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
+    void* var_64 = tensorBatchNorm(var_63, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
     void* var_65 = tensorRelu(var_64); 
-    void* var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); 
-    void* var_67 = tensorBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); 
+    void* var_66 = tensorConvolution(var_65, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
+    void* var_67 = tensorBatchNorm(var_66, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
     void* var_68 = tensorRelu(var_67); 
-    void* var_70 = tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); 
-    void* var_71 = tensorBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); 
-    void* var_72 = tensorRelu(var_71); 
-    void* var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); 
-    void* var_74 = tensorBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); 
+    void* var_69 = tensorConvolution(var_68, conv2d_12_w, 0, 0, 1, 1, 1, 1); 
+    void* var_70 = tensorBatchNorm(var_69, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
+    void* var_71 = tensorRelu(var_70); 
+    void* var_73 = tensorConvolution(var_71, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
+    void* var_74 = tensorBatchNorm(var_73, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
     void* var_75 = tensorRelu(var_74); 
-    void* var_77 = tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); 
-    void* var_78 = tensorBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); 
-    void* var_79 = tensorRelu(var_78); 
-    void* var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); 
-    void* var_81 = tensorBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); 
-    void* var_82 = tensorRelu(var_81); 
-    void* var_85 = tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); 
-    void* var_86 = tensorBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); 
-    void* var_87 = tensorRelu(var_86); 
-    void* var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); 
-    void* var_89 = tensorBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
-    void* var_90 = tensorRelu(var_89); 
-    void* var_92 = tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
-    void* var_93 = tensorBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
-    void* var_94 = tensorRelu(var_93); 
-    void* var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
-    void* var_96 = tensorBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
-    void* var_97 = tensorRelu(var_96); 
-    void* var_99 = tensorPooling(var_97,1,2,2,0,0,2,2); 
-    void* var_101 = tensorGemmGPU(var_99, dense_1_w); 
-    void* var_102 = tensorAdd(var_101, dense_1_b); 
-    void* var_103 = tensorSoftmax(var_102); 
+    void* var_76 = tensorConvolution(var_75, conv2d_13_w, 0, 0, 1, 1, 1, 1); 
+    void* var_77 = tensorBatchNorm(var_76, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); 
+    void* var_78 = tensorRelu(var_77); 
+    void* var_79 = tensorConvolution(var_78, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); 
+    void* var_80 = tensorBatchNorm(var_79, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); 
+    void* var_81 = tensorRelu(var_80); 
+    void* var_82 = tensorConvolution(var_81, conv2d_14_w, 0, 0, 1, 1, 1, 1); 
+    void* var_83 = tensorBatchNorm(var_82, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); 
+    void* var_84 = tensorRelu(var_83); 
+    void* var_86 = tensorPooling(var_84,1,2,2,0,0,2,2); 
+    void* var_88 = tensorGemmGPU(var_86, dense_1_w); 
+    void* var_89 = tensorAdd(var_88, dense_1_b); 
+    void* var_90 = tensorSoftmax(var_89); 
 
     uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); 
 
-    float accuracy = computeAccuracy2(labels, batch_size, var_103); 
+    float accuracy = computeAccuracy2(labels, batch_size, var_90); 
     final_accuracy += accuracy; 
     freeBatchMemory(); 
-
+ 
   }
 
   final_accuracy = final_accuracy / batch_count;