diff --git a/llvm/projects/hpvm-tensor-rt/bin/compute_install_times.py b/llvm/projects/hpvm-tensor-rt/bin/compute_install_times.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e59b72f023a7869e721ba62f923f5e4ca791113
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/bin/compute_install_times.py
@@ -0,0 +1,116 @@
+
+
+class TuningParameters:
+  def __init__(self):
+    self.iterations_measured = 150
+    self.total_iterations = 30000
+
+    
+tunerParams = TuningParameters()
+
+
+class Benchmark:
+  def __init__(self):
+    self.binary_time = 0
+
+
+### All times are real profiled times on the Jetson Board
+### Times are for 150 OpenTuner iterations on Jetson
+
+ResNet50 = Benchmark()
+ResNet50.tuner_time = 3.85 * 100 * 150  # 50 images * 100 batches
+
+VGG16_ImageNet = Benchmark()
+VGG16_ImageNet.tuner_time = 4.55 * 100 * 150  # 50 images * 100 batches
+
+AlexNet_ImageNet = Benchmark()
+AlexNet_ImageNet.tuner_time = 0.7 * 100 * 150
+
+
+VGG16_CIFAR10 = Benchmark()
+VGG16_CIFAR10.tuner_time = 1.54 * 60 * 60  # 50 images * 100 batches
+
+
+VGG16_CIFAR100 = Benchmark()
+VGG16_CIFAR100.tuner_time = 1.57 * 60 * 60  # 50 images * 100 batches
+
+
+ResNet18 = Benchmark()
+ResNet18.tuner_time = 0.52 * 60 * 60  # 12.9 measured for 1000 images
+
+
+MobileNet = Benchmark()
+MobileNet.tuner_time = 0.72 * 60 * 60  # 50 images * 100 batches
+
+
+AlexNet_CIFAR10 = Benchmark()
+AlexNet_CIFAR10.tuner_time = 0.67 * 60 * 60  # Time in hours
+
+
+AlexNet2_CIFAR10 = Benchmark()
+AlexNet2_CIFAR10.tuner_time = 0.19 * 60 * 60 
+
+
+LeNet_CIFAR10 = Benchmark()
+LeNet_CIFAR10.tuner_time = 0.11 * 60 * 60
+
+
+
+
+
+def getInstallTime(Bench):
+
+  ## We limit pareto configs to 50 after iterations of tuning complete
+
+  tuner_invocations = tunerParams.total_iterations / tunerParams.iterations_measured
+ 
+  extrapolated_time = tuner_invocations * Bench.tuner_time
+  
+  time_hours = extrapolated_time / (60 * 60)
+
+  return time_hours
+
+  
+
+# Routine to compute extrapolated tuning times
+def computeExtrapolatedInstallTime():
+
+
+    resnet50_time = getInstallTime(ResNet50)
+    print ("*** ResNet50 time (hrs) = ", resnet50_time)
+
+    resnet18_time = getInstallTime(ResNet18)
+    print ("*** ResNet18 time (hrs) = ", resnet18_time)
+
+    mobilenet_time = getInstallTime(MobileNet)
+    print ("*** MobileNet time (hrs) = ", mobilenet_time)
+    
+    vgg16_img_time = getInstallTime(VGG16_ImageNet)
+    print ("*** VGG16-Imagenet time (hrs) = ", vgg16_img_time)
+
+    vgg16_cifar10_time = getInstallTime(VGG16_CIFAR10)
+    print ("*** VGG16-CIFAR10 time (hrs) = ", vgg16_cifar10_time)
+
+    vgg16_cifar100_time = getInstallTime(VGG16_CIFAR100)
+    print ("*** VGG16-CIFAR100 time (hrs) = ", vgg16_cifar100_time)
+
+    alexnet_img_time = getInstallTime(AlexNet_ImageNet)
+    print ("*** AlexNet-Imagenet time (hrs) = ", alexnet_img_time)
+
+    alexnet_cifar10_time = getInstallTime(AlexNet_CIFAR10)
+    print ("*** AlexNet-CIFAR10 time (hrs) = ", alexnet_cifar10_time)
+
+    alexnet2_cifar10_time = getInstallTime(AlexNet2_CIFAR10)
+    print ("*** AlexNet2-CIFAR10 time (hrs) = ", alexnet2_cifar10_time)
+
+    lenet_cifar10_time = getInstallTime(LeNet_CIFAR10)
+    print ("*** LeNet-CIFAR10 time (hrs) = ", lenet_cifar10_time)
+
+
+
+  
+
+if __name__ == "__main__":
+
+    computeExtrapolatedInstallTime()
+
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc
index 0aa33bc43dace1f847f44ed7ad6dcfc0082d014a..65d6335f75fb5f3e9469e42507e063a2b526aee8 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc
@@ -4,15 +4,18 @@
 #include <unistd.h> 
 #include <fcntl.h> 
 #include <sys/types.h> 
-#include <sys/stat.h> 
+#include <sys/stat.h>
+#include <vector>
 #include <string.h> 
 #include "tensor_runtime.h" 
 #include "utils.h" 
 
+#include "tensor_custom_ops_cpu.h"
 
 
 
-Tensor *gaussianFilter(float sigma, size_t w, size_t h, size_t n_chan) {
+
+Tensor* gaussianFilter(float sigma, size_t w, size_t h, size_t n_chan) {
   int64_t m = (w - 1) / 2, n = (h - 1) / 2;
   auto *data = new float[w * h];
   float sum = 0.0f;
@@ -57,64 +60,50 @@ TODOs:
 ****/
 
 void* canny_filter(void* dataset) {
-  Tensor *gaussian = gaussianFilter(1.4, 5, 5, 1);
-  Tensor *kernel_x, *kernel_y;
+
+  Tensor* gaussian = gaussianFilter(1.4, 5, 5, 1);
+  Tensor* kernel_x, *kernel_y;
   std::tie(kernel_x, kernel_y) = getSobelKernels();
 
   // 0. Grayscale
-  auto *summed_image = autotuner_tensorReduce(dataset, 1, MathOp::Add);
-  auto *grayscale_image = autotuner_tensorMap1(MathOp::Avg3, summed_image);
+  auto* summed_image = tensorReduce(dataset, 1, MathOp::Add);
+  auto* grayscale_image = tensorMap1(MathOp::Avg3, summed_image);
   // 1. Denoise
-  auto *image2 = ConvLayer_PROMISE(grayscale_image, 0.0, 0.0, gaussian,
-				   0.0, 0.0, nullptr, 0.0, 0.0, 2, 2, 1,
-				   1, 0, 0, -1, 0.0, 0.0, 0);
+
+  auto* image2 = tensorConvolution(grayscale_image, gaussian,
+				   2, 2, // padding
+				   1, 1, // strides
+				   1, 0); // conv_mode, conv_groups
+				    
   // 2. Get edge gradient / direction
-  auto *grad_x = ConvLayer_PROMISE(
-      image2, 0.0, 0.0, kernel_x, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0,
-      -1, 0.0, 0.0, 0);
-  auto *grad_y = ConvLayer_PROMISE(
-      image2, 0.0, 0.0, kernel_y, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0,
-      -1, 0.0, 0.0, 0);
-  auto *grad_mag = autotuner_tensorMap2(MathOp::Hypot, grad_x, grad_y);
+  auto *grad_x = tensorConvolution(image2, kernel_x,
+				   1, 1,
+				   1, 1,
+				   1, 0);
+   
+  auto *grad_y = tensorConvolution(image2, kernel_y,
+				   1, 1,
+				   1, 1,
+				   1, 0);
+ 
+  auto *grad_mag = tensorMap2(MathOp::Hypot, grad_x, grad_y);
   // 2.5. Normalize grad magnitude
-  auto *grad_max_1D = autotuner_tensorReduce(grad_mag, 2, MathOp::Max);
-  auto *grad_max = autotuner_tensorReduce(grad_max_1D, 3, MathOp::Max);
-  auto *grad_mag_norm = autotuner_tensorMap2(MathOp::Div, grad_mag, grad_max);
+  auto *grad_max_1D = tensorReduce(grad_mag, 2, MathOp::Max);
+  auto *grad_max = tensorReduce(grad_max_1D, 3, MathOp::Max);
+  auto *grad_mag_norm = tensorMap2(MathOp::Div, grad_mag, grad_max);
   return grad_mag_norm;
 }
 
-const size_t batch_size = 500, total_max = 3000;
-const float psnr_threshold = 30.0;
 
 
 
+void* invoke_canny(void* input) {
+  
+  auto* result = canny_filter(input);
 
-int main() {
-  const char *input_path = "../model_params/image_processing_5k";
-  const char *ref_output_path = "../model_params/canny_ref_output";
-  std::vector<float> psnr;
-  llvm_hpvm_initTensorRt(1);
-  size_t bstart = 0;
-  startMemTracking();
-  while (true) {
-    Tensor *batch = readDataSet(input_path, bstart, batch_size);
-    if (batch == nullptr)
-      break;
-
-    auto *result = main_procedure(batch);
-    auto *ref_output = readDataSet(ref_output_path, bstart, batch_size, 1);
-    std::vector<float> psnr_batch = PSNR(ref_output, result);
-    std::copy(psnr_batch.begin(), psnr_batch.end(), std::back_inserter(psnr));
-    bstart += batch_size;
-    if (bstart >= total_max)
-      break;
-    freeBatchMemory();
-  }
-  float violation = violationRate(psnr, psnr_threshold);
-  float mean_psnr = mean(psnr);
-  std::ofstream of("final_accuracy");
-  of << violation * 100 << ", " << mean_psnr << '\n';
-  return 0;
+  printf("Done with Canny \n");
+  
+  return result;
 }
 
 
@@ -128,84 +117,128 @@ int main(){
 
   llvm_hpvm_initTensorRt(0); 
 
-
-  //std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/");
-  std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); 
+  std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); 
+  std::string input_path =  dir_prefix + std::string("norm_cifar_input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("test_labels.bin"); 
+
+  void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin",
+					  float_type, 32, 3, 3, 3);  
+  void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin",
+					float_type, 1, 32, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin",
+					  float_type, 32, 32, 3, 3);  
+  void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin",
+					float_type, 1, 32, 1, 1);
+  void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin",
+					  float_type, 64, 32, 3, 3);  
+  void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin",
+					  float_type, 64, 64, 3, 3);  
+  void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin",
+					float_type, 1, 64, 1, 1);
+  void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin",
+					  float_type, 128, 64, 3, 3);  
+  void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin",
+					float_type, 1, 128, 1, 1);
+  void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin",
+					  float_type, 128, 128, 3, 3);  
+  void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin",
+					float_type, 1, 128, 1, 1);
+  
+  void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin",
+					 float_type, 1, 1, 2048, 10);  
+  void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin",
+				      float_type, 1, 10, 1, 1);  
  
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  //void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  //uint8_t* labels = readLabels(labels_path.c_str(),10000); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv0.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv_bias0.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv3.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv_bias3.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv6.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv_bias6.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv7.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv_bias7.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv8.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv_bias8.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("fc12.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("fc_bias12.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-
-
-  startMemTracking();
 
-  int test_input_size = 2000;
-  int batch_size = 2000;
+  int test_input_size = 5000;
+  int batch_size = 500;
   int batch_count = test_input_size / batch_size;
   float final_accuracy = 0.0;
 
   // NOTE: Starting time profiling
-  startProfiling();
-  
+  startProfiling();  
+  startMemTracking();
+
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
   for(int i = 0; i < batch_count; i++){
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
-
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
-    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorTanh(var_1); 
-    void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); 
-    void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
-    void* var_6 = tensorAdd(var_5, conv2d_2_b); 
-    void* var_7 = tensorTanh(var_6); 
-    void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); 
-    void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorTanh(var_11); 
-    void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_14 = tensorAdd(var_13, conv2d_4_b); 
-    void* var_15 = tensorTanh(var_14); 
-    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorTanh(var_17); 
-    void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); 
-    void* var_22 = tensorGemmGPU(var_19, dense_1_w); 
-    void* var_23 = tensorAdd(var_22, dense_1_b); 
-    void* var_24 = tensorSoftmax(var_23); 
+
+
+    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
+
+    void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv1out, conv1_bias); 
+    void* conv1_tanh = tensorTanh(conv1out);
+    
+    // 2nd Layer
+    void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv2out, conv2_bias); 
+    void* conv2_tanh = tensorTanh(conv2out);
+    void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+     
+    // 3rd Layer
+    void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv3out, conv3_bias); 
+    void* conv3_tanh = tensorTanh(conv3out);
+
+    // 4th Layer
+    void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv4out, conv4_bias); 
+    void* conv4_tanh = tensorTanh(conv4out);
+    void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+    
+    // 5th Layer
+    void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv5out, conv5_bias); 
+    void* conv5_tanh = tensorTanh(conv5out);
+
+    // 6th Layer
+    void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv6out, conv6_bias); 
+  
+    void* conv6_tanh = tensorTanh(conv6out);
+    void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+    
+    // final FC Layer
+    void* gemm1out = tensorGemmGPU(pool6out, fc1_weights);  
+    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+    void* result = tensorSoftmax(gemm1biasout);
 
     uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
 
-    float accuracy = computeAccuracy2(labels,batch_size,var_24); 
+    float accuracy = computeAccuracy2(labels, batch_size, result); 
     final_accuracy += accuracy;
+
+
+    std::vector<int> index_vector;
+    index_vector.push_back(1);
+    index_vector.push_back(2);
+    index_vector.push_back(3);
+    index_vector.push_back(4);
+    index_vector.push_back(5);
     
-    freeBatchMemory();
+    
+    void* argmax_out = tensorArgMax(result);
+    void* select_out = tensorSelect2(argmax_out, index_vector);
+    void* reduced_input = tensorContract(input, select_out);
+
+    
+    invoke_canny(reduced_input);
+    
+
+    freeBatchMemory();    
   }
 
   stopProfiling();
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc
index 5c48b3b01f2641576e6ac725ae0a81f03d6a5dbb..95b571e5a0d710cf71f0bb714e658420751abf53 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc
@@ -6,6 +6,7 @@
 #include "tensor_runtime.h"
 #include "utils.h"
 
+#include "tensor_custom_ops_cpu.h"
 
 
 void testTensorGemm(){
@@ -1098,6 +1099,105 @@ void testSampling_1_1(){
 
 
 
+void* testTensorArgMax(){
+
+  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1);
+ 
+  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+
+  // Input 0
+  host_ptr[0] = 1;
+  host_ptr[1] = 7; // highest - max index = 1
+  host_ptr[2] = 3;
+
+  // Input 1
+  host_ptr[3] = 3;
+  host_ptr[4] = 3;
+  host_ptr[5] = 8; // highest - max index = 2
+
+  // Input 2
+  host_ptr[6] = 2;
+  host_ptr[7] = 5;
+  host_ptr[8] = 9; // highest - max index = 2
+
+  // Input 3
+  host_ptr[9] = 11; // highest - max index = 0
+  host_ptr[10] = 2;
+  host_ptr[11] = 8;
+
+  void* argmax_out = tensorArgMax(input);
+  
+  // Expect Output of call below to be:  
+  //   1    2    2    0
+  printTensorValues(argmax_out);
+
+  return argmax_out; 
+}
+
+
+
+void* testTensorSelect(void* argmax_out){
+
+  void* select_out = tensorSelect(argmax_out, 2);
+  printf ("***** tensorSelect output \n");
+
+  printTensorValues(select_out);
+
+  return select_out; 
+  
+}
+
+
+void testTensorContract(void* select_out){
+
+  Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1);
+  float* host_ptr = (float*) ((struct Tensor*) input)->host_data;
+
+  // Input 0
+  host_ptr[0] = 1;
+  host_ptr[1] = 1; 
+  host_ptr[2] = 1;
+  host_ptr[3] = 1;
+
+  // Input 1
+  host_ptr[4] = 2;
+  host_ptr[5] = 2;
+  host_ptr[6] = 2;
+  host_ptr[7] = 2;
+  
+  // Input 2
+  host_ptr[8] = 3;
+  host_ptr[9] = 3;
+  host_ptr[10] = 3; 
+  host_ptr[11] = 3; 
+
+  // Input 3
+  host_ptr[12] = 4; 
+  host_ptr[13] = 4;
+  host_ptr[14] = 4;
+  host_ptr[15] = 4;
+
+
+  void* contract_out = tensorContract(input, select_out);
+  printf ("***** tensorContract output \n");
+
+  printTensorValues(contract_out);
+
+}
+
+
+
+void testNewTensorOps(){
+
+  void* argmax_out = testTensorArgMax();
+  void* select_out = testTensorSelect(argmax_out);
+  testTensorContract(select_out);
+  
+}
+
+
+
+
 
 
 
@@ -1137,21 +1237,22 @@ int main(){
 
   // testPerforation2();
 
-
-  //testSampling();
   
 
-  //testSampling2();
 
-
-  //testSampling3();
-  
+  /********* SAMPLING TESTS ****
 
   testSampling_3_3();
 
   
   testSampling_1_1();
 
+  *************/
+
+
+  testNewTensorOps();
+
+  
 
 
   //testQuantization();
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9128c1a24ca5bd95a7e6fb9e962d56501558f8f
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h
@@ -0,0 +1,178 @@
+
+
+#include "tensor.h"
+#include <stdlib.h>
+#include <vector>
+
+
+void* tensorArgMax(void* input_ptr){
+
+  Tensor* input = (Tensor*) input_ptr;
+  float* host_ptr = (float*) input->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+
+  Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1);
+  changeTensorPlacement(output, HOST);
+    
+  float* out_ptr = (float*) output->host_data;
+  
+  for(int i = 0; i < batch_size; i++){
+
+    int start = i * channels;
+    float max_index = 0;
+    float max_val = host_ptr[start];
+    for(int j = 0; j < channels; j++){
+      
+      int index = start + j;
+      //printf ("index = %d \n", index);
+      float val = host_ptr[index];
+      if (val > max_val){
+	max_val = val;
+	max_index = j;
+      }	
+    }
+
+    out_ptr[i] = max_index;
+  }
+  
+
+  return output;
+  
+}
+
+
+
+
+
+void* tensorSelect(void* input_ptr, float target_value){
+
+  Tensor* input = (Tensor*) input_ptr;
+  float* host_ptr = (float*) input->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+
+  if (channels != 1){
+    printf("* Channels dimension must be 1 \n");
+    abort();
+  }
+
+  Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1);
+  changeTensorPlacement(output, HOST);    
+  float* out_ptr = (float*) output->host_data;
+
+  for(int i = 0; i < batch_size; i++){
+    if (host_ptr[i] == target_value){
+      out_ptr[i] = 1;
+    }
+    else{
+      out_ptr[i] = 0;
+    }	   
+  }
+      
+  return output;
+}
+
+
+
+
+void* tensorSelect2(void* input_ptr, std::vector<int> index_vector){
+
+  Tensor* input = (Tensor*) input_ptr;
+  float* host_ptr = (float*) input->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+
+  if (channels != 1){
+    printf("* Channels dimension must be 1 \n");
+    abort();
+  }
+
+  Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1);
+  changeTensorPlacement(output, HOST);    
+  float* out_ptr = (float*) output->host_data;
+
+  for(int i = 0; i < batch_size; i++){
+
+    for(int j = 0; j < index_vector.size(); j++){
+      int target_value = index_vector[j];
+      if (host_ptr[i] == target_value){
+	out_ptr[i] = 1;
+	break;
+      }
+      else{
+	out_ptr[i] = 0;
+      }
+    }
+      
+  }
+      
+  return output;
+}
+
+
+
+
+
+
+long getOnesInVector(float* vector_host_ptr, long vector_length){
+
+  long ones_count = 0;
+  for(int i = 0; i < vector_length; i++){
+
+    if(vector_host_ptr[i] == 1)
+      ones_count += 1;
+  }
+
+  return ones_count;
+}
+
+
+void* tensorContract(void* input_ptr, void* bitvector_ptr){
+
+  Tensor* input = (Tensor*) input_ptr;
+  float* host_ptr = (float*) input->host_data;
+
+  Tensor* bitvector = (Tensor*) bitvector_ptr;
+  float* vector_host_ptr = (float*) bitvector->host_data;  
+  long vector_length = bitvector->dims.dim_sizes[0];
+  
+  long reduced_batch_size = getOnesInVector(vector_host_ptr, vector_length); 
+  
+  long batch_size = input->dims.dim_sizes[0]; 
+  long channels = input->dims.dim_sizes[1];
+  long height = input->dims.dim_sizes[2];
+  long width = input->dims.dim_sizes[3];
+
+  long image_size = channels * height * width; // Computing size of each image
+  
+  if (batch_size != vector_length){
+    printf("ERROR: bitvector length has to match input batch size \n");
+    abort();
+  }
+
+  Tensor* output = (Tensor *) create4DTensor(0, 0, reduced_batch_size, channels, height, width);
+  changeTensorPlacement(output, HOST);    
+  float* out_ptr = (float*) output->host_data;
+
+  long out_index = 0;
+  for(int i = 0; i < batch_size; i++){
+
+    // Include image if corresponding index in bitvector is '1'
+    if (vector_host_ptr[i] == 1){
+
+      for(int j = 0; j < image_size; j++){
+
+	out_ptr[j] = host_ptr[i * image_size + j];
+      }
+
+      out_ptr += image_size; // Update the output pointer to the next image boundary
+    }	   
+  }
+      
+  return output;
+}
+
diff --git a/llvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt b/llvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt
index 57f128d0e58bdad252b4b93cae526b8323d8779a..da42b2ad85397c72f2385724f4af52f3da6c0c78 100644
--- a/llvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt
+++ b/llvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt
@@ -1,282 +1,282 @@
 #Conv1,4
-Conv
-Add
-Relu
-Pool
-#NML1,1
+Conv1
+Add1
+Relu1
+Pool1
+BatchNorm1
 #Conv2,2
-Conv
-Add
+Conv2
+Add2
+BatchNorm2
+#NML1,1
+#Conv3,2
+Conv3
+Add3
+BatchNorm3
 #NML2,1
+#Conv4,2
+Conv4
+Add4
+BatchNorm4
+#Conv5,2
+Conv5
+Add5
+BatchNorm5
 #NML3,1
-#Conv3,2
-Conv
-Add
 #NML4,1
+#Conv6,2
+Conv6
+Add6
+BatchNorm6
 #NML5,1
-#Conv4,2
-Conv
-Add
+#Conv7,2
+Conv7
+Add7
+BatchNorm7
 #NML6,1
-#Conv5,2
-Conv
-Add
+#Conv8,2
+Conv8
+Add8
+BatchNorm8
 #NML7,1
 #NML8,1
+#Conv9,2
+Conv9
+Add9
+BatchNorm9
 #NML9,1
-#Conv6,2
-Conv
-Add
+#Conv10,2
+Conv10
+Add10
+BatchNorm10
 #NML10,1
+#Conv11,2
+Conv11
+Add11
+BatchNorm11
 #NML11,1
-#Conv7,2
-Conv
-Add
 #NML12,1
+#Conv12,2
+Conv12
+Add12
+BatchNorm12
 #NML13,1
-#Conv8,2
-Conv
-Add
+#Conv13,2
+Conv13
+Add13
+BatchNorm13
 #NML14,1
+#Conv14,2
+Conv14
+Add14
+BatchNorm14
+#Conv15,2
+Conv15
+Add15
+BatchNorm15
 #NML15,1
 #NML16,1
-#Conv9,2
-Conv
-Add
+#Conv16,2
+Conv16
+Add16
+BatchNorm16
 #NML17,1
+#Conv17,2
+Conv17
+Add17
+BatchNorm17
 #NML18,1
-#Conv10,2
-Conv
-Add
+#Conv18,2
+Conv18
+Add18
+BatchNorm18
 #NML19,1
 #NML20,1
-#Conv11,2
-Conv
-Add
+#Conv19,2
+Conv19
+Add19
+BatchNorm19
 #NML21,1
+#Conv20,2
+Conv20
+Add20
+BatchNorm20
 #NML22,1
+#Conv21,2
+Conv21
+Add21
+BatchNorm21
 #NML23,1
-#Conv12,2
-Conv
-Add
 #NML24,1
+#Conv22,2
+Conv22
+Add22
+BatchNorm22
 #NML25,1
-#Conv13,2
-Conv
-Add
+#Conv23,2
+Conv23
+Add23
+BatchNorm23
 #NML26,1
+#Conv24,2
+Conv24
+Add24
+BatchNorm24
 #NML27,1
-#Conv14,2
-Conv
-Add
 #NML28,1
-#Conv15,2
-Conv
-Add
+#Conv25,2
+Conv25
+Add25
+BatchNorm25
 #NML29,1
+#Conv26,2
+Conv26
+Add26
+BatchNorm26
 #NML30,1
+#Conv27,2
+Conv27
+Add27
+BatchNorm27
+#Conv28,2
+Conv28
+Add28
+BatchNorm28
 #NML31,1
-#Conv16,2
-Conv
-Add
 #NML32,1
+#Conv29,2
+Conv29
+Add29
+BatchNorm29
 #NML33,1
-#Conv17,2
-Conv
-Add
+#Conv30,2
+Conv30
+Add30
+BatchNorm30
 #NML34,1
+#Conv31,2
+Conv31
+Add31
+BatchNorm31
 #NML35,1
-#Conv18,2
-Conv
-Add
 #NML36,1
+#Conv32,2
+Conv32
+Add32
+BatchNorm32
 #NML37,1
+#Conv33,2
+Conv33
+Add33
+BatchNorm33
 #NML38,1
-#Conv19,2
-Conv
-Add
+#Conv34,2
+Conv34
+Add34
+BatchNorm34
 #NML39,1
 #NML40,1
-#Conv20,2
-Conv
-Add
+#Conv35,2
+Conv35
+Add35
+BatchNorm35
 #NML41,1
+#Conv36,2
+Conv36
+Add36
+BatchNorm36
 #NML42,1
-#Conv21,2
-Conv
-Add
+#Conv37,2
+Conv37
+Add37
+BatchNorm37
 #NML43,1
 #NML44,1
+#Conv38,2
+Conv38
+Add38
+BatchNorm38
 #NML45,1
-#Conv22,2
-Conv
-Add
+#Conv39,2
+Conv39
+Add39
+BatchNorm39
 #NML46,1
+#Conv40,2
+Conv40
+Add40
+BatchNorm40
 #NML47,1
-#Conv23,2
-Conv
-Add
 #NML48,1
+#Conv41,2
+Conv41
+Add41
+BatchNorm41
 #NML49,1
-#Conv24,2
-Conv
-Add
+#Conv42,2
+Conv42
+Add42
+BatchNorm42
 #NML50,1
+#Conv43,2
+Conv43
+Add43
+BatchNorm43
 #NML51,1
 #NML52,1
-#Conv25,2
-Conv
-Add
+#Conv44,2
+Conv44
+Add44
+BatchNorm44
 #NML53,1
+#Conv45,2
+Conv45
+Add45
+BatchNorm45
 #NML54,1
-#Conv26,2
-Conv
-Add
+#Conv46,2
+Conv46
+Add46
+BatchNorm46
+#Conv47,2
+Conv47
+Add47
+BatchNorm47
 #NML55,1
 #NML56,1
-#Conv27,2
-Conv
-Add
+#Conv48,2
+Conv48
+Add48
+BatchNorm48
 #NML57,1
-#Conv28,2
-Conv
-Add
+#Conv49,2
+Conv49
+Add49
+BatchNorm49
 #NML58,1
+#Conv50,2
+Conv50
+Add50
+BatchNorm50
 #NML59,1
 #NML60,1
-#Conv29,2
-Conv
-Add
+#Conv51,2
+Conv51
+Add51
+BatchNorm51
 #NML61,1
+#Conv52,2
+Conv52
+Add52
+BatchNorm52
 #NML62,1
-#Conv30,2
-Conv
-Add
+#Conv53,2
+Conv53
+Add53
+BatchNorm53
 #NML63,1
 #NML64,1
-#Conv31,2
-Conv
-Add
 #NML65,1
-#NML66,1
-#NML67,1
-#Conv32,2
-Conv
-Add
-#NML68,1
-#NML69,1
-#Conv33,2
-Conv
-Add
-#NML70,1
-#NML71,1
-#Conv34,2
-Conv
-Add
-#NML72,1
-#NML73,1
-#NML74,1
-#Conv35,2
-Conv
-Add
-#NML75,1
-#NML76,1
-#Conv36,2
-Conv
-Add
-#NML77,1
-#NML78,1
-#Conv37,2
-Conv
-Add
-#NML79,1
-#NML80,1
-#NML81,1
-#Conv38,2
-Conv
-Add
-#NML82,1
-#NML83,1
-#Conv39,2
-Conv
-Add
-#NML84,1
-#NML85,1
-#Conv40,2
-Conv
-Add
-#NML86,1
-#NML87,1
-#NML88,1
-#Conv41,2
-Conv
-Add
-#NML89,1
-#NML90,1
-#Conv42,2
-Conv
-Add
-#NML91,1
-#NML92,1
-#Conv43,2
-Conv
-Add
-#NML93,1
-#NML94,1
-#NML95,1
-#Conv44,2
-Conv
-Add
-#NML96,1
-#NML97,1
-#Conv45,2
-Conv
-Add
-#NML98,1
-#NML99,1
-#Conv46,2
-Conv
-Add
-#NML100,1
-#Conv47,2
-Conv
-Add
-#NML101,1
-#NML102,1
-#NML103,1
-#Conv48,2
-Conv
-Add
-#NML104,1
-#NML105,1
-#Conv49,2
-Conv
-Add
-#NML106,1
-#NML107,1
-#Conv50,2
-Conv
-Add
-#NML108,1
-#NML109,1
-#NML110,1
-#Conv51,2
-Conv
-Add
-#NML111,1
-#NML112,1
-#Conv52,2
-Conv
-Add
-#NML113,1
-#NML114,1
-#Conv53,2
-Conv
-Add
-#NML115,1
-#NML116,1
-#NML117,1
-#NML118,1
 #FC1,2
-Mul
-Add
+Mul1
+Add54
diff --git a/llvm/projects/soc_simulator/scripts/construct_ops.py b/llvm/projects/soc_simulator/scripts/construct_ops.py
index 3b655f2f5fb7ccb3eb4ac8db2e105cb74d71f986..3bcb2af9c345b19d86acaf92e3771c60370b4678 100644
--- a/llvm/projects/soc_simulator/scripts/construct_ops.py
+++ b/llvm/projects/soc_simulator/scripts/construct_ops.py
@@ -11,7 +11,7 @@ op_map["pool"] = "Pool"
 op_map["relu"] = "Relu"
 op_map["activation"] = "Relu"
 op_map["tanh"] = "Tanh"
-op_map["batchnorm"] = "NML"
+op_map["batchnorm"] = "BatchNorm"
 
 
 unique_op_map = {}
@@ -45,19 +45,29 @@ if __name__ == "__main__":
   f2 = open(out_path, "w+")
 
 
-
   nml_id = 1
   conv_id = 1
   fc_id = 1
+  batchnorm_id = 1
   
   for x in f:
       toks = x.split()
 
       layer_len = len(toks)
 
+      #if "batchnorm" in toks:
+      #    f2.write("BatchNorm" + str(batchnorm_id) + "\n")
+      #    batchnorm_id += 1
+      #    continue
+      
       if layer_len == 1 and "conv" not in toks and "dense" not in toks:
           f2.write("#NML" + str(nml_id) + ",1\n")
           nml_id += 1
+
+          layer_str = getLayerStr(toks)
+          f2.write(layer_str)
+      
+          
       if "conv" in toks:
           f2.write("#Conv" + str(conv_id) + "," + str(layer_len) + "\n")
 
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..fec9712539bcb79fc880293e90d4864ba5bf0e4f
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile
@@ -0,0 +1,83 @@
+DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks
+# NOTE: can configure build directory
+#HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/
+HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT)
+
+CC = $(HPVM_BUILD_DIR)/bin/clang++
+OPT = $(HPVM_BUILD_DIR)/bin/opt
+LLVM_DIS = $(HPVM_BUILD_DIR)/bin/llvm-dis
+LLVM_LINK = $(HPVM_BUILD_DIR)/bin/llvm-link
+LLVM_INCLUDE_DIR = $(LLVM_SRC_ROOT)/include
+
+SRC_DIR = src
+BUILD_DIR = build
+APP = alexnet
+
+TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include
+TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include
+TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a
+PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a
+SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a
+TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a
+
+CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH)  -fno-exceptions -ffast-math -std=c++11 -O3
+CCFLAGS += -DDEVICE=CUDNN_TARGET
+LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs
+
+HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib
+
+
+VISC_OPTFLAGS = -load  $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load  $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load  $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load  $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG
+
+
+PROMISE_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges.txt
+
+VISC_OPTFLAGS2 = -load  $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load  $(HPVM_LIB_DIR)/LLVMDFG2LLVM_PROMISE.so  -load  $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so    -load  $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load  $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so  -load  $(HPVM_LIB_DIR)/LLVMClearDFG.so   -inplace -hpvm-fuse -dfg2llvm-promise  -quantization-levels-filename=$(PROMISE_QUANT_FILE_PATH) -dfg2llvm-cudnn  -dfg2llvm-x86 -clearDFG
+
+WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt
+CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt
+
+VISC_OPTFLAGS3 = -load  $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load  $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so    -load  $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load  $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so  -load  $(HPVM_LIB_DIR)/LLVMClearDFG.so   -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG
+
+
+TARGET = $(BUILD_DIR)/$(APP).opt.bc
+SOURCES = $(SRC_DIR)/$(APP).cpp
+VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll
+
+#OBJS = $(BUILD_DIR)/$(wildcabrd *.ll)
+.PRECIOUS: $(BUILD_DIR)/$(APP).ll $(BUILD_DIR)/$(APP).visc.ll
+default: $(BUILD_DIR) $(TARGET)
+
+
+$(BUILD_DIR)/%.ll: $(SRC_DIR)/%.cpp
+	$(CC) $(CC_FLAGS) -emit-llvm src/$(APP).cpp -S -o  $(BUILD_DIR)/$(APP).ll  
+	#---- $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_promise.cpp -S -o $(BUILD_DIR)/$(APP)_promise.ll
+	#---- $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_loop.cpp -S -o  $(BUILD_DIR)/$(APP)_loop.ll  
+
+$(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll
+	$(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP).ll -S -o  $(BUILD_DIR)/$(APP).visc.ll
+	#----- $(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP)_promise.ll -S -o  $(BUILD_DIR)/$(APP)_promise.visc.ll
+	#----- $(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP)_loop.ll -S -o  $(BUILD_DIR)/$(APP)_loop.visc.ll
+	$(OPT) $(VISC_OPTFLAGS)  $(BUILD_DIR)/$(APP).visc.ll  -o  $(BUILD_DIR)/$(APP)_cudnn.bc
+	#$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_promise.bc
+
+	#--- $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_wrapperapi.bc
+	#--- $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll  -o  $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc
+
+	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc
+	#$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc
+
+	#---- $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc
+	#--- $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc
+	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
+	#$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
+
+	#--- $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS)
+	#--- $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS)
+
+
+$(BUILD_DIR):
+	mkdir -p $@
+
+clean:
+	rm -rf $(BUILD_DIR)
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/src/alexnet.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/src/alexnet.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5aae61e5f0673d260c32ad923bb1038f91a39a3b
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/src/alexnet.cpp
@@ -0,0 +1,562 @@
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/stat.h> 
+#include <cstring> 
+#include <visc.h> 
+#include <tensorTypes.h> 
+#include <tensorUtils.h> 
+
+
+void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 2, 2, 4, 4); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_add(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_2_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_3_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_add(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_6_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_7_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_add(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_10_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_add(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_13_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_add(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_16_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_17_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_mul(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_add(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_20_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_21_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_mul(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_22_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_add(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_23_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_relu(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_24_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_mul(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_25_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(2, t1, t2, 0); 
+
+  void *r = __visc__tensor_add(t1, t2); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void var_26_node(void* t1, size_t bytes_t1) { 
+  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__attributes(1, t1, 0); 
+
+  void* r = __visc__tensor_softmax(t1); 
+  __visc__return(2, r, (size_t) 0); 
+}
+
+void root(void* input, size_t input_bytes, 
+	  void* conv2d_1_w, size_t conv2d_1_w_bytes, 
+	  void* conv2d_1_b, size_t conv2d_1_b_bytes, 
+	  void* conv2d_2_w, size_t conv2d_2_w_bytes, 
+	  void* conv2d_2_b, size_t conv2d_2_b_bytes, 
+	  void* conv2d_3_w, size_t conv2d_3_w_bytes, 
+	  void* conv2d_3_b, size_t conv2d_3_b_bytes, 
+	  void* conv2d_4_w, size_t conv2d_4_w_bytes, 
+	  void* conv2d_4_b, size_t conv2d_4_b_bytes, 
+	  void* conv2d_5_w, size_t conv2d_5_w_bytes, 
+	  void* conv2d_5_b, size_t conv2d_5_b_bytes, 
+	  void* dense_1_w, size_t dense_1_w_bytes, 
+	  void* dense_1_b, size_t dense_1_b_bytes, 
+	  void* dense_2_w, size_t dense_2_w_bytes, 
+	  void* dense_2_b, size_t dense_2_b_bytes, 
+	  void* dense_3_w, size_t dense_3_w_bytes, 
+	  void* dense_3_b, size_t dense_3_b_bytes){ 
+
+
+  __visc__hint(visc::CPU_TARGET); 
+  __visc__attributes(17, input, conv2d_1_w, conv2d_1_b, conv2d_2_w, conv2d_2_b, conv2d_3_w, conv2d_3_b, conv2d_4_w, conv2d_4_b, conv2d_5_w, conv2d_5_b, dense_1_w, dense_1_b, dense_2_w, dense_2_b, dense_3_w, dense_3_b, 0); 
+
+
+  void* var_0 = __visc__createNodeND(0, var_0_node); 
+
+  __visc__bindIn(var_0, 0, 0, 0); 
+  __visc__bindIn(var_0, 1, 1, 0); 
+  __visc__bindIn(var_0, 2, 2, 0); 
+  __visc__bindIn(var_0, 3, 3, 0); 
+
+  void* var_1 = __visc__createNodeND(0, var_1_node); 
+
+  __visc__edge(var_0, var_1, 1, 0, 0, 0); 
+  __visc__edge(var_0, var_1, 1, 1, 1, 0); 
+  __visc__bindIn(var_1, 4, 2, 0); 
+  __visc__bindIn(var_1, 5, 3, 0); 
+
+  void* var_2 = __visc__createNodeND(0, var_2_node); 
+
+  __visc__edge(var_1, var_2, 1, 0, 0, 0); 
+  __visc__edge(var_1, var_2, 1, 1, 1, 0); 
+
+  void* var_3 = __visc__createNodeND(0, var_3_node); 
+
+  __visc__edge(var_2, var_3, 1, 0, 0, 0); 
+  __visc__edge(var_2, var_3, 1, 1, 1, 0); 
+
+  void* var_4 = __visc__createNodeND(0, var_4_node); 
+
+  __visc__edge(var_3, var_4, 1, 0, 0, 0); 
+  __visc__edge(var_3, var_4, 1, 1, 1, 0); 
+  __visc__bindIn(var_4, 6, 2, 0); 
+  __visc__bindIn(var_4, 7, 3, 0); 
+
+  void* var_5 = __visc__createNodeND(0, var_5_node); 
+
+  __visc__edge(var_4, var_5, 1, 0, 0, 0); 
+  __visc__edge(var_4, var_5, 1, 1, 1, 0); 
+  __visc__bindIn(var_5, 8, 2, 0); 
+  __visc__bindIn(var_5, 9, 3, 0); 
+
+  void* var_6 = __visc__createNodeND(0, var_6_node); 
+
+  __visc__edge(var_5, var_6, 1, 0, 0, 0); 
+  __visc__edge(var_5, var_6, 1, 1, 1, 0); 
+
+  void* var_7 = __visc__createNodeND(0, var_7_node); 
+
+  __visc__edge(var_6, var_7, 1, 0, 0, 0); 
+  __visc__edge(var_6, var_7, 1, 1, 1, 0); 
+
+  void* var_8 = __visc__createNodeND(0, var_8_node); 
+
+  __visc__edge(var_7, var_8, 1, 0, 0, 0); 
+  __visc__edge(var_7, var_8, 1, 1, 1, 0); 
+  __visc__bindIn(var_8, 10, 2, 0); 
+  __visc__bindIn(var_8, 11, 3, 0); 
+
+  void* var_9 = __visc__createNodeND(0, var_9_node); 
+
+  __visc__edge(var_8, var_9, 1, 0, 0, 0); 
+  __visc__edge(var_8, var_9, 1, 1, 1, 0); 
+  __visc__bindIn(var_9, 12, 2, 0); 
+  __visc__bindIn(var_9, 13, 3, 0); 
+
+  void* var_10 = __visc__createNodeND(0, var_10_node); 
+
+  __visc__edge(var_9, var_10, 1, 0, 0, 0); 
+  __visc__edge(var_9, var_10, 1, 1, 1, 0); 
+
+  void* var_11 = __visc__createNodeND(0, var_11_node); 
+
+  __visc__edge(var_10, var_11, 1, 0, 0, 0); 
+  __visc__edge(var_10, var_11, 1, 1, 1, 0); 
+  __visc__bindIn(var_11, 14, 2, 0); 
+  __visc__bindIn(var_11, 15, 3, 0); 
+
+  void* var_12 = __visc__createNodeND(0, var_12_node); 
+
+  __visc__edge(var_11, var_12, 1, 0, 0, 0); 
+  __visc__edge(var_11, var_12, 1, 1, 1, 0); 
+  __visc__bindIn(var_12, 16, 2, 0); 
+  __visc__bindIn(var_12, 17, 3, 0); 
+
+  void* var_13 = __visc__createNodeND(0, var_13_node); 
+
+  __visc__edge(var_12, var_13, 1, 0, 0, 0); 
+  __visc__edge(var_12, var_13, 1, 1, 1, 0); 
+
+  void* var_14 = __visc__createNodeND(0, var_14_node); 
+
+  __visc__edge(var_13, var_14, 1, 0, 0, 0); 
+  __visc__edge(var_13, var_14, 1, 1, 1, 0); 
+  __visc__bindIn(var_14, 18, 2, 0); 
+  __visc__bindIn(var_14, 19, 3, 0); 
+
+  void* var_15 = __visc__createNodeND(0, var_15_node); 
+
+  __visc__edge(var_14, var_15, 1, 0, 0, 0); 
+  __visc__edge(var_14, var_15, 1, 1, 1, 0); 
+  __visc__bindIn(var_15, 20, 2, 0); 
+  __visc__bindIn(var_15, 21, 3, 0); 
+
+  void* var_16 = __visc__createNodeND(0, var_16_node); 
+
+  __visc__edge(var_15, var_16, 1, 0, 0, 0); 
+  __visc__edge(var_15, var_16, 1, 1, 1, 0); 
+
+  void* var_17 = __visc__createNodeND(0, var_17_node); 
+
+  __visc__edge(var_16, var_17, 1, 0, 0, 0); 
+  __visc__edge(var_16, var_17, 1, 1, 1, 0); 
+
+  void* var_18 = __visc__createNodeND(0, var_18_node); 
+
+  __visc__edge(var_17, var_18, 1, 0, 0, 0); 
+  __visc__edge(var_17, var_18, 1, 1, 1, 0); 
+  __visc__bindIn(var_18, 22, 2, 0); 
+  __visc__bindIn(var_18, 23, 3, 0); 
+
+  void* var_19 = __visc__createNodeND(0, var_19_node); 
+
+  __visc__edge(var_18, var_19, 1, 0, 0, 0); 
+  __visc__edge(var_18, var_19, 1, 1, 1, 0); 
+  __visc__bindIn(var_19, 24, 2, 0); 
+  __visc__bindIn(var_19, 25, 3, 0); 
+
+  void* var_20 = __visc__createNodeND(0, var_20_node); 
+
+  __visc__edge(var_19, var_20, 1, 0, 0, 0); 
+  __visc__edge(var_19, var_20, 1, 1, 1, 0); 
+
+  void* var_21 = __visc__createNodeND(0, var_21_node); 
+
+  __visc__edge(var_20, var_21, 1, 0, 0, 0); 
+  __visc__edge(var_20, var_21, 1, 1, 1, 0); 
+  __visc__bindIn(var_21, 26, 2, 0); 
+  __visc__bindIn(var_21, 27, 3, 0); 
+
+  void* var_22 = __visc__createNodeND(0, var_22_node); 
+
+  __visc__edge(var_21, var_22, 1, 0, 0, 0); 
+  __visc__edge(var_21, var_22, 1, 1, 1, 0); 
+  __visc__bindIn(var_22, 28, 2, 0); 
+  __visc__bindIn(var_22, 29, 3, 0); 
+
+  void* var_23 = __visc__createNodeND(0, var_23_node); 
+
+  __visc__edge(var_22, var_23, 1, 0, 0, 0); 
+  __visc__edge(var_22, var_23, 1, 1, 1, 0); 
+
+  void* var_24 = __visc__createNodeND(0, var_24_node); 
+
+  __visc__edge(var_23, var_24, 1, 0, 0, 0); 
+  __visc__edge(var_23, var_24, 1, 1, 1, 0); 
+  __visc__bindIn(var_24, 30, 2, 0); 
+  __visc__bindIn(var_24, 31, 3, 0); 
+
+  void* var_25 = __visc__createNodeND(0, var_25_node); 
+
+  __visc__edge(var_24, var_25, 1, 0, 0, 0); 
+  __visc__edge(var_24, var_25, 1, 1, 1, 0); 
+  __visc__bindIn(var_25, 32, 2, 0); 
+  __visc__bindIn(var_25, 33, 3, 0); 
+
+  void* var_26 = __visc__createNodeND(0, var_26_node); 
+
+  __visc__edge(var_25, var_26, 1, 0, 0, 0); 
+  __visc__edge(var_25, var_26, 1, 1, 1, 0); 
+
+  __visc__bindOut(var_26, 0, 0, 0); 
+  __visc__bindOut(var_26, 1, 1, 0); 
+
+}
+
+struct ret_t {
+  void* tensor; 
+  size_t bytes; 
+}; 
+
+typedef struct __attribute__((__packed__)) {
+  void* input; 
+  size_t input_bytes; 
+  void* conv2d_1_w; 
+  size_t conv2d_1_w_bytes; 
+  void* conv2d_1_b; 
+  size_t conv2d_1_b_bytes; 
+  void* conv2d_2_w; 
+  size_t conv2d_2_w_bytes; 
+  void* conv2d_2_b; 
+  size_t conv2d_2_b_bytes; 
+  void* conv2d_3_w; 
+  size_t conv2d_3_w_bytes; 
+  void* conv2d_3_b; 
+  size_t conv2d_3_b_bytes; 
+  void* conv2d_4_w; 
+  size_t conv2d_4_w_bytes; 
+  void* conv2d_4_b; 
+  size_t conv2d_4_b_bytes; 
+  void* conv2d_5_w; 
+  size_t conv2d_5_w_bytes; 
+  void* conv2d_5_b; 
+  size_t conv2d_5_b_bytes; 
+  void* dense_1_w; 
+  size_t dense_1_w_bytes; 
+  void* dense_1_b; 
+  size_t dense_1_b_bytes; 
+  void* dense_2_w; 
+  size_t dense_2_w_bytes; 
+  void* dense_2_b; 
+  size_t dense_2_b_bytes; 
+  void* dense_3_w; 
+  size_t dense_3_w_bytes; 
+  void* dense_3_b; 
+  size_t dense_3_b_bytes; 
+
+  struct ret_t r; 
+}
+RootIn;
+
+
+int main(){ 
+
+  std::string dir_prefix = std::string("/shared/hsharif3/alexnet_imagenet_tune/"); 
+  std::string input_path =  dir_prefix + std::string("test_input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("test_labels.bin"); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
+  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
+  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv2d_2_w.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
+  std::string conv2d_2_b_path =  dir_prefix + std::string("conv2d_2_b.bin"); 
+  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv2d_3_w.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
+  std::string conv2d_3_b_path =  dir_prefix + std::string("conv2d_3_b.bin"); 
+  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv2d_4_w.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
+  std::string conv2d_4_b_path =  dir_prefix + std::string("conv2d_4_b.bin"); 
+  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv2d_5_w.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
+  std::string conv2d_5_b_path =  dir_prefix + std::string("conv2d_5_b.bin"); 
+  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("dense_1_w.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,9216,4096); 
+  std::string dense_1_b_path =  dir_prefix + std::string("dense_1_b.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); 
+  std::string dense_2_w_path =  dir_prefix + std::string("dense_2_w.bin"); 
+  void* dense_2_w =  readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); 
+  std::string dense_2_b_path =  dir_prefix + std::string("dense_2_b.bin"); 
+  void* dense_2_b =  readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); 
+  std::string dense_3_w_path =  dir_prefix + std::string("dense_3_w.bin"); 
+  void* dense_3_w =  readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); 
+  std::string dense_3_b_path =  dir_prefix + std::string("dense_3_b.bin"); 
+  void* dense_3_b =  readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); 
+  void* input = readTrainedWeights(input_path.c_str(), 0, 1000,3,224,224); 
+  //uint32_t* labels = readLabels2(labels_path.c_str(),6000); 
+
+  uint32_t* labels = readLabels3(labels_path.c_str(), 1000); 
+
+    
+  __visc__init(); 
+  RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); 
+
+  args->input = input; 
+  args->input_bytes = 0; 
+  args->conv2d_1_w = conv2d_1_w; 
+  args->conv2d_1_w_bytes = 0; 
+  args->conv2d_1_b = conv2d_1_b; 
+  args->conv2d_1_b_bytes = 0; 
+  args->conv2d_2_w = conv2d_2_w; 
+  args->conv2d_2_w_bytes = 0; 
+  args->conv2d_2_b = conv2d_2_b; 
+  args->conv2d_2_b_bytes = 0; 
+  args->conv2d_3_w = conv2d_3_w; 
+  args->conv2d_3_w_bytes = 0; 
+  args->conv2d_3_b = conv2d_3_b; 
+  args->conv2d_3_b_bytes = 0; 
+  args->conv2d_4_w = conv2d_4_w; 
+  args->conv2d_4_w_bytes = 0; 
+  args->conv2d_4_b = conv2d_4_b; 
+  args->conv2d_4_b_bytes = 0; 
+  args->conv2d_5_w = conv2d_5_w; 
+  args->conv2d_5_w_bytes = 0; 
+  args->conv2d_5_b = conv2d_5_b; 
+  args->conv2d_5_b_bytes = 0; 
+  args->dense_1_w = dense_1_w; 
+  args->dense_1_w_bytes = 0; 
+  args->dense_1_b = dense_1_b; 
+  args->dense_1_b_bytes = 0; 
+  args->dense_2_w = dense_2_w; 
+  args->dense_2_w_bytes = 0; 
+  args->dense_2_b = dense_2_b; 
+  args->dense_2_b_bytes = 0; 
+  args->dense_3_w = dense_3_w; 
+  args->dense_3_w_bytes = 0; 
+  args->dense_3_b = dense_3_b; 
+  args->dense_3_b_bytes = 0; 
+
+  void* dfg = __visc__launch(0, root, (void*) args); 
+
+  __visc__wait(dfg); 
+
+  void *result = static_cast<RootIn*>(args)->input; 
+  hpvm_request_tensor(result, 0); 
+
+  __visc__cleanup(); 
+  computeAccuracy3(labels, result); 
+  return 0; 
+
+}