diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc
index 0aa33bc43dace1f847f44ed7ad6dcfc0082d014a..65d6335f75fb5f3e9469e42507e063a2b526aee8 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc
@@ -4,15 +4,18 @@
 #include <unistd.h> 
 #include <fcntl.h> 
 #include <sys/types.h> 
-#include <sys/stat.h> 
+#include <sys/stat.h>
+#include <vector>
 #include <string.h> 
 #include "tensor_runtime.h" 
 #include "utils.h" 
 
+#include "tensor_custom_ops_cpu.h"
 
 
 
-Tensor *gaussianFilter(float sigma, size_t w, size_t h, size_t n_chan) {
+
+Tensor* gaussianFilter(float sigma, size_t w, size_t h, size_t n_chan) {
   int64_t m = (w - 1) / 2, n = (h - 1) / 2;
   auto *data = new float[w * h];
   float sum = 0.0f;
@@ -57,64 +60,50 @@ TODOs:
 ****/
 
 void* canny_filter(void* dataset) {
-  Tensor *gaussian = gaussianFilter(1.4, 5, 5, 1);
-  Tensor *kernel_x, *kernel_y;
+
+  Tensor* gaussian = gaussianFilter(1.4, 5, 5, 1);
+  Tensor* kernel_x, *kernel_y;
   std::tie(kernel_x, kernel_y) = getSobelKernels();
 
   // 0. Grayscale
-  auto *summed_image = autotuner_tensorReduce(dataset, 1, MathOp::Add);
-  auto *grayscale_image = autotuner_tensorMap1(MathOp::Avg3, summed_image);
+  auto* summed_image = tensorReduce(dataset, 1, MathOp::Add);
+  auto* grayscale_image = tensorMap1(MathOp::Avg3, summed_image);
   // 1. Denoise
-  auto *image2 = ConvLayer_PROMISE(grayscale_image, 0.0, 0.0, gaussian,
-				   0.0, 0.0, nullptr, 0.0, 0.0, 2, 2, 1,
-				   1, 0, 0, -1, 0.0, 0.0, 0);
+
+  auto* image2 = tensorConvolution(grayscale_image, gaussian,
+				   2, 2, // padding
+				   1, 1, // strides
+				   1, 0); // conv_mode, conv_groups
+				    
   // 2. Get edge gradient / direction
-  auto *grad_x = ConvLayer_PROMISE(
-      image2, 0.0, 0.0, kernel_x, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0,
-      -1, 0.0, 0.0, 0);
-  auto *grad_y = ConvLayer_PROMISE(
-      image2, 0.0, 0.0, kernel_y, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0,
-      -1, 0.0, 0.0, 0);
-  auto *grad_mag = autotuner_tensorMap2(MathOp::Hypot, grad_x, grad_y);
+  auto *grad_x = tensorConvolution(image2, kernel_x,
+				   1, 1,
+				   1, 1,
+				   1, 0);
+   
+  auto *grad_y = tensorConvolution(image2, kernel_y,
+				   1, 1,
+				   1, 1,
+				   1, 0);
+ 
+  auto *grad_mag = tensorMap2(MathOp::Hypot, grad_x, grad_y);
   // 2.5. Normalize grad magnitude
-  auto *grad_max_1D = autotuner_tensorReduce(grad_mag, 2, MathOp::Max);
-  auto *grad_max = autotuner_tensorReduce(grad_max_1D, 3, MathOp::Max);
-  auto *grad_mag_norm = autotuner_tensorMap2(MathOp::Div, grad_mag, grad_max);
+  auto *grad_max_1D = tensorReduce(grad_mag, 2, MathOp::Max);
+  auto *grad_max = tensorReduce(grad_max_1D, 3, MathOp::Max);
+  auto *grad_mag_norm = tensorMap2(MathOp::Div, grad_mag, grad_max);
   return grad_mag_norm;
 }
 
-const size_t batch_size = 500, total_max = 3000;
-const float psnr_threshold = 30.0;
 
 
 
+void* invoke_canny(void* input) {
+  
+  auto* result = canny_filter(input);
 
-int main() {
-  const char *input_path = "../model_params/image_processing_5k";
-  const char *ref_output_path = "../model_params/canny_ref_output";
-  std::vector<float> psnr;
-  llvm_hpvm_initTensorRt(1);
-  size_t bstart = 0;
-  startMemTracking();
-  while (true) {
-    Tensor *batch = readDataSet(input_path, bstart, batch_size);
-    if (batch == nullptr)
-      break;
-
-    auto *result = main_procedure(batch);
-    auto *ref_output = readDataSet(ref_output_path, bstart, batch_size, 1);
-    std::vector<float> psnr_batch = PSNR(ref_output, result);
-    std::copy(psnr_batch.begin(), psnr_batch.end(), std::back_inserter(psnr));
-    bstart += batch_size;
-    if (bstart >= total_max)
-      break;
-    freeBatchMemory();
-  }
-  float violation = violationRate(psnr, psnr_threshold);
-  float mean_psnr = mean(psnr);
-  std::ofstream of("final_accuracy");
-  of << violation * 100 << ", " << mean_psnr << '\n';
-  return 0;
+  printf("Done with Canny \n");
+  
+  return result;
 }
 
 
@@ -128,84 +117,128 @@ int main(){
 
   llvm_hpvm_initTensorRt(0); 
 
-
-  //std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/");
-  std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); 
+  std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); 
+  std::string input_path =  dir_prefix + std::string("norm_cifar_input.bin"); 
+  std::string labels_path =  dir_prefix + std::string("test_labels.bin"); 
+
+  void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin",
+					  float_type, 32, 3, 3, 3);  
+  void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin",
+					float_type, 1, 32, 1, 1);  
+  void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin",
+					  float_type, 32, 32, 3, 3);  
+  void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin",
+					float_type, 1, 32, 1, 1);
+  void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin",
+					  float_type, 64, 32, 3, 3);  
+  void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin",
+					float_type, 1, 64, 1, 1);  
+  void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin",
+					  float_type, 64, 64, 3, 3);  
+  void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin",
+					float_type, 1, 64, 1, 1);
+  void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin",
+					  float_type, 128, 64, 3, 3);  
+  void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin",
+					float_type, 1, 128, 1, 1);
+  void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin",
+					  float_type, 128, 128, 3, 3);  
+  void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin",
+					float_type, 1, 128, 1, 1);
+  
+  void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin",
+					 float_type, 1, 1, 2048, 10);  
+  void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin",
+				      float_type, 1, 10, 1, 1);  
  
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  //void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32); 
-  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  //uint8_t* labels = readLabels(labels_path.c_str(),10000); 
-  std::string conv2d_1_w_path =  dir_prefix + std::string("conv0.bin"); 
-  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
-  std::string conv2d_1_b_path =  dir_prefix + std::string("conv_bias0.bin"); 
-  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
-  std::string conv2d_2_w_path =  dir_prefix + std::string("conv3.bin"); 
-  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
-  std::string conv2d_2_b_path =  dir_prefix + std::string("conv_bias3.bin"); 
-  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
-  std::string conv2d_3_w_path =  dir_prefix + std::string("conv6.bin"); 
-  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
-  std::string conv2d_3_b_path =  dir_prefix + std::string("conv_bias6.bin"); 
-  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
-  std::string conv2d_4_w_path =  dir_prefix + std::string("conv7.bin"); 
-  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
-  std::string conv2d_4_b_path =  dir_prefix + std::string("conv_bias7.bin"); 
-  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
-  std::string conv2d_5_w_path =  dir_prefix + std::string("conv8.bin"); 
-  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
-  std::string conv2d_5_b_path =  dir_prefix + std::string("conv_bias8.bin"); 
-  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
-  std::string dense_1_w_path =  dir_prefix + std::string("fc12.bin"); 
-  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
-  std::string dense_1_b_path =  dir_prefix + std::string("fc_bias12.bin"); 
-  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
-
-
-  startMemTracking();
 
-  int test_input_size = 2000;
-  int batch_size = 2000;
+  int test_input_size = 5000;
+  int batch_size = 500;
   int batch_count = test_input_size / batch_size;
   float final_accuracy = 0.0;
 
   // NOTE: Starting time profiling
-  startProfiling();
-  
+  startProfiling();  
+  startMemTracking();
+
+  int conv_mode = 1; // NOTE: using CROSS_CORRELATION
+  int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum
+
   for(int i = 0; i < batch_count; i++){
 
     int start = i * batch_size;
     int end = (i + 1) * batch_size;
-    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
-
-    void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
-    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
-    void* var_2 = tensorTanh(var_1); 
-    void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); 
-    void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
-    void* var_6 = tensorAdd(var_5, conv2d_2_b); 
-    void* var_7 = tensorTanh(var_6); 
-    void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); 
-    void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
-    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
-    void* var_12 = tensorTanh(var_11); 
-    void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
-    void* var_14 = tensorAdd(var_13, conv2d_4_b); 
-    void* var_15 = tensorTanh(var_14); 
-    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
-    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
-    void* var_18 = tensorTanh(var_17); 
-    void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); 
-    void* var_22 = tensorGemmGPU(var_19, dense_1_w); 
-    void* var_23 = tensorAdd(var_22, dense_1_b); 
-    void* var_24 = tensorSoftmax(var_23); 
+
+
+    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
+
+    void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv1out, conv1_bias); 
+    void* conv1_tanh = tensorTanh(conv1out);
+    
+    // 2nd Layer
+    void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv2out, conv2_bias); 
+    void* conv2_tanh = tensorTanh(conv2out);
+    void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2);
+     
+    // 3rd Layer
+    void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv3out, conv3_bias); 
+    void* conv3_tanh = tensorTanh(conv3out);
+
+    // 4th Layer
+    void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv4out, conv4_bias); 
+    void* conv4_tanh = tensorTanh(conv4out);
+    void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2);
+    
+    // 5th Layer
+    void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv5out, conv5_bias); 
+    void* conv5_tanh = tensorTanh(conv5out);
+
+    // 6th Layer
+    void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1,
+				       conv_mode, conv_precision);
+    tensorAdd(conv6out, conv6_bias); 
+  
+    void* conv6_tanh = tensorTanh(conv6out);
+    void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2);
+    
+    // final FC Layer
+    void* gemm1out = tensorGemmGPU(pool6out, fc1_weights);  
+    void* gemm1biasout = tensorAdd(gemm1out, fc1_bias);
+    void* result = tensorSoftmax(gemm1biasout);
 
     uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
 
-    float accuracy = computeAccuracy2(labels,batch_size,var_24); 
+    float accuracy = computeAccuracy2(labels, batch_size, result); 
     final_accuracy += accuracy;
+
+
+    std::vector<int> index_vector;
+    index_vector.push_back(1);
+    index_vector.push_back(2);
+    index_vector.push_back(3);
+    index_vector.push_back(4);
+    index_vector.push_back(5);
     
-    freeBatchMemory();
+    
+    void* argmax_out = tensorArgMax(result);
+    void* select_out = tensorSelect2(argmax_out, index_vector);
+    void* reduced_input = tensorContract(input, select_out);
+
+    
+    invoke_canny(reduced_input);
+    
+
+    freeBatchMemory();    
   }
 
   stopProfiling();
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h
index fe8927f289deecb3a00b39bcc86377d122f7ef2a..b9128c1a24ca5bd95a7e6fb9e962d56501558f8f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h
@@ -2,6 +2,7 @@
 
 #include "tensor.h"
 #include <stdlib.h>
+#include <vector>
 
 
 void* tensorArgMax(void* input_ptr){
@@ -76,6 +77,47 @@ void* tensorSelect(void* input_ptr, float target_value){
 
 
 
+
+void* tensorSelect2(void* input_ptr, std::vector<int> index_vector){
+
+  Tensor* input = (Tensor*) input_ptr;
+  float* host_ptr = (float*) input->host_data;
+
+  int batch_size = input->dims.dim_sizes[0];
+  int channels = input->dims.dim_sizes[1];
+
+  if (channels != 1){
+    printf("* Channels dimension must be 1 \n");
+    abort();
+  }
+
+  Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1);
+  changeTensorPlacement(output, HOST);    
+  float* out_ptr = (float*) output->host_data;
+
+  for(int i = 0; i < batch_size; i++){
+
+    for(int j = 0; j < index_vector.size(); j++){
+      int target_value = index_vector[j];
+      if (host_ptr[i] == target_value){
+	out_ptr[i] = 1;
+	break;
+      }
+      else{
+	out_ptr[i] = 0;
+      }
+    }
+      
+  }
+      
+  return output;
+}
+
+
+
+
+
+
 long getOnesInVector(float* vector_host_ptr, long vector_length){
 
   long ones_count = 0;