diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0aa33bc43dace1f847f44ed7ad6dcfc0082d014a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc
@@ -0,0 +1,221 @@
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <unistd.h> 
+#include <fcntl.h> 
+#include <sys/types.h> 
+#include <sys/stat.h> 
+#include <string.h> 
+#include "tensor_runtime.h" 
+#include "utils.h" 
+
+
+
+
+Tensor *gaussianFilter(float sigma, size_t w, size_t h, size_t n_chan) {
+  int64_t m = (w - 1) / 2, n = (h - 1) / 2;
+  auto *data = new float[w * h];
+  float sum = 0.0f;
+  for (int64_t i = -m; i <= m; i++)
+    for (int64_t j = -n; j <= n; j++) {
+      size_t idx = (i + m) * h + (j + n);
+      float exponent = -(i * i + j * j) / (2.0 * sigma * sigma);
+      data[idx] = exp(exponent);
+      sum += data[idx];
+    }
+  if (sum != 0.0f)
+    for (size_t i = 0; i < w * h; i++)
+      data[i] /= sum;
+  return (Tensor *)createFilterFromData(CUDNN_DATA_FLOAT, data, w, h, n_chan);
+}
+
+std::pair<Tensor*, Tensor*> getSobelKernels() {
+  std::vector<float> k1({-1, 0, 1, -2, 0, 2, -1, 0, 1});
+  std::vector<float> k2({1, 2, 1, 0, 0, 0, -1, -2, -1});
+  auto *t1 =
+      (Tensor *)createFilterFromData(CUDNN_DATA_FLOAT, k1.data(), 3, 3, 1);
+  auto *t2 =
+      (Tensor *)createFilterFromData(CUDNN_DATA_FLOAT, k2.data(), 3, 3, 1);
+  return std::make_pair(t1, t2);
+}
+
+/*** 
+
+TODOs:
+
+* Precision calculation?
+* tensorArgMax?
+* tensorSelect?
+* tensorContract
+* autotuning support for these functions
+* FP32 vs F16 versions of sampling perforation?
+* Need tensorRT version and a PROMISE API version
+* How to Profile? are profileEvent calls added
+* Pytorch version
+
+
+****/
+
+void* canny_filter(void* dataset) {
+  Tensor *gaussian = gaussianFilter(1.4, 5, 5, 1);
+  Tensor *kernel_x, *kernel_y;
+  std::tie(kernel_x, kernel_y) = getSobelKernels();
+
+  // 0. Grayscale
+  auto *summed_image = autotuner_tensorReduce(dataset, 1, MathOp::Add);
+  auto *grayscale_image = autotuner_tensorMap1(MathOp::Avg3, summed_image);
+  // 1. Denoise
+  auto *image2 = ConvLayer_PROMISE(grayscale_image, 0.0, 0.0, gaussian,
+				   0.0, 0.0, nullptr, 0.0, 0.0, 2, 2, 1,
+				   1, 0, 0, -1, 0.0, 0.0, 0);
+  // 2. Get edge gradient / direction
+  auto *grad_x = ConvLayer_PROMISE(
+      image2, 0.0, 0.0, kernel_x, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0,
+      -1, 0.0, 0.0, 0);
+  auto *grad_y = ConvLayer_PROMISE(
+      image2, 0.0, 0.0, kernel_y, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0,
+      -1, 0.0, 0.0, 0);
+  auto *grad_mag = autotuner_tensorMap2(MathOp::Hypot, grad_x, grad_y);
+  // 2.5. Normalize grad magnitude
+  auto *grad_max_1D = autotuner_tensorReduce(grad_mag, 2, MathOp::Max);
+  auto *grad_max = autotuner_tensorReduce(grad_max_1D, 3, MathOp::Max);
+  auto *grad_mag_norm = autotuner_tensorMap2(MathOp::Div, grad_mag, grad_max);
+  return grad_mag_norm;
+}
+
+const size_t batch_size = 500, total_max = 3000;
+const float psnr_threshold = 30.0;
+
+
+
+
+int main() {
+  const char *input_path = "../model_params/image_processing_5k";
+  const char *ref_output_path = "../model_params/canny_ref_output";
+  std::vector<float> psnr;
+  llvm_hpvm_initTensorRt(1);
+  size_t bstart = 0;
+  startMemTracking();
+  while (true) {
+    Tensor *batch = readDataSet(input_path, bstart, batch_size);
+    if (batch == nullptr)
+      break;
+
+    auto *result = main_procedure(batch);
+    auto *ref_output = readDataSet(ref_output_path, bstart, batch_size, 1);
+    std::vector<float> psnr_batch = PSNR(ref_output, result);
+    std::copy(psnr_batch.begin(), psnr_batch.end(), std::back_inserter(psnr));
+    bstart += batch_size;
+    if (bstart >= total_max)
+      break;
+    freeBatchMemory();
+  }
+  float violation = violationRate(psnr, psnr_threshold);
+  float mean_psnr = mean(psnr);
+  std::ofstream of("final_accuracy");
+  of << violation * 100 << ", " << mean_psnr << '\n';
+  return 0;
+}
+
+
+
+
+
+
+
+
+int main(){ 
+
+  llvm_hpvm_initTensorRt(0); 
+
+
+  //std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/");
+  std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); 
+ 
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  //void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32); 
+  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
+  //uint8_t* labels = readLabels(labels_path.c_str(),10000); 
+  std::string conv2d_1_w_path =  dir_prefix + std::string("conv0.bin"); 
+  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
+  std::string conv2d_1_b_path =  dir_prefix + std::string("conv_bias0.bin"); 
+  void* conv2d_1_b =  readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); 
+  std::string conv2d_2_w_path =  dir_prefix + std::string("conv3.bin"); 
+  void* conv2d_2_w =  readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); 
+  std::string conv2d_2_b_path =  dir_prefix + std::string("conv_bias3.bin"); 
+  void* conv2d_2_b =  readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); 
+  std::string conv2d_3_w_path =  dir_prefix + std::string("conv6.bin"); 
+  void* conv2d_3_w =  readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); 
+  std::string conv2d_3_b_path =  dir_prefix + std::string("conv_bias6.bin"); 
+  void* conv2d_3_b =  readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); 
+  std::string conv2d_4_w_path =  dir_prefix + std::string("conv7.bin"); 
+  void* conv2d_4_w =  readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); 
+  std::string conv2d_4_b_path =  dir_prefix + std::string("conv_bias7.bin"); 
+  void* conv2d_4_b =  readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); 
+  std::string conv2d_5_w_path =  dir_prefix + std::string("conv8.bin"); 
+  void* conv2d_5_w =  readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); 
+  std::string conv2d_5_b_path =  dir_prefix + std::string("conv_bias8.bin"); 
+  void* conv2d_5_b =  readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); 
+  std::string dense_1_w_path =  dir_prefix + std::string("fc12.bin"); 
+  void* dense_1_w =  readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); 
+  std::string dense_1_b_path =  dir_prefix + std::string("fc_bias12.bin"); 
+  void* dense_1_b =  readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); 
+
+
+  startMemTracking();
+
+  int test_input_size = 2000;
+  int batch_size = 2000;
+  int batch_count = test_input_size / batch_size;
+  float final_accuracy = 0.0;
+
+  // NOTE: Starting time profiling
+  startProfiling();
+  
+  for(int i = 0; i < batch_count; i++){
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
+    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);    
+
+    void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); 
+    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
+    void* var_2 = tensorTanh(var_1); 
+    void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); 
+    void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); 
+    void* var_6 = tensorAdd(var_5, conv2d_2_b); 
+    void* var_7 = tensorTanh(var_6); 
+    void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); 
+    void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); 
+    void* var_11 = tensorAdd(var_10, conv2d_3_b); 
+    void* var_12 = tensorTanh(var_11); 
+    void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); 
+    void* var_14 = tensorAdd(var_13, conv2d_4_b); 
+    void* var_15 = tensorTanh(var_14); 
+    void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); 
+    void* var_17 = tensorAdd(var_16, conv2d_5_b); 
+    void* var_18 = tensorTanh(var_17); 
+    void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); 
+    void* var_22 = tensorGemmGPU(var_19, dense_1_w); 
+    void* var_23 = tensorAdd(var_22, dense_1_b); 
+    void* var_24 = tensorSoftmax(var_23); 
+
+    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+    float accuracy = computeAccuracy2(labels,batch_size,var_24); 
+    final_accuracy += accuracy;
+    
+    freeBatchMemory();
+  }
+
+  stopProfiling();
+
+  final_accuracy = final_accuracy / batch_count;
+  dumpFinalAccuracy(final_accuracy);
+
+
+  llvm_hpvm_cleanupTensorRt(); 
+
+  return 0; 
+
+}