diff --git a/llvm/include/llvm/IR/IntrinsicsVISC.td b/llvm/include/llvm/IR/IntrinsicsVISC.td
index c6ce86c504efc6a56b3f6888977265335d5cc31e..131f1384cbdd35d0949056a9d3b083fefdc90a6c 100644
--- a/llvm/include/llvm/IR/IntrinsicsVISC.td
+++ b/llvm/include/llvm/IR/IntrinsicsVISC.td
@@ -245,11 +245,21 @@ let TargetPrefix = "visc" in {
    */
   def int_visc_tensor_relu : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
 
+  /* Tensor clipped relu intrinsic
+   * i8* llvm.visc.tensor.clipped.relu(i8*);
+   */
+  def int_visc_tensor_clipped_relu : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
+
   /* Tensor tanh intrinsic
    * i8* llvm.visc.tensor.tanh(i8*);
    */
   def int_visc_tensor_tanh : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
 
+  /* Tensor sigmoid intrinsic
+   * i8* llvm.visc.tensor.sigmoid(i8*);
+   */
+  def int_visc_tensor_sigmoid : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], []>;
+
   /* Tensor softmax intrinsic
    * i8* llvm.visc.tensor.softmax(i8*);
    */
diff --git a/llvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp b/llvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
index a6374406d2b0da0b488798838a6175882eea1a3d..229a48a529ebb46d2d982278f81ff08abf3a317a 100644
--- a/llvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp
@@ -270,6 +270,40 @@ void CGT_CUDNN::codeGen(DFLeafNode* N) {
       /********************* Handle VISC Tensor intrinsics ********************/
       switch (II->getIntrinsicID()) {
 
+      case Intrinsic::visc_tensor_convolution:
+      { /* llvm.hpvm.tensor.mul */
+        // Tensor mul is not in place.
+        DEBUG(errs() << F_cudnn->getName() << "\t: Handling tensor convolution \n");
+
+        // Argument list for the runtime call
+        std::vector<Value*> Args;
+        Args.push_back(II->getOperand(0));
+        Args.push_back(II->getOperand(1));
+	Args.push_back(II->getOperand(2));
+        Args.push_back(II->getOperand(3));
+        Args.push_back(II->getOperand(4));
+        Args.push_back(II->getOperand(5));
+
+	Constant* conv_mode = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1);
+	Constant* conv_precision = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+
+        Args.push_back(conv_mode);
+        Args.push_back(conv_precision);
+	
+        // Create cudnn runtime function call
+        Constant* tensorConvolution;
+        DECLARE(tensorConvolution);
+	
+        CallInst* CI = CallInst::Create(tensorConvolution,
+                                        Args, "", II);
+        // We can replace the call to hpvm.tensor.mul with the runtime call
+        II->replaceAllUsesWith(CI);
+
+        // Mark to remove at the end
+        IItoRemove.push_back(II);
+      }
+      break;
+
       case Intrinsic::visc_tensor_mul:
       { /* llvm.hpvm.tensor.mul */
         // Tensor mul is not in place.
@@ -388,12 +422,15 @@ void CGT_CUDNN::codeGen(DFLeafNode* N) {
     }
   }
 
+  //--- errs()<<"IIToRemove.size() = "<<IItoRemove.size()<<"\n\n";
+
   // We need to do this explicitly: DCE pass may not remove them.
   // Traverse the vector backwards, otherwise definitions are deleted while
   // their subsequent uses are still around.
   for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
        re = IItoRemove.rend(); ri != re; ++ri) {
     DEBUG(errs() << "Erasing: " << **ri << "\n");
+    errs() << "Erasing: " << **ri << "\n";
     (*ri)->eraseFromParent();
   }
 
diff --git a/llvm/lib/Transforms/GenVISC/GenVISC.cpp b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
index 2f6282deac47a193abf5711d04cdf809466a9187..01effd433eb6e0d27f6d2f891909b11b51b0257b 100644
--- a/llvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
@@ -165,8 +165,11 @@ IS_VISC_CALL(hint)
 
 // Tensor Operators
 IS_VISC_CALL(tensor_mul)
+IS_VISC_CALL(tensor_convolution)
 IS_VISC_CALL(tensor_add)
 IS_VISC_CALL(tensor_relu)
+IS_VISC_CALL(tensor_tanh)
+IS_VISC_CALL(tensor_sigmoid)
 IS_VISC_CALL(tensor_softmax)
 
 // Return the constant integer represented by value V
@@ -1263,6 +1266,9 @@ bool GenVISC::runOnModule(Module &M) {
       if (isVISCCall_cos(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::cos, &toBeErased);
       }
+      if (isVISCCall_tensor_convolution(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_convolution, &toBeErased);
+      }
       if (isVISCCall_tensor_add(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_tensor_add, &toBeErased);
       }
diff --git a/llvm/test/VISC/DNN_Benchmarks/common/include/tensorUtils.h b/llvm/test/VISC/DNN_Benchmarks/common/include/tensorUtils.h
index b75d5520fe7ac731ce69e3fe17b24238fa19440a..4200ad1569acce4f42a23e49fda3eec64409f980 100644
--- a/llvm/test/VISC/DNN_Benchmarks/common/include/tensorUtils.h
+++ b/llvm/test/VISC/DNN_Benchmarks/common/include/tensorUtils.h
@@ -16,10 +16,10 @@ void printTensorInfo(void* tensor_ptr){
     printf("Successful cudaMalloc \n");
   }
 
-  printf("tensor dims = %zu \n", tensor->dims.num_dims);
+  printf("tensor dims = %d \n", tensor->dims.num_dims);
   printf("dim1_size = %zu \n", tensor->dims.dim_sizes[0]);
   printf("dim2_size = %zu \n", tensor->dims.dim_sizes[1]);
-  printf("num_elems = %d \n", tensor->num_elems);
+  printf("num_elems = %zu \n", tensor->num_elems);
 }
 
 
diff --git a/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h b/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h
index 30e15359bcfcca793717025e96655bd90f09f80f..cf6180e40600469aee158b0c9881d57135d60d9e 100644
--- a/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h
+++ b/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h
@@ -94,7 +94,9 @@ float __visc__cos(float);
 
 void* __visc__tensor_add(void*, void*);
 void* __visc__tensor_mul(void*, void*);
+void* __visc__tensor_convolution(void*, void*, int, int, int, int);
 void* __visc__tensor_relu(void*);
+void* __visc__tensor_tanh(void*);
 void* __visc__tensor_softmax(void*);
 
 #include <unistd.h>