diff --git a/.gitignore b/.gitignore
index 64ff9e1e7fd06833e3cac1e52496b3850728c4a5..a39f1eeab21b98c0c9b1bced43e6b09b0b17ba06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,5 @@ llvm/test/VISC/parboil/benchmarks/*/run
 llvm/test/VISC/parboil/benchmarks/*/build
 llvm/build
 llvm/install
+build/
+install/
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/Makefile
index b022f2dc7356e717ff1ff875f6b51e23811a3037..3287830462d17ab791b54dd4cfd90172bd3aa802 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/Makefile
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/Makefile
@@ -1,6 +1,6 @@
-# NOTE: $LLVM_SRC_ROOT and $HPVM_BUILD_ROOT have to be set
+# NOTE: $LLVM_SRC_ROOT and $CUDA_TOOLKIT_ROOT_DIR have to be set
 DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks
-HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/build
+HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build
 
 CC = $(HPVM_BUILD_DIR)/bin/clang++
 OPT = $(HPVM_BUILD_DIR)/bin/opt
@@ -15,11 +15,11 @@ APP = canny_test
 
 TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include
 TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include
-TENSOR_LIB_PATH = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a
+CUSTOM_LIB_PATHS = $(HPVM_BUILD_DIR)/lib/libtensor_runtime.a $(HPVM_BUILD_DIR)/lib/libgpu_profiler.a $(HPVM_BUILD_DIR)/lib/libpromise_profiler.a
 
-CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH)  -fno-exceptions -ffast-math -std=c++11 -O3
+CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_TOOLKIT_ROOT_DIR)/include  -fno-exceptions -ffast-math -std=c++11 -O3
 CCFLAGS += -DDEVICE=CUDNN_TARGET
-LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lOpenCL
+LINKER_FLAGS = -L $(CUDA_TOOLKIT_ROOT_DIR)/lib64 -lpthread -lcudart -lcurand -lcudnn -lcublas -lOpenCL -lcufft
 
 HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib
 
@@ -44,13 +44,13 @@ default: $(BUILD_DIR) $(TARGET)
 
 
 $(BUILD_DIR)/%.ll: $(SRC_DIR)/%.cpp
-	$(CC) $(CC_FLAGS) -emit-llvm src/$(APP).cpp -c -o  $(BUILD_DIR)/$(APP).ll  
+	$(CC) $(CC_FLAGS) -emit-llvm src/$(APP).cpp -S -o  $(BUILD_DIR)/$(APP).ll  
 
 $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll
 	$(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP).ll -S -o  $(BUILD_DIR)/$(APP).visc.ll
 	$(OPT) $(VISC_OPTFLAGS)  $(BUILD_DIR)/$(APP).visc.ll  -o  $(BUILD_DIR)/$(APP)_wrapper.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapper.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapper_linked.bc
-	$(CC) $(BUILD_DIR)/$(APP)_wrapper_linked.bc $(TENSOR_LIB_PATH) -o $(BUILD_DIR)/$(APP)_final $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_wrapper_linked.bc $(CUSTOM_LIB_PATHS) $(PROMISE_PROFILER_LIB_PATH) -o $(BUILD_DIR)/$(APP)_final $(LINKER_FLAGS)
 
 $(BUILD_DIR):
 	mkdir -p $@
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/src/canny_test.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/src/canny_test.cpp
index b4e3db13da5d1fa0d21e0fbd6defdb7b33a651c1..5f254f5e953446f767e45f6856f04a308883f0c1 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/src/canny_test.cpp
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/canny_test/src/canny_test.cpp
@@ -39,7 +39,7 @@ void var_2_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
     __visc__hint(visc::PROMISE_TARGET);
     __visc__attributes(2, t1, t2, 0);
 
-    void* r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1, 1, 0);
+    void* r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1);
     __visc__return(2, r, (size_t) 0);
 }
 
@@ -49,7 +49,7 @@ void var_3_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
     __visc__hint(visc::PROMISE_TARGET);
     __visc__attributes(2, t1, t2, 0);
 
-    void* r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1, 1, 0);
+    void* r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1);
     __visc__return(2, r, (size_t) 0);
 }
 
@@ -57,7 +57,7 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
     __visc__hint(visc::PROMISE_TARGET);
     __visc__attributes(2, t1, t2, 0);
 
-    void* r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1, 1, 0);
+    void* r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1);
     __visc__return(2, r, (size_t) 0);
 }