diff --git a/.gitignore b/.gitignore
index ed521a76d8d20f96025eaf7304abc13fe4bc0344..70639e367bc68fedc132d3b39f4c606e4f8a532f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,10 +28,10 @@ lit.site.cfg
 *.dot
 JITTests.exports
 
-hpvm/build
-hpvm/install
-llvm/
-llvm-*.src/
-llvm-*.src.tar.xz
+hpvm/build/
+hpvm/install/
+hpvm/llvm/
+hpvm/llvm-*.src.tar.xz
+hpvm/llvm-*.src/
 hpvm/projects/visc-rt/visc-rt.ll
 hpvm/test/parboil/benchmarks/*/build/
diff --git a/hpvm/Makefile b/hpvm/Makefile
deleted file mode 100644
index a4dab2b2426ab196333b921457eb530608cf9d5d..0000000000000000000000000000000000000000
--- a/hpvm/Makefile
+++ /dev/null
@@ -1,2 +0,0 @@
-all:
-	/bin/bash llvm_installer/llvm_installer.sh
diff --git a/hpvm/install.h b/hpvm/install.h
new file mode 100644
index 0000000000000000000000000000000000000000..359f3d8837515659def90969bf6a6e4a1bca4579
--- /dev/null
+++ b/hpvm/install.h
@@ -0,0 +1,3 @@
+
+# Run installer script
+/bin/bash llvm_installer/llvm_installer.sh
diff --git a/hpvm/llvm_installer/llvm_installer.sh b/hpvm/llvm_installer/llvm_installer.sh
index 1740e84f8df06f526dd65719a57abc5845e72e52..6912c51f1137fc7ffc49912899691ec89d0441eb 100755
--- a/hpvm/llvm_installer/llvm_installer.sh
+++ b/hpvm/llvm_installer/llvm_installer.sh
@@ -11,7 +11,7 @@ CURRENT_DIR=`pwd`
 INSTALL_DIR=`pwd`/install
 BUILD_DIR=$CURRENT_DIR/$LLVM_SRC/build
 
-NUM_THREADS=12
+NUM_THREADS=16
 
 SUFFIX=".tar.xz"
 CLANG_SRC="cfe-$VERSION.src"
@@ -178,7 +178,5 @@ else
 fi
 
 cd $CURRENT_DIR
-echo "Add $INSTALL_DIR/bin to PATH variable."
-export PATH="$INSTALL_DIR/bin:$PATH"
 
 
diff --git a/hpvm/projects/visc-rt/makefile b/hpvm/projects/visc-rt/makefile
index 01cc2b7b3fd548f918ea8f43ad5c5bbd4642edc6..e0f7e3b2c0311362f49b7be1915bb54a4f0adf71 100644
--- a/hpvm/projects/visc-rt/makefile
+++ b/hpvm/projects/visc-rt/makefile
@@ -2,12 +2,14 @@
 LLVM_BUILD_ROOT = ${LLVM_SRC_ROOT}/../build/
 
 OPENCL_INC_PATH = /opt/intel/opencl-sdk/include
+CUDA_INC_PATH = /software/cuda-9.1/include/CL/
+
 
 ifeq ($(NUM_CORES),)
   NUM_CORES=8
 endif
 
-CPP_FLAGS = -I $(LLVM_SRC_ROOT)/include -I $(LLVM_BUILD_ROOT)/include -I $(OPENCL_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+CPP_FLAGS = -I $(LLVM_SRC_ROOT)/include -I $(LLVM_BUILD_ROOT)/include -I $(OPENCL_INC_PATH) -I $(CUDA_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
 TARGET:=visc-rt
 
 LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang
diff --git a/hpvm/test/parboil/benchmarks/sgemm/Makefile b/hpvm/test/parboil/benchmarks/sgemm/Makefile
index 9069d99772909f1a7414fd9364bddbe9439988d5..ace9ded22b6ef365c9cd0f6262245dd2e086643d 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/Makefile
@@ -19,7 +19,7 @@ BIN = $(addsuffix -$(VERSION), $(APP))
 
 SRCDIR = src/$(VERSION)
 BUILDDIR = build/$(VERSION)_$(PLATFORM)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
+DATASET_DIR ?= $(PARBOIL_ROOT)/datasets/$(APP)
 
 MATRIX1 = $(DATASET_DIR)/$(TEST)/input/matrix1.txt
 MATRIX2 = $(DATASET_DIR)/$(TEST)/input/matrix2.txt
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
index 22d9098646f3ca1e8582a319e93e68e9a58e42d8..af9ee76e0fed3ced9e2666193afbd7c0631f1ce8 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
@@ -279,4 +279,3 @@ int main (int argc, char *argv[]) {
 
     return 0;
 }
-
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
index 74d2f44495290de30a624a3606d64b56ddc92b63..9ecba96aed5642a4babaea8667576c25c1e4fb1f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
+++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
@@ -33,14 +33,41 @@ static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
     return 0;
 }
 
-void naive_kernel(float c0,float c1, float* A0, float *Anext,int nx,int ny,int nz)
+typedef struct __attribute__((__packed__)) {
+    float c0, c1;
+    float* A0; size_t bytes_A0;
+    float* Anext; size_t bytes_Anext;
+    int nx, ny, nz;
+    size_t dim_X1, dim_Y1, dim_Z1;
+    size_t dim_X2, dim_Y2, dim_Z2;
+} RootIn;
+
+void naive_kernel(float c0, float c1, 
+                  float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
+                  int nx, int ny, int nz) 
 {
     __visc__hint(visc::DEVICE);
     __visc__attributes(2, A0, Anext, 1, Anext);
-    int i = get_global_id(0)+1;
-    int j = get_global_id(1)+1;
-    int k = get_global_id(2)+1;
 
+    void* thisNode = __visc__getNode();
+    void* parentNode = __visc__getParentNode(thisNode);
+
+    int lx = __visc__getNodeInstanceID_x(thisNode);
+    int ly = __visc__getNodeInstanceID_y(thisNode);
+    int lz = __visc__getNodeInstanceID_z(thisNode);
+
+    int gx = __visc__getNodeInstanceID_x(parentNode);
+    int gy = __visc__getNodeInstanceID_y(parentNode);
+    int gz = __visc__getNodeInstanceID_z(parentNode);
+    
+    int gridx = __visc__getNumNodeInstances_x(thisNode);
+    int gridy = __visc__getNumNodeInstances_y(thisNode);
+    int gridz = __visc__getNumNodeInstances_z(thisNode);
+
+    int i = gx * gridx + lx + 1;
+    int j = gy * gridy + ly + 1;
+    int k = gz * gridz + lz + 1;
+    
     if(i<nx-1)
     {
         Anext[Index3D (nx, ny, i, j, k)] = c1 *
@@ -54,6 +81,74 @@ void naive_kernel(float c0,float c1, float* A0, float *Anext,int nx,int ny,int n
     }
 }
 
+void stencilLvl1(float c0, float c1, 
+                 float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
+                 int nx, int ny, int nz,
+                 size_t dim_X1, size_t dim_Y1, size_t dim_Z1) 
+{
+    __visc__hint(visc::DEVICE);
+    __visc__attributes(2, A0, Anext, 1, Anext);
+    void* stencil_node = __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1);
+    __visc__bindIn(stencil_node, 0, 0, 0);
+    __visc__bindIn(stencil_node, 1, 1, 0);
+    __visc__bindIn(stencil_node, 2, 2, 0);
+    __visc__bindIn(stencil_node, 3, 3, 0);
+    __visc__bindIn(stencil_node, 4, 4, 0);
+    __visc__bindIn(stencil_node, 5, 5, 0);
+    __visc__bindIn(stencil_node, 6, 6, 0);
+    __visc__bindIn(stencil_node, 7, 7, 0);
+    __visc__bindIn(stencil_node, 8, 8, 0);
+}
+
+void stencilLvl2(float c0, float c1, 
+                 float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
+                 int nx, int ny, int nz,
+                 size_t dim_X1, size_t dim_Y1, size_t dim_Z1,
+                 size_t dim_X2, size_t dim_Y2, size_t dim_Z2)
+{
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(2, A0, Anext, 1, Anext);
+    void* stencil_node = __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2);
+    __visc__bindIn(stencil_node, 0, 0, 0);
+    __visc__bindIn(stencil_node, 1, 1, 0);
+    __visc__bindIn(stencil_node, 2, 2, 0);
+    __visc__bindIn(stencil_node, 3, 3, 0);
+    __visc__bindIn(stencil_node, 4, 4, 0);
+    __visc__bindIn(stencil_node, 5, 5, 0);
+    __visc__bindIn(stencil_node, 6, 6, 0);
+    __visc__bindIn(stencil_node, 7, 7, 0);
+    __visc__bindIn(stencil_node, 8, 8, 0);
+    __visc__bindIn(stencil_node, 9, 9, 0);
+    __visc__bindIn(stencil_node, 10, 10, 0);
+    __visc__bindIn(stencil_node, 11, 11, 0);
+}
+
+void stencilLvl3(float c0, float c1, 
+                 float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
+                 int nx, int ny, int nz,
+                 size_t dim_X1, size_t dim_Y1, size_t dim_Z1,
+                 size_t dim_X2, size_t dim_Y2, size_t dim_Z2)
+{
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(2, A0, Anext, 1, Anext);
+    void* stencil_node = __visc__createNodeND(0, stencilLvl2);
+    __visc__bindIn(stencil_node, 0, 0, 0);
+    __visc__bindIn(stencil_node, 1, 1, 0);
+    __visc__bindIn(stencil_node, 2, 2, 0);
+    __visc__bindIn(stencil_node, 3, 3, 0);
+    __visc__bindIn(stencil_node, 4, 4, 0);
+    __visc__bindIn(stencil_node, 5, 5, 0);
+    __visc__bindIn(stencil_node, 6, 6, 0);
+    __visc__bindIn(stencil_node, 7, 7, 0);
+    __visc__bindIn(stencil_node, 8, 8, 0);
+    __visc__bindIn(stencil_node, 9, 9, 0);
+    __visc__bindIn(stencil_node, 10, 10, 0);
+    __visc__bindIn(stencil_node, 11, 11, 0);
+    __visc__bindIn(stencil_node, 12, 12, 0);
+    __visc__bindIn(stencil_node, 13, 13, 0);
+    __visc__bindIn(stencil_node, 14, 14, 0);
+}
+
 int main(int argc, char** argv) {
     struct pb_TimerSet timers;
     struct pb_Parameters *parameters;
@@ -124,13 +219,13 @@ int main(int argc, char** argv) {
 
 
     //only use 1D thread block
-    int tx =256;
-    long block[3] = {tx,1,1};
-    long grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
+    size_t tx = 256;
+    size_t block[3] = {tx,1,1};
+    size_t grid[3] = {((unsigned)nx-2+tx-1)/tx*tx,(unsigned)ny-2,(unsigned)nz-2};
     //size_t grid[3] = {nx-2,ny-2,nz-2};
     size_t offset[3] = {1,1,1};
 
-    printf("grid(%d, %d, %d), block(%d, %d, %d)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);
+    printf("grid(%ld, %ld, %ld), block(%ld, %ld, %ld)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);
     //main execution
 
     int t;
@@ -140,14 +235,25 @@ int main(int argc, char** argv) {
     for(t=0; t<iteration; t++)
     {
         pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        void* stencilDFG = __visc__node(naive_kernel, 2, 3, block[0], block[1], block[2], grid[0]/block[0], grid[1]/block[1], grid[2]/block[2], 9, (float)c0, (float)c1, h_A0, bytes, h_Anext, bytes, nx, ny, nz, 0);
+        
+        void* root_in = malloc(sizeof(RootIn));
+        RootIn root_in_local = {
+            c0, c1,
+            h_A0, bytes,
+            h_Anext, bytes,
+            nx, ny, nz,
+            block[0], block[1], block[2],
+            grid[0]/block[0], grid[1]/block[1], grid[2]/block[2]
+        };
+        *(RootIn*)root_in = root_in_local;
+        void* stencilDFG = __visc__launch(0, stencilLvl3, root_in);
+
         __visc__wait(stencilDFG);
         //printf("iteration %d\n",t);
         pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
         float* h_temp = h_A0;
         h_A0 = h_Anext;
         h_Anext = h_temp;
-
     }