diff --git a/hpvm/test/parboil/benchmarks/sgemm/Makefile b/hpvm/test/parboil/benchmarks/sgemm/Makefile
index 9069d99772909f1a7414fd9364bddbe9439988d5..ace9ded22b6ef365c9cd0f6262245dd2e086643d 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/Makefile
+++ b/hpvm/test/parboil/benchmarks/sgemm/Makefile
@@ -19,7 +19,7 @@ BIN = $(addsuffix -$(VERSION), $(APP))
 
 SRCDIR = src/$(VERSION)
 BUILDDIR = build/$(VERSION)_$(PLATFORM)
-DATASET_DIR = $(PARBOIL_ROOT)/datasets/$(APP)
+DATASET_DIR ?= $(PARBOIL_ROOT)/datasets/$(APP)
 
 MATRIX1 = $(DATASET_DIR)/$(TEST)/input/matrix1.txt
 MATRIX2 = $(DATASET_DIR)/$(TEST)/input/matrix2.txt
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
index 22d9098646f3ca1e8582a319e93e68e9a58e42d8..af9ee76e0fed3ced9e2666193afbd7c0631f1ce8 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
@@ -279,4 +279,3 @@ int main (int argc, char *argv[]) {
 
     return 0;
 }
-
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
index 74d2f44495290de30a624a3606d64b56ddc92b63..11a02fdd488acbaf9c080cab63e968b3310e180e 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
+++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
@@ -33,14 +33,41 @@ static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
     return 0;
 }
 
-void naive_kernel(float c0,float c1, float* A0, float *Anext,int nx,int ny,int nz)
+typedef struct __attribute__((__packed__)) {
+    float c0, c1;
+    float* A0; size_t bytes_A0;
+    float* Anext; size_t bytes_Anext;
+    int nx, ny, nz;
+    size_t dim_X1, dim_Y1, dim_Z1;
+    size_t dim_X2, dim_Y2, dim_Z2;
+} RootIn;
+
+void naive_kernel(float c0, float c1, 
+                  float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
+                  int nx, int ny, int nz) 
 {
     __visc__hint(visc::DEVICE);
     __visc__attributes(2, A0, Anext, 1, Anext);
-    int i = get_global_id(0)+1;
-    int j = get_global_id(1)+1;
-    int k = get_global_id(2)+1;
 
+    void* thisNode = __visc__getNode();
+    void* parentNode = __visc__getParentNode(thisNode);
+
+    int lx = __visc__getNodeInstanceID_x(thisNode);
+    int ly = __visc__getNodeInstanceID_y(thisNode);
+    int lz = __visc__getNodeInstanceID_z(thisNode);
+
+    int gx = __visc__getNodeInstanceID_x(parentNode);
+    int gy = __visc__getNodeInstanceID_y(parentNode);
+    int gz = __visc__getNodeInstanceID_z(parentNode);
+    
+    int gridx = __visc__getNumNodeInstances_x(thisNode);
+    int gridy = __visc__getNumNodeInstances_y(thisNode);
+    int gridz = __visc__getNumNodeInstances_z(thisNode);
+
+    int i = gx * gridx + lx + 1;
+    int j = gy * gridy + ly + 1;
+    int k = gz * gridz + lz + 1;
+    
     if(i<nx-1)
     {
         Anext[Index3D (nx, ny, i, j, k)] = c1 *
@@ -54,6 +81,74 @@ void naive_kernel(float c0,float c1, float* A0, float *Anext,int nx,int ny,int n
     }
 }
 
+void stencilLvl1(float c0, float c1, 
+                 float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
+                 int nx, int ny, int nz,
+                 size_t dim_X1, size_t dim_Y1, size_t dim_Z1) 
+{
+    __visc__hint(visc::DEVICE);
+    __visc__attributes(2, A0, Anext, 1, Anext);
+    void* stencil_node = __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1);
+    __visc__bindIn(stencil_node, 0, 0, 0);
+    __visc__bindIn(stencil_node, 1, 1, 0);
+    __visc__bindIn(stencil_node, 2, 2, 0);
+    __visc__bindIn(stencil_node, 3, 3, 0);
+    __visc__bindIn(stencil_node, 4, 4, 0);
+    __visc__bindIn(stencil_node, 5, 5, 0);
+    __visc__bindIn(stencil_node, 6, 6, 0);
+    __visc__bindIn(stencil_node, 7, 7, 0);
+    __visc__bindIn(stencil_node, 8, 8, 0);
+}
+
+void stencilLvl2(float c0, float c1, 
+                 float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
+                 int nx, int ny, int nz,
+                 size_t dim_X1, size_t dim_Y1, size_t dim_Z1,
+                 size_t dim_X2, size_t dim_Y2, size_t dim_Z2)
+{
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(2, A0, Anext, 1, Anext);
+    void* stencil_node = __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2);
+    __visc__bindIn(stencil_node, 0, 0, 0);
+    __visc__bindIn(stencil_node, 1, 1, 0);
+    __visc__bindIn(stencil_node, 2, 2, 0);
+    __visc__bindIn(stencil_node, 3, 3, 0);
+    __visc__bindIn(stencil_node, 4, 4, 0);
+    __visc__bindIn(stencil_node, 5, 5, 0);
+    __visc__bindIn(stencil_node, 6, 6, 0);
+    __visc__bindIn(stencil_node, 7, 7, 0);
+    __visc__bindIn(stencil_node, 8, 8, 0);
+    __visc__bindIn(stencil_node, 9, 9, 0);
+    __visc__bindIn(stencil_node, 10, 10, 0);
+    __visc__bindIn(stencil_node, 11, 11, 0);
+}
+
+void stencilLvl3(float c0, float c1, 
+                 float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
+                 int nx, int ny, int nz,
+                 size_t dim_X1, size_t dim_Y1, size_t dim_Z1,
+                 size_t dim_X2, size_t dim_Y2, size_t dim_Z2)
+{
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(2, A0, Anext, 1, Anext);
+    void* stencil_node = __visc__createNodeND(0, stencilLvl2);
+    __visc__bindIn(stencil_node, 0, 0, 0);
+    __visc__bindIn(stencil_node, 1, 1, 0);
+    __visc__bindIn(stencil_node, 2, 2, 0);
+    __visc__bindIn(stencil_node, 3, 3, 0);
+    __visc__bindIn(stencil_node, 4, 4, 0);
+    __visc__bindIn(stencil_node, 5, 5, 0);
+    __visc__bindIn(stencil_node, 6, 6, 0);
+    __visc__bindIn(stencil_node, 7, 7, 0);
+    __visc__bindIn(stencil_node, 8, 8, 0);
+    __visc__bindIn(stencil_node, 9, 9, 0);
+    __visc__bindIn(stencil_node, 10, 10, 0);
+    __visc__bindIn(stencil_node, 11, 11, 0);
+    __visc__bindIn(stencil_node, 12, 12, 0);
+    __visc__bindIn(stencil_node, 13, 13, 0);
+    __visc__bindIn(stencil_node, 14, 14, 0);
+}
+
 int main(int argc, char** argv) {
     struct pb_TimerSet timers;
     struct pb_Parameters *parameters;
@@ -130,7 +225,7 @@ int main(int argc, char** argv) {
     //size_t grid[3] = {nx-2,ny-2,nz-2};
     size_t offset[3] = {1,1,1};
 
-    printf("grid(%d, %d, %d), block(%d, %d, %d)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);
+    printf("grid(%ld, %ld, %ld), block(%ld, %ld, %ld)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);
     //main execution
 
     int t;
@@ -140,14 +235,25 @@ int main(int argc, char** argv) {
     for(t=0; t<iteration; t++)
     {
         pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        void* stencilDFG = __visc__node(naive_kernel, 2, 3, block[0], block[1], block[2], grid[0]/block[0], grid[1]/block[1], grid[2]/block[2], 9, (float)c0, (float)c1, h_A0, bytes, h_Anext, bytes, nx, ny, nz, 0);
+        
+        void* root_in = malloc(sizeof(RootIn));
+        RootIn root_in_local = {
+            c0, c1,
+            h_A0, bytes,
+            h_Anext, bytes,
+            nx, ny, nz,
+            block[0], block[1], block[2],
+            grid[0]/block[0], grid[1]/block[1], grid[2]/block[2]
+        };
+        *(RootIn*)root_in = root_in_local;
+        void* stencilDFG = __visc__launch(0, stencilLvl3, root_in);
+
         __visc__wait(stencilDFG);
         //printf("iteration %d\n",t);
         pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
         float* h_temp = h_A0;
         h_A0 = h_Anext;
         h_Anext = h_temp;
-
     }