diff --git a/hpvm/test/parboil/benchmarks/lbm/Makefile b/hpvm/test/parboil/benchmarks/lbm/Makefile
index 5644a7d42c2cb64961f27b914ced9b24b172d273..ee7bcb33b4c22f2d47bada51a9d89b0cddf270b2 100644
--- a/hpvm/test/parboil/benchmarks/lbm/Makefile
+++ b/hpvm/test/parboil/benchmarks/lbm/Makefile
@@ -1,4 +1,4 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil
+PARBOIL_ROOT = $(LLVM_SRC_ROOT)/../test/parboil
 APP = lbm
 
 ifeq ($(NUM_CORES),)
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h
index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d8ceb373dfe0e7a02d374c78a9c3fd68cc8f3085 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h
@@ -6,7 +6,7 @@
  *cr
  ***************************************************************************/
 
-#ifndef _LBM_MARCOS_H
+#ifndef _LBM_MACROS_H_
 #define _LBM_MACROS_H_
 
 #define OMEGA (1.95f)
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
index 255036539c25d9da9cb46329b69ec9ce14bd57ac..605513847af6665d2d20b01cb659ecbd393ddb26 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
@@ -82,21 +82,35 @@ void MAIN_printInfo( const MAIN_Param* param ) {
 }
 
 /*############################################################################*/
-void performStreamCollide_kernel( float* srcG, float* dstG )
+
+typedef struct __attribute__((__packed__)) {
+    float* srcG; size_t bytes_srcG;
+    float* dstG; size_t bytes_dstG;
+    size_t dim_X1, dim_X2, dim_Y2;
+} RootIn;
+
+void performStreamCollide_kernel( float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG )
 {
     __visc__hint(visc::DEVICE);
     __visc__attributes(2, srcG, dstG, 1, dstG);
+
+    void* thisNode = __visc__getNode();
+    void* parentNode = __visc__getParentNode(thisNode);
+
     srcG += MARGIN;
     dstG += MARGIN;
 
+    int lx = __visc__getNodeInstanceID_x(thisNode);
+    int gx = __visc__getNodeInstanceID_x(parentNode);
+    int gy = __visc__getNodeInstanceID_y(parentNode);
 
     //Using some predefined macros here.  Consider this the declaration
     //  and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
 
     SWEEP_VAR
-    SWEEP_X = get_local_id(0);
-    SWEEP_Y = get_group_id(0);
-    SWEEP_Z = get_group_id(1);
+    SWEEP_X = lx; // get_local_id(0)
+    SWEEP_Y = gx; // get_group_id(0)
+    SWEEP_Z = gy; // get_group_id(1)
 
     float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB;
     float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ;
@@ -262,13 +276,58 @@ void performStreamCollide_kernel( float* srcG, float* dstG )
     DST_WB( dstG ) = tempWB;
 }
 
+void lbmLvl1(float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG, size_t dim_X1)
+{
+    __visc__hint(visc::DEVICE);
+    __visc__attributes(2, srcG, dstG, 1, dstG);
+    void* lbm_node = __visc__createNodeND(1, performStreamCollide_kernel, dim_X1);
+    __visc__bindIn(lbm_node, 0, 0, 0);
+    __visc__bindIn(lbm_node, 1, 1, 0);
+    __visc__bindIn(lbm_node, 2, 2, 0);
+    __visc__bindIn(lbm_node, 3, 3, 0);
+}
+
+void lbmLvl2(float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2)
+{
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(2, srcG, dstG, 1, dstG);
+    void* lbm_node = __visc__createNodeND(2, lbmLvl1, dim_X2, dim_Y2);
+    __visc__bindIn(lbm_node, 0, 0, 0);
+    __visc__bindIn(lbm_node, 1, 1, 0);
+    __visc__bindIn(lbm_node, 2, 2, 0);
+    __visc__bindIn(lbm_node, 3, 3, 0);
+    __visc__bindIn(lbm_node, 4, 4, 0);
+}
+
+void lbmLvl3(float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2)
+{
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(2, srcG, dstG, 1, dstG);
+    void* lbm_node = __visc__createNodeND(0, lbmLvl2);
+    __visc__bindIn(lbm_node, 0, 0, 0);
+    __visc__bindIn(lbm_node, 1, 1, 0);
+    __visc__bindIn(lbm_node, 2, 2, 0);
+    __visc__bindIn(lbm_node, 3, 3, 0);
+    __visc__bindIn(lbm_node, 4, 4, 0);
+    __visc__bindIn(lbm_node, 5, 5, 0);
+    __visc__bindIn(lbm_node, 6, 6, 0);
+}
 
 __attribute__((noinline)) void MAIN_performStreamCollide( LBM_Grid src, LBM_Grid dst ) {
 
     long dimBlock[3] = {SIZE_X,1,1};
     long dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
     size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-    void* lbmDFG = __visc__node(performStreamCollide_kernel, 2, 3, dimBlock[0], dimBlock[1], dimBlock[2], dimGrid[0]/dimBlock[0], dimGrid[1]/dimBlock[1], dimGrid[2]/dimBlock[2], 4, src-MARGIN, size, dst-MARGIN, size, 0);
+    
+    void* root_in = malloc(sizeof(RootIn));
+    RootIn root_in_local = {
+        src - MARGIN, size,
+        dst - MARGIN, size,
+        SIZE_X, SIZE_Y, SIZE_Z
+    };
+    *(RootIn*)root_in = root_in_local;
+    void* lbmDFG = __visc__launch(0, lbmLvl3, root_in);
+    
     __visc__wait(lbmDFG);
 
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/Makefile b/hpvm/test/parboil/benchmarks/spmv/Makefile
index e7ef8ca39c065ebe08d82675cd68ecd35c2af43b..71e2246343264f0fc04ad5041d588730a6924751 100644
--- a/hpvm/test/parboil/benchmarks/spmv/Makefile
+++ b/hpvm/test/parboil/benchmarks/spmv/Makefile
@@ -1,4 +1,4 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil
+PARBOIL_ROOT = $(LLVM_SRC_ROOT)/../test/parboil
 APP = spmv
 
 # Default compile visc
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
index bd457e6e1348631ae4ced3b25955ee68a1014bb4..eb4d4d7dbed65c13340578f8023afab1473fe962 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
@@ -31,17 +31,38 @@ static int generate_vector(float *x_vector, int dim)
     return 0;
 }
 
-void spmv_jds(float *dst_vector, float *d_data,
-              int *d_index, int *d_perm,
-              float *x_vec, int dim,
-              int *jds_ptr_int,
-              int *sh_zcnt_int)
+typedef struct __attribute__((__packed__)) {
+    float* dst_vector; size_t bytes_dst_vector;
+    float* d_data; size_t bytes_d_data;
+    int* d_index; size_t bytes_d_index;
+    int* d_perm; size_t bytes_d_perm;
+    float* x_vec; size_t bytes_x_vec;
+    int dim;
+    int* jds_ptr_int; size_t bytes_jds_ptr_int;
+    int* sh_zcnt_int; size_t bytes_sh_zcnt_int;
+    size_t dim_X1, dim_X2;
+} RootIn;
+
+void spmv_jds(float* dst_vector, size_t bytes_dst_vector,
+              float* d_data, size_t bytes_d_data,
+              int* d_index, size_t bytes_d_index,
+              int* d_perm, size_t bytes_d_perm,
+              float* x_vec, size_t bytes_x_vec,
+              int dim,
+              int* jds_ptr_int, size_t bytes_jds_ptr_int,
+              int* sh_zcnt_int, size_t bytes_sh_zcnt_int)
 {
     __visc__hint(visc::DEVICE);
     __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
                        1, dst_vector);
 
-    int ix = get_global_id(0);
+    void* thisNode = __visc__getNode();
+    void* parentNode = __visc__getParentNode(thisNode);
+    int lx = __visc__getNodeInstanceID_x(thisNode);
+    int gx = __visc__getNodeInstanceID_x(parentNode);
+    int gridx = __visc__getNumNodeInstances_x(thisNode);
+
+    int ix = gx * gridx + lx;
     int warp_id=ix>>WARP_BITS;
 
     if(ix<dim)
@@ -99,6 +120,102 @@ void spmv_jds(float *dst_vector, float *d_data,
     }
 }
 
+void spmvLvl1(float* dst_vector, size_t bytes_dst_vector,
+              float* d_data, size_t bytes_d_data,
+              int* d_index, size_t bytes_d_index,
+              int* d_perm, size_t bytes_d_perm,
+              float* x_vec, size_t bytes_x_vec,
+              int dim,
+              int* jds_ptr_int, size_t bytes_jds_ptr_int,
+              int* sh_zcnt_int, size_t bytes_sh_zcnt_int,
+              size_t dim_X1)
+{
+    __visc__hint(visc::DEVICE);
+    __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
+                       1, dst_vector);
+    void* spmv_node = __visc__createNodeND(1, spmv_jds, dim_X1);
+    __visc__bindIn(spmv_node, 0, 0, 0);
+    __visc__bindIn(spmv_node, 1, 1, 0);
+    __visc__bindIn(spmv_node, 2, 2, 0);
+    __visc__bindIn(spmv_node, 3, 3, 0);
+    __visc__bindIn(spmv_node, 4, 4, 0);
+    __visc__bindIn(spmv_node, 5, 5, 0);
+    __visc__bindIn(spmv_node, 6, 6, 0);
+    __visc__bindIn(spmv_node, 7, 7, 0);
+    __visc__bindIn(spmv_node, 8, 8, 0);
+    __visc__bindIn(spmv_node, 9, 9, 0);
+    __visc__bindIn(spmv_node, 10, 10, 0);
+    __visc__bindIn(spmv_node, 11, 11, 0);
+    __visc__bindIn(spmv_node, 12, 12, 0);
+    __visc__bindIn(spmv_node, 13, 13, 0);
+    __visc__bindIn(spmv_node, 14, 14, 0);
+}
+
+void spmvLvl2(float* dst_vector, size_t bytes_dst_vector,
+              float* d_data, size_t bytes_d_data,
+              int* d_index, size_t bytes_d_index,
+              int* d_perm, size_t bytes_d_perm,
+              float* x_vec, size_t bytes_x_vec,
+              int dim,
+              int* jds_ptr_int, size_t bytes_jds_ptr_int,
+              int* sh_zcnt_int, size_t bytes_sh_zcnt_int,
+              size_t dim_X1, size_t dim_X2)
+{
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
+                       1, dst_vector);
+    void* spmv_node = __visc__createNodeND(1, spmvLvl1, dim_X2);
+    __visc__bindIn(spmv_node, 0, 0, 0);
+    __visc__bindIn(spmv_node, 1, 1, 0);
+    __visc__bindIn(spmv_node, 2, 2, 0);
+    __visc__bindIn(spmv_node, 3, 3, 0);
+    __visc__bindIn(spmv_node, 4, 4, 0);
+    __visc__bindIn(spmv_node, 5, 5, 0);
+    __visc__bindIn(spmv_node, 6, 6, 0);
+    __visc__bindIn(spmv_node, 7, 7, 0);
+    __visc__bindIn(spmv_node, 8, 8, 0);
+    __visc__bindIn(spmv_node, 9, 9, 0);
+    __visc__bindIn(spmv_node, 10, 10, 0);
+    __visc__bindIn(spmv_node, 11, 11, 0);
+    __visc__bindIn(spmv_node, 12, 12, 0);
+    __visc__bindIn(spmv_node, 13, 13, 0);
+    __visc__bindIn(spmv_node, 14, 14, 0);
+    __visc__bindIn(spmv_node, 15, 15, 0);
+}
+
+void spmvLvl3(float* dst_vector, size_t bytes_dst_vector,
+              float* d_data, size_t bytes_d_data,
+              int* d_index, size_t bytes_d_index,
+              int* d_perm, size_t bytes_d_perm,
+              float* x_vec, size_t bytes_x_vec,
+              int dim,
+              int* jds_ptr_int, size_t bytes_jds_ptr_int,
+              int* sh_zcnt_int, size_t bytes_sh_zcnt_int,
+              size_t dim_X1, size_t dim_X2)
+{
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
+                       1, dst_vector);
+    void* spmv_node = __visc__createNodeND(1, spmvLvl2, dim_X2);
+    __visc__bindIn(spmv_node, 0, 0, 0);
+    __visc__bindIn(spmv_node, 1, 1, 0);
+    __visc__bindIn(spmv_node, 2, 2, 0);
+    __visc__bindIn(spmv_node, 3, 3, 0);
+    __visc__bindIn(spmv_node, 4, 4, 0);
+    __visc__bindIn(spmv_node, 5, 5, 0);
+    __visc__bindIn(spmv_node, 6, 6, 0);
+    __visc__bindIn(spmv_node, 7, 7, 0);
+    __visc__bindIn(spmv_node, 8, 8, 0);
+    __visc__bindIn(spmv_node, 9, 9, 0);
+    __visc__bindIn(spmv_node, 10, 10, 0);
+    __visc__bindIn(spmv_node, 11, 11, 0);
+    __visc__bindIn(spmv_node, 12, 12, 0);
+    __visc__bindIn(spmv_node, 13, 13, 0);
+    __visc__bindIn(spmv_node, 14, 14, 0);
+    __visc__bindIn(spmv_node, 15, 15, 0);
+    __visc__bindIn(spmv_node, 16, 16, 0);
+}
+
 int main(int argc, char** argv) {
     struct pb_TimerSet timers;
     struct pb_Parameters *parameters;
@@ -185,24 +302,22 @@ int main(int argc, char** argv) {
     for(i=0; i<50; i++)
     {
         pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        void* spmvDFG = __visc__node(spmv_jds, 2, 1, block, (grid/block),
-                                        15,
-                                        h_Ax_vector,
-                                        dim*sizeof(float),
-                                        h_data,
-                                        len*sizeof(float),
-                                        h_indices,
-                                        len*sizeof(int),
-                                        h_perm,
-                                        dim*sizeof(int),
-                                        h_x_vector,
-                                        dim*sizeof(float),
-                                        dim,
-                                        h_ptr,
-                                        depth*sizeof(int),
-                                        h_nzcnt,
-                                        nzcnt_len*sizeof(int),
-                                        0);
+
+        void* root_in = malloc(sizeof(RootIn));
+        RootIn root_in_local = {
+            h_Ax_vector, dim * sizeof(float),
+            h_data, len * sizeof(float),
+            h_indices, len * sizeof(int),
+            h_perm, dim * sizeof(int),
+            h_x_vector, dim * sizeof(float),
+            dim,
+            h_ptr, depth * sizeof(int),
+            h_nzcnt, nzcnt_len * sizeof(int),
+            block, (grid/block)
+        };
+        *(RootIn*)root_in = root_in_local;
+        void* spmvDFG = __visc__launch(0, spmvLvl3, root_in);
+       
         __visc__wait(spmvDFG);
         pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -224,7 +339,7 @@ int main(int argc, char** argv) {
     llvm_visc_request_mem(h_Ax_vector, dim*sizeof(float));
 
 
-   pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+    pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
 
     llvm_visc_untrack_mem(h_Ax_vector);
     llvm_visc_untrack_mem(h_data);