diff --git a/hpvm/test/parboil/benchmarks/hpvm-cava/Makefile b/hpvm/test/parboil/benchmarks/hpvm-cava/Makefile
index 671c4f7c0a154dc50ee18e0704eb993f190b793a..d51743070ca51aaed137c730b99045e94c378c36 100644
--- a/hpvm/test/parboil/benchmarks/hpvm-cava/Makefile
+++ b/hpvm/test/parboil/benchmarks/hpvm-cava/Makefile
@@ -87,7 +87,6 @@ APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3
 APP_CXXFLAGS=-ffast-math -O0 -I/opt/opencv/include
 APP_LDFLAGS=$(LFLAGS)
-OPT_FLAGS = -tti -targetlibinfo -tbaa -scoped-noalias -assumption-cache-tracker -profile-summary-info -forceattrs -inferattrs -ipsccp -globalopt -domtree -mem2reg -deadargelim -domtree -basicaa -aa -simplifycfg -pgo-icall-prom -basiccg -globals-aa -prune-eh -always-inline -functionattrs -domtree -sroa -early-cse -lazy-value-info -jump-threading -correlated-propagation -simplifycfg -domtree -basicaa -aa -libcalls-shrinkwrap -tailcallelim -simplifycfg -reassociate -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -licm -loop-unswitch -simplifycfg -domtree -basicaa -aa -loops -loop-simplify -lcssa-verification -lcssa -scalar-evolution -indvars -loop-idiom -loop-deletion -memdep -memcpyopt -sccp -domtree -demanded-bits -bdce -basicaa -aa -lazy-value-info -jump-threading -correlated-propagation -domtree -basicaa -aa -memdep -dse -loops -loop-simplify -lcssa-verification -lcssa -aa -scalar-evolution -licm -postdomtree -adce -simplifycfg -domtree -basicaa -aa -barrier -basiccg -rpo-functionattrs -globals-aa -float2int -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -loop-accesses -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-distribute -loop-simplify -lcssa-verification -lcssa -branch-prob -block-freq -scalar-evolution -basicaa -aa -loop-accesses -demanded-bits -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-vectorize -loop-simplify -scalar-evolution -aa -loop-accesses -loop-load-elim -basicaa -aa -simplifycfg -domtree -basicaa -aa -loops -scalar-evolution -alignment-from-assumptions -strip-dead-prototypes -domtree -loops -branch-prob -block-freq -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -branch-prob -block-freq -loop-sink -instsimplify 
 
 CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
 OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
@@ -188,7 +187,7 @@ else ifeq ($(TARGET),fpga)
 else
   KERNEL_LINKED = $(BUILD_DIR)/$(APP).kernels.linked.ll
   #KERNEL = $(TEST_OBJS).kernels.ll
-  PTX_ASSEMBLY = $(TEST_OBJS).nvptx.s
+  KERNEL_OCL = $(TEST_OBJS).kernels.cl
 endif
 
 HOST_LINKED = $(BUILD_DIR)/$(APP).linked.ll
@@ -201,14 +200,11 @@ FAILSAFE=
 endif
 
 # Targets
-default: $(FAILSAFE) $(BUILD_DIR) $(EXE)
-#default: $(FAILSAFE) $(BUILD_DIR) $(PTX_ASSEMBLY) $(SPIR_ASSEMBLY) $(AOC_CL) $(AOCL_ASSEMBLY) $(EXE)
+default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(EXE)
+#default: $(FAILSAFE) $(BUILD_DIR) $(KERNEL_OCL) $(SPIR_ASSEMBLY) $(AOC_CL) $(AOCL_ASSEMBLY) $(EXE)
 
-$(PTX_ASSEMBLY) : $(KERNEL_LINKED)
-	$(CC) $(KERNEL_GEN_FLAGS) -S $< -o $@
-
-$(KERNEL_LINKED) : $(KERNEL)
-	$(LLVM_LINK) $(LIBCLC_NVPTX_LIB) -S $< -o $@
+$(KERNEL_OCL) : $(KERNEL)
+	$(OCLBE) --debug $< -o $@
 
 $(SPIR_ASSEMBLY) : $(KERNEL)
 	python $(PYTHON_LLVM_40_34) $< $(BUILD_DIR)/kernel_34.ll
diff --git a/hpvm/test/parboil/benchmarks/hpvm-cava/Makefile.config b/hpvm/test/parboil/benchmarks/hpvm-cava/Makefile.config
index f724aaee7e5a11972ceabcc2c3be3df8937f7048..9ece12fe0eade697640611f2810afd42d17727cf 100644
--- a/hpvm/test/parboil/benchmarks/hpvm-cava/Makefile.config
+++ b/hpvm/test/parboil/benchmarks/hpvm-cava/Makefile.config
@@ -5,13 +5,14 @@ CUDA_LIB_PATH=/usr/local/cuda/lib64
 OPENCL_PATH=/opt/intel/opencl-sdk/
 OPENCL_LIB_PATH=$(OPENCL_PATH)/lib64
 
-#LLVM_SRC_ROOT=/home/kotsifa2/HPVM/Gitlab/hpvm/llvm
+LLVM_SRC_ROOT=/home/aejjeh/work_dir/hpvm-reorg-9/hpvm/llvm/
 # NOTE: You may need to configure this based on your root path.
 VISC_SRC_ROOT=$(LLVM_SRC_ROOT)
 
 VISC_BUILD_DIR =$(VISC_SRC_ROOT)/../build
 CC = $(VISC_BUILD_DIR)/bin/clang
 PLATFORM_CFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
+OCLBE = $(VISC_BUILD_DIR)/bin/llvm-cbe
 
 CXX = $(VISC_BUILD_DIR)/bin/clang++
 PLATFORM_CXXFLAGS = -I$(LLVM_SRC_ROOT)/include -I$(VISC_BUILD_DIR)/include
diff --git a/hpvm/test/parboil/benchmarks/hpvm-cava/src/main.c b/hpvm/test/parboil/benchmarks/hpvm-cava/src/main.c
index ea42ad0bf87fd8e0b337ea1e7d0ad803025e849e..e43bbb4f25c4c97c9907ebae37251c854860c3b5 100644
--- a/hpvm/test/parboil/benchmarks/hpvm-cava/src/main.c
+++ b/hpvm/test/parboil/benchmarks/hpvm-cava/src/main.c
@@ -34,7 +34,7 @@ typedef struct __attribute__((__packed__)) {
     float*coefs; size_t bytes_coefs;
     float *l2_dist; size_t bytes_l2_dist;
     float *tone_map; size_t bytes_tone_map;
-    int row_size; int col_size;
+    size_t row_size; size_t col_size;
 } 
 RootIn;
 
@@ -112,7 +112,7 @@ static struct argp parser = { options, parse_opt, args_doc, prog_doc };
 // Helper function for printing intermediate results
 void descale_cpu(float *input, size_t bytes_input, 
                  uint8_t *output, size_t bytes_result,
-                 int row_size, int col_size) {
+                 size_t row_size, size_t col_size) {
   
   for (int chan = 0; chan < CHAN_SIZE; chan++)
     for (int row = 0; row < row_size; row++)
@@ -142,18 +142,19 @@ static void sort(float arr[], int n) {
 // Leaf HPVM node function for scale
 void scale_fxp(uint8_t *input, size_t bytes_input, 
                float *output, size_t bytes_output,
-               int row_size, int col_size) {
+               size_t row_size, size_t col_size) {
 
   //Specifies compilation target for current node
-  __visc__hint(DEVICE);
+  __visc__hint(CPU_TARGET);
 
   // Specifies pointer arguments that will be used as "in" and "out" arguments
   // - count of "in" arguments
   // - list of "in" argument , and similar for "out"
   __visc__attributes(2, input, output, 1, output);
-  
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    for (int row = 0; row < row_size; row++)
+//    for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++){
         int index = (chan*row_size + row) * col_size + col;
         output[index] = input[index] * 1.0 / 255;
@@ -164,8 +165,8 @@ void scale_fxp(uint8_t *input, size_t bytes_input,
 // Leaf HPVM node function for descale
 void descale_fxp(float *input, size_t bytes_input, 
                  uint8_t *output, size_t bytes_result,
-                 int row_size, int col_size) {
-  __visc__hint(DEVICE);
+                 size_t row_size, size_t col_size) {
+  __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, output, 1, output);
   
   for (int chan = 0; chan < CHAN_SIZE; chan++)
@@ -180,11 +181,13 @@ void descale_fxp(float *input, size_t bytes_input,
 // Leaf HPVM node function for demosaicing
 void demosaic_fxp(float *input, size_t bytes_input, 
                   float *result, size_t bytes_result,
-                  int row_size, int col_size) {
+                  size_t row_size, size_t col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, result, 1, result);
   
-  for (int row = 1; row < row_size - 1; row++)
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
+//  for (int row = 1; row < row_size - 1; row++)
     for (int col = 1; col < col_size - 1; col++) {
         int index_0 = (0 * row_size + row) * col_size + col;
         int index_1 = (1 * row_size + row) * col_size + col;
@@ -261,12 +264,14 @@ void demosaic_fxp(float *input, size_t bytes_input,
 // Leaf HPVM node function for denoise
 void denoise_fxp(float *input, size_t bytes_input, 
                  float *result, size_t bytes_result,
-                 int row_size, int col_size) {
-  __visc__hint(DEVICE);
+                 size_t row_size, size_t col_size) {
+  __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
   
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    for (int row = 0; row < row_size; row++)
+//    for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++)
         if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
           float filter[9];
@@ -287,12 +292,14 @@ void denoise_fxp(float *input, size_t bytes_input,
 void transform_fxp(float *input, size_t bytes_input, 
                    float *result, size_t bytes_result,
                    float *TsTw_tran, size_t bytes_TsTw,
-                   int row_size, int col_size) {
+                   size_t row_size, size_t col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, TsTw_tran, 1, result);
   
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    for (int row = 0; row < row_size; row++)
+//    for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
         int index = (chan * row_size + row) * col_size + col;
         int index_0 = (0 * row_size + row) * col_size + col;
@@ -317,15 +324,19 @@ void gamut_map_fxp(float *input, size_t bytes_input,
                    float *weights, size_t bytes_weights,
                    float *coefs, size_t bytes_coefs,
                    float *l2_dist, size_t bytes_l2_dist,
-                   int row_size, int col_size) {
+                   size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist);
 
  // First, get the L2 norm from every pixel to the control points,
  // Then, sum it and weight it. Finally, add the bias.
-
-  for (int row = 0; row < row_size; row++)
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
+//  for (int row = 0; row < row_size; row++)
     for (int col = 0; col < col_size; col++) {
+      float chan_val_0 = 0.0;
+      float chan_val_1 = 0.0;
+      float chan_val_2 = 0.0;
       for (int cp = 0; cp < 3702; cp++) {
         int index_0 = (0 * row_size + row) * col_size + col;
         int index_1 = (1 * row_size + row) * col_size + col;
@@ -338,19 +349,25 @@ void gamut_map_fxp(float *input, size_t bytes_input,
         float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
         float val = val1 * val2 + val3 * val4 + val5 * val6;
         float sqrt_val = sqrt(val);
-        l2_dist[cp] = sqrt_val;
-      }
-      for (int chan = 0; chan < CHAN_SIZE; chan++) {
-        float chan_val = 0.0;
-        for (int cp = 0; cp < 3702; cp++) {
-          chan_val += l2_dist[cp] * weights[cp * CHAN_SIZE + chan];
-        }
-        chan_val += coefs[0 * CHAN_SIZE + chan] + 
-                    coefs[1 * CHAN_SIZE + chan] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + chan] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + chan] * input[(2 * row_size + row) * col_size + col];
-        result[(chan * row_size + row) * col_size + col] = max(chan_val, 0);
+        chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
+        chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
+        chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
       }
+        chan_val_0 += coefs[0 * CHAN_SIZE + 0] + 
+                    coefs[1 * CHAN_SIZE + 0] * input[(0 * row_size + row) * col_size + col] +
+                    coefs[2 * CHAN_SIZE + 0] * input[(1 * row_size + row) * col_size + col] +
+                    coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
+        chan_val_1 += coefs[0 * CHAN_SIZE + 1] + 
+                    coefs[1 * CHAN_SIZE + 1] * input[(0 * row_size + row) * col_size + col] +
+                    coefs[2 * CHAN_SIZE + 1] * input[(1 * row_size + row) * col_size + col] +
+                    coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
+        chan_val_2 += coefs[0 * CHAN_SIZE + 2] + 
+                    coefs[1 * CHAN_SIZE + 2] * input[(0 * row_size + row) * col_size + col] +
+                    coefs[2 * CHAN_SIZE + 2] * input[(1 * row_size + row) * col_size + col] +
+                    coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
+        result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
+        result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
+        result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
     }
   __visc__return(1, bytes_result);
 }
@@ -359,12 +376,14 @@ void gamut_map_fxp(float *input, size_t bytes_input,
 void tone_map_fxp(float *input, size_t bytes_input, 
                   float *result, size_t bytes_result,
                   float *tone_map, size_t bytes_tone_map,
-                  int row_size, int col_size) {
+                  size_t row_size, size_t col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, tone_map, 1, result);
   
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    for (int row = 0; row < row_size; row++)
+//    for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
         int index = (chan * row_size + row) * col_size + col;
         uint8_t x = input[index] * 255;
@@ -383,13 +402,13 @@ void tone_map_fxp(float *input, size_t bytes_input,
 
 void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, 
                        float *result, size_t bytes_result,
-                       int row_size, int col_size) {
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
 
   // Create an 1D (specified by 1st argument) HPVM node with 1 dynamic
   // instance (last argument) associated with node function scale_fxp
-  void *ScaleNode = __visc__createNodeND(1, scale_fxp, (size_t)1);
+  void *ScaleNode = __visc__createNodeND(1, scale_fxp, row_size);
 
   // Binds inputs of current node with specified node
   // - destination node
@@ -410,10 +429,10 @@ void scale_fxp_wrapper(uint8_t *input, size_t bytes_input,
 
 void descale_fxp_wrapper(float *input, size_t bytes_input, 
                        uint8_t *result, size_t bytes_result,
-                       int row_size, int col_size) {
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
-  void *DescaleNode = __visc__createNodeND(1, descale_fxp, (size_t)1);
+  void *DescaleNode = __visc__createNodeND(1, descale_fxp, row_size);
   __visc__bindIn(DescaleNode, 0, 0, 0); // bind input
   __visc__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input
   __visc__bindIn(DescaleNode, 2, 2, 0); // bind result
@@ -426,10 +445,10 @@ void descale_fxp_wrapper(float *input, size_t bytes_input,
 
 void demosaic_fxp_wrapper(float *input, size_t bytes_input, 
                        float *result, size_t bytes_result,
-                       int row_size, int col_size) {
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
-  void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, (size_t)1);
+  void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, row_size);
   __visc__bindIn(DemosaicNode, 0, 0, 0); // bind input
   __visc__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input
   __visc__bindIn(DemosaicNode, 2, 2, 0); // bind result
@@ -442,10 +461,10 @@ void demosaic_fxp_wrapper(float *input, size_t bytes_input,
 
 void denoise_fxp_wrapper(float *input, size_t bytes_input, 
                        float *result, size_t bytes_result,
-                       int row_size, int col_size) {
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
-  void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, (size_t)1);
+  void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, row_size);
   __visc__bindIn(DenoiseNode, 0, 0, 0); // bind input
   __visc__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input
   __visc__bindIn(DenoiseNode, 2, 2, 0); // bind result
@@ -459,10 +478,10 @@ void denoise_fxp_wrapper(float *input, size_t bytes_input,
 void transform_fxp_wrapper(float *input, size_t bytes_input, 
                        float *result, size_t bytes_result,
                        float *TsTw_tran, size_t bytes_TsTw,
-                       int row_size, int col_size) {
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(3, input, result, TsTw_tran, 1, result);
-  void *TransformNode = __visc__createNodeND(1, transform_fxp, (size_t)1);
+  void *TransformNode = __visc__createNodeND(1, transform_fxp, row_size);
   __visc__bindIn(TransformNode, 0, 0, 0); // bind input
   __visc__bindIn(TransformNode, 1, 1, 0); // bind bytes_input
   __visc__bindIn(TransformNode, 2, 2, 0); // bind result
@@ -481,10 +500,10 @@ void gamut_fxp_wrapper(float *input, size_t bytes_input,
                        float *weights, size_t bytes_weights,
                        float *coefs, size_t bytes_coefs,
                        float *l2_dist, size_t bytes_l2_dist,
-                       int row_size, int col_size) {
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
-  void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, (size_t)1);
+  void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, row_size);
   __visc__bindIn(GamutNode, 0, 0, 0); // bind input
   __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input
   __visc__bindIn(GamutNode, 2, 2, 0); // bind result
@@ -505,11 +524,11 @@ void gamut_fxp_wrapper(float *input, size_t bytes_input,
 void tone_map_fxp_wrapper(float *input, size_t bytes_input, 
                        float *result, size_t bytes_result,
                        float *tone_map, size_t bytes_tone_map,
-                       int row_size, int col_size) {
+                       size_t row_size, size_t col_size) {
 
   __visc__hint(CPU_TARGET);
   __visc__attributes(3, input, result, tone_map, 1, result);
-  void *ToneMapNode = __visc__createNodeND(1, tone_map_fxp, (size_t)1);
+  void *ToneMapNode = __visc__createNodeND(1, tone_map_fxp, row_size);
   __visc__bindIn(ToneMapNode, 0, 0, 0); // bind input
   __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input
   __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result
@@ -538,7 +557,7 @@ void CamPipeRoot(/*0*/ uint8_t *input,         /*1*/ size_t bytes_input,
                  /*22*/ float*coefs,           /*23*/ size_t bytes_coefs,
                  /*24*/ float *l2_dist,        /*25*/ size_t bytes_l2_dist,
                  /*26*/ float *tone_map,       /*27*/ size_t bytes_tone_map,
-                 /*28*/ int row_size,          /*29*/ int col_size) {
+                 /*28*/ size_t row_size,          /*29*/ size_t col_size) {
 
   //Specifies compilation target for current node
     __visc__hint(CPU_TARGET);
@@ -656,7 +675,7 @@ int main(int argc, char* argv[]) {
     // Read a raw image.
     // NOTE: We deliberately perform this file I/O outside of the kernel.
     printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
-    int row_size, col_size;
+    size_t row_size, col_size;
     uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
 
     printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
diff --git a/hpvm/test/parboil/benchmarks/hpvm-cava/src/pipe_stages.h b/hpvm/test/parboil/benchmarks/hpvm-cava/src/pipe_stages.h
index eae4347b991fe948173fc85334c65f084d40b745..8d98cb65cc8af7353cc1faf08988f3b1a6758046 100644
--- a/hpvm/test/parboil/benchmarks/hpvm-cava/src/pipe_stages.h
+++ b/hpvm/test/parboil/benchmarks/hpvm-cava/src/pipe_stages.h
@@ -25,24 +25,24 @@ extern int num_ctrl_pts;
 
 void scale_fxp(uint8_t *input, size_t bytes_input, 
                float *output, size_t bytes_output,
-               int row_size, int col_size);
+               size_t row_size, size_t col_size);
 
 void descale_fxp(float *input, size_t bytes_input, 
                  uint8_t *output, size_t bytes_result,
-                 int row_size, int col_size);
+                 size_t row_size, size_t col_size);
 
 void demosaic_fxp(float *input, size_t bytes_input, 
                   float *result, size_t bytes_result,
-                  int row_size, int col_size);
+                  size_t row_size, size_t col_size);
 
 void denoise_fxp(float *input, size_t bytes_input, 
                  float *result, size_t bytes_result,
-                 int row_size, int col_size);
+                 size_t row_size, size_t col_size);
 
 void transform_fxp(float *input, size_t bytes_input, 
                    float *result, size_t bytes_result,
                    float *TsTw_tran, size_t bytes_TsTw,
-                   int row_size, int col_size);
+                   size_t row_size, size_t col_size);
 
 void gamut_map_fxp(float *input, size_t bytes_input, 
                    float *result, size_t bytes_result,
@@ -50,14 +50,14 @@ void gamut_map_fxp(float *input, size_t bytes_input,
                    float *weights, size_t bytes_weights,
                    float *coefs, size_t bytes_coefs,
                    float *l2_dist, size_t bytes_l2_dist,
-                   int row_size, int col_size);
+                   size_t row_size, size_t col_size);
 
 void tone_map_fxp(float *input, size_t bytes_input, 
                   float *result, size_t bytes_result,
                   float *tone_map, size_t bytes_tone_map,
-                  int row_size, int col_size);
+                  size_t row_size, size_t col_size);
 
-void tone_map_approx_fxp(float *input, int row_size, int col_size,
+void tone_map_approx_fxp(float *input, size_t row_size, size_t col_size,
                          float *result);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/Makefile b/hpvm/test/parboil/benchmarks/spmv/Makefile
index 71e2246343264f0fc04ad5041d588730a6924751..23e1d4990031404b8e365d9430499b5fddb2af01 100644
--- a/hpvm/test/parboil/benchmarks/spmv/Makefile
+++ b/hpvm/test/parboil/benchmarks/spmv/Makefile
@@ -1,4 +1,4 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/../test/parboil
+PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = spmv
 
 # Default compile visc
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile b/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile
index efed901e6e008c78ed51d682bf4aaf60603ae193..a289d68f342ba488f8ce4d90faf26816d4d00829 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile
+++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/Makefile
@@ -5,7 +5,7 @@ TOOLS_SRC=common_src/convert-dataset
 SRCDIR_OBJS=gpu_info.ll file.ll
 VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm
-APP_CFLAGS=-ffast-math -O3 -I$(TOOLS_SRC)
-APP_CXXFLAGS=-ffast-math -O3 -I$(TOOLS_SRC)
+APP_CFLAGS=-ffast-math -O1 -I$(TOOLS_SRC)
+APP_CXXFLAGS=-ffast-math -O1 -I$(TOOLS_SRC)
 
 include $(TOOLS_SRC)/commontools.mk
diff --git a/hpvm/test/parboil/benchmarks/stencil/Makefile b/hpvm/test/parboil/benchmarks/stencil/Makefile
index eeac0b5f7eabda2dd4c5892f7e10cf7c3b0e413c..a44dd0dbf0d678c7e8417345854254a1c2676653 100644
--- a/hpvm/test/parboil/benchmarks/stencil/Makefile
+++ b/hpvm/test/parboil/benchmarks/stencil/Makefile
@@ -1,4 +1,4 @@
-PARBOIL_ROOT = $(LLVM_SRC_ROOT)/test/VISC/parboil
+PARBOIL_ROOT = $(LLVM_SRC_ROOT)/tools/hpvm/test/parboil
 APP = stencil
 
 # Default compile visc