From 1fa97ee84c62e70116fdaa57b3b1b1117c2e653f Mon Sep 17 00:00:00 2001
From: Adel Ejjeh <aejjeh@tyler.cs.illinois.edu>
Date: Wed, 22 Jan 2020 00:59:07 -0600
Subject: [PATCH] pushing older (working) version of cava

---
 hpvm/test/hpvm-cava/src/cam_pipe.c         |   45 +-
 hpvm/test/hpvm-cava/src/cam_pipe_utility.c |   13 +-
 hpvm/test/hpvm-cava/src/cam_pipe_utility.h |    2 +-
 hpvm/test/hpvm-cava/src/defs.h             |  214 ++--
 hpvm/test/hpvm-cava/src/dma_interface.c    |   32 +-
 hpvm/test/hpvm-cava/src/dma_interface.h    |   17 +-
 hpvm/test/hpvm-cava/src/load_cam_model.c   |   39 +-
 hpvm/test/hpvm-cava/src/main.c             | 1268 ++++++++++----------
 hpvm/test/hpvm-cava/src/pipe_stages.c      |  300 ++---
 hpvm/test/hpvm-cava/src/pipe_stages.h      |   73 +-
 hpvm/test/hpvm-cava/src/utility.c          |    6 +-
 11 files changed, 988 insertions(+), 1021 deletions(-)

diff --git a/hpvm/test/hpvm-cava/src/cam_pipe.c b/hpvm/test/hpvm-cava/src/cam_pipe.c
index cdeaf39332..7874ff9d52 100644
--- a/hpvm/test/hpvm-cava/src/cam_pipe.c
+++ b/hpvm/test/hpvm-cava/src/cam_pipe.c
@@ -1,11 +1,11 @@
-#include "cam_pipe_utility.h"
-#include "dma_interface.h"
-#include "load_cam_model.h"
-#include "pipe_stages.h"
-#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <assert.h>
+#include "pipe_stages.h"
+#include "load_cam_model.h"
+#include "cam_pipe_utility.h"
+#include "dma_interface.h"
 #ifdef DMA_MODE
 #include "gem5_harness.h"
 #endif
@@ -13,7 +13,7 @@
 // FIXME: Include gem5/dma_interface.cc/h separately
 #ifndef DMA_INTERFACE_V3
 #define DMA_INTERFACE_V3
-#endif // DMA_INTERFACE_V3
+#endif//DMA_INTERFACE_V3
 
 ///////////////////////////////////////////////////////////////
 // Camera Model Parameters
@@ -71,8 +71,7 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
   uint8_t *acc_input, *acc_result;
   float *acc_input_scaled, *acc_result_scaled;
   float *host_TsTw, *host_ctrl_pts, *host_weights, *host_coefs, *host_tone_map;
-  float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map,
-      *acc_l2_dist;
+  float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map, *acc_l2_dist;
 
   strcat(cam_model_path, "cam_models/NikonD7000/");
 
@@ -85,25 +84,20 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
   host_coefs = get_coefs(cam_model_path, num_ctrl_pts);
   host_tone_map = get_tone_map(cam_model_path);
 
-  acc_input = (uint8_t *)malloc_aligned(sizeof(uint8_t) * row_size * col_size *
-                                        CHAN_SIZE);
-  acc_result = (uint8_t *)malloc_aligned(sizeof(uint8_t) * row_size * col_size *
-                                         CHAN_SIZE);
-  acc_input_scaled =
-      (float *)malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
-  acc_result_scaled =
-      (float *)malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
-  acc_TsTw = (float *)malloc_aligned(sizeof(float) * 9);
-  acc_ctrl_pts =
-      (float *)malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
-  acc_weights =
-      (float *)malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
-  acc_coefs = (float *)malloc_aligned(sizeof(float) * 12);
-  acc_tone_map = (float *)malloc_aligned(sizeof(float) * 256 * CHAN_SIZE);
-  acc_l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts);
+  acc_input = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
+  acc_result = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
+  acc_input_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
+  acc_result_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
+  acc_TsTw = (float*) malloc_aligned(sizeof(float) * 9);
+  acc_ctrl_pts = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  acc_weights = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  acc_coefs = (float*) malloc_aligned(sizeof(float) * 12);
+  acc_tone_map = (float*) malloc_aligned(sizeof(float) * 256 * CHAN_SIZE);
+  acc_l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts);
 
   // Load camera model parameters for the ISP
-  MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw, sizeof(float) * 9);
+  MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw,
+                     sizeof(float) * 9);
   MAP_ARRAY_TO_ACCEL(ISP, "host_ctrl_pts", host_ctrl_pts,
                      sizeof(float) * num_ctrl_pts * CHAN_SIZE);
   MAP_ARRAY_TO_ACCEL(ISP, "host_weights", host_weights,
@@ -142,3 +136,4 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
   free(acc_tone_map);
   free(acc_l2_dist);
 }
+
diff --git a/hpvm/test/hpvm-cava/src/cam_pipe_utility.c b/hpvm/test/hpvm-cava/src/cam_pipe_utility.c
index 864f02d5b2..f806e9ee1a 100644
--- a/hpvm/test/hpvm-cava/src/cam_pipe_utility.c
+++ b/hpvm/test/hpvm-cava/src/cam_pipe_utility.c
@@ -1,6 +1,6 @@
-#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <assert.h>
 
 #include "cam_pipe_utility.h"
 //#include "pipe_stages.h"
@@ -26,11 +26,10 @@ uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size) {
   return image;
 }
 
-void write_image_to_binary(char *file_path, uint8_t *image, int row_size,
-                           int col_size) {
+void write_image_to_binary(char *file_path, uint8_t *image, int row_size, int col_size) {
   FILE *fp = fopen(file_path, "w");
 
-  int shape[3] = {row_size, col_size, CHAN_SIZE};
+  int shape[3] = { row_size, col_size, CHAN_SIZE };
   fwrite(shape, sizeof(int), 3, fp);
 
   int size = row_size * col_size * CHAN_SIZE;
@@ -41,8 +40,8 @@ void write_image_to_binary(char *file_path, uint8_t *image, int row_size,
 float *transpose_mat(float *inmat, int width, int height) {
   // Define vectors
   float *outmat;
-  int err = posix_memalign((void **)&outmat, CACHELINE_SIZE,
-                           sizeof(float) * height * width);
+  int err =
+      posix_memalign((void **)&outmat, CACHELINE_SIZE, sizeof(float) * height * width);
   assert(err == 0 && "Failed to allocate memory!");
 
   // Transpose the matrix
@@ -72,7 +71,7 @@ void convert_chw_to_hwc(uint8_t *input, int row_size, int col_size,
                         uint8_t **result) {
   if (*result == NULL) {
     *result = (uint8_t *)malloc_aligned(row_size * col_size * CHAN_SIZE *
-                                        sizeof(uint8_t));
+                                      sizeof(uint8_t));
   }
   ARRAY_3D(uint8_t, _input, input, row_size, col_size);
   ARRAY_3D(uint8_t, _result, *result, col_size, CHAN_SIZE);
diff --git a/hpvm/test/hpvm-cava/src/cam_pipe_utility.h b/hpvm/test/hpvm-cava/src/cam_pipe_utility.h
index b61b7cc9b5..b4fb6cde0c 100644
--- a/hpvm/test/hpvm-cava/src/cam_pipe_utility.h
+++ b/hpvm/test/hpvm-cava/src/cam_pipe_utility.h
@@ -1,8 +1,8 @@
 #ifndef _CAM_PIPE_UTILITY_H_
 #define _CAM_PIPE_UTILITY_H_
 
-#include "pipe_stages.h"
 #include "utility.h"
+#include "pipe_stages.h"
 
 uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size);
 void write_image_to_binary(char *file_path, uint8_t *image, int row_size,
diff --git a/hpvm/test/hpvm-cava/src/defs.h b/hpvm/test/hpvm-cava/src/defs.h
index 0fa95ef3d2..ccc8acc857 100644
--- a/hpvm/test/hpvm-cava/src/defs.h
+++ b/hpvm/test/hpvm-cava/src/defs.h
@@ -10,46 +10,46 @@ typedef unsigned long uint64_t;
 
 // Debugging message macros.
 #if DEBUG_LEVEL >= 1
-#define INFO_MSG(args...) printf(args)
-
-#if DEBUG_LEVEL >= 2
-#define PRINT_MSG(args...) printf(args)
-#define PRINT_DEBUG(hid, rows, cols, num_cols)                                 \
-  print_debug(hid, rows, cols, num_cols)
-#define PRINT_DEBUG4D(hid, rows, cols, height)                                 \
-  print_debug4d(hid, rows, cols, height)
-#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)                       \
-  print_debug4d_fp16(hid, num, height, rows, cols)
-
-#if DEBUG_LEVEL >= 3
-#define PRINT_DEBUG_V(hid, rows, cols, num_cols)                               \
-  print_debug(hid, rows, cols, num_cols)
-#define PRINT_DEBUG4D_V(hid, rows, cols, height)                               \
-  print_debug4d(hid, rows, cols, height)
-#define PRINT_MSG_V(args...) printf(args)
+  #define INFO_MSG(args...) printf(args)
+
+  #if DEBUG_LEVEL >= 2
+    #define PRINT_MSG(args...) printf(args)
+    #define PRINT_DEBUG(hid, rows, cols, num_cols)                                 \
+        print_debug(hid, rows, cols, num_cols)
+    #define PRINT_DEBUG4D(hid, rows, cols, height)                                 \
+        print_debug4d(hid, rows, cols, height)
+    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)                       \
+        print_debug4d_fp16(hid, num, height, rows, cols)
+
+    #if DEBUG_LEVEL >= 3
+      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)                               \
+          print_debug(hid, rows, cols, num_cols)
+      #define PRINT_DEBUG4D_V(hid, rows, cols, height)                               \
+          print_debug4d(hid, rows, cols, height)
+      #define PRINT_MSG_V(args...) printf(args)
+    #else
+      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)
+      #define PRINT_DEBUG4D_V(hid, rows, cols, height)
+      #define PRINT_MSG_V(args...)
+    #endif
+  #else
+    #define PRINT_MSG(args...)
+    #define PRINT_DEBUG(hid, rows, cols, num_cols)
+    #define PRINT_DEBUG4D(hid, rows, cols, height)
+    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+    #define PRINT_DEBUG_V(hid, rows, cols, height)
+    #define PRINT_DEBUG4D_V(hid, rows, cols, height)
+    #define PRINT_MSG_V(args...)
+  #endif
 #else
-#define PRINT_DEBUG_V(hid, rows, cols, num_cols)
-#define PRINT_DEBUG4D_V(hid, rows, cols, height)
-#define PRINT_MSG_V(args...)
-#endif
-#else
-#define PRINT_MSG(args...)
-#define PRINT_DEBUG(hid, rows, cols, num_cols)
-#define PRINT_DEBUG4D(hid, rows, cols, height)
-#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
-#define PRINT_DEBUG_V(hid, rows, cols, height)
-#define PRINT_DEBUG4D_V(hid, rows, cols, height)
-#define PRINT_MSG_V(args...)
-#endif
-#else
-#define INFO_MSG(args...)
-#define PRINT_DEBUG(hid, rows, cols, num_cols)
-#define PRINT_DEBUG4D(hid, rows, cols, height)
-#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
-#define PRINT_MSG(args...)
-#define PRINT_DEBUG_V(hid, rows, cols, height)
-#define PRINT_DEBUG4D_V(hid, rows, cols, height)
-#define PRINT_MSG_V(args...)
+  #define INFO_MSG(args...)
+  #define PRINT_DEBUG(hid, rows, cols, num_cols)
+  #define PRINT_DEBUG4D(hid, rows, cols, height)
+  #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+  #define PRINT_MSG(args...)
+  #define PRINT_DEBUG_V(hid, rows, cols, height)
+  #define PRINT_DEBUG4D_V(hid, rows, cols, height)
+  #define PRINT_MSG_V(args...)
 #endif
 
 #define STRING(arg) #arg
@@ -72,9 +72,9 @@ typedef unsigned long uint64_t;
 #define max3(e0, e1, e2) max2(max2(e0, e1), e2)
 #define max4(e0, e1, e2, e3) max2(max2(e0, e1), max2(e2, e3))
 #define max8(e0, e1, e2, e3, e4, e5, e6, e7)                                   \
-  max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7))
+    max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7))
 #define max9(e0, e1, e2, e3, e4, e5, e6, e7, e8)                               \
-  max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8)
+    max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8)
 
 #define min2(A, B) (((A) < (B)) ? (A) : (B))
 
@@ -92,8 +92,7 @@ typedef unsigned long uint64_t;
 //  If GEM5_HARNESS is defined:
 //
 //     MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize)
-//        ===>   mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr,
-//        mySize)
+//        ===>   mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr, mySize)
 //
 //     INVOKE_KERNEL(myReqCode, kernelFuncName, args...)
 //        ===>   invokeAcceleratorAndBlock(myReqCode)
@@ -108,69 +107,69 @@ typedef unsigned long uint64_t;
 #ifdef GEM5_HARNESS
 
 #define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
-  mapArrayToAccelerator(req_code, name, base_addr, size)
+    mapArrayToAccelerator(req_code, name, base_addr, size)
 #define INVOKE_KERNEL(req_code, kernel_ptr, args...)                           \
-  do {                                                                         \
-    UNUSED(kernel_ptr);                                                        \
-    invokeAcceleratorAndBlock(req_code);                                       \
-  } while (0)
+    do {                                                                       \
+        UNUSED(kernel_ptr);                                                    \
+        invokeAcceleratorAndBlock(req_code);                                   \
+    } while (0)
 #define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
-  do {                                                                         \
-    UNUSED(kernel_ptr);                                                        \
-    invokeAcceleratorAndReturn2(req_code, finish_flag);                        \
-  } while (0)
+    do {                                                                       \
+        UNUSED(kernel_ptr);                                                    \
+        invokeAcceleratorAndReturn2(req_code, finish_flag);                    \
+    } while (0)
 
 #define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
-  do {                                                                         \
-    invokeAladdinTrafficGenAndBlock(start_addr, size, false, false);           \
-  } while (0)
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, false, false);       \
+    } while (0)
 #define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-  do {                                                                         \
-    invokeAladdinTrafficGenAndBlock(start_addr, size, true, false);            \
-  } while (0)
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, true, false);        \
+    } while (0)
 #define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
-  do {                                                                         \
-    invokeAladdinTrafficGenAndBlock(start_addr, size, false, true);            \
-  } while (0)
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, false, true);        \
+    } while (0)
 #define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-  do {                                                                         \
-    invokeAladdinTrafficGenAndBlock(start_addr, size, true, true);             \
-  } while (0)
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, true, true);         \
+    } while (0)
 
 #else
 
 #define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
-  do {                                                                         \
-    INFO_MSG("Mapping array %s @ %p, size %d.\n", name, (void *)base_addr,     \
-             (int)(size));                                                     \
-    UNUSED(req_code);                                                          \
-    UNUSED(name);                                                              \
-    UNUSED(base_addr);                                                         \
-    UNUSED(size);                                                              \
-  } while (0)
+    do {                                                                       \
+        INFO_MSG("Mapping array %s @ %p, size %d.\n",                          \
+                 name, (void*)base_addr, (int)(size));                         \
+        UNUSED(req_code);                                                      \
+        UNUSED(name);                                                          \
+        UNUSED(base_addr);                                                     \
+        UNUSED(size);                                                          \
+    } while (0)
 #define INVOKE_KERNEL(req_code, kernel_ptr, args...) kernel_ptr(args)
 #define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
-  kernel_ptr(args)
+    kernel_ptr(args)
 #define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
-  do {                                                                         \
-    UNUSED(start_addr);                                                        \
-    UNUSED(size);                                                              \
-  } while (0)
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
 #define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-  do {                                                                         \
-    UNUSED(start_addr);                                                        \
-    UNUSED(size);                                                              \
-  } while (0)
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
 #define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
-  do {                                                                         \
-    UNUSED(start_addr);                                                        \
-    UNUSED(size);                                                              \
-  } while (0)
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
 #define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-  do {                                                                         \
-    UNUSED(start_addr);                                                        \
-    UNUSED(size);                                                              \
-  } while (0)
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
 
 #endif
 
@@ -178,14 +177,14 @@ typedef unsigned long uint64_t;
 //
 // This assumes that the current name of the base pointer is also the name of
 // the array in the top level function of the dynamic trace. THIS IS VERY
-// IMPORTANT - if the argument passed to a top level function has been renamed
-// in the function, then this WILL NOT WORK!
+// IMPORTANT - if the argument passed to a top level function has been renamed in
+// the function, then this WILL NOT WORK!
 //
 // MAP_ARRAY(myReqCode, myArray, mySize)
 //    ===>   MAP_ARRAY_TO_ACCEL(myReqCode, "myArray", myArray, mySize)
 #define MAP_ARRAY(req_code, name_and_base_addr, size)                          \
-  MAP_ARRAY_TO_ACCEL(req_code, STRING(name_and_base_addr), name_and_base_addr, \
-                     size)
+    MAP_ARRAY_TO_ACCEL(                                                        \
+            req_code, STRING(name_and_base_addr), name_and_base_addr, size)
 
 // Use these convenience macros to cast a raw pointer into a multidimensional
 // variable-length array, which lets us use [] notation inside of the ugly
@@ -203,24 +202,23 @@ typedef unsigned long uint64_t;
 //
 //   And so on...
 #define ARRAY_1D(TYPE, output_array_name, input_array_name)                    \
-  TYPE *output_array_name = (TYPE *)input_array_name
+    TYPE* output_array_name = (TYPE*)input_array_name
 
 #define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1)             \
-  TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
+    TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
 
 #define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2)      \
-  TYPE(*output_array_name)                                                     \
-  [DIM_1][DIM_2] = (TYPE(*)[DIM_1][DIM_2])input_array_name
-
-#define ARRAY_4D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2,      \
-                 DIM_3)                                                        \
-  TYPE(*output_array_name)                                                     \
-  [DIM_1][DIM_2][DIM_3] = (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name
-
-#define ARRAY_5D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2,      \
-                 DIM_3, DIM_4)                                                 \
-  TYPE(*output_array_name)                                                     \
-  [DIM_1][DIM_2][DIM_3][DIM_4] =                                               \
-      (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name
+    TYPE(*output_array_name)[DIM_1][DIM_2] =                                   \
+        (TYPE(*)[DIM_1][DIM_2])input_array_name
+
+#define ARRAY_4D(                                                              \
+    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3)            \
+        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3] =                        \
+            (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name
+
+#define ARRAY_5D(                                                              \
+    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3, DIM_4)     \
+        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3][DIM_4] =                 \
+            (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name
 
 #endif
diff --git a/hpvm/test/hpvm-cava/src/dma_interface.c b/hpvm/test/hpvm-cava/src/dma_interface.c
index 68698635a4..81bce54469 100644
--- a/hpvm/test/hpvm-cava/src/dma_interface.c
+++ b/hpvm/test/hpvm-cava/src/dma_interface.c
@@ -1,6 +1,6 @@
-#include "dma_interface.h"
 #include <assert.h>
 #include <string.h>
+#include "dma_interface.h"
 
 // All _dmaImplN functions must be always inlined or we'll get extra functions
 // in the trace.
@@ -10,22 +10,22 @@
 // Starting with version 3, all versioning will be distinguished by the return
 // value of the DMA functions.
 
-__attribute__((__always_inline__)) int _dmaImpl3(void *dst_addr, void *src_addr,
-                                                 size_t size) {
+__attribute__((__always_inline__))
+int _dmaImpl3(void* dst_addr, void* src_addr, size_t size) {
   assert(size > 0);
   memmove(dst_addr, src_addr, size);
   return 3;
 }
 
-int dmaLoad(void *dst_addr, void *src_host_addr, size_t size) {
+int dmaLoad(void* dst_addr, void* src_host_addr, size_t size) {
   return _dmaImpl3(dst_addr, src_host_addr, size);
 }
 
-int dmaStore(void *dst_host_addr, void *src_addr, size_t size) {
+int dmaStore(void* dst_host_addr, void* src_addr, size_t size) {
   return _dmaImpl3(dst_host_addr, src_addr, size);
 }
 
-int setReadyBits(void *start_addr, size_t size, unsigned value) {
+int setReadyBits(void* start_addr, size_t size, unsigned value) {
   asm("");
   return 0;
 }
@@ -35,37 +35,39 @@ int setReadyBits(void *start_addr, size_t size, unsigned value) {
 // With version 2 and earlier, we return (void*)NULL and use the number of
 // function arguments to distinguish the DMA functions.
 
-__attribute__((__always_inline__)) void *
-_dmaImpl2(void *base_addr, size_t src_off, size_t dst_off, size_t size) {
+__attribute__((__always_inline__))
+void* _dmaImpl2(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
   assert(size > 0);
   memmove(base_addr + dst_off, base_addr + src_off, size);
   return NULL;
 }
 
-void *dmaLoad(void *base_addr, size_t src_off, size_t dst_off, size_t size) {
+void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
   return _dmaImpl2(base_addr, src_off, dst_off, size);
 }
 
-void *dmaStore(void *base_addr, size_t src_off, size_t dst_off, size_t size) {
+void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
   return _dmaImpl2(base_addr, src_off, dst_off, size);
 }
 
 #else
 
-__attribute__((__always_inline__)) void *_dmaImpl1(void *base_addr,
-                                                   size_t offset, size_t size) {
+__attribute__((__always_inline__))
+void* _dmaImpl1(void* base_addr, size_t offset, size_t size) {
   assert(size > 0);
   asm("");
   return NULL;
 }
 
-void *dmaLoad(void *addr, size_t offset, size_t size) {
+void* dmaLoad(void* addr, size_t offset, size_t size) {
   return _dmaImpl1(addr, offset, size);
 }
 
-void *dmaStore(void *addr, size_t offset, size_t size) {
+void* dmaStore(void* addr, size_t offset, size_t size) {
   return _dmaImpl1(addr, offset, size);
 }
 #endif
 
-void dmaFence() { asm(""); }
+void dmaFence() {
+  asm("");
+}
diff --git a/hpvm/test/hpvm-cava/src/dma_interface.h b/hpvm/test/hpvm-cava/src/dma_interface.h
index 771ece5238..f23234eede 100644
--- a/hpvm/test/hpvm-cava/src/dma_interface.h
+++ b/hpvm/test/hpvm-cava/src/dma_interface.h
@@ -10,12 +10,12 @@
 // Version 3 of the DMA interface enables memcpy operations from arbitrary
 // source and destination addresses.
 
-int dmaLoad(void *dst_addr, void *src_host_addr, size_t size);
-int dmaStore(void *dst_host_addr, void *src_addr, size_t size);
+int dmaLoad(void* dst_addr, void* src_host_addr, size_t size);
+int dmaStore(void* dst_host_addr, void* src_addr, size_t size);
 
 // The user can explicitly toggle the state of ready bits, if ready mode is
 // enabled. This requires support from DMA v3.
-int setReadyBits(void *start_addr, size_t size, unsigned value);
+int setReadyBits(void* start_addr, size_t size, unsigned value);
 
 #elif defined(DMA_INTERFACE_V2)
 
@@ -26,18 +26,17 @@ int setReadyBits(void *start_addr, size_t size, unsigned value);
 // actually copied from source to destination (the memory copy will not show up
 // in the trace).
 
-void *dmaLoad(void *base_addr, size_t src_off, size_t dst_off, size_t size);
-void *dmaStore(void *base_addr, size_t src_off, size_t dst_off, size_t size);
+void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size);
+void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size);
 
 #else
 
 #warning "DMA interface v1 is deprecated!"
 
-// Version 1 of the DMA interface is now deprecated and will be removed
-// entirely.
+// Version 1 of the DMA interface is now deprecated and will be removed entirely.
 
-void *dmaLoad(void *addr, size_t offset, size_t size);
-void *dmaStore(void *addr, size_t offset, size_t size);
+void* dmaLoad(void* addr, size_t offset, size_t size);
+void* dmaStore(void* addr, size_t offset, size_t size);
 
 #endif
 void dmaFence();
diff --git a/hpvm/test/hpvm-cava/src/load_cam_model.c b/hpvm/test/hpvm-cava/src/load_cam_model.c
index baec19ad49..124fe0b7d1 100644
--- a/hpvm/test/hpvm-cava/src/load_cam_model.c
+++ b/hpvm/test/hpvm-cava/src/load_cam_model.c
@@ -1,14 +1,13 @@
-#include "load_cam_model.h"
-#include "utility.h"
-#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
-// clang-format: pipe_stages.h must come after stdlib.h
-#include "pipe_stages.h"
 #include <string.h>
+#include <assert.h>
+#include "utility.h"
+#include "pipe_stages.h"
+#include "load_cam_model.h"
 
 // Get color space transform
-float *get_Ts(char *cam_model_path) {
+float* get_Ts(char* cam_model_path) {
   float *Ts;
   int err = posix_memalign((void **)&Ts, CACHELINE_SIZE, sizeof(float) * 9);
   assert(err == 0 && "Failed to allocate memory!");
@@ -33,7 +32,7 @@ float *get_Ts(char *cam_model_path) {
     str = strtok(line, " \n");
     int i = 0;
     while (str != NULL) {
-      line_data[i] = atof(str);
+      line_data[i] = atof(str); 
       str = strtok(NULL, " \n");
       i++;
     }
@@ -51,7 +50,7 @@ float *get_Ts(char *cam_model_path) {
 }
 
 // Get white balance transform
-float *get_Tw(char *cam_model_path, int wb_index) {
+float* get_Tw(char* cam_model_path, int wb_index) {
   float *Tw;
   int err = posix_memalign((void **)&Tw, CACHELINE_SIZE, sizeof(float) * 9);
   assert(err == 0 && "Failed to allocate memory!");
@@ -63,7 +62,7 @@ float *get_Tw(char *cam_model_path, int wb_index) {
 
   // Calculate base for the white balance transform selected
   // For more details see the camera model readme
-  int wb_base = 8 + 5 * (wb_index - 1);
+  int wb_base  = 8 + 5*(wb_index-1);
 
   // Open file for reading
   // Open file for reading
@@ -82,15 +81,15 @@ float *get_Tw(char *cam_model_path, int wb_index) {
     str = strtok(line, " \n");
     int i = 0;
     while (str != NULL) {
-      line_data[i] = atof(str);
+      line_data[i] = atof(str); 
       str = strtok(NULL, " \n");
       i++;
     }
 
     if (line_idx == wb_base) {
       // Convert the white balance vector into a diagaonal matrix
-      for (int i = 0; i < 3; i++) {
-        for (int j = 0; j < 3; j++) {
+      for (int i=0; i<3; i++) {
+        for (int j=0; j<3; j++) {
           if (i == j) {
             Tw[i * 3 + j] = line_data[i];
           } else {
@@ -106,8 +105,9 @@ float *get_Tw(char *cam_model_path, int wb_index) {
   return Tw;
 }
 
+
 // Get combined transforms for checking
-float *get_TsTw(char *cam_model_path, int wb_index) {
+float* get_TsTw(char* cam_model_path, int wb_index) {
   float *TsTw;
   int err = posix_memalign((void **)&TsTw, CACHELINE_SIZE, sizeof(float) * 9);
   assert(err == 0 && "Failed to allocate memory!");
@@ -119,7 +119,7 @@ float *get_TsTw(char *cam_model_path, int wb_index) {
 
   // Calculate base for the white balance transform selected
   // For more details see the camera model readme
-  int wb_base = 5 + 5 * (wb_index - 1);
+  int wb_base  = 5 + 5*(wb_index-1);
 
   // Open file for reading
   char file_name[] = "raw2jpg_transform.txt";
@@ -137,7 +137,7 @@ float *get_TsTw(char *cam_model_path, int wb_index) {
     str = strtok(line, " \n");
     int i = 0;
     while (str != NULL) {
-      line_data[i] = atof(str);
+      line_data[i] = atof(str); 
       str = strtok(NULL, " \n");
       i++;
     }
@@ -155,7 +155,7 @@ float *get_TsTw(char *cam_model_path, int wb_index) {
 }
 
 // Get control points
-float *get_ctrl_pts(char *cam_model_path, int num_cntrl_pts) {
+float* get_ctrl_pts(char* cam_model_path, int num_cntrl_pts) {
   float *ctrl_pnts;
   int err = posix_memalign((void **)&ctrl_pnts, CACHELINE_SIZE,
                            sizeof(float) * num_cntrl_pts * 3);
@@ -200,7 +200,7 @@ float *get_ctrl_pts(char *cam_model_path, int num_cntrl_pts) {
 }
 
 // Get weights
-float *get_weights(char *cam_model_path, int num_cntrl_pts) {
+float* get_weights(char* cam_model_path, int num_cntrl_pts) {
   float *weights;
   int err = posix_memalign((void **)&weights, CACHELINE_SIZE,
                            sizeof(float) * num_cntrl_pts * 3);
@@ -245,7 +245,7 @@ float *get_weights(char *cam_model_path, int num_cntrl_pts) {
 }
 
 // Get coeficients
-float *get_coefs(char *cam_model_path, int num_cntrl_pts) {
+float* get_coefs(char* cam_model_path, int num_cntrl_pts) {
   float *coefs;
   int err = posix_memalign((void **)&coefs, CACHELINE_SIZE, sizeof(float) * 12);
   assert(err == 0 && "Failed to allocate memory!");
@@ -288,8 +288,9 @@ float *get_coefs(char *cam_model_path, int num_cntrl_pts) {
   return coefs;
 }
 
+
 // Get tone mapping table
-float *get_tone_map(char *cam_model_path) {
+float* get_tone_map(char* cam_model_path) {
   float *tone_map;
   int err = posix_memalign((void **)&tone_map, CACHELINE_SIZE,
                            sizeof(float) * 256 * CHAN_SIZE);
diff --git a/hpvm/test/hpvm-cava/src/main.c b/hpvm/test/hpvm-cava/src/main.c
index 8e7bd197d0..e43bbb4f25 100644
--- a/hpvm/test/hpvm-cava/src/main.c
+++ b/hpvm/test/hpvm-cava/src/main.c
@@ -1,14 +1,14 @@
-#include "utility.h"
 #include <argp.h>
-#include <assert.h>
-#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <assert.h>
 #include <string.h>
+#include <math.h>
+#include "utility.h"
 
 #include "cam_pipe_utility.h"
-#include "load_cam_model.h"
 #include "pipe_stages.h"
+#include "load_cam_model.h"
 
 #include "visc.h"
 
@@ -17,138 +17,120 @@ int NUM_CLASSES;
 int INPUT_DIM;
 int NUM_WORKER_THREADS;
 
-// Type of struct holding the return value from the last node.
-struct RetStruct {
-  size_t bytesRet;
-};
-
 // Type of struct that is used to pass arguments to the HPVM dataflow graph
 // using the hpvm launch operation
 typedef struct __attribute__((__packed__)) {
-  uint8_t *input;
-  size_t bytes_input;
-  uint8_t *result;
-  size_t bytes_result;
-  float *input_scaled;
-  size_t bytes_input_scaled;
-  float *result_scaled;
-  size_t bytes_result_scaled;
-  float *demosaic_out;
-  size_t bytes_demosaic_out;
-  float *denoise_out;
-  size_t bytes_denoise_out;
-  float *transform_out;
-  size_t bytes_transform_out;
-  float *gamut_out;
-  size_t bytes_gamut_out;
-  float *TsTw;
-  size_t bytes_TsTw;
-  float *ctrl_pts;
-  size_t bytes_ctrl_pts;
-  float *weights;
-  size_t bytes_weights;
-  float *coefs;
-  size_t bytes_coefs;
-  float *l2_dist;
-  size_t bytes_l2_dist;
-  float *tone_map;
-  size_t bytes_tone_map;
-  int row_size;
-  int col_size;
-  struct RetStruct ret; // Instance of RetStruct holding the return value.
-} RootIn;
+    uint8_t *input; size_t bytes_input;
+    uint8_t *result; size_t bytes_result;
+    float *input_scaled; size_t bytes_input_scaled; 
+    float *result_scaled; size_t bytes_result_scaled;
+    float *demosaic_out; size_t bytes_demosaic_out;
+    float *denoise_out; size_t bytes_denoise_out;
+    float *transform_out; size_t bytes_transform_out;
+    float *gamut_out;size_t bytes_gamut_out;
+    float *TsTw; size_t bytes_TsTw;
+    float *ctrl_pts; size_t bytes_ctrl_pts;
+    float *weights; size_t bytes_weights;
+    float*coefs; size_t bytes_coefs;
+    float *l2_dist; size_t bytes_l2_dist;
+    float *tone_map; size_t bytes_tone_map;
+    size_t row_size; size_t col_size;
+} 
+RootIn;
 
 typedef enum _argnum {
-  RAW_IMAGE_BIN,
-  OUTPUT_IMAGE_BIN,
-  NUM_REQUIRED_ARGS,
-  DATA_FILE = NUM_REQUIRED_ARGS,
-  NUM_ARGS,
+    RAW_IMAGE_BIN,
+    OUTPUT_IMAGE_BIN,
+    NUM_REQUIRED_ARGS,
+    DATA_FILE = NUM_REQUIRED_ARGS,
+    NUM_ARGS,
 } argnum;
 
 typedef struct _arguments {
-  char *args[NUM_ARGS];
-  int num_inputs;
-  int num_threads;
+    char* args[NUM_ARGS];
+    int num_inputs;
+    int num_threads;
 } arguments;
 
 static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n";
 static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary";
 static struct argp_option options[] = {
-    {"num-inputs", 'n', "N", 0, "Number of input images"},
-    {0},
-    {"data-file", 'f', "F", 0,
-     "File to read data and weights from (if data-init-mode == READ_FILE or "
-     "save-params is true). *.txt files are decoded as text files, while "
-     "*.bin files are decoded as binary files."},
+    { "num-inputs", 'n', "N", 0, "Number of input images" }, { 0 },
+    { "data-file", 'f', "F", 0,
+      "File to read data and weights from (if data-init-mode == READ_FILE or "
+      "save-params is true). *.txt files are decoded as text files, while "
+      "*.bin files are decoded as binary files." },
 };
 
-static error_t parse_opt(int key, char *arg, struct argp_state *state) {
-  arguments *args = (arguments *)(state->input);
-  switch (key) {
-  case 'n': {
-    args->num_inputs = strtol(arg, NULL, 10);
-    break;
-  }
-  case 'f': {
-    args->args[DATA_FILE] = arg;
-    break;
-  }
-  case 't': {
-    args->num_threads = strtol(arg, NULL, 10);
-    break;
-  }
-  case ARGP_KEY_ARG: {
-    if (state->arg_num >= NUM_REQUIRED_ARGS)
-      argp_usage(state);
-    args->args[state->arg_num] = arg;
-    break;
-  }
-  case ARGP_KEY_END: {
-    if (state->arg_num < NUM_REQUIRED_ARGS) {
-      fprintf(stderr, "Not enough arguments! Got %d, require %d.\n",
-              state->arg_num, NUM_REQUIRED_ARGS);
-      argp_usage(state);
+static error_t parse_opt(int key, char* arg, struct argp_state* state) {
+    arguments* args = (arguments*)(state->input);
+    switch (key) {
+        case 'n': {
+            args->num_inputs = strtol(arg, NULL, 10);
+            break;
+        }
+        case 'f': {
+            args->args[DATA_FILE] = arg;
+            break;
+        }
+        case 't': {
+            args->num_threads = strtol(arg, NULL, 10);
+            break;
+        }
+        case ARGP_KEY_ARG: {
+            if (state->arg_num >= NUM_REQUIRED_ARGS)
+                argp_usage(state);
+            args->args[state->arg_num] = arg;
+            break;
+        }
+        case ARGP_KEY_END: {
+            if (state->arg_num < NUM_REQUIRED_ARGS) {
+                fprintf(stderr,
+                        "Not enough arguments! Got %d, require %d.\n",
+                        state->arg_num,
+                        NUM_REQUIRED_ARGS);
+                argp_usage(state);
+            }
+            break;
+        }
+        default:
+            return ARGP_ERR_UNKNOWN;
     }
-    break;
-  }
-  default:
-    return ARGP_ERR_UNKNOWN;
-  }
-  return 0;
+    return 0;
 }
 
-void set_default_args(arguments *args) {
-  args->num_inputs = 1;
-  args->num_threads = 0;
-  for (int i = 0; i < NUM_ARGS; i++) {
-    args->args[i] = NULL;
-  }
+void set_default_args(arguments* args) {
+    args->num_inputs = 1;
+    args->num_threads = 0;
+    for (int i = 0; i < NUM_ARGS; i++) {
+        args->args[i] = NULL;
+    }
 }
 
-static struct argp parser = {options, parse_opt, args_doc, prog_doc};
+static struct argp parser = { options, parse_opt, args_doc, prog_doc };
 
 // Helper function for printing intermediate results
-void descale_cpu(float *input, size_t bytes_input, uint8_t *output,
-                 size_t bytes_result, size_t row_size, size_t col_size) {
-
+void descale_cpu(float *input, size_t bytes_input, 
+                 uint8_t *output, size_t bytes_result,
+                 size_t row_size, size_t col_size) {
+  
   for (int chan = 0; chan < CHAN_SIZE; chan++)
     for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
-        int index = (chan * row_size + row) * col_size + col;
+        int index = (chan*row_size + row) * col_size + col;
         output[index] = min(max(input[index] * 255, 0), 255);
       }
 }
 
 static void sort(float arr[], int n) {
-  int i, j;
-  for (i = 0; i < n - 1; i++)
-    for (j = 0; j < n - i - 1; j++)
-      if (arr[j] > arr[j + 1]) {
-        float temp = arr[j];
-        arr[j] = arr[j + 1];
-        arr[j + 1] = temp;
-      }
+    int i, j;
+    for (i = 0; i < n - 1; i++)
+        for (j = 0; j < n - i - 1; j++)
+            if (arr[j] > arr[j + 1]) {
+                float temp = arr[j];
+                arr[j] = arr[j + 1];
+                arr[j + 1] = temp;
+            }
 }
 
 /**************************************************************/
@@ -158,258 +140,255 @@ static void sort(float arr[], int n) {
 // In this benchmark, no use of HPVM query intrinsics in the leaf node functions
 
 // Leaf HPVM node function for scale
-void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
-               size_t bytes_output, size_t row_size, size_t col_size) {
+void scale_fxp(uint8_t *input, size_t bytes_input, 
+               float *output, size_t bytes_output,
+               size_t row_size, size_t col_size) {
 
-  // Specifies compilation target for current node
+  //Specifies compilation target for current node
   __visc__hint(CPU_TARGET);
 
   // Specifies pointer arguments that will be used as "in" and "out" arguments
   // - count of "in" arguments
   // - list of "in" argument , and similar for "out"
   __visc__attributes(2, input, output, 1, output);
-  void *thisNode = __visc__getNode();
-  int row = __visc__getNodeInstanceID_x(thisNode);
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    //    for (int row = 0; row < row_size; row++)
-    for (int col = 0; col < col_size; col++) {
-      int index = (chan * row_size + row) * col_size + col;
-      output[index] = input[index] * 1.0 / 255;
-    }
+//    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++){
+        int index = (chan*row_size + row) * col_size + col;
+        output[index] = input[index] * 1.0 / 255;
+      }
   __visc__return(1, bytes_output);
 }
 
 // Leaf HPVM node function for descale
-void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
-                 size_t bytes_result, size_t row_size, size_t col_size) {
+void descale_fxp(float *input, size_t bytes_input, 
+                 uint8_t *output, size_t bytes_result,
+                 size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, output, 1, output);
-
+  
   for (int chan = 0; chan < CHAN_SIZE; chan++)
     for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
-        int index = (chan * row_size + row) * col_size + col;
+        int index = (chan*row_size + row) * col_size + col;
         output[index] = min(max(input[index] * 255, 0), 255);
       }
   __visc__return(1, bytes_result);
 }
 
 // Leaf HPVM node function for demosaicing
-void demosaic_fxp(float *input, size_t bytes_input, float *result,
-                  size_t bytes_result, size_t row_size, size_t col_size) {
+void demosaic_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  size_t row_size, size_t col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, result, 1, result);
-
-  void *thisNode = __visc__getNode();
-  int row = __visc__getNodeInstanceID_x(thisNode);
-  //  for (int row = 1; row < row_size - 1; row++)
-  for (int col = 1; col < col_size - 1; col++) {
-    int index_0 = (0 * row_size + row) * col_size + col;
-    int index_1 = (1 * row_size + row) * col_size + col;
-    int index_2 = (2 * row_size + row) * col_size + col;
-    if (row % 2 == 0 && col % 2 == 0) {
-      // Green pixel
-      // Getting the R values
-      float R1 = input[index_0 - 1];
-      float R2 = input[index_0 + 1];
-      // Getting the B values
-      float B1 = input[index_2 - col_size];
-      float B2 = input[index_2 + col_size];
-      // R
-      result[index_0] = (R1 + R2) / 2;
-      // G
-      result[index_1] = input[index_1] * 2;
-      // B
-      result[index_2] = (B1 + B2) / 2;
-    } else if (row % 2 == 0 && col % 2 == 1) {
-      // Red pixel
-      // Getting the G values
-      float G1 = input[index_1 - col_size];
-      float G2 = input[index_1 + col_size];
-      float G3 = input[index_1 - 1];
-      float G4 = input[index_1 + 1];
-      // Getting the B values
-      float B1 = input[index_2 - col_size - 1];
-      float B2 = input[index_2 - col_size + 1];
-      float B3 = input[index_2 + col_size - 1];
-      float B4 = input[index_2 + col_size + 1];
-      // R
-      result[index_0] = input[index_0];
-      // G
-      result[index_1] = (G1 + G2 + G3 + G4) / 2;
-      // B (center pixel)
-      result[index_2] = (B1 + B2 + B3 + B4) / 4;
-    } else if (row % 2 == 1 && col % 2 == 0) {
-      // Blue pixel
-      // Getting the R values
-      float R1 = input[index_0 - col_size - 1];
-      float R2 = input[index_0 + col_size - 1];
-      float R3 = input[index_0 - col_size + 1];
-      float R4 = input[index_0 + col_size + 1];
-      // Getting the G values
-      float G1 = input[index_1 - col_size];
-      float G2 = input[index_1 + col_size];
-      float G3 = input[index_1 - 1];
-      float G4 = input[index_1 + 1];
-      // R
-      result[index_0] = (R1 + R2 + R3 + R4) / 4;
-      // G
-      result[index_1] = (G1 + G2 + G3 + G4) / 2;
-      // B
-      result[index_2] = input[index_2];
-    } else {
-      // Bottom Green pixel
-      // Getting the R values
-      float R1 = input[index_0 - col_size];
-      float R2 = input[index_0 + col_size];
-      // Getting the B values
-      float B1 = input[index_2 - 1];
-      float B2 = input[index_2 + 1];
-      // R
-      result[index_0] = (R1 + R2) / 2;
-      // G
-      result[index_1] = input[index_1] * 2;
-      // B
-      result[index_2] = (B1 + B2) / 2;
-    }
-  }
+  
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
+//  for (int row = 1; row < row_size - 1; row++)
+    for (int col = 1; col < col_size - 1; col++) {
+        int index_0 = (0 * row_size + row) * col_size + col;
+        int index_1 = (1 * row_size + row) * col_size + col;
+        int index_2 = (2 * row_size + row) * col_size + col;
+        if (row % 2 == 0 && col % 2 == 0) {
+            // Green pixel
+            // Getting the R values
+            float R1 = input[index_0 - 1];
+            float R2 = input[index_0 + 1];
+            // Getting the B values
+            float B1 = input[index_2 - col_size];
+            float B2 = input[index_2 + col_size];
+            // R
+            result[index_0] = (R1 + R2) / 2;
+            // G
+            result[index_1] = input[index_1] * 2;
+            // B
+            result[index_2] = (B1 + B2) / 2;
+        } else if (row % 2 == 0 && col % 2 == 1) {
+            // Red pixel
+            // Getting the G values
+            float G1 = input[index_1 - col_size];
+            float G2 = input[index_1 + col_size];
+            float G3 = input[index_1 - 1];
+            float G4 = input[index_1 + 1];
+            // Getting the B values
+            float B1 = input[index_2 - col_size - 1];
+            float B2 = input[index_2 - col_size + 1];
+            float B3 = input[index_2 + col_size - 1];
+            float B4 = input[index_2 + col_size + 1];
+            // R
+            result[index_0] = input[index_0];
+            // G
+            result[index_1] = (G1 + G2 + G3 + G4) / 2;
+            // B (center pixel)
+            result[index_2] = (B1 + B2 + B3 + B4) / 4;
+        } else if (row % 2 == 1 && col % 2 == 0) {
+            // Blue pixel
+            // Getting the R values
+            float R1 = input[index_0 - col_size - 1];
+            float R2 = input[index_0 + col_size - 1];
+            float R3 = input[index_0 - col_size + 1];
+            float R4 = input[index_0 + col_size + 1];
+            // Getting the G values
+            float G1 = input[index_1 - col_size];
+            float G2 = input[index_1 + col_size];
+            float G3 = input[index_1 - 1];
+            float G4 = input[index_1 + 1];
+            // R
+            result[index_0] = (R1 + R2 + R3 + R4) / 4;
+            // G
+            result[index_1] = (G1 + G2 + G3 + G4) / 2;
+            // B
+            result[index_2] = input[index_2];
+        } else {
+            // Bottom Green pixel
+            // Getting the R values
+            float R1 = input[index_0 - col_size];
+            float R2 = input[index_0 + col_size];
+            // Getting the B values
+            float B1 = input[index_2 - 1];
+            float B2 = input[index_2 + 1];
+            // R
+            result[index_0] = (R1 + R2) / 2;
+            // G
+            result[index_1] = input[index_1] * 2;
+            // B
+            result[index_2] = (B1 + B2) / 2;
+        }
+      }
   __visc__return(1, bytes_result);
 }
 
 // Leaf HPVM node function for denoise
-void denoise_fxp(float *input, size_t bytes_input, float *result,
-                 size_t bytes_result, size_t row_size, size_t col_size) {
+void denoise_fxp(float *input, size_t bytes_input, 
+                 float *result, size_t bytes_result,
+                 size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
-
-  void *thisNode = __visc__getNode();
-  int row = __visc__getNodeInstanceID_x(thisNode);
+  
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    //    for (int row = 0; row < row_size; row++)
-    for (int col = 0; col < col_size; col++)
-      if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
-        float filter[9];
-        for (int i = -1; i < 2; i++)
-          for (int j = -1; j < 2; j++) {
-            int index = ((i + row) - row + 1) * 3 + (j + col) - col + 1;
-            filter[index] =
-                input[(chan * row_size + (i + row)) * col_size + (j + col)];
-          }
-        sort(filter, 9);
-        result[(chan * row_size + row) * col_size + col] = filter[4];
-      } else {
-        result[(chan * row_size + row) * col_size + col] =
-            input[(chan * row_size + row) * col_size + col];
-      }
+//    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++)
+        if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
+          float filter[9];
+          for (int i = -1; i < 2; i++)
+            for (int j = -1; j < 2; j++) {
+              int index = ((i+row) - row + 1) * 3 + (j+col) - col + 1;
+              filter[index] = input[(chan * row_size + (i + row)) * col_size + (j + col)];
+            }
+          sort(filter, 9);
+          result[(chan * row_size + row) * col_size + col] = filter[4];
+        } else {
+      result[(chan * row_size + row) * col_size + col] = input[(chan * row_size + row) * col_size + col];
+        }
   __visc__return(1, bytes_result);
 }
 
 // Leaf HPVM node function, for color map and white balance transform
-void transform_fxp(float *input, size_t bytes_input, float *result,
-                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *TsTw_tran, size_t bytes_TsTw,
                    size_t row_size, size_t col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, TsTw_tran, 1, result);
-
-  void *thisNode = __visc__getNode();
-  int row = __visc__getNodeInstanceID_x(thisNode);
+  
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    //    for (int row = 0; row < row_size; row++)
-    for (int col = 0; col < col_size; col++) {
-      int index = (chan * row_size + row) * col_size + col;
-      int index_0 = (0 * row_size + row) * col_size + col;
-      int index_1 = (1 * row_size + row) * col_size + col;
-      int index_2 = (2 * row_size + row) * col_size + col;
-      int index_2d_0 = 0 * CHAN_SIZE + chan;
-      int index_2d_1 = 1 * CHAN_SIZE + chan;
-      int index_2d_2 = 2 * CHAN_SIZE + chan;
-      result[index] = max(input[index_0] * TsTw_tran[index_2d_0] +
-                              input[index_1] * TsTw_tran[index_2d_1] +
-                              input[index_2] * TsTw_tran[index_2d_2],
-                          0);
-    }
+//    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++) {
+        int index = (chan * row_size + row) * col_size + col;
+        int index_0 = (0 * row_size + row) * col_size + col;
+        int index_1 = (1 * row_size + row) * col_size + col;
+        int index_2 = (2 * row_size + row) * col_size + col;
+        int index_2d_0 = 0 * CHAN_SIZE + chan;
+        int index_2d_1 = 1 * CHAN_SIZE + chan;
+        int index_2d_2 = 2 * CHAN_SIZE + chan;
+        result[index] =
+            max(input[index_0] * TsTw_tran[index_2d_0] +
+                input[index_1] * TsTw_tran[index_2d_1] +
+                input[index_2] * TsTw_tran[index_2d_2],
+                0);
+      }
   __visc__return(1, bytes_result);
 }
 
 // Leaf HPVM node function, for gamut mapping
-void gamut_map_fxp(float *input, size_t bytes_input, float *result,
-                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights, float *coefs,
-                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
+void gamut_map_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights,
+                   float *coefs, size_t bytes_coefs,
+                   float *l2_dist, size_t bytes_l2_dist,
                    size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2,
-                     result, l2_dist);
-
-  // First, get the L2 norm from every pixel to the control points,
-  // Then, sum it and weight it. Finally, add the bias.
-  void *thisNode = __visc__getNode();
-  int row = __visc__getNodeInstanceID_x(thisNode);
-  //  for (int row = 0; row < row_size; row++)
-  for (int col = 0; col < col_size; col++) {
-    float chan_val_0 = 0.0;
-    float chan_val_1 = 0.0;
-    float chan_val_2 = 0.0;
-    for (int cp = 0; cp < 3702; cp++) {
-      int index_0 = (0 * row_size + row) * col_size + col;
-      int index_1 = (1 * row_size + row) * col_size + col;
-      int index_2 = (2 * row_size + row) * col_size + col;
-      float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
-      float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
-      float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
-      float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
-      float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
-      float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
-      float val = val1 * val2 + val3 * val4 + val5 * val6;
-      float sqrt_val = sqrt(val);
-      chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
-      chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
-      chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
+  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist);
+
+ // First, get the L2 norm from every pixel to the control points,
+ // Then, sum it and weight it. Finally, add the bias.
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
+//  for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      float chan_val_0 = 0.0;
+      float chan_val_1 = 0.0;
+      float chan_val_2 = 0.0;
+      for (int cp = 0; cp < 3702; cp++) {
+        int index_0 = (0 * row_size + row) * col_size + col;
+        int index_1 = (1 * row_size + row) * col_size + col;
+        int index_2 = (2 * row_size + row) * col_size + col;
+        float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); 
+        float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
+        float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
+        float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
+        float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); 
+        float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
+        float val = val1 * val2 + val3 * val4 + val5 * val6;
+        float sqrt_val = sqrt(val);
+        chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
+        chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
+        chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
+      }
+        chan_val_0 += coefs[0 * CHAN_SIZE + 0] + 
+                    coefs[1 * CHAN_SIZE + 0] * input[(0 * row_size + row) * col_size + col] +
+                    coefs[2 * CHAN_SIZE + 0] * input[(1 * row_size + row) * col_size + col] +
+                    coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
+        chan_val_1 += coefs[0 * CHAN_SIZE + 1] + 
+                    coefs[1 * CHAN_SIZE + 1] * input[(0 * row_size + row) * col_size + col] +
+                    coefs[2 * CHAN_SIZE + 1] * input[(1 * row_size + row) * col_size + col] +
+                    coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
+        chan_val_2 += coefs[0 * CHAN_SIZE + 2] + 
+                    coefs[1 * CHAN_SIZE + 2] * input[(0 * row_size + row) * col_size + col] +
+                    coefs[2 * CHAN_SIZE + 2] * input[(1 * row_size + row) * col_size + col] +
+                    coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
+        result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
+        result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
+        result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
     }
-    chan_val_0 +=
-        coefs[0 * CHAN_SIZE + 0] +
-        coefs[1 * CHAN_SIZE + 0] *
-            input[(0 * row_size + row) * col_size + col] +
-        coefs[2 * CHAN_SIZE + 0] *
-            input[(1 * row_size + row) * col_size + col] +
-        coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
-    chan_val_1 +=
-        coefs[0 * CHAN_SIZE + 1] +
-        coefs[1 * CHAN_SIZE + 1] *
-            input[(0 * row_size + row) * col_size + col] +
-        coefs[2 * CHAN_SIZE + 1] *
-            input[(1 * row_size + row) * col_size + col] +
-        coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
-    chan_val_2 +=
-        coefs[0 * CHAN_SIZE + 2] +
-        coefs[1 * CHAN_SIZE + 2] *
-            input[(0 * row_size + row) * col_size + col] +
-        coefs[2 * CHAN_SIZE + 2] *
-            input[(1 * row_size + row) * col_size + col] +
-        coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
-    result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
-    result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
-    result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
-  }
   __visc__return(1, bytes_result);
 }
 
 // HPVM leaf node function, for tone mapping
-void tone_map_fxp(float *input, size_t bytes_input, float *result,
-                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  float *tone_map, size_t bytes_tone_map,
                   size_t row_size, size_t col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, tone_map, 1, result);
-
-  void *thisNode = __visc__getNode();
-  int row = __visc__getNodeInstanceID_x(thisNode);
+  
+  void* thisNode = __visc__getNode();
+	int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    //    for (int row = 0; row < row_size; row++)
-    for (int col = 0; col < col_size; col++) {
-      int index = (chan * row_size + row) * col_size + col;
-      uint8_t x = input[index] * 255;
-      result[index] = tone_map[x * CHAN_SIZE + chan];
-    }
+//    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++) {
+        int index = (chan * row_size + row) * col_size + col;
+        uint8_t x = input[index] * 255;
+        result[index] = tone_map[x * CHAN_SIZE + chan];
+      }
   __visc__return(1, bytes_result);
 }
 
@@ -421,8 +400,9 @@ void tone_map_fxp(float *input, size_t bytes_input, float *result,
 // requirement for the FPGA backend . The CPU backend also supports this,
 // so it does not cause a portability issue.
 
-void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, float *result,
-                       size_t bytes_result, size_t row_size, size_t col_size) {
+void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
 
@@ -447,9 +427,9 @@ void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, float *result,
   __visc__bindOut(ScaleNode, 0, 0, 0);
 }
 
-void descale_fxp_wrapper(float *input, size_t bytes_input, uint8_t *result,
-                         size_t bytes_result, size_t row_size,
-                         size_t col_size) {
+void descale_fxp_wrapper(float *input, size_t bytes_input, 
+                       uint8_t *result, size_t bytes_result,
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
   void *DescaleNode = __visc__createNodeND(1, descale_fxp, row_size);
@@ -459,13 +439,13 @@ void descale_fxp_wrapper(float *input, size_t bytes_input, uint8_t *result,
   __visc__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result
   __visc__bindIn(DescaleNode, 4, 4, 0); // bind row_size
   __visc__bindIn(DescaleNode, 5, 5, 0); // bind col_size
-
+  
   __visc__bindOut(DescaleNode, 0, 0, 0);
 }
 
-void demosaic_fxp_wrapper(float *input, size_t bytes_input, float *result,
-                          size_t bytes_result, size_t row_size,
-                          size_t col_size) {
+void demosaic_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
   void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, row_size);
@@ -475,13 +455,13 @@ void demosaic_fxp_wrapper(float *input, size_t bytes_input, float *result,
   __visc__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result
   __visc__bindIn(DemosaicNode, 4, 4, 0); // bind row_size
   __visc__bindIn(DemosaicNode, 5, 5, 0); // bind col_size
-
+  
   __visc__bindOut(DemosaicNode, 0, 0, 0);
 }
 
-void denoise_fxp_wrapper(float *input, size_t bytes_input, float *result,
-                         size_t bytes_result, size_t row_size,
-                         size_t col_size) {
+void denoise_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
   void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, row_size);
@@ -491,14 +471,14 @@ void denoise_fxp_wrapper(float *input, size_t bytes_input, float *result,
   __visc__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result
   __visc__bindIn(DenoiseNode, 4, 4, 0); // bind row_size
   __visc__bindIn(DenoiseNode, 5, 5, 0); // bind col_size
-
+  
   __visc__bindOut(DenoiseNode, 0, 0, 0);
 }
 
-void transform_fxp_wrapper(float *input, size_t bytes_input, float *result,
-                           size_t bytes_result, float *TsTw_tran,
-                           size_t bytes_TsTw, size_t row_size,
-                           size_t col_size) {
+void transform_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       float *TsTw_tran, size_t bytes_TsTw,
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(3, input, result, TsTw_tran, 1, result);
   void *TransformNode = __visc__createNodeND(1, transform_fxp, row_size);
@@ -510,41 +490,41 @@ void transform_fxp_wrapper(float *input, size_t bytes_input, float *result,
   __visc__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw
   __visc__bindIn(TransformNode, 6, 6, 0); // bind row_size
   __visc__bindIn(TransformNode, 7, 7, 0); // bind col_size
-
+  
   __visc__bindOut(TransformNode, 0, 0, 0);
 }
 
-void gamut_fxp_wrapper(float *input, size_t bytes_input, float *result,
-                       size_t bytes_result, float *ctrl_pts,
-                       size_t bytes_ctrl_pts, float *weights,
-                       size_t bytes_weights, float *coefs, size_t bytes_coefs,
-                       float *l2_dist, size_t bytes_l2_dist, size_t row_size,
-                       size_t col_size) {
+void gamut_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       float *ctrl_pts, size_t bytes_ctrl_pts,
+                       float *weights, size_t bytes_weights,
+                       float *coefs, size_t bytes_coefs,
+                       float *l2_dist, size_t bytes_l2_dist,
+                       size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1,
-                     result);
+  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
   void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, row_size);
-  __visc__bindIn(GamutNode, 0, 0, 0);   // bind input
-  __visc__bindIn(GamutNode, 1, 1, 0);   // bind bytes_input
-  __visc__bindIn(GamutNode, 2, 2, 0);   // bind result
-  __visc__bindIn(GamutNode, 3, 3, 0);   // bind bytes_result
-  __visc__bindIn(GamutNode, 4, 4, 0);   // bind ctrl_pts
-  __visc__bindIn(GamutNode, 5, 5, 0);   // bind bytes_ctrl_pts
-  __visc__bindIn(GamutNode, 6, 6, 0);   // bind weights
-  __visc__bindIn(GamutNode, 7, 7, 0);   // bind bytes_weights
-  __visc__bindIn(GamutNode, 8, 8, 0);   // bind coefs
-  __visc__bindIn(GamutNode, 9, 9, 0);   // bind bytes_coefs
+  __visc__bindIn(GamutNode, 0, 0, 0); // bind input
+  __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input
+  __visc__bindIn(GamutNode, 2, 2, 0); // bind result
+  __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result
+  __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts
+  __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts
+  __visc__bindIn(GamutNode, 6, 6, 0); // bind weights
+  __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights
+  __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs
+  __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs
   __visc__bindIn(GamutNode, 10, 10, 0); // bind l2_dist
   __visc__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist
   __visc__bindIn(GamutNode, 12, 12, 0); // bind row_size
   __visc__bindIn(GamutNode, 13, 13, 0); // bind col_size
-
+  
   __visc__bindOut(GamutNode, 0, 0, 0);
 }
-void tone_map_fxp_wrapper(float *input, size_t bytes_input, float *result,
-                          size_t bytes_result, float *tone_map,
-                          size_t bytes_tone_map, size_t row_size,
-                          size_t col_size) {
+void tone_map_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       float *tone_map, size_t bytes_tone_map,
+                       size_t row_size, size_t col_size) {
 
   __visc__hint(CPU_TARGET);
   __visc__attributes(3, input, result, tone_map, 1, result);
@@ -553,52 +533,52 @@ void tone_map_fxp_wrapper(float *input, size_t bytes_input, float *result,
   __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input
   __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result
   __visc__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map
+  __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map 
   __visc__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map
   __visc__bindIn(ToneMapNode, 6, 6, 0); // bind row_size
   __visc__bindIn(ToneMapNode, 7, 7, 0); // bind col_size
-
+  
   __visc__bindOut(ToneMapNode, 0, 0, 0);
 }
 
+
 /*** ROOT Node - Top Level of the Graph Hierarchy ***/
-void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input,
-                 /*2*/ uint8_t *result, /*3*/ size_t bytes_result,
-                 /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled,
-                 /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled,
-                 /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out,
-                 /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out,
-                 /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out,
-                 /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out,
-                 /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw,
-                 /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts,
-                 /*20*/ float *weights, /*21*/ size_t bytes_weights,
-                 /*22*/ float *coefs, /*23*/ size_t bytes_coefs,
-                 /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist,
-                 /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map,
-                 /*28*/ size_t row_size, /*29*/ size_t col_size) {
-
-  // Specifies compilation target for current node
-  __visc__hint(CPU_TARGET);
+void CamPipeRoot(/*0*/ uint8_t *input,         /*1*/ size_t bytes_input, 
+                 /*2*/ uint8_t *result,        /*3*/ size_t bytes_result,
+                 /*4*/ float *input_scaled,    /*5*/ size_t bytes_input_scaled,
+                 /*6*/ float *result_scaled,   /*7*/ size_t bytes_result_scaled,
+                 /*8*/ float *demosaic_out,    /*9*/ size_t bytes_demosaic_out,
+                 /*10*/ float *denoise_out,    /*11*/ size_t bytes_denoise_out,
+                 /*12*/ float *transform_out,  /*13*/ size_t bytes_transform_out,
+                 /*14*/ float *gamut_out,      /*15*/ size_t bytes_gamut_out,
+                 /*16*/ float *TsTw,           /*17*/ size_t bytes_TsTw,
+                 /*18*/ float *ctrl_pts,       /*19*/ size_t bytes_ctrl_pts,
+                 /*20*/ float *weights,        /*21*/ size_t bytes_weights,
+                 /*22*/ float*coefs,           /*23*/ size_t bytes_coefs,
+                 /*24*/ float *l2_dist,        /*25*/ size_t bytes_l2_dist,
+                 /*26*/ float *tone_map,       /*27*/ size_t bytes_tone_map,
+                 /*28*/ size_t row_size,          /*29*/ size_t col_size) {
+
+  //Specifies compilation target for current node
+    __visc__hint(CPU_TARGET);
 
   // Specifies pointer arguments that will be used as "in" and "out" arguments
   // - count of "in" arguments
   // - list of "in" argument , and similar for "out"
-  __visc__attributes(14, input, result, input_scaled, result_scaled,
-                     demosaic_out, denoise_out, transform_out, gamut_out, TsTw,
-                     ctrl_pts, weights, coefs, tone_map, l2_dist, 5, result,
-                     demosaic_out, denoise_out, transform_out, gamut_out);
+    __visc__attributes(14, input, result, input_scaled, result_scaled, demosaic_out, denoise_out, 
+                       transform_out, gamut_out, TsTw, ctrl_pts, weights, coefs, tone_map, l2_dist, 
+                       5, result, demosaic_out, denoise_out, transform_out, gamut_out);
 
   // Create an 0D (specified by 1st argument) HPVM node - so a single node
   // associated with node function ---_fxp_wrapper
-  void *ScNode = __visc__createNodeND(0, scale_fxp_wrapper);
-  void *DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper);
-  void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper);
-  void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper);
-  void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper);
-  void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper);
-  void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper);
-
+    void* ScNode = __visc__createNodeND(0, scale_fxp_wrapper);
+    void* DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper);
+    void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper);
+    void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper);
+    void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper);
+    void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper);
+    void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper);
+    
   // BindIn binds inputs of current node with specified node
   // - destination node
   // - argument position in argument list of function of source node
@@ -612,283 +592,268 @@ void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input,
   // - destination position (in argument list of destination node)
   // - streaming (1) or non-streaming (0)
 
-  // scale_fxp inputs
-  __visc__bindIn(ScNode, 0, 0, 0);  // input -> ScNode:input
-  __visc__bindIn(ScNode, 1, 1, 0);  // bytes_input -> ScNode:bytes_input
-  __visc__bindIn(ScNode, 4, 2, 0);  // input_scaled -> ScNode:result
-  __visc__bindIn(ScNode, 5, 3, 0);  // bytes_input_scaled -> ScNode:bytes_result
-  __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
-  __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size
-
-  // demosaic_fxp inputs
-  __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
-  __visc__edge(ScNode, DmNode, 1, 0, 1,
-               0);                  // SCNode:bytes_result -> DmNode:bytes_input
-  __visc__bindIn(DmNode, 8, 2, 0);  // demosaic_out -> DmNode:result
-  __visc__bindIn(DmNode, 9, 3, 0);  // bytes_demosaic_out -> DmNode:bytes_result
-  __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size
-  __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size
-
-  // denoise_fxp inputs
-  __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
-  __visc__edge(DmNode, DnNode, 1, 0, 1,
-               0);                  // DMNode:bytes_result -> DnNode:bytes_input
-  __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
-  __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
-  __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size
-  __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size
-
-  // transform_fxp inputs
-  __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
-  __visc__edge(DnNode, TrNode, 1, 0, 1,
-               0);                  // DnNode:bytes_result -> TrNode:bytes_input
-  __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
-  __visc__bindIn(TrNode, 13, 3,
-                 0); // bytes_result_scaled -> TrNode:bytes_result
-  __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
-  __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
-  __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size
-  __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size
-
-  // gamut_fxp inputs
-  __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
-  __visc__edge(TrNode, GmNode, 1, 0, 1,
-               0);                  // TrNode:bytes_result -> GmNode:bytes_input
-  __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
-  __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
-  __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
-  __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
-  __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
-  __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
-  __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
-  __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
-  __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
-  __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
-  __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size
-  __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size
-
-  // tone_map_fxp inputs
-  __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
-  __visc__edge(GmNode, TnNode, 1, 0, 1,
-               0);                 // GmNode:bytes_result -> TnNode:bytes_input
-  __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
-  __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
-  __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
-  __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
-  __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size
-  __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size
-
-  // descale_fxp inputs
-  __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
-  __visc__edge(TnNode, DsNode, 1, 0, 1,
-               0);                  // TnNode:bytes_result -> DsNode:bytes_input
-  __visc__bindIn(DsNode, 2, 2, 0);  // result -> DsNode:result
-  __visc__bindIn(DsNode, 3, 3, 0);  // bytes_result -> DsNode:bytes_result
-  __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
-  __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
+    // scale_fxp inputs
+    __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input
+    __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input
+    __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result
+    __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result
+    __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
+    __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size
+
+    // demosaic_fxp inputs
+    __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
+    __visc__edge(ScNode, DmNode, 1, 0, 1, 0); // SCNode:bytes_result -> DmNode:bytes_input
+    __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result
+    __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result
+    __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size 
+    __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size
+
+    // denoise_fxp inputs
+    __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
+    __visc__edge(DmNode, DnNode, 1, 0, 1, 0); // DMNode:bytes_result -> DnNode:bytes_input
+    __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
+    __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
+    __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size 
+    __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size
+    
+    // transform_fxp inputs
+    __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
+    __visc__edge(DnNode, TrNode, 1, 0, 1, 0); // DnNode:bytes_result -> TrNode:bytes_input
+    __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
+    __visc__bindIn(TrNode, 13, 3, 0); // bytes_result_scaled -> TrNode:bytes_result
+    __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
+    __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
+    __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size 
+    __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size
+    
+    // gamut_fxp inputs
+    __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
+    __visc__edge(TrNode, GmNode, 1, 0, 1, 0); // TrNode:bytes_result -> GmNode:bytes_input
+    __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
+    __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
+    __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
+    __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
+    __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
+    __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
+    __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
+    __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
+    __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
+    __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
+    __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size 
+    __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size
+    
+    // tone_map_fxp inputs
+    __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
+    __visc__edge(GmNode, TnNode, 1, 0, 1, 0); // GmNode:bytes_result -> TnNode:bytes_input
+    __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
+    __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
+    __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
+    __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
+    __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size 
+    __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size
+
+    // descale_fxp inputs
+    __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
+    __visc__edge(TnNode, DsNode, 1, 0, 1, 0); // TnNode:bytes_result -> DsNode:bytes_input
+    __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result
+    __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result
+    __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
+    __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
 
   // Similar to bindIn, but for the output. Output of a node is a struct, and
   // we consider the fields in increasing ordering.
-  __visc__bindOut(DsNode, 0, 0, 0);
+    __visc__bindOut(DsNode, 0, 0, 0);
+    
 }
 
-int main(int argc, char *argv[]) {
-  // Parse the arguments.
-  arguments args;
-  set_default_args(&args);
-  argp_parse(&parser, argc, argv, 0, 0, &args);
-
-  // Read a raw image.
-  // NOTE: We deliberately perform this file I/O outside of the kernel.
-  printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
-  size_t row_size, col_size;
-  uint8_t *image_in =
-      read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
-
-  printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
-
-  // Allocate a buffer for storing the output image data.
-  // (This is currently the same size as the input image data.)
-  size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
-  size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
-  uint8_t *image_out = (uint8_t *)malloc_aligned(bytes_image);
-  uint8_t *image_out_gamut = (uint8_t *)malloc_aligned(bytes_image);
-  uint8_t *image_out_demosaic = (uint8_t *)malloc_aligned(bytes_image);
-  uint8_t *image_out_denoise = (uint8_t *)malloc_aligned(bytes_image);
-  uint8_t *image_out_transform = (uint8_t *)malloc_aligned(bytes_image);
-
-  __visc__init();
-
-  ///////////////////////////////////////////////////////////////
-  // Camera Model Parameters
-  ///////////////////////////////////////////////////////////////
-  // Path to the camera model to be used
-  //    char cam_model_path[100];
-  //    char cam_model_path = "cam_models/NikonD7000/";
-  // White balance index (select white balance from transform file)
-  // The first white balance in the file has a wb_index of 1
-  // For more information on model format see the readme
-  int wb_index = 6;
-
-  // Number of control points
-  int num_ctrl_pts = 3702;
-  uint8_t *input, *result;
-  float *input_scaled, *result_scaled, *demosaic_out, *denoise_out,
-      *transform_out, *gamut_out;
-  float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
-
-  TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
-  float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
-  free(TsTw);
-  TsTw = trans;
-  ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
-  weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
-  coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
-  tone_map = get_tone_map("cam_models/NikonD7000/");
-
-  input_scaled = (float *)malloc_aligned(bytes_fimage);
-  result_scaled = (float *)malloc_aligned(bytes_fimage);
-  demosaic_out = (float *)malloc_aligned(bytes_fimage);
-  denoise_out = (float *)malloc_aligned(bytes_fimage);
-  transform_out = (float *)malloc_aligned(bytes_fimage);
-  gamut_out = (float *)malloc_aligned(bytes_fimage);
-  l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts);
-
-  // This is host_input in cam_pipe()
-  input = (uint8_t *)malloc_aligned(bytes_image);
-  convert_hwc_to_chw(image_in, row_size, col_size, &input);
-
-  // This is host_result in cam_pipe()
-  result = (uint8_t *)malloc_aligned(bytes_image);
-
-  // Allocate struct to pass DFG inputs
-  RootIn *rootArgs = (RootIn *)malloc(sizeof(RootIn));
-
-  // Set up HPVM DFG inputs in the rootArgs struct.
-  rootArgs->input = input;
-  rootArgs->bytes_input = bytes_image;
-
-  rootArgs->result = result;
-  rootArgs->bytes_result = bytes_image;
-
-  rootArgs->input_scaled = input_scaled;
-  rootArgs->bytes_input_scaled = bytes_fimage;
-
-  rootArgs->result_scaled = result_scaled;
-  rootArgs->bytes_result_scaled = bytes_fimage;
-
-  rootArgs->demosaic_out = demosaic_out;
-  rootArgs->bytes_demosaic_out = bytes_fimage;
-
-  rootArgs->denoise_out = denoise_out;
-  rootArgs->bytes_denoise_out = bytes_fimage;
-
-  rootArgs->transform_out = transform_out;
-  rootArgs->bytes_transform_out = bytes_fimage;
-
-  rootArgs->gamut_out = gamut_out;
-  rootArgs->bytes_gamut_out = bytes_fimage;
-
-  rootArgs->TsTw = TsTw;
-  rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);
-
-  rootArgs->ctrl_pts = ctrl_pts;
-  rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);
-
-  rootArgs->weights = weights;
-  rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);
-
-  rootArgs->coefs = coefs;
-  rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);
-
-  rootArgs->tone_map = tone_map;
-  rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);
-
-  rootArgs->l2_dist = l2_dist;
-  rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);
-
-  rootArgs->row_size = row_size;
-  rootArgs->col_size = col_size;
-
-  // Memory tracking is required for pointer arguments.
-  // Nodes can be scheduled on different targets, and
-  // dataflow edge implementation needs to request data.
-  // The pair (pointer, size) is inserted in memory tracker using this call
-  llvm_visc_track_mem(input, bytes_image);
-  llvm_visc_track_mem(result, bytes_image);
-  llvm_visc_track_mem(input_scaled, bytes_fimage);
-  llvm_visc_track_mem(result_scaled, bytes_fimage);
-  llvm_visc_track_mem(demosaic_out, bytes_fimage);
-  llvm_visc_track_mem(denoise_out, bytes_fimage);
-  llvm_visc_track_mem(transform_out, bytes_fimage);
-  llvm_visc_track_mem(gamut_out, bytes_fimage);
-  llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float));
-  llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
-  llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
-  llvm_visc_track_mem(coefs, 4 * CHAN_SIZE * sizeof(float));
-  llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
-  llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float));
-
-  printf("\n\nLaunching CAVA pipeline!\n");
-
-  void *camPipeDFG = __visc__launch(0, CamPipeRoot, (void *)rootArgs);
-  __visc__wait(camPipeDFG);
-
-  printf("\n\nPipeline execution completed!\n");
-  printf("Pipeline final stage returned %lu; should be %lu\n",
-         rootArgs->ret.bytesRet, bytes_image);
-  printf("\n\nRequesting memory!\n");
-
-  // Request data from graph.
-  llvm_visc_request_mem(result, bytes_image);
-  llvm_visc_request_mem(demosaic_out, bytes_fimage);
-  llvm_visc_request_mem(denoise_out, bytes_fimage);
-  llvm_visc_request_mem(transform_out, bytes_fimage);
-  llvm_visc_request_mem(gamut_out, bytes_fimage);
-  printf("\n\nDone requesting memory!\n");
-
-  uint8_t *gamut_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
-  uint8_t *demosaic_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
-  uint8_t *transform_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
-  uint8_t *denoise_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
-
-  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image,
-              row_size, col_size);
-  descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image,
-              row_size, col_size);
-  descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image,
-              row_size, col_size);
-  descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image,
-              row_size, col_size);
-
-  convert_chw_to_hwc(result, row_size, col_size, &image_out);
-  convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
-  convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size,
-                     &image_out_demosaic);
-  convert_chw_to_hwc(denoise_out_descaled, row_size, col_size,
-                     &image_out_denoise);
-  convert_chw_to_hwc(transform_out_descaled, row_size, col_size,
-                     &image_out_transform);
-
-  // Remove tracked pointers.
-  llvm_visc_untrack_mem(input);
-  llvm_visc_untrack_mem(result);
-  llvm_visc_untrack_mem(input_scaled);
-  llvm_visc_untrack_mem(result_scaled);
-  llvm_visc_untrack_mem(demosaic_out);
-  llvm_visc_untrack_mem(denoise_out);
-  llvm_visc_untrack_mem(transform_out);
-  llvm_visc_untrack_mem(gamut_out);
-
-  llvm_visc_untrack_mem(TsTw);
-  llvm_visc_untrack_mem(ctrl_pts);
-  llvm_visc_untrack_mem(weights);
-  llvm_visc_untrack_mem(coefs);
-  llvm_visc_untrack_mem(tone_map);
-  llvm_visc_untrack_mem(l2_dist);
-
-  // Output the image.
-  // NOTE: We deliberately perform this file I/O outside of the kernel.
+int main(int argc, char* argv[]) {
+    // Parse the arguments.
+    arguments args;
+    set_default_args(&args);
+    argp_parse(&parser, argc, argv, 0, 0, &args);
+
+    // Read a raw image.
+    // NOTE: We deliberately perform this file I/O outside of the kernel.
+    printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
+    size_t row_size, col_size;
+    uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
+
+    printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
+
+    // Allocate a buffer for storing the output image data.
+    // (This is currently the same size as the input image data.)
+    size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
+    size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
+    uint8_t *image_out = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t *image_out_gamut = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t *image_out_demosaic = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t *image_out_denoise = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t *image_out_transform = (uint8_t*) malloc_aligned(bytes_image);
+
+    __visc__init();
+
+    ///////////////////////////////////////////////////////////////
+    // Camera Model Parameters
+    ///////////////////////////////////////////////////////////////
+    // Path to the camera model to be used
+//    char cam_model_path[100];
+//    char cam_model_path = "cam_models/NikonD7000/";
+    // White balance index (select white balance from transform file)
+    // The first white balance in the file has a wb_index of 1
+    // For more information on model format see the readme
+    int wb_index = 6;
+
+    // Number of control points
+    int num_ctrl_pts = 3702;
+    uint8_t *input, *result;
+    float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, *transform_out, *gamut_out;
+    float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
+
+    TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
+    float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
+    free(TsTw);
+    TsTw = trans;
+    ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
+    weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
+    coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
+    tone_map = get_tone_map("cam_models/NikonD7000/");
+    
+    input_scaled = (float*) malloc_aligned(bytes_fimage);
+    result_scaled = (float*) malloc_aligned(bytes_fimage);
+    demosaic_out = (float*) malloc_aligned(bytes_fimage);
+    denoise_out = (float*) malloc_aligned(bytes_fimage);
+    transform_out  = (float*) malloc_aligned(bytes_fimage);
+    gamut_out = (float*) malloc_aligned(bytes_fimage);
+    l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts);    
+    
+    // This is host_input in cam_pipe()
+    input = (uint8_t*) malloc_aligned(bytes_image);
+    convert_hwc_to_chw(image_in, row_size, col_size, &input);
+    
+    // This is host_result in cam_pipe()
+    result = (uint8_t*) malloc_aligned(bytes_image);
+
+    // Allocate struct to pass DFG inputs
+    RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn));
+
+    // Set up HPVM DFG inputs in the rootArgs struct.
+    rootArgs->input = input;
+    rootArgs->bytes_input = bytes_image;
+    
+    rootArgs->result = result;
+    rootArgs->bytes_result = bytes_image;
+    
+    rootArgs->input_scaled = input_scaled;
+    rootArgs->bytes_input_scaled = bytes_fimage;
+    
+    rootArgs->result_scaled = result_scaled;
+    rootArgs->bytes_result_scaled = bytes_fimage;
+    
+    rootArgs->demosaic_out = demosaic_out;
+    rootArgs->bytes_demosaic_out = bytes_fimage;
+    
+    rootArgs->denoise_out = denoise_out;
+    rootArgs->bytes_denoise_out = bytes_fimage;
+    
+    rootArgs->transform_out = transform_out;
+    rootArgs->bytes_transform_out = bytes_fimage;
+
+    rootArgs->gamut_out = gamut_out;
+    rootArgs->bytes_gamut_out = bytes_fimage;
+
+    rootArgs->TsTw = TsTw;
+    rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->ctrl_pts = ctrl_pts;
+    rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->weights = weights;
+    rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->coefs = coefs;
+    rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->tone_map = tone_map;
+    rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->l2_dist = l2_dist;
+    rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);
+    
+    rootArgs->row_size = row_size;
+    rootArgs->col_size = col_size;
+
+    // Memory tracking is required for pointer arguments.
+    // Nodes can be scheduled on different targets, and 
+    // dataflow edge implementation needs to request data.
+    // The pair (pointer, size) is inserted in memory tracker using this call
+    llvm_visc_track_mem(input, bytes_image);
+    llvm_visc_track_mem(result, bytes_image);
+    llvm_visc_track_mem(input_scaled, bytes_fimage);
+    llvm_visc_track_mem(result_scaled, bytes_fimage);
+    llvm_visc_track_mem(demosaic_out, bytes_fimage);
+    llvm_visc_track_mem(denoise_out, bytes_fimage);
+    llvm_visc_track_mem(transform_out, bytes_fimage);
+    llvm_visc_track_mem(gamut_out, bytes_fimage);
+    llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); 
+    llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+    llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+    llvm_visc_track_mem(coefs, 4 * CHAN_SIZE *sizeof(float));
+    llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
+    llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float));
+    
+    printf("\n\nLaunching CAVA pipeline!\n");
+
+    void* camPipeDFG = __visc__launch(0, CamPipeRoot, (void*) rootArgs);
+    __visc__wait(camPipeDFG);
+
+    printf("\n\nPipeline execution completed!\n");
+    printf("\n\nRequesting memory!\n");
+
+    // Request data from graph.    
+    llvm_visc_request_mem(result, bytes_image);
+    llvm_visc_request_mem(demosaic_out, bytes_fimage);
+    llvm_visc_request_mem(denoise_out, bytes_fimage);
+    llvm_visc_request_mem(transform_out, bytes_fimage);
+    llvm_visc_request_mem(gamut_out, bytes_fimage);
+    printf("\n\nDone requesting memory!\n");
+
+
+    uint8_t* gamut_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
+  uint8_t* demosaic_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t* transform_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t* denoise_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
+    
+  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, row_size, col_size);
+    descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, row_size, col_size);
+    descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, row_size, col_size);
+    descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, row_size, col_size);
+    
+    convert_chw_to_hwc(result, row_size, col_size, &image_out);
+   convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
+    convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, &image_out_demosaic);
+    convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, &image_out_denoise);
+    convert_chw_to_hwc(transform_out_descaled, row_size, col_size, &image_out_transform);
+
+    
+    // Remove tracked pointers.
+    llvm_visc_untrack_mem(input);
+    llvm_visc_untrack_mem(result);
+    llvm_visc_untrack_mem(input_scaled);
+    llvm_visc_untrack_mem(result_scaled);
+    llvm_visc_untrack_mem(demosaic_out);
+    llvm_visc_untrack_mem(denoise_out);
+    llvm_visc_untrack_mem(transform_out);
+    llvm_visc_untrack_mem(gamut_out);
+    
+    llvm_visc_untrack_mem(TsTw); 
+    llvm_visc_untrack_mem(ctrl_pts);
+    llvm_visc_untrack_mem(weights);
+    llvm_visc_untrack_mem(coefs);
+    llvm_visc_untrack_mem(tone_map);
+    llvm_visc_untrack_mem(l2_dist);
+
+    // Output the image.
+    // NOTE: We deliberately perform this file I/O outside of the kernel.
   char str[50], base_str[50];
   strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]);
   strcpy(str, base_str);
@@ -912,7 +877,8 @@ int main(int argc, char *argv[]) {
   printf("Writing output image to %s\n", str);
   write_image_to_binary(str, image_out_transform, row_size, col_size);
 
-  __visc__cleanup();
+    __visc__cleanup();
 
-  return 0;
+    return 0;
 }
+
diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.c b/hpvm/test/hpvm-cava/src/pipe_stages.c
index 253052af87..2ebedec936 100644
--- a/hpvm/test/hpvm-cava/src/pipe_stages.c
+++ b/hpvm/test/hpvm-cava/src/pipe_stages.c
@@ -1,43 +1,44 @@
+#include <stdio.h>
+#include <math.h>
 #include "pipe_stages.h"
 #include "cam_pipe_utility.h"
-#include <math.h>
-#include <stdio.h>
 
-// void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) {
-void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
-               size_t bytes_output, int row_size, int col_size) {
+//void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) {
+void scale_fxp(uint8_t *input, size_t bytes_input, 
+               float *output, size_t bytes_output,
+               int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, output, 1, output);
-
+  
   ARRAY_3D(uint8_t, _input, input, row_size, col_size);
   ARRAY_3D(float, _output, output, row_size, col_size);
-sl_chan:
+  sl_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-  sl_row:
+    sl_row:
     for (int row = 0; row < row_size; row++)
-    sl_col:
+      sl_col:
       for (int col = 0; col < col_size; col++)
         _output[chan][row][col] = _input[chan][row][col] * 1.0 / 255;
 
   __visc__return(1, bytes_output);
 }
 
-// void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) {
-void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
-                 size_t bytes_result, int row_size, int col_size) {
+//void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) {
+void descale_fxp(float *input, size_t bytes_input, 
+                 uint8_t *output, size_t bytes_result,
+                 int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, output, 1, output);
-
+  
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(uint8_t, _output, output, row_size, col_size);
-dsl_chan:
+  dsl_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-  dsl_row:
+    dsl_row:
     for (int row = 0; row < row_size; row++)
-    dsl_col:
+      dsl_col:
       for (int col = 0; col < col_size; col++)
-        _output[chan][row][col] =
-            min(max(_input[chan][row][col] * 255, 0), 255);
+        _output[chan][row][col] = min(max(_input[chan][row][col] * 255, 0), 255);
 
   __visc__return(1, bytes_output);
 }
@@ -45,125 +46,127 @@ dsl_chan:
 // Demosaicing stage
 // G R
 // B G
-// void demosaic_fxp(float *input, int row_size, int col_size, float *result) {
-void demosaic_fxp(float *input, size_t bytes_input, float *result,
-                  size_t bytes_result, int row_size, int col_size) {
+//void demosaic_fxp(float *input, int row_size, int col_size, float *result) {
+void demosaic_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, result, 1, result);
-
+  
   printf("Demosaicing.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
 
-dm_row:
+  dm_row:
   for (int row = 1; row < row_size - 1; row++)
-  dm_col:
+    dm_col:
     for (int col = 1; col < col_size - 1; col++)
-      if (row % 2 == 0 && col % 2 == 0) {
-        // Green pixel
-        // Getting the R values
-        float R1 = _input[0][row][col - 1];
-        float R2 = _input[0][row][col + 1];
-        // Getting the B values
-        float B1 = _input[2][row - 1][col];
-        float B2 = _input[2][row + 1][col];
-        // R
-        _result[0][row][col] = (R1 + R2) / 2;
-        // G
-        _result[1][row][col] = _input[1][row][col] * 2;
-        // B
-        _result[2][row][col] = (B1 + B2) / 2;
-      } else if (row % 2 == 0 && col % 2 == 1) {
-        // Red pixel
-        // Getting the G values
-        float G1 = _input[1][row - 1][col];
-        float G2 = _input[1][row + 1][col];
-        float G3 = _input[1][row][col - 1];
-        float G4 = _input[1][row][col + 1];
-        // Getting the B values
-        float B1 = _input[2][row - 1][col - 1];
-        float B2 = _input[2][row - 1][col + 1];
-        float B3 = _input[2][row + 1][col - 1];
-        float B4 = _input[2][row + 1][col + 1];
-        // R
-        _result[0][row][col] = _input[0][row][col];
-        // G
-        _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
-        // B (center pixel)
-        _result[2][row][col] = (B1 + B2 + B3 + B4) / 4;
-      } else if (row % 2 == 1 && col % 2 == 0) {
-        // Blue pixel
-        // Getting the R values
-        float R1 = _input[0][row - 1][col - 1];
-        float R2 = _input[0][row + 1][col - 1];
-        float R3 = _input[0][row - 1][col + 1];
-        float R4 = _input[0][row + 1][col + 1];
-        // Getting the G values
-        float G1 = _input[1][row - 1][col];
-        float G2 = _input[1][row + 1][col];
-        float G3 = _input[1][row][col - 1];
-        float G4 = _input[1][row][col + 1];
-        // R
-        _result[0][row][col] = (R1 + R2 + R3 + R4) / 4;
-        // G
-        _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
-        // B
-        _result[2][row][col] = _input[2][row][col];
-      } else {
-        // Bottom Green pixel
-        // Getting the R values
-        float R1 = _input[0][row - 1][col];
-        float R2 = _input[0][row + 1][col];
-        // Getting the B values
-        float B1 = _input[2][row][col - 1];
-        float B2 = _input[2][row][col + 1];
-        // R
-        _result[0][row][col] = (R1 + R2) / 2;
-        // G
-        _result[1][row][col] = _input[1][row][col] * 2;
-        // B
-        _result[2][row][col] = (B1 + B2) / 2;
-      }
+        if (row % 2 == 0 && col % 2 == 0) {
+            // Green pixel
+            // Getting the R values
+            float R1 = _input[0][row][col - 1];
+            float R2 = _input[0][row][col + 1];
+            // Getting the B values
+            float B1 = _input[2][row - 1][col];
+            float B2 = _input[2][row + 1][col];
+            // R
+            _result[0][row][col] = (R1 + R2) / 2;
+            // G
+            _result[1][row][col] = _input[1][row][col] * 2;
+            // B
+            _result[2][row][col] = (B1 + B2) / 2;
+        } else if (row % 2 == 0 && col % 2 == 1) {
+            // Red pixel
+            // Getting the G values
+            float G1 = _input[1][row - 1][col];
+            float G2 = _input[1][row + 1][col];
+            float G3 = _input[1][row][col - 1];
+            float G4 = _input[1][row][col + 1];
+            // Getting the B values
+            float B1 = _input[2][row - 1][col - 1];
+            float B2 = _input[2][row - 1][col + 1];
+            float B3 = _input[2][row + 1][col - 1];
+            float B4 = _input[2][row + 1][col + 1];
+            // R
+            _result[0][row][col] = _input[0][row][col];
+            // G
+            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+            // B (center pixel)
+            _result[2][row][col] = (B1 + B2 + B3 + B4) / 4;
+        } else if (row % 2 == 1 && col % 2 == 0) {
+            // Blue pixel
+            // Getting the R values
+            float R1 = _input[0][row - 1][col - 1];
+            float R2 = _input[0][row + 1][col - 1];
+            float R3 = _input[0][row - 1][col + 1];
+            float R4 = _input[0][row + 1][col + 1];
+            // Getting the G values
+            float G1 = _input[1][row - 1][col];
+            float G2 = _input[1][row + 1][col];
+            float G3 = _input[1][row][col - 1];
+            float G4 = _input[1][row][col + 1];
+            // R
+            _result[0][row][col] = (R1 + R2 + R3 + R4) / 4;
+            // G
+            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+            // B
+            _result[2][row][col] = _input[2][row][col];
+        } else {
+            // Bottom Green pixel
+            // Getting the R values
+            float R1 = _input[0][row - 1][col];
+            float R2 = _input[0][row + 1][col];
+            // Getting the B values
+            float B1 = _input[2][row][col - 1];
+            float B2 = _input[2][row][col + 1];
+            // R
+            _result[0][row][col] = (R1 + R2) / 2;
+            // G
+            _result[1][row][col] = _input[1][row][col] * 2;
+            // B
+            _result[2][row][col] = (B1 + B2) / 2;
+        }
 
   __visc__return(1, bytes_result);
 }
 
 static void sort(float arr[], int n) {
-  int i, j;
-dn_sort_i:
-  for (i = 0; i < n - 1; i++)
-  dn_sort_j:
-    for (j = 0; j < n - i - 1; j++)
-      if (arr[j] > arr[j + 1]) {
-        float temp = arr[j];
-        arr[j] = arr[j + 1];
-        arr[j + 1] = temp;
-      }
+    int i, j;
+    dn_sort_i:
+    for (i = 0; i < n - 1; i++)
+        dn_sort_j:
+        for (j = 0; j < n - i - 1; j++)
+            if (arr[j] > arr[j + 1]) {
+                float temp = arr[j];
+                arr[j] = arr[j + 1];
+                arr[j + 1] = temp;
+            }
 }
 
 // Simple denoise
-// void denoise_fxp(float *input, int row_size, int col_size, float *result) {
-void denoise_fxp(float *input, size_t bytes_input, float *result,
-                 size_t bytes_result, int row_size, int col_size) {
+//void denoise_fxp(float *input, int row_size, int col_size, float *result) {
+void denoise_fxp(float *input, size_t bytes_input, 
+                 float *result, size_t bytes_result,
+                 int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, result, 1, result);
-
+  
   printf("Denoising.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
 
-dn_chan:
+  dn_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-  dn_row:
+    dn_row:
     for (int row = 0; row < row_size; row++)
-    dn_col:
+      dn_col:
       for (int col = 0; col < col_size; col++)
         if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
           float filter[9];
-        dn_slide_row:
-          for (int i = row - 1; i < row + 2; i++)
-          dn_slide_col:
-            for (int j = col - 1; j < col + 2; j++) {
+          dn_slide_row:
+          for (int i = row-1; i < row+2; i++)
+            dn_slide_col:
+            for (int j = col-1; j < col+2; j++) {
               int index = (i - row + 1) * 3 + j - col + 1;
               filter[index] = _input[chan][i][j];
             }
@@ -176,24 +179,25 @@ dn_chan:
 }
 
 // Color map and white balance transform
-// void transform_fxp(float *input, int row_size, int col_size, float *result,
+//void transform_fxp(float *input, int row_size, int col_size, float *result,
 //                   float *TsTw_tran) {
-void transform_fxp(float *input, size_t bytes_input, float *result,
-                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *TsTw_tran, size_t bytes_TsTw,
                    int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, TsTw_tran, 1, result);
-
+  
   printf("Color mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
   ARRAY_2D(float, _TsTw_tran, TsTw_tran, 3);
 
-tr_chan:
+  tr_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-  tr_row:
+    tr_row:
     for (int row = 0; row < row_size; row++)
-    tr_col:
+      tr_col:
       for (int col = 0; col < col_size; col++)
         _result[chan][row][col] =
             max(_input[0][row][col] * _TsTw_tran[0][chan] +
@@ -206,18 +210,18 @@ tr_chan:
 //
 // Weighted radial basis function for gamut mapping
 //
-// void gamut_map_fxp(float *input, int row_size, int col_size, float *result,
-//                   float *ctrl_pts, float *weights, float *coefs, float
-//                   *l2_dist) {
-void gamut_map_fxp(float *input, size_t bytes_input, float *result,
-                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights, float *coefs,
-                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
+//void gamut_map_fxp(float *input, int row_size, int col_size, float *result,
+//                   float *ctrl_pts, float *weights, float *coefs, float *l2_dist) {
+void gamut_map_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights,
+                   float *coefs, size_t bytes_coefs,
+                   float *l2_dist, size_t bytes_l2_dist,
                    int row_size, int col_size) {
   __visc__hint(DEVICE);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1,
-                     result);
-
+  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
+  
   printf("Gamut mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
@@ -225,25 +229,26 @@ void gamut_map_fxp(float *input, size_t bytes_input, float *result,
   ARRAY_2D(float, _weights, weights, 3);
   ARRAY_2D(float, _coefs, coefs, 3);
 
-// First, get the L2 norm from every pixel to the control points,
-// Then, sum it and weight it. Finally, add the bias.
-gm_rbf_row:
+  // First, get the L2 norm from every pixel to the control points,
+  // Then, sum it and weight it. Finally, add the bias.
+  gm_rbf_row:
   for (int row = 0; row < row_size; row++)
-  gm_rbf_col:
+    gm_rbf_col:
     for (int col = 0; col < col_size; col++) {
-    gm_rbf_cp0:
+      gm_rbf_cp0:
       for (int cp = 0; cp < num_ctrl_pts; cp++) {
-        l2_dist[cp] = sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) *
-                               (_input[0][row][col] - _ctrl_pts[cp][0]) +
-                           (_input[1][row][col] - _ctrl_pts[cp][1]) *
-                               (_input[1][row][col] - _ctrl_pts[cp][1]) +
-                           (_input[2][row][col] - _ctrl_pts[cp][2]) *
-                               (_input[2][row][col] - _ctrl_pts[cp][2]));
+        l2_dist[cp] =
+            sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) *
+                     (_input[0][row][col] - _ctrl_pts[cp][0]) +
+                 (_input[1][row][col] - _ctrl_pts[cp][1]) *
+                     (_input[1][row][col] - _ctrl_pts[cp][1]) +
+                 (_input[2][row][col] - _ctrl_pts[cp][2]) *
+                     (_input[2][row][col] - _ctrl_pts[cp][2]));
       }
-    gm_rbf_chan:
+      gm_rbf_chan:
       for (int chan = 0; chan < CHAN_SIZE; chan++) {
         float chan_val = 0.0;
-      gm_rbf_cp1:
+        gm_rbf_cp1:
         for (int cp = 0; cp < num_ctrl_pts; cp++) {
           chan_val += l2_dist[cp] * _weights[cp][chan];
         }
@@ -258,24 +263,25 @@ gm_rbf_row:
 }
 
 // Tone mapping
-// void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map,
+//void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map,
 //                  float *result) {
-void tone_map_fxp(float *input, size_t bytes_input, float *result,
-                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  float *tone_map, size_t bytes_tone_map,
                   int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, tone_map, 1, result);
-
+  
   printf("Tone mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
   ARRAY_2D(float, _tone_map, tone_map, 3);
 
-tm_chan:
+  tm_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-  tm_row:
+    tm_row:
     for (int row = 0; row < row_size; row++)
-    tm_col:
+      tm_col:
       for (int col = 0; col < col_size; col++) {
         uint8_t x = _input[chan][row][col] * 255;
         _result[chan][row][col] = _tone_map[x][chan];
diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.h b/hpvm/test/hpvm-cava/src/pipe_stages.h
index 4fa24354c7..8d98cb65cc 100644
--- a/hpvm/test/hpvm-cava/src/pipe_stages.h
+++ b/hpvm/test/hpvm-cava/src/pipe_stages.h
@@ -2,58 +2,59 @@
 #define _PIPE_STAGES_H_
 
 #include "defs.h"
-#include <stddef.h>
 
 #define CHAN_SIZE 3
 
 #define ISP 0x4
 
-#define max(a, b)                                                              \
-  ({                                                                           \
-    __typeof__(a) _a = (a);                                                    \
-    __typeof__(b) _b = (b);                                                    \
-    _a > _b ? _a : _b;                                                         \
-  })
-
-#define min(a, b)                                                              \
-  ({                                                                           \
-    __typeof__(a) _a = (a);                                                    \
-    __typeof__(b) _b = (b);                                                    \
-    _a < _b ? _a : _b;                                                         \
-  })
-
-#define abs(a)                                                                 \
-  ({                                                                           \
-    __typeof__(a) _a = (a);                                                    \
-    _a < 0 ? -_a : _a;                                                         \
-  })
+#define max(a,b) \
+  ({ __typeof__ (a) _a = (a); \
+      __typeof__ (b) _b = (b); \
+    _a > _b ? _a : _b; })
+
+#define min(a,b) \
+  ({ __typeof__ (a) _a = (a); \
+      __typeof__ (b) _b = (b); \
+    _a < _b ? _a : _b; })
+
+#define abs(a) \
+  ({ __typeof__ (a) _a = (a); \
+    _a < 0 ? -_a : _a; })
 
 extern int num_ctrl_pts;
 
-void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
-               size_t bytes_output, size_t row_size, size_t col_size);
+void scale_fxp(uint8_t *input, size_t bytes_input, 
+               float *output, size_t bytes_output,
+               size_t row_size, size_t col_size);
 
-void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
-                 size_t bytes_result, size_t row_size, size_t col_size);
+void descale_fxp(float *input, size_t bytes_input, 
+                 uint8_t *output, size_t bytes_result,
+                 size_t row_size, size_t col_size);
 
-void demosaic_fxp(float *input, size_t bytes_input, float *result,
-                  size_t bytes_result, size_t row_size, size_t col_size);
+void demosaic_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  size_t row_size, size_t col_size);
 
-void denoise_fxp(float *input, size_t bytes_input, float *result,
-                 size_t bytes_result, size_t row_size, size_t col_size);
+void denoise_fxp(float *input, size_t bytes_input, 
+                 float *result, size_t bytes_result,
+                 size_t row_size, size_t col_size);
 
-void transform_fxp(float *input, size_t bytes_input, float *result,
-                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *TsTw_tran, size_t bytes_TsTw,
                    size_t row_size, size_t col_size);
 
-void gamut_map_fxp(float *input, size_t bytes_input, float *result,
-                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights, float *coefs,
-                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
+void gamut_map_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights,
+                   float *coefs, size_t bytes_coefs,
+                   float *l2_dist, size_t bytes_l2_dist,
                    size_t row_size, size_t col_size);
 
-void tone_map_fxp(float *input, size_t bytes_input, float *result,
-                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  float *tone_map, size_t bytes_tone_map,
                   size_t row_size, size_t col_size);
 
 void tone_map_approx_fxp(float *input, size_t row_size, size_t col_size,
diff --git a/hpvm/test/hpvm-cava/src/utility.c b/hpvm/test/hpvm-cava/src/utility.c
index 86bd018183..c1eaee3333 100644
--- a/hpvm/test/hpvm-cava/src/utility.c
+++ b/hpvm/test/hpvm-cava/src/utility.c
@@ -1,7 +1,7 @@
-#include "utility.h"
-#include "defs.h"
-#include <assert.h>
 #include <stdlib.h>
+#include <assert.h>
+#include "defs.h"
+#include "utility.h"
 
 void *malloc_aligned(size_t size) {
   void *ptr = NULL;
-- 
GitLab