Skip to content
Snippets Groups Projects
main.c 36.40 KiB
#include "utility.h"
#include "defs.h"
#include <argp.h>
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "cam_pipe_utility.h"
#include "load_cam_model.h"

#include "hpvm.h"

int NUM_TEST_CASES;
int NUM_CLASSES;
int INPUT_DIM;
int NUM_WORKER_THREADS;

// Type of struct that is used to pass arguments to the HPVM dataflow graph
// using the hpvm launch operation
typedef struct __attribute__((__packed__)) {
  uint8_t *input;
  size_t bytes_input;
  uint8_t *result;
  size_t bytes_result;
  float *input_scaled;
  size_t bytes_input_scaled;
  float *result_scaled;
  size_t bytes_result_scaled;
  float *demosaic_out;
  size_t bytes_demosaic_out;
  float *denoise_out;
  size_t bytes_denoise_out;
  float *transform_out;
  size_t bytes_transform_out;
  float *gamut_out;
  size_t bytes_gamut_out;
  float *TsTw;
  size_t bytes_TsTw;
  float *ctrl_pts;
  size_t bytes_ctrl_pts;
  float *weights;
  size_t bytes_weights;
  float *coefs;
  size_t bytes_coefs;
  float *l2_dist;
  size_t bytes_l2_dist;
  float *tone_map;
  size_t bytes_tone_map;
  size_t row_size;
  size_t col_size;
} RootIn;

typedef enum _argnum {
  CAM_MODEL,
  RAW_IMAGE_BIN,
  OUTPUT_IMAGE_BIN,
  NUM_REQUIRED_ARGS,
  DATA_FILE = NUM_REQUIRED_ARGS,
  NUM_ARGS,
} argnum;

typedef struct _arguments {
  char *args[NUM_ARGS];
  int num_inputs;
  int num_threads;
} arguments;

static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n";
static char args_doc[] = "path/to/cam-model path/to/raw-image-binary path/to/output-image-binary";
static struct argp_option options[] = {
    {"num-inputs", 'n', "N", 0, "Number of input images"},
    {0},
    {"data-file", 'f', "F", 0,
     "File to read data and weights from (if data-init-mode == READ_FILE or "
     "save-params is true). *.txt files are decoded as text files, while "
     "*.bin files are decoded as binary files."},
};

static error_t parse_opt(int key, char *arg, struct argp_state *state) {
  arguments *args = (arguments *)(state->input);
  switch (key) {
  case 'n': {
    args->num_inputs = strtol(arg, NULL, 10);
    break;
  }
  case 'f': {
    args->args[DATA_FILE] = arg;
    break;
  }
  case 't': {
    args->num_threads = strtol(arg, NULL, 10);
    break;
  }
  case ARGP_KEY_ARG: {
    if (state->arg_num >= NUM_REQUIRED_ARGS)
      argp_usage(state);
    args->args[state->arg_num] = arg;
    break;
  }
  case ARGP_KEY_END: {
    if (state->arg_num < NUM_REQUIRED_ARGS) {
      fprintf(stderr, "Not enough arguments! Got %d, require %d.\n",
              state->arg_num, NUM_REQUIRED_ARGS);
      argp_usage(state);
    }
    break;
  }
  default:
    return ARGP_ERR_UNKNOWN;
  }
  return 0;
}

void set_default_args(arguments *args) {
  args->num_inputs = 1;
  args->num_threads = 0;
  for (int i = 0; i < NUM_ARGS; i++) {
    args->args[i] = NULL;
  }
}

static struct argp parser = {options, parse_opt, args_doc, prog_doc};

// Helper function for printing intermediate results
void descale_cpu(float *input, size_t bytes_input, uint8_t *output,
                 size_t bytes_result, size_t row_size, size_t col_size) {

  for (int chan = 0; chan < CHAN_SIZE; chan++)
    for (int row = 0; row < row_size; row++)
      for (int col = 0; col < col_size; col++) {
        int index = (chan * row_size + row) * col_size + col;
        output[index] = min(max(input[index] * 255, 0), 255);
      }
}

static void sort(float arr[], int n) {
  int i, j;
  for (i = 0; i < n - 1; i++)
    for (j = 0; j < n - i - 1; j++)
      if (arr[j] > arr[j + 1]) {
        float temp = arr[j];
        arr[j] = arr[j + 1];
        arr[j + 1] = temp;
      }
}

/**************************************************************/
/*** HPVM Leaf node Functions - Performing the computations ***/
/**************************************************************/

// In this benchmark, no use of HPVM query intrinsics in the leaf node functions

// Leaf HPVM node function for scale
void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
               size_t bytes_output, size_t row_size, size_t col_size) {

  // Specifies compilation target for current node
  __hpvm__hint(CPU_TARGET);

  // Specifies pointer arguments that will be used as "in" and "out" arguments
  // - count of "in" arguments
  // - list of "in" argument , and similar for "out"
  __hpvm__attributes(2, input, output, 1, output);
  void *thisNode = __hpvm__getNode();
  int row = __hpvm__getNodeInstanceID_x(thisNode);
  for (int chan = 0; chan < CHAN_SIZE; chan++)
    //    for (int row = 0; row < row_size; row++)
    for (int col = 0; col < col_size; col++) {
      int index = (chan * row_size + row) * col_size + col;
      output[index] = input[index] * 1.0 / 255;
    }
  __hpvm__return(1, bytes_output);
}

// Leaf HPVM node function for descale
void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
                 size_t bytes_result, size_t row_size, size_t col_size) {
  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(2, input, output, 1, output);

  for (int chan = 0; chan < CHAN_SIZE; chan++)
    for (int row = 0; row < row_size; row++)
      for (int col = 0; col < col_size; col++) {
        int index = (chan * row_size + row) * col_size + col;
        output[index] = min(max(input[index] * 255, 0), 255);
      }
}

// Leaf HPVM node function for demosaicing
void demosaic_fxp(float *input, size_t bytes_input, float *result,
                  size_t bytes_result, size_t row_size, size_t col_size) {
  __hpvm__hint(DEVICE);
  __hpvm__attributes(2, input, result, 1, result);

  void *thisNode = __hpvm__getNode();
  int row = __hpvm__getNodeInstanceID_x(thisNode);
  //  for (int row = 1; row < row_size - 1; row++)
  for (int col = 1; col < col_size - 1; col++) {
    int index_0 = (0 * row_size + row) * col_size + col;
    int index_1 = (1 * row_size + row) * col_size + col;
    int index_2 = (2 * row_size + row) * col_size + col;
    if (row % 2 == 0 && col % 2 == 0) {
      // Green pixel
      // Getting the R values
      float R1 = input[index_0 - 1];
      float R2 = input[index_0 + 1];
      // Getting the B values
      float B1 = input[index_2 - col_size];
      float B2 = input[index_2 + col_size];
      // R
      result[index_0] = (R1 + R2) / 2;
      // G
      result[index_1] = input[index_1] * 2;
      // B
      result[index_2] = (B1 + B2) / 2;
    } else if (row % 2 == 0 && col % 2 == 1) {
      // Red pixel
      // Getting the G values
      float G1 = input[index_1 - col_size];
      float G2 = input[index_1 + col_size];
      float G3 = input[index_1 - 1];
      float G4 = input[index_1 + 1];
      // Getting the B values
      float B1 = input[index_2 - col_size - 1];
      float B2 = input[index_2 - col_size + 1];
      float B3 = input[index_2 + col_size - 1];
      float B4 = input[index_2 + col_size + 1];
      // R
      result[index_0] = input[index_0];
      // G
      result[index_1] = (G1 + G2 + G3 + G4) / 2;
      // B (center pixel)
      result[index_2] = (B1 + B2 + B3 + B4) / 4;
    } else if (row % 2 == 1 && col % 2 == 0) {
      // Blue pixel
      // Getting the R values
      float R1 = input[index_0 - col_size - 1];
      float R2 = input[index_0 + col_size - 1];
      float R3 = input[index_0 - col_size + 1];
      float R4 = input[index_0 + col_size + 1];
      // Getting the G values
      float G1 = input[index_1 - col_size];
      float G2 = input[index_1 + col_size];
      float G3 = input[index_1 - 1];
      float G4 = input[index_1 + 1];
      // R
      result[index_0] = (R1 + R2 + R3 + R4) / 4;
      // G
      result[index_1] = (G1 + G2 + G3 + G4) / 2;
      // B
      result[index_2] = input[index_2];
    } else {
      // Bottom Green pixel
      // Getting the R values
      float R1 = input[index_0 - col_size];
      float R2 = input[index_0 + col_size];
      // Getting the B values
      float B1 = input[index_2 - 1];
      float B2 = input[index_2 + 1];
      // R
      result[index_0] = (R1 + R2) / 2;
      // G
      result[index_1] = input[index_1] * 2;
      // B
      result[index_2] = (B1 + B2) / 2;
    }
  }
  __hpvm__return(1, bytes_result);
}

// Leaf HPVM node function for denoise
void denoise_fxp(float *input, size_t bytes_input, float *result,
                 size_t bytes_result, size_t row_size, size_t col_size) {
  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(2, input, result, 1, result);

  void *thisNode = __hpvm__getNode();
  int row = __hpvm__getNodeInstanceID_x(thisNode);
  for (int chan = 0; chan < CHAN_SIZE; chan++)
    //    for (int row = 0; row < row_size; row++)
    for (int col = 0; col < col_size; col++)
      if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
        float filter[9];
        for (int i = -1; i < 2; i++)
          for (int j = -1; j < 2; j++) {
            int index = ((i + row) - row + 1) * 3 + (j + col) - col + 1;
            filter[index] =
                input[(chan * row_size + (i + row)) * col_size + (j + col)];
          }
        sort(filter, 9);
        result[(chan * row_size + row) * col_size + col] = filter[4];
      } else {
        result[(chan * row_size + row) * col_size + col] =
            input[(chan * row_size + row) * col_size + col];
      }
  __hpvm__return(1, bytes_result);
}

// Leaf HPVM node function, for color map and white balance transform
void transform_fxp(float *input, size_t bytes_input, float *result,
                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
                   size_t row_size, size_t col_size) {
  __hpvm__hint(DEVICE);
  __hpvm__attributes(3, input, result, TsTw_tran, 1, result);

  void *thisNode = __hpvm__getNode();
  int row = __hpvm__getNodeInstanceID_x(thisNode);
  for (int chan = 0; chan < CHAN_SIZE; chan++)
    //    for (int row = 0; row < row_size; row++)
    for (int col = 0; col < col_size; col++) {
      int index = (chan * row_size + row) * col_size + col;
      int index_0 = (0 * row_size + row) * col_size + col;
      int index_1 = (1 * row_size + row) * col_size + col;
      int index_2 = (2 * row_size + row) * col_size + col;
      int index_2d_0 = 0 * CHAN_SIZE + chan;
      int index_2d_1 = 1 * CHAN_SIZE + chan;
      int index_2d_2 = 2 * CHAN_SIZE + chan;
      result[index] = max(input[index_0] * TsTw_tran[index_2d_0] +
                              input[index_1] * TsTw_tran[index_2d_1] +
                              input[index_2] * TsTw_tran[index_2d_2],
                          0);
    }
  __hpvm__return(1, bytes_result);
}

// Leaf HPVM node function, for gamut mapping
void gamut_map_fxp(float *input, size_t bytes_input, float *result,
                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
                   float *weights, size_t bytes_weights, float *coefs,
                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
                   size_t row_size, size_t col_size) {
  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2,
                     result, l2_dist);

  // First, get the L2 norm from every pixel to the control points,
  // Then, sum it and weight it. Finally, add the bias.
  void *thisNode = __hpvm__getNode();
  int row = __hpvm__getNodeInstanceID_x(thisNode);
  //  for (int row = 0; row < row_size; row++)
  for (int col = 0; col < col_size; col++) {
    float chan_val_0 = 0.0;
    float chan_val_1 = 0.0;
    float chan_val_2 = 0.0;
    for (int cp = 0; cp < 3702; cp++) {
      int index_0 = (0 * row_size + row) * col_size + col;
      int index_1 = (1 * row_size + row) * col_size + col;
      int index_2 = (2 * row_size + row) * col_size + col;
      float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
      float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
      float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
      float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
      float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
      float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
      float val = val1 * val2 + val3 * val4 + val5 * val6;
      float sqrt_val = sqrt(val);
      chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
      chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
      chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
    }
    chan_val_0 +=
        coefs[0 * CHAN_SIZE + 0] +
        coefs[1 * CHAN_SIZE + 0] *
            input[(0 * row_size + row) * col_size + col] +
        coefs[2 * CHAN_SIZE + 0] *
            input[(1 * row_size + row) * col_size + col] +
        coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
    chan_val_1 +=
        coefs[0 * CHAN_SIZE + 1] +
        coefs[1 * CHAN_SIZE + 1] *
            input[(0 * row_size + row) * col_size + col] +
        coefs[2 * CHAN_SIZE + 1] *
            input[(1 * row_size + row) * col_size + col] +
        coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
    chan_val_2 +=
        coefs[0 * CHAN_SIZE + 2] +
        coefs[1 * CHAN_SIZE + 2] *
            input[(0 * row_size + row) * col_size + col] +
        coefs[2 * CHAN_SIZE + 2] *
            input[(1 * row_size + row) * col_size + col] +
        coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
    result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
    result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
    result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
  }
  __hpvm__return(1, bytes_result);
}

// HPVM leaf node function, for tone mapping
void tone_map_fxp(float *input, size_t bytes_input, float *result,
                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
                  size_t row_size, size_t col_size) {
  __hpvm__hint(DEVICE);
  __hpvm__attributes(3, input, result, tone_map, 1, result);

  void *thisNode = __hpvm__getNode();
  int row = __hpvm__getNodeInstanceID_x(thisNode);
  for (int chan = 0; chan < CHAN_SIZE; chan++)
    //    for (int row = 0; row < row_size; row++)
    for (int col = 0; col < col_size; col++) {
      int index = (chan * row_size + row) * col_size + col;
      uint8_t x = input[index] * 255;
      result[index] = tone_map[x * CHAN_SIZE + chan];
    }
  __hpvm__return(1, bytes_result);
}

/********************************************************************/
/*** HPVM Internal node Functions - Determine the graph structure ***/
/********************************************************************/

// We create a wrapper node per leaf node - this is an implementation
// requirement for the FPGA backend . The CPU backend also supports this,
// so it does not cause a portability issue.

void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, float *result,
                       size_t bytes_result, size_t row_size, size_t col_size) {
  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(2, input, result, 1, result);

  // Create an 1D (specified by 1st argument) HPVM node with 1 dynamic
  // instance (last argument) associated with node function scale_fxp
  void *ScaleNode = __hpvm__createNodeND(1, scale_fxp, row_size);

  // Binds inputs of current node with specified node
  // - destination node
  // - argument position in argument list of function of source node
  // - argument position in argument list of function of destination node
  // - streaming (1) or non-streaming (0)
  __hpvm__bindIn(ScaleNode, 0, 0, 0); // bind input
  __hpvm__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input
  __hpvm__bindIn(ScaleNode, 2, 2, 0); // bind result
  __hpvm__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result
  __hpvm__bindIn(ScaleNode, 4, 4, 0); // bind row_size
  __hpvm__bindIn(ScaleNode, 5, 5, 0); // bind col_size

  // Similar to bindIn, but for the output. Output of a node is a struct, and
  // we consider the fields in increasing ordering.
  __hpvm__bindOut(ScaleNode, 0, 0, 0);
}

void descale_fxp_wrapper(float *input, size_t bytes_input, uint8_t *result,
                         size_t bytes_result, size_t row_size,
                         size_t col_size) {
  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(2, input, result, 1, result);
  void *DescaleNode = __hpvm__createNodeND(1, descale_fxp, row_size);
  __hpvm__bindIn(DescaleNode, 0, 0, 0); // bind input
  __hpvm__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input
  __hpvm__bindIn(DescaleNode, 2, 2, 0); // bind result
  __hpvm__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result
  __hpvm__bindIn(DescaleNode, 4, 4, 0); // bind row_size
  __hpvm__bindIn(DescaleNode, 5, 5, 0); // bind col_size
}

void demosaic_fxp_wrapper(float *input, size_t bytes_input, float *result,
                          size_t bytes_result, size_t row_size,
                          size_t col_size) {
  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(2, input, result, 1, result);
  void *DemosaicNode = __hpvm__createNodeND(1, demosaic_fxp, row_size);
  __hpvm__bindIn(DemosaicNode, 0, 0, 0); // bind input
  __hpvm__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input
  __hpvm__bindIn(DemosaicNode, 2, 2, 0); // bind result
  __hpvm__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result
  __hpvm__bindIn(DemosaicNode, 4, 4, 0); // bind row_size
  __hpvm__bindIn(DemosaicNode, 5, 5, 0); // bind col_size

  __hpvm__bindOut(DemosaicNode, 0, 0, 0);
}

void denoise_fxp_wrapper(float *input, size_t bytes_input, float *result,
                         size_t bytes_result, size_t row_size,
                         size_t col_size) {
  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(2, input, result, 1, result);
  void *DenoiseNode = __hpvm__createNodeND(1, denoise_fxp, row_size);
  __hpvm__bindIn(DenoiseNode, 0, 0, 0); // bind input
  __hpvm__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input
  __hpvm__bindIn(DenoiseNode, 2, 2, 0); // bind result
  __hpvm__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result
  __hpvm__bindIn(DenoiseNode, 4, 4, 0); // bind row_size
  __hpvm__bindIn(DenoiseNode, 5, 5, 0); // bind col_size

  __hpvm__bindOut(DenoiseNode, 0, 0, 0);
}

void transform_fxp_wrapper(float *input, size_t bytes_input, float *result,
                           size_t bytes_result, float *TsTw_tran,
                           size_t bytes_TsTw, size_t row_size,
                           size_t col_size) {
  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(3, input, result, TsTw_tran, 1, result);
  void *TransformNode = __hpvm__createNodeND(1, transform_fxp, row_size);
  __hpvm__bindIn(TransformNode, 0, 0, 0); // bind input
  __hpvm__bindIn(TransformNode, 1, 1, 0); // bind bytes_input
  __hpvm__bindIn(TransformNode, 2, 2, 0); // bind result
  __hpvm__bindIn(TransformNode, 3, 3, 0); // bind bytes_result
  __hpvm__bindIn(TransformNode, 4, 4, 0); // bind tstw
  __hpvm__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw
  __hpvm__bindIn(TransformNode, 6, 6, 0); // bind row_size
  __hpvm__bindIn(TransformNode, 7, 7, 0); // bind col_size

  __hpvm__bindOut(TransformNode, 0, 0, 0);
}

void gamut_fxp_wrapper(float *input, size_t bytes_input, float *result,
                       size_t bytes_result, float *ctrl_pts,
                       size_t bytes_ctrl_pts, float *weights,
                       size_t bytes_weights, float *coefs, size_t bytes_coefs,
                       float *l2_dist, size_t bytes_l2_dist, size_t row_size,
                       size_t col_size) {
  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1,
                     result);
  void *GamutNode = __hpvm__createNodeND(1, gamut_map_fxp, row_size);
  __hpvm__bindIn(GamutNode, 0, 0, 0);   // bind input
  __hpvm__bindIn(GamutNode, 1, 1, 0);   // bind bytes_input
  __hpvm__bindIn(GamutNode, 2, 2, 0);   // bind result
  __hpvm__bindIn(GamutNode, 3, 3, 0);   // bind bytes_result
  __hpvm__bindIn(GamutNode, 4, 4, 0);   // bind ctrl_pts
  __hpvm__bindIn(GamutNode, 5, 5, 0);   // bind bytes_ctrl_pts
  __hpvm__bindIn(GamutNode, 6, 6, 0);   // bind weights
  __hpvm__bindIn(GamutNode, 7, 7, 0);   // bind bytes_weights
  __hpvm__bindIn(GamutNode, 8, 8, 0);   // bind coefs
  __hpvm__bindIn(GamutNode, 9, 9, 0);   // bind bytes_coefs
  __hpvm__bindIn(GamutNode, 10, 10, 0); // bind l2_dist
  __hpvm__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist
  __hpvm__bindIn(GamutNode, 12, 12, 0); // bind row_size
  __hpvm__bindIn(GamutNode, 13, 13, 0); // bind col_size

  __hpvm__bindOut(GamutNode, 0, 0, 0);
}
void tone_map_fxp_wrapper(float *input, size_t bytes_input, float *result,
                          size_t bytes_result, float *tone_map,
                          size_t bytes_tone_map, size_t row_size,
                          size_t col_size) {

  __hpvm__hint(CPU_TARGET);
  __hpvm__attributes(3, input, result, tone_map, 1, result);
  void *ToneMapNode = __hpvm__createNodeND(1, tone_map_fxp, row_size);
  __hpvm__bindIn(ToneMapNode, 0, 0, 0); // bind input
  __hpvm__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input
  __hpvm__bindIn(ToneMapNode, 2, 2, 0); // bind result
  __hpvm__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result
  __hpvm__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map
  __hpvm__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map
  __hpvm__bindIn(ToneMapNode, 6, 6, 0); // bind row_size
  __hpvm__bindIn(ToneMapNode, 7, 7, 0); // bind col_size

  __hpvm__bindOut(ToneMapNode, 0, 0, 0);
}

/*** ROOT Node - Top Level of the Graph Hierarchy ***/
void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input,
                 /*2*/ uint8_t *result, /*3*/ size_t bytes_result,
                 /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled,
                 /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled,
                 /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out,
                 /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out,
                 /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out,
                 /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out,
                 /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw,
                 /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts,
                 /*20*/ float *weights, /*21*/ size_t bytes_weights,
                 /*22*/ float *coefs, /*23*/ size_t bytes_coefs,
                 /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist,
                 /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map,
                 /*28*/ size_t row_size, /*29*/ size_t col_size) {

  // Specifies compilation target for current node
  __hpvm__hint(CPU_TARGET);

  // Specifies pointer arguments that will be used as "in" and "out" arguments
  // - count of "in" arguments
  // - list of "in" argument , and similar for "out"
  __hpvm__attributes(14, input, result, input_scaled, result_scaled,
                     demosaic_out, denoise_out, transform_out, gamut_out, TsTw,
                     ctrl_pts, weights, coefs, tone_map, l2_dist, 5, result,
                     demosaic_out, denoise_out, transform_out, gamut_out);

  // Create an 0D (specified by 1st argument) HPVM node - so a single node
  // associated with node function ---_fxp_wrapper
  void *ScNode = __hpvm__createNodeND(0, scale_fxp_wrapper);
  void *DmNode = __hpvm__createNodeND(0, demosaic_fxp_wrapper);
  void *DnNode = __hpvm__createNodeND(0, denoise_fxp_wrapper);
  void *TrNode = __hpvm__createNodeND(0, transform_fxp_wrapper);
  void *GmNode = __hpvm__createNodeND(0, gamut_fxp_wrapper);
  void *TnNode = __hpvm__createNodeND(0, tone_map_fxp_wrapper);
  void *DsNode = __hpvm__createNodeND(0, descale_fxp_wrapper);

  // BindIn binds inputs of current node with specified node
  // - destination node
  // - argument position in argument list of function of source node
  // - argument position in argument list of function of destination node
  // - streaming (1) or non-streaming (0)

  // Edge transfers data between nodes within the same level of hierarchy.
  // - source and destination dataflow nodes
  // - edge type, all-all (1) or one-one(0)
  // - source position (in output struct of source node)
  // - destination position (in argument list of destination node)
  // - streaming (1) or non-streaming (0)

  // scale_fxp inputs
  __hpvm__bindIn(ScNode, 0, 0, 0);  // input -> ScNode:input
  __hpvm__bindIn(ScNode, 1, 1, 0);  // bytes_input -> ScNode:bytes_input
  __hpvm__bindIn(ScNode, 4, 2, 0);  // input_scaled -> ScNode:result
  __hpvm__bindIn(ScNode, 5, 3, 0);  // bytes_input_scaled -> ScNode:bytes_result
  __hpvm__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
  __hpvm__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size

  // demosaic_fxp inputs
  __hpvm__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
  __hpvm__edge(ScNode, DmNode, 1, 0, 1,
               0);                  // SCNode:bytes_result -> DmNode:bytes_input
  __hpvm__bindIn(DmNode, 8, 2, 0);  // demosaic_out -> DmNode:result
  __hpvm__bindIn(DmNode, 9, 3, 0);  // bytes_demosaic_out -> DmNode:bytes_result
  __hpvm__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size
  __hpvm__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size

  // denoise_fxp inputs
  __hpvm__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
  __hpvm__edge(DmNode, DnNode, 1, 0, 1,
               0);                  // DMNode:bytes_result -> DnNode:bytes_input
  __hpvm__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
  __hpvm__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
  __hpvm__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size
  __hpvm__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size

  // transform_fxp inputs
  __hpvm__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
  __hpvm__edge(DnNode, TrNode, 1, 0, 1,
               0);                  // DnNode:bytes_result -> TrNode:bytes_input
  __hpvm__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
  __hpvm__bindIn(TrNode, 13, 3,
                 0); // bytes_result_scaled -> TrNode:bytes_result
  __hpvm__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
  __hpvm__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
  __hpvm__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size
  __hpvm__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size

  // gamut_fxp inputs
  __hpvm__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
  __hpvm__edge(TrNode, GmNode, 1, 0, 1,
               0);                  // TrNode:bytes_result -> GmNode:bytes_input
  __hpvm__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
  __hpvm__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
  __hpvm__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
  __hpvm__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
  __hpvm__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
  __hpvm__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
  __hpvm__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
  __hpvm__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
  __hpvm__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
  __hpvm__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
  __hpvm__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size
  __hpvm__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size

  // tone_map_fxp inputs
  __hpvm__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
  __hpvm__edge(GmNode, TnNode, 1, 0, 1,
               0);                 // GmNode:bytes_result -> TnNode:bytes_input
  __hpvm__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
  __hpvm__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
  __hpvm__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
  __hpvm__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
  __hpvm__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size
  __hpvm__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size

  // descale_fxp inputs
  __hpvm__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
  __hpvm__edge(TnNode, DsNode, 1, 0, 1,
               0);                  // TnNode:bytes_result -> DsNode:bytes_input
  __hpvm__bindIn(DsNode, 2, 2, 0);  // result -> DsNode:result
  __hpvm__bindIn(DsNode, 3, 3, 0);  // bytes_result -> DsNode:bytes_result
  __hpvm__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
  __hpvm__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
}

int main(int argc, char *argv[]) {
  // Parse the arguments.
  arguments args;
  set_default_args(&args);
  argp_parse(&parser, argc, argv, 0, 0, &args);

  // Read a raw image.
  // NOTE: We deliberately perform this file I/O outside of the kernel.
  printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
  size_t row_size, col_size;
  uint8_t *image_in =
      read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);

  printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);

  // Allocate a buffer for storing the output image data.
  // (This is currently the same size as the input image data.)
  size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
  size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
  uint8_t *image_out = (uint8_t *)malloc_aligned(bytes_image);
  uint8_t *image_out_gamut = (uint8_t *)malloc_aligned(bytes_image);
  uint8_t *image_out_demosaic = (uint8_t *)malloc_aligned(bytes_image);
  uint8_t *image_out_denoise = (uint8_t *)malloc_aligned(bytes_image);
  uint8_t *image_out_transform = (uint8_t *)malloc_aligned(bytes_image);

  __hpvm__init();

  ///////////////////////////////////////////////////////////////
  // Camera Model Parameters
  ///////////////////////////////////////////////////////////////
  // Path to the camera model to be used
  //    char cam_model_path[100];
  //    char cam_model_path = "cam_models/NikonD7000/";
  // White balance index (select white balance from transform file)
  // The first white balance in the file has a wb_index of 1
  // For more information on model format see the readme
  int wb_index = 6;

  // Number of control points
  int num_ctrl_pts = 3702;
  uint8_t *input, *result;
  float *input_scaled, *result_scaled, *demosaic_out, *denoise_out,
      *transform_out, *gamut_out;
  float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;

  TsTw = get_TsTw(args.args[CAM_MODEL], wb_index);
  float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
  free(TsTw);
  TsTw = trans;
  ctrl_pts = get_ctrl_pts(args.args[CAM_MODEL], num_ctrl_pts);
  weights = get_weights(args.args[CAM_MODEL], num_ctrl_pts);
  coefs = get_coefs(args.args[CAM_MODEL], num_ctrl_pts);
  tone_map = get_tone_map(args.args[CAM_MODEL]);

  input_scaled = (float *)malloc_aligned(bytes_fimage);
  result_scaled = (float *)malloc_aligned(bytes_fimage);
  demosaic_out = (float *)malloc_aligned(bytes_fimage);
  denoise_out = (float *)malloc_aligned(bytes_fimage);
  transform_out = (float *)malloc_aligned(bytes_fimage);
  gamut_out = (float *)malloc_aligned(bytes_fimage);
  l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts);

  // This is host_input in cam_pipe()
  input = (uint8_t *)malloc_aligned(bytes_image);
  convert_hwc_to_chw(image_in, row_size, col_size, &input);

  // This is host_result in cam_pipe()
  result = (uint8_t *)malloc_aligned(bytes_image);

  // Allocate struct to pass DFG inputs
  RootIn *rootArgs = (RootIn *)malloc(sizeof(RootIn));

  // Set up HPVM DFG inputs in the rootArgs struct.
  rootArgs->input = input;
  rootArgs->bytes_input = bytes_image;

  rootArgs->result = result;
  rootArgs->bytes_result = bytes_image;

  rootArgs->input_scaled = input_scaled;
  rootArgs->bytes_input_scaled = bytes_fimage;

  rootArgs->result_scaled = result_scaled;
  rootArgs->bytes_result_scaled = bytes_fimage;

  rootArgs->demosaic_out = demosaic_out;
  rootArgs->bytes_demosaic_out = bytes_fimage;
  rootArgs->denoise_out = denoise_out;
  rootArgs->bytes_denoise_out = bytes_fimage;

  rootArgs->transform_out = transform_out;
  rootArgs->bytes_transform_out = bytes_fimage;

  rootArgs->gamut_out = gamut_out;
  rootArgs->bytes_gamut_out = bytes_fimage;

  rootArgs->TsTw = TsTw;
  rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);

  rootArgs->ctrl_pts = ctrl_pts;
  rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);

  rootArgs->weights = weights;
  rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);

  rootArgs->coefs = coefs;
  rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);

  rootArgs->tone_map = tone_map;
  rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);

  rootArgs->l2_dist = l2_dist;
  rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);

  rootArgs->row_size = row_size;
  rootArgs->col_size = col_size;

  // Memory tracking is required for pointer arguments.
  // Nodes can be scheduled on different targets, and
  // dataflow edge implementation needs to request data.
  // The pair (pointer, size) is inserted in memory tracker using this call
  llvm_hpvm_track_mem(input, bytes_image);
  llvm_hpvm_track_mem(result, bytes_image);
  llvm_hpvm_track_mem(input_scaled, bytes_fimage);
  llvm_hpvm_track_mem(result_scaled, bytes_fimage);
  llvm_hpvm_track_mem(demosaic_out, bytes_fimage);
  llvm_hpvm_track_mem(denoise_out, bytes_fimage);
  llvm_hpvm_track_mem(transform_out, bytes_fimage);
  llvm_hpvm_track_mem(gamut_out, bytes_fimage);
  llvm_hpvm_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float));
  llvm_hpvm_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
  llvm_hpvm_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
  llvm_hpvm_track_mem(coefs, 4 * CHAN_SIZE * sizeof(float));
  llvm_hpvm_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
  llvm_hpvm_track_mem(l2_dist, num_ctrl_pts * sizeof(float));

  printf("\n\nLaunching CAVA pipeline!\n");

  void *camPipeDFG = __hpvm__launch(0, CamPipeRoot, (void *)rootArgs);
  __hpvm__wait(camPipeDFG);

  printf("\n\nPipeline execution completed!\n");
  printf("\n\nRequesting memory!\n");

  // Request data from graph.
  llvm_hpvm_request_mem(result, bytes_image);
  llvm_hpvm_request_mem(demosaic_out, bytes_fimage);
  llvm_hpvm_request_mem(denoise_out, bytes_fimage);
  llvm_hpvm_request_mem(transform_out, bytes_fimage);
  llvm_hpvm_request_mem(gamut_out, bytes_fimage);
  printf("\n\nDone requesting memory!\n");

  uint8_t *gamut_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
  uint8_t *demosaic_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
  uint8_t *transform_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
  uint8_t *denoise_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image,
              row_size, col_size);
  descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image,
              row_size, col_size);
  descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image,
              row_size, col_size);
  descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image,
              row_size, col_size);

  convert_chw_to_hwc(result, row_size, col_size, &image_out);
  convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
  convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size,
                     &image_out_demosaic);
  convert_chw_to_hwc(denoise_out_descaled, row_size, col_size,
                     &image_out_denoise);
  convert_chw_to_hwc(transform_out_descaled, row_size, col_size,
                     &image_out_transform);

  // Remove tracked pointers.
  llvm_hpvm_untrack_mem(input);
  llvm_hpvm_untrack_mem(result);
  llvm_hpvm_untrack_mem(input_scaled);
  llvm_hpvm_untrack_mem(result_scaled);
  llvm_hpvm_untrack_mem(demosaic_out);
  llvm_hpvm_untrack_mem(denoise_out);
  llvm_hpvm_untrack_mem(transform_out);
  llvm_hpvm_untrack_mem(gamut_out);

  llvm_hpvm_untrack_mem(TsTw);
  llvm_hpvm_untrack_mem(ctrl_pts);
  llvm_hpvm_untrack_mem(weights);
  llvm_hpvm_untrack_mem(coefs);
  llvm_hpvm_untrack_mem(tone_map);
  llvm_hpvm_untrack_mem(l2_dist);

  // Output the image.
  // NOTE: We deliberately perform this file I/O outside of the kernel.
  char str[50], base_str[50];
  strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]);
  strcpy(str, base_str);
  strcat(str, ".bin");
  printf("Writing output image to %s\n", str);
  write_image_to_binary(str, image_out, row_size, col_size);
  strcpy(str, base_str);
  strcat(str, "_gamut.bin");
  printf("Writing output image to %s\n", str);
  write_image_to_binary(str, image_out_gamut, row_size, col_size);
  strcpy(str, base_str);
  strcat(str, "_demosaic.bin");
  printf("Writing output image to %s\n", str);
  write_image_to_binary(str, image_out_demosaic, row_size, col_size);
  strcpy(str, base_str);
  strcat(str, "_denoise.bin");
  printf("Writing output image to %s\n", str);
  write_image_to_binary(str, image_out_denoise, row_size, col_size);
  strcpy(str, base_str);
  strcat(str, "_transform.bin");
  printf("Writing output image to %s\n", str);
  write_image_to_binary(str, image_out_transform, row_size, col_size);

  __hpvm__cleanup();

  return 0;
}