diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..f6a82b3e3d84b98630f45bc1c95961ea6968d4e9
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/Makefile
@@ -0,0 +1,254 @@
+# This Makefile compiles the HPVM-CAVA pilot project. 
+# It builds HPVM-related dependencies, then the native camera pipeline ISP code.
+#
+# Paths to some dependencies (e.g., HPVM, LLVM) must exist in Makefile.config,
+# which can be copied from Makefile.config.example for a start.
+
+CONFIG_FILE := Makefile.config
+
+ifeq ($(wildcard $(CONFIG_FILE)),)
+    $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example)
+endif
+include $(CONFIG_FILE)
+
+# Compiler Flags
+
+DLEVEL ?= 0
+#CAM_CFLAGS?=-O0 \
+	#-Wno-psabi \
+        #-Wno-unused-label -Wno-unused-but-set-variable \
+        #-Wno-maybe-uninitialized -DARCHITECTURE=SMV -DTRANSPOSE_WEIGHTS=1 \
+        -DDEBUG_LEVEL=$(DLEVEL)
+LFLAGS += -lm -lrt
+
+# Build dirs
+ifeq ($(VERSION),)
+    VERSION = Default
+endif
+SRC_DIR = src/
+CAM_PIPE_SRC_DIR = $(SRC_DIR)
+BUILD_DIR = build/$(TARGET)_$(VERSION)
+CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+# Source files for the frontend camera pipeline
+COMMON_SRCS = main.c \
+	      utility.c
+
+
+CAM_PIPE_SRCS = load_cam_model.c \
+        cam_pipe_utility.c \
+	dma_interface.c # FIXME: This is a hack until external C++ files can be included in build.
+
+
+# NOTE: We have temporarily removed gem5 and other dependencies for simplicity.
+SRCS = $(COMMON_SRCS) $(CAM_PIPE_SRCS)
+#GEM5_DMA_SRC = gem5/dma_interface.c
+#GEM5_SYS_SRCS = gem5/aladdin_sys_connection.cpp gem5/aladdin_sys_constants.cpp
+#GEM5_UTIL_SRCS = ../../util/m5/m5op_x86.S
+
+# NATIVE_FULL_PATH_SRCS contains all the full path source files for the camera vision pipeline.
+NATIVE_FULL_PATH_SRCS = $(patsubst %, $(SRC_DIR)/%, $(COMMON_SRCS))
+NATIVE_FULL_PATH_SRCS += $(patsubst %, $(CAM_PIPE_SRC_DIR)/%, $(CAM_PIPE_SRCS))
+
+# GEM5_FULL_PATH_SRCS contains all the full path source files for the gem5 files.
+#GEM5_FULL_PATH_SRCS = $(patsubst %, $(ALADDIN_HOME)/%, $(GEM5_DMA_SRC) $(GEM5_SYS_SRCS) $(GEM5_UTIL_SRCS))
+
+#INCLUDES += -I$(ALADDIN_HOME) \
+#			-I$(ALADDIN_HOME)/../../include \
+#			-I$(ALADDIN_HOME)/gem5 
+INCLUDES +=  -I$(SRC_DIR) \
+            -I$(CAM_PIPE_SRC_DIR) \
+
+ifneq ($(CONFUSE_ROOT),)
+INCLUDES += -I$(CONFUSE_ROOT)/include
+LFLAGS += -L$(CONFUSE_ROOT)/lib
+endif
+
+EXE = cava-visc-$(VERSION)-$(TARGET)
+
+CAM_CFLAGS += -mf16c -flax-vector-conversions
+LFLAGS += -pthread
+
+#$(DEBUG): $(NATIVE_FULL_PATH_SRCS) $(GEM5_FULL_PATH_SRCS)
+#	@echo Building benchmark for native machine with debug support.
+#	#@mkdir -p $(BUILD_DIR)
+#	@$(CC) $(CAM_CFLAGS) -ggdb3 $(INCLUDES) -DGEM5 -DDMA_MODE -DDMA_INTERFACE_V3 -o $(DEBUG) $^ $(LFLAGS)
+#
+#clean-native:
+#	rm -f $(NATIVE) $(DEBUG)
+
+## BEGIN HPVM MAKEFILE
+LANGUAGE=visc
+SRCDIR_OBJS= load_cam_model.ll cam_pipe_utility.ll dma_interface.ll utility.ll
+OBJS_SRC=src/cam_pipe.c src/pipe_stages.c src/load_cam_model.c src/cam_pipe_utility.c src/dma_interface.c src/utility.c
+VISC_OBJS=main.visc.ll
+APP = $(EXE)
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3
+APP_CXXFLAGS=-ffast-math -O0 -I/opt/opencv/include
+APP_LDFLAGS=$(LFLAGS)
+OPT_FLAGS = -tti -targetlibinfo -tbaa -scoped-noalias -assumption-cache-tracker -profile-summary-info -forceattrs -inferattrs -ipsccp -globalopt -domtree -mem2reg -deadargelim -domtree -basicaa -aa -simplifycfg -pgo-icall-prom -basiccg -globals-aa -prune-eh -always-inline -functionattrs -domtree -sroa -early-cse -lazy-value-info -jump-threading -correlated-propagation -simplifycfg -domtree -basicaa -aa -libcalls-shrinkwrap -tailcallelim -simplifycfg -reassociate -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -licm -loop-unswitch -simplifycfg -domtree -basicaa -aa -loops -loop-simplify -lcssa-verification -lcssa -scalar-evolution -indvars -loop-idiom -loop-deletion -memdep -memcpyopt -sccp -domtree -demanded-bits -bdce -basicaa -aa -lazy-value-info -jump-threading -correlated-propagation -domtree -basicaa -aa -memdep -dse -loops -loop-simplify -lcssa-verification -lcssa -aa -scalar-evolution -licm -postdomtree -adce -simplifycfg -domtree -basicaa -aa -barrier -basiccg -rpo-functionattrs -globals-aa -float2int -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -loop-accesses -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-distribute -loop-simplify -lcssa-verification -lcssa -branch-prob -block-freq -scalar-evolution -basicaa -aa -loop-accesses -demanded-bits -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-vectorize -loop-simplify -scalar-evolution -aa -loop-accesses -loop-load-elim -basicaa -aa -simplifycfg -domtree -basicaa -aa -loops -scalar-evolution -alignment-from-assumptions -strip-dead-prototypes -domtree -loops -branch-prob -block-freq -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -branch-prob -block-freq -loop-sink -instsimplify 
+
+CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
+OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
+CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
+LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
+
+LIBCLC_LIB_PATH = $(LLVM_SRC_ROOT)/../libclc/built_libs
+VISC_RT_PATH = $(LLVM_SRC_ROOT)/projects/visc-rt
+
+VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll
+#LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx--nvidiacl.bc
+LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx64--nvidiacl.bc
+#LIBCLC_NVPTX_LIB = nvptx64--nvidiacl.bc
+
+LLVM_34_AS = $(LLVM_34_ROOT)/build/bin/llvm-as
+
+TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
+KERNEL_GEN_FLAGS = -O3 -target nvptx64-nvidia-nvcl
+
+ifeq ($(TARGET),x86)
+  DEVICE = SPIR_TARGET
+  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG
+  CFLAGS += -DOPENCL_CPU
+  VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir
+else ifeq ($(TARGET),seq)
+  DEVICE = CPU_TARGET
+  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
+  VISC_OPTFLAGS += -visc-timers-x86
+else ifeq ($(TARGET),fpga)
+  DEVICE = FPGA_TARGET
+  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_FPGA.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-fpga -dfg2llvm-x86 -clearDFG
+  CFLAGS += -DOPENCL_CPU
+  VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-fpga
+else
+  DEVICE = GPU_TARGET
+  VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG
+  VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+endif
+  TESTGEN_OPTFLAGS += -visc-timers-gen
+
+CFLAGS += -DDEVICE=$(DEVICE)
+CXXFLAGS += -DDEVICE=$(DEVICE)
+
+#ifeq ($(TIMER),x86)
+#  VISC_OPTFLAGS += -visc-timers-x86
+#else ifeq ($(TIMER),ptx)
+#  VISC_OPTFLAGS += -visc-timers-ptx
+#else ifeq ($(TIMER),gen)
+#  TESTGEN_OPTFLAGS += -visc-timers-gen
+#else ifeq ($(TIMER),spir)
+#  TESTGEN_OPTFLAGS += -visc-timers-spir
+#else ifeq ($(TIMER),fpga)
+#  TESTGEN_OPTFLAGS += -visc-timers-fpga
+#else ifeq ($(TIMER),no)
+#else
+#  ifeq ($(TARGET),x86)
+#    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir
+#  else ifeq ($(TARGET),seq)
+#    VISC_OPTFLAGS += -visc-timers-x86
+#  else ifeq ($(TARGET),fpga)
+#    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-fpga
+#  else ifeq ($(TARGET),seqx86)
+#    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir
+#  else ifeq ($(TARGET),seqgpu)
+#    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+#  else
+#    VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx
+#  endif
+#  TESTGEN_OPTFLAGS += -visc-timers-gen
+#endif
+
+# Add BUILDDIR as a prefix to each element of $1
+INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
+
+# Add SRCDIR as a prefix to each element of $1
+#INSRCDIR=$(addprefix $(SRCDIR)/,$(1))
+
+PYTHON_LLVM_40_34 = ../llvm-40-34.py
+
+.PRECIOUS: $(BUILD_DIR)/%.ll
+
+OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
+TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
+KERNEL = $(TEST_OBJS).kernels.ll
+
+ifeq ($(TARGET),x86)
+  SPIR_ASSEMBLY = $(TEST_OBJS).kernels.bc
+else ifeq ($(TARGET),seq)
+else ifeq ($(TARGET),fpga)
+  AOC_CL = $(TEST_OBJS).kernels.cl
+  AOCL_ASSEMBLY = $(TEST_OBJS).kernels.aocx
+  BOARD = a10gx
+  ifeq ($(EMULATION),1)
+    EXE = cava-visc-emu
+    AOC_EMU = -march=emulator
+    BUILD_DIR = build/$(TARGET)-emu
+  endif
+else
+  KERNEL_LINKED = $(BUILD_DIR)/$(APP).kernels.linked.ll
+  #KERNEL = $(TEST_OBJS).kernels.ll
+  PTX_ASSEMBLY = $(TEST_OBJS).nvptx.s
+endif
+
+HOST_LINKED = $(BUILD_DIR)/$(APP).linked.ll
+HOST = $(BUILD_DIR)/$(APP).host.ll
+
+ifeq ($(OPENCL_PATH),)
+FAILSAFE=no_opencl
+else 
+FAILSAFE=
+endif
+
+# Targets
+default: $(FAILSAFE) $(BUILD_DIR) $(EXE)
+#default: $(FAILSAFE) $(BUILD_DIR) $(PTX_ASSEMBLY) $(SPIR_ASSEMBLY) $(AOC_CL) $(AOCL_ASSEMBLY) $(EXE)
+
+$(PTX_ASSEMBLY) : $(KERNEL_LINKED)
+	$(CC) $(KERNEL_GEN_FLAGS) -S $< -o $@
+
+$(KERNEL_LINKED) : $(KERNEL)
+	$(LLVM_LINK) $(LIBCLC_NVPTX_LIB) -S $< -o $@
+
+$(SPIR_ASSEMBLY) : $(KERNEL)
+	python $(PYTHON_LLVM_40_34) $< $(BUILD_DIR)/kernel_34.ll
+	$(LLVM_34_AS) $(BUILD_DIR)/kernel_34.ll -o $@
+
+$(AOCL_ASSEMBLY) : $(AOC_CL)
+	aoc --report $(AOC_EMU) $(AOC_CL) -o $(AOCL_ASSEMBLY) -board=$(BOARD)
+
+$(AOC_CL) : $(KERNEL)
+	llvm-cbe --debug $(KERNEL)
+
+$(EXE) : $(HOST_LINKED)
+	$(CXX) -O3 $(LDFLAGS) $< -o $@
+
+$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
+	$(LLVM_LINK) $^ -S -o $@
+
+$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp
+	make -C $(LLVM_LIB_PATH)
+
+$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
+	$(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
+#	mv *.ll $(BUILD_DIR) 
+#	$(OPT) -debug-only=DFG2LLVM_SPIR,DFG2LLVM_X86,DFG2LLVM_FPGA,GENVISC $(VISC_OPTFLAGS) -S $< -o $(HOST)
+#$(OBJS): $(OBJS_SRC)
+#	$(CC) $(OBJS_CFLAGS) -emit-llvm -S -o $@ $<
+
+$(BUILD_DIR):
+	mkdir -p $(BUILD_DIR)
+
+$(BUILD_DIR)/%.ll : $(SRC_DIR)/%.c
+	$(CC) $(OBJS_CFLAGS) -emit-llvm -S -o $@ $<
+
+$(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c
+	$(CC) $(CFLAGS) -emit-llvm -S -o $@ $<
+
+#$(BUILD_DIR)/main.opt.ll : $(BUILD_DIR)/main.ll
+#	$(OPT) $(OPT_FLAGS) $< -S -o $@
+
+$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
+	$(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@
+
+## END HPVM MAKEFILE
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam-vision-native b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam-vision-native
new file mode 100755
index 0000000000000000000000000000000000000000..938e1b1a15d86e71b9b044e594472dad10fd9bc2
Binary files /dev/null and b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam-vision-native differ
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.c
new file mode 100644
index 0000000000000000000000000000000000000000..7874ff9d529afebc40d1660637e85b3a1e00f23e
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.c
@@ -0,0 +1,139 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "pipe_stages.h"
+#include "load_cam_model.h"
+#include "cam_pipe_utility.h"
+#include "dma_interface.h"
+#ifdef DMA_MODE
+#include "gem5_harness.h"
+#endif
+
+// FIXME: Include gem5/dma_interface.cc/h separately
+#ifndef DMA_INTERFACE_V3
+#define DMA_INTERFACE_V3
+#endif//DMA_INTERFACE_V3
+
+///////////////////////////////////////////////////////////////
+// Camera Model Parameters
+///////////////////////////////////////////////////////////////
+
+// Path to the camera model to be used
+char cam_model_path[100];
+
+// White balance index (select white balance from transform file)
+// The first white balance in the file has a wb_index of 1
+// For more information on model format see the readme
+int wb_index = 6;
+
+// Number of control points
+int num_ctrl_pts = 3702;
+
+void load_cam_params_hw(float *host_TsTw, float *host_ctrl_pts,
+                        float *host_weights, float *host_coefs,
+                        float *host_tone_map, float *acc_TsTw,
+                        float *acc_ctrl_pts, float *acc_weights,
+                        float *acc_coefs, float *acc_tone_map) {
+  dmaLoad(acc_TsTw, host_TsTw, 9 * sizeof(float));
+  dmaLoad(acc_ctrl_pts, host_ctrl_pts,
+          num_ctrl_pts * CHAN_SIZE * sizeof(float));
+  dmaLoad(acc_weights, host_weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+  dmaLoad(acc_coefs, host_coefs, 4 * CHAN_SIZE * sizeof(float));
+  dmaLoad(acc_tone_map, host_tone_map, 256 * CHAN_SIZE * sizeof(float));
+}
+
+void isp_hw(uint8_t *host_input, uint8_t *host_result, int row_size,
+            int col_size, uint8_t *acc_input, uint8_t *acc_result,
+            float *acc_input_scaled, float *acc_result_scaled, float *acc_TsTw,
+            float *acc_ctrl_pts, float *acc_weights, float *acc_coefs,
+            float *acc_tone_map, float *acc_l2_dist) {
+  dmaLoad(acc_input, host_input,
+          row_size * col_size * CHAN_SIZE * sizeof(uint8_t));
+  scale_fxp(acc_input, row_size, col_size, acc_input_scaled);
+  demosaic_fxp(acc_input_scaled, row_size, col_size, acc_result_scaled);
+  denoise_fxp(acc_result_scaled, row_size, col_size, acc_input_scaled);
+  transform_fxp(acc_input_scaled, row_size, col_size, acc_result_scaled,
+                acc_TsTw);
+  gamut_map_fxp(acc_result_scaled, row_size, col_size, acc_input_scaled,
+                acc_ctrl_pts, acc_weights, acc_coefs, acc_l2_dist);
+  tone_map_fxp(acc_input_scaled, row_size, col_size, acc_tone_map,
+               acc_result_scaled);
+  // tone_map_approx_fxp(acc_input_scaled, row_size, col_size,
+  // acc_result_scaled);
+  descale_fxp(acc_result_scaled, row_size, col_size, acc_result);
+  dmaStore(host_result, acc_result,
+           row_size * col_size * CHAN_SIZE * sizeof(uint8_t));
+}
+
+void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
+              int col_size) {
+  uint8_t *acc_input, *acc_result;
+  float *acc_input_scaled, *acc_result_scaled;
+  float *host_TsTw, *host_ctrl_pts, *host_weights, *host_coefs, *host_tone_map;
+  float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map, *acc_l2_dist;
+
+  strcat(cam_model_path, "cam_models/NikonD7000/");
+
+  host_TsTw = get_TsTw(cam_model_path, wb_index);
+  float *trans = transpose_mat(host_TsTw, CHAN_SIZE, CHAN_SIZE);
+  free(host_TsTw);
+  host_TsTw = trans;
+  host_ctrl_pts = get_ctrl_pts(cam_model_path, num_ctrl_pts);
+  host_weights = get_weights(cam_model_path, num_ctrl_pts);
+  host_coefs = get_coefs(cam_model_path, num_ctrl_pts);
+  host_tone_map = get_tone_map(cam_model_path);
+
+  acc_input = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
+  acc_result = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
+  acc_input_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
+  acc_result_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
+  acc_TsTw = (float*) malloc_aligned(sizeof(float) * 9);
+  acc_ctrl_pts = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  acc_weights = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  acc_coefs = (float*) malloc_aligned(sizeof(float) * 12);
+  acc_tone_map = (float*) malloc_aligned(sizeof(float) * 256 * CHAN_SIZE);
+  acc_l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts);
+
+  // Load camera model parameters for the ISP
+  MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw,
+                     sizeof(float) * 9);
+  MAP_ARRAY_TO_ACCEL(ISP, "host_ctrl_pts", host_ctrl_pts,
+                     sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  MAP_ARRAY_TO_ACCEL(ISP, "host_weights", host_weights,
+                     sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  MAP_ARRAY_TO_ACCEL(ISP, "host_coefs", host_coefs,
+                     sizeof(float) * 4 * CHAN_SIZE);
+  MAP_ARRAY_TO_ACCEL(ISP, "host_tone_map", host_tone_map,
+                     sizeof(float) * 256 * CHAN_SIZE);
+  INVOKE_KERNEL(ISP, load_cam_params_hw, host_TsTw, host_ctrl_pts, host_weights,
+                host_coefs, host_tone_map, acc_TsTw, acc_ctrl_pts, acc_weights,
+                acc_coefs, acc_tone_map);
+
+  // Invoke the ISP
+  MAP_ARRAY_TO_ACCEL(ISP, "host_input", host_input,
+                     sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
+  MAP_ARRAY_TO_ACCEL(ISP, "host_result", host_result,
+                     sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
+  INVOKE_KERNEL(ISP, isp_hw, host_input, host_result, row_size, col_size,
+                acc_input, acc_result, acc_input_scaled, acc_result_scaled,
+                acc_TsTw, acc_ctrl_pts, acc_weights, acc_coefs, acc_tone_map,
+                acc_l2_dist);
+
+  free(acc_input);
+  free(acc_result);
+  free(acc_input_scaled);
+  free(acc_result_scaled);
+  free(host_TsTw);
+  free(host_ctrl_pts);
+  free(host_weights);
+  free(host_coefs);
+  free(host_tone_map);
+  free(acc_TsTw);
+  free(acc_ctrl_pts);
+  free(acc_weights);
+  free(acc_coefs);
+  free(acc_tone_map);
+  free(acc_l2_dist);
+}
+
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.h
new file mode 100644
index 0000000000000000000000000000000000000000..83a77e01a67886af64902eacfd5af69a0a8d48b0
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.h
@@ -0,0 +1,7 @@
+#ifndef _CAM_PIPE_H_
+#define _CAM_PIPE_H_
+
+void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
+              int col_size);
+
+#endif
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.c
new file mode 100644
index 0000000000000000000000000000000000000000..f806e9ee1a2e288fabcb8ad658a47c3919fbb661
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.c
@@ -0,0 +1,82 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "cam_pipe_utility.h"
+//#include "pipe_stages.h"
+
+uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size) {
+  uint8_t *image;
+  FILE *fp = fopen(file_path, "r");
+  int chan_size;
+  if (fread(row_size, sizeof(int), 1, fp) != 1)
+    assert("Failed to read row size from binary file!");
+  if (fread(col_size, sizeof(int), 1, fp) != 1)
+    assert("Failed to read col size from binary file!");
+  if (fread(&chan_size, sizeof(int), 1, fp) != 1)
+    assert("Failed to read row size from binary file!");
+  assert(chan_size == CHAN_SIZE && "Channel size read from the binary file "
+                                   "doesn't equal to the default value!\n");
+
+  int size = *row_size * *col_size * CHAN_SIZE;
+  image = malloc_aligned(sizeof(uint8_t) * size);
+  if (fread(image, sizeof(uint8_t), size, fp) != size)
+    assert("Failed to read the image from binary file!");
+  fclose(fp);
+  return image;
+}
+
+void write_image_to_binary(char *file_path, uint8_t *image, int row_size, int col_size) {
+  FILE *fp = fopen(file_path, "w");
+
+  int shape[3] = { row_size, col_size, CHAN_SIZE };
+  fwrite(shape, sizeof(int), 3, fp);
+
+  int size = row_size * col_size * CHAN_SIZE;
+  fwrite(image, sizeof(uint8_t), size, fp);
+  fclose(fp);
+}
+
+float *transpose_mat(float *inmat, int width, int height) {
+  // Define vectors
+  float *outmat;
+  int err =
+      posix_memalign((void **)&outmat, CACHELINE_SIZE, sizeof(float) * height * width);
+  assert(err == 0 && "Failed to allocate memory!");
+
+  // Transpose the matrix
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      outmat[j * height + i] = inmat[i * width + j];
+    }
+  }
+  return outmat;
+}
+
+void convert_hwc_to_chw(uint8_t *input, int row_size, int col_size,
+                        uint8_t **result) {
+  if (*result == NULL) {
+    *result = (uint8_t *)malloc_aligned(row_size * col_size * CHAN_SIZE *
+                                        sizeof(uint8_t));
+  }
+  ARRAY_3D(uint8_t, _input, input, col_size, CHAN_SIZE);
+  ARRAY_3D(uint8_t, _result, *result, row_size, col_size);
+  for (int h = 0; h < row_size; h++)
+    for (int w = 0; w < col_size; w++)
+      for (int c = 0; c < CHAN_SIZE; c++)
+        _result[c][h][w] = _input[h][w][c];
+}
+
+void convert_chw_to_hwc(uint8_t *input, int row_size, int col_size,
+                        uint8_t **result) {
+  if (*result == NULL) {
+    *result = (uint8_t *)malloc_aligned(row_size * col_size * CHAN_SIZE *
+                                      sizeof(uint8_t));
+  }
+  ARRAY_3D(uint8_t, _input, input, row_size, col_size);
+  ARRAY_3D(uint8_t, _result, *result, col_size, CHAN_SIZE);
+  for (int c = 0; c < CHAN_SIZE; c++)
+    for (int h = 0; h < row_size; h++)
+      for (int w = 0; w < col_size; w++)
+        _result[h][w][c] = _input[c][h][w];
+}
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4fb6cde0c438b23c2b596cf0418953aaedca501
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.h
@@ -0,0 +1,16 @@
+#ifndef _CAM_PIPE_UTILITY_H_
+#define _CAM_PIPE_UTILITY_H_
+
+#include "utility.h"
+#include "pipe_stages.h"
+
+uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size);
+void write_image_to_binary(char *file_path, uint8_t *image, int row_size,
+                           int col_size);
+float *transpose_mat(float *inmat, int width, int height);
+void convert_hwc_to_chw(uint8_t *input, int row_size, int col_size,
+                        uint8_t **result);
+void convert_chw_to_hwc(uint8_t *input, int row_size, int col_size,
+                        uint8_t **result);
+
+#endif
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/defs.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccc8acc857c36fd13115670932a38dc3a406dc29
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/defs.h
@@ -0,0 +1,224 @@
+#ifndef _COMMON_DEFS_H_
+#define _COMMON_DEFS_H_
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long uint64_t;
+
+#define CACHELINE_SIZE 64
+
+// Debugging message macros.
+#if DEBUG_LEVEL >= 1
+  #define INFO_MSG(args...) printf(args)
+
+  #if DEBUG_LEVEL >= 2
+    #define PRINT_MSG(args...) printf(args)
+    #define PRINT_DEBUG(hid, rows, cols, num_cols)                                 \
+        print_debug(hid, rows, cols, num_cols)
+    #define PRINT_DEBUG4D(hid, rows, cols, height)                                 \
+        print_debug4d(hid, rows, cols, height)
+    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)                       \
+        print_debug4d_fp16(hid, num, height, rows, cols)
+
+    #if DEBUG_LEVEL >= 3
+      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)                               \
+          print_debug(hid, rows, cols, num_cols)
+      #define PRINT_DEBUG4D_V(hid, rows, cols, height)                               \
+          print_debug4d(hid, rows, cols, height)
+      #define PRINT_MSG_V(args...) printf(args)
+    #else
+      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)
+      #define PRINT_DEBUG4D_V(hid, rows, cols, height)
+      #define PRINT_MSG_V(args...)
+    #endif
+  #else
+    #define PRINT_MSG(args...)
+    #define PRINT_DEBUG(hid, rows, cols, num_cols)
+    #define PRINT_DEBUG4D(hid, rows, cols, height)
+    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+    #define PRINT_DEBUG_V(hid, rows, cols, height)
+    #define PRINT_DEBUG4D_V(hid, rows, cols, height)
+    #define PRINT_MSG_V(args...)
+  #endif
+#else
+  #define INFO_MSG(args...)
+  #define PRINT_DEBUG(hid, rows, cols, num_cols)
+  #define PRINT_DEBUG4D(hid, rows, cols, height)
+  #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+  #define PRINT_MSG(args...)
+  #define PRINT_DEBUG_V(hid, rows, cols, height)
+  #define PRINT_DEBUG4D_V(hid, rows, cols, height)
+  #define PRINT_MSG_V(args...)
+#endif
+
+#define STRING(arg) #arg
+
+// This is to avoid a ton of spurious unused variable warnings when
+// we're not building for gem5.
+#define UNUSED(x) (void)(x)
+
+// Macros for computing the maximum of a group of elements.
+//
+// Why macros and not functions (or a loop)? A loop takes O(n) cycles to
+// compute the maximum, when it could be done in O(log n) time with a tree
+// based implementation. But Aladdin regards function calls as a hard
+// dependency that it does not optimize across, so we would not get the
+// parallelism we expect from the tree. Thus, these must be macros.
+//
+// I've only implemented a few of these. These are only meant for the pooling
+// layers, and we shouldn't need more than a 3x3 pooling layer anyways.
+#define max2(A, B) (((A) > (B)) ? (A) : (B))
+#define max3(e0, e1, e2) max2(max2(e0, e1), e2)
+#define max4(e0, e1, e2, e3) max2(max2(e0, e1), max2(e2, e3))
+#define max8(e0, e1, e2, e3, e4, e5, e6, e7)                                   \
+    max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7))
+#define max9(e0, e1, e2, e3, e4, e5, e6, e7, e8)                               \
+    max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8)
+
+#define min2(A, B) (((A) < (B)) ? (A) : (B))
+
+#define FRAC_CEIL(A, B) ((A) / (B) + ((A) % (B) != 0))
+// Convenience macros to switch between invoking an accelerator (if building a
+// binary for gem5) or just calling the kernel function in software.
+//
+// Usage:
+//
+//  These macros expand differently based on whether the GEM5_HARNESS macro is
+//  defined. If so, then this binary is meant to be run under gem5, invoking
+//  accelerators; if not, this binary should run the pure software version of
+//  the accelerated kernels.
+//
+//  If GEM5_HARNESS is defined:
+//
+//     MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize)
+//        ===>   mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr, mySize)
+//
+//     INVOKE_KERNEL(myReqCode, kernelFuncName, args...)
+//        ===>   invokeAcceleratorAndBlock(myReqCode)
+//
+//  Otherwise:
+//     MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize)
+//        expands to nothing
+//
+//     INVOKE_KERNEL(myReqCode, kernelFuncName, args...)
+//        ===>  kernelFuncName(args)
+//
+#ifdef GEM5_HARNESS
+
+#define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
+    mapArrayToAccelerator(req_code, name, base_addr, size)
+#define INVOKE_KERNEL(req_code, kernel_ptr, args...)                           \
+    do {                                                                       \
+        UNUSED(kernel_ptr);                                                    \
+        invokeAcceleratorAndBlock(req_code);                                   \
+    } while (0)
+#define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
+    do {                                                                       \
+        UNUSED(kernel_ptr);                                                    \
+        invokeAcceleratorAndReturn2(req_code, finish_flag);                    \
+    } while (0)
+
+#define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, false, false);       \
+    } while (0)
+#define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, true, false);        \
+    } while (0)
+#define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, false, true);        \
+    } while (0)
+#define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, true, true);         \
+    } while (0)
+
+#else
+
+#define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
+    do {                                                                       \
+        INFO_MSG("Mapping array %s @ %p, size %d.\n",                          \
+                 name, (void*)base_addr, (int)(size));                         \
+        UNUSED(req_code);                                                      \
+        UNUSED(name);                                                          \
+        UNUSED(base_addr);                                                     \
+        UNUSED(size);                                                          \
+    } while (0)
+#define INVOKE_KERNEL(req_code, kernel_ptr, args...) kernel_ptr(args)
+#define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
+    kernel_ptr(args)
+#define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
+#define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
+#define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
+#define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
+
+#endif
+
+// Simplified version of MAP_ARRAY_TO_ACCEL.
+//
+// This assumes that the current name of the base pointer is also the name of
+// the array in the top level function of the dynamic trace. THIS IS VERY
+// IMPORTANT - if the argument passed to a top level function has been renamed in
+// the function, then this WILL NOT WORK!
+//
+// MAP_ARRAY(myReqCode, myArray, mySize)
+//    ===>   MAP_ARRAY_TO_ACCEL(myReqCode, "myArray", myArray, mySize)
+#define MAP_ARRAY(req_code, name_and_base_addr, size)                          \
+    MAP_ARRAY_TO_ACCEL(                                                        \
+            req_code, STRING(name_and_base_addr), name_and_base_addr, size)
+
+// Use these convenience macros to cast a raw pointer into a multidimensional
+// variable-length array, which lets us use [] notation inside of the ugly
+// sub2ind syntax!
+//
+// Usage:
+//   If we have an array like array[5][4]:
+//      ARRAY_2D(TYPE, output_name, array, 4);
+//
+//   If we have an array like array[5][4][3]:
+//      ARRAY_3D(TYPE, output_name, array, 4, 3);
+//
+//   If we have an array like array[5][4][3][2]
+//      ARRAY_4D(TYPE, output_name, array, 4, 3, 2);
+//
+//   And so on...
+#define ARRAY_1D(TYPE, output_array_name, input_array_name)                    \
+    TYPE* output_array_name = (TYPE*)input_array_name
+
+#define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1)             \
+    TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
+
+#define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2)      \
+    TYPE(*output_array_name)[DIM_1][DIM_2] =                                   \
+        (TYPE(*)[DIM_1][DIM_2])input_array_name
+
+#define ARRAY_4D(                                                              \
+    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3)            \
+        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3] =                        \
+            (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name
+
+#define ARRAY_5D(                                                              \
+    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3, DIM_4)     \
+        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3][DIM_4] =                 \
+            (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name
+
+#endif
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.c
new file mode 100644
index 0000000000000000000000000000000000000000..81bce54469886153170f994a77250a784cc9b7d7
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.c
@@ -0,0 +1,73 @@
+#include <assert.h>
+#include <string.h>
+#include "dma_interface.h"
+
+// All _dmaImplN functions must be always inlined or we'll get extra functions
+// in the trace.
+
+#if defined(DMA_INTERFACE_V3)
+
+// Starting with version 3, all versioning will be distinguished by the return
+// value of the DMA functions.
+
+__attribute__((__always_inline__))
+int _dmaImpl3(void* dst_addr, void* src_addr, size_t size) {
+  assert(size > 0);
+  memmove(dst_addr, src_addr, size);
+  return 3;
+}
+
+int dmaLoad(void* dst_addr, void* src_host_addr, size_t size) {
+  return _dmaImpl3(dst_addr, src_host_addr, size);
+}
+
+int dmaStore(void* dst_host_addr, void* src_addr, size_t size) {
+  return _dmaImpl3(dst_host_addr, src_addr, size);
+}
+
+int setReadyBits(void* start_addr, size_t size, unsigned value) {
+  asm("");
+  return 0;
+}
+
+#elif defined(DMA_INTERFACE_V2)
+
+// With version 2 and earlier, we return (void*)NULL and use the number of
+// function arguments to distinguish the DMA functions.
+
+__attribute__((__always_inline__))
+void* _dmaImpl2(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
+  assert(size > 0);
+  memmove(base_addr + dst_off, base_addr + src_off, size);
+  return NULL;
+}
+
+void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
+  return _dmaImpl2(base_addr, src_off, dst_off, size);
+}
+
+void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
+  return _dmaImpl2(base_addr, src_off, dst_off, size);
+}
+
+#else
+
+__attribute__((__always_inline__))
+void* _dmaImpl1(void* base_addr, size_t offset, size_t size) {
+  assert(size > 0);
+  asm("");
+  return NULL;
+}
+
+void* dmaLoad(void* addr, size_t offset, size_t size) {
+  return _dmaImpl1(addr, offset, size);
+}
+
+void* dmaStore(void* addr, size_t offset, size_t size) {
+  return _dmaImpl1(addr, offset, size);
+}
+#endif
+
+void dmaFence() {
+  asm("");
+}
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..f23234eede4df99db84b144646530dfe240c6e62
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.h
@@ -0,0 +1,44 @@
+#ifndef __DMA_INTERFACE_H__
+#define __DMA_INTERFACE_H__
+
+#include <stddef.h>
+
+#define PAGE_SIZE 4096
+
+#if defined(DMA_INTERFACE_V3)
+
+// Version 3 of the DMA interface enables memcpy operations from arbitrary
+// source and destination addresses.
+
+int dmaLoad(void* dst_addr, void* src_host_addr, size_t size);
+int dmaStore(void* dst_host_addr, void* src_addr, size_t size);
+
+// The user can explicitly toggle the state of ready bits, if ready mode is
+// enabled. This requires support from DMA v3.
+int setReadyBits(void* start_addr, size_t size, unsigned value);
+
+#elif defined(DMA_INTERFACE_V2)
+
+#warning "DMA interface v2 is deprecated!"
+
+// Version 2 of the DMA interface separates source and destination offsets from
+// the base address into different fields, and on the host machine, memory is
+// actually copied from source to destination (the memory copy will not show up
+// in the trace).
+
+void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size);
+void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size);
+
+#else
+
+#warning "DMA interface v1 is deprecated!"
+
+// Version 1 of the DMA interface is now deprecated and will be removed entirely.
+
+void* dmaLoad(void* addr, size_t offset, size_t size);
+void* dmaStore(void* addr, size_t offset, size_t size);
+
+#endif
+void dmaFence();
+
+#endif
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/gem5_harness.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/gem5_harness.h
new file mode 100644
index 0000000000000000000000000000000000000000..36859cfe1ba67bc8197d7ea3961adfbeb95c70b1
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/gem5_harness.h
@@ -0,0 +1,10 @@
+#ifndef _GEM5_HARNESS_H_
+#define _GEM5_HARNESS_H_
+
+/* One header to include them all. */
+
+//#include "aladdin_sys_connection.h"
+//#include "aladdin_sys_constants.h"
+#include "dma_interface.h"
+
+#endif
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.c
new file mode 100644
index 0000000000000000000000000000000000000000..124fe0b7d175c2655feac562ecd6e2a5b73cc96a
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.c
@@ -0,0 +1,335 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "utility.h"
+#include "pipe_stages.h"
+#include "load_cam_model.h"
+
+// Get color space transform
+float* get_Ts(char* cam_model_path) {
+  float *Ts;
+  int err = posix_memalign((void **)&Ts, CACHELINE_SIZE, sizeof(float) * 9);
+  assert(err == 0 && "Failed to allocate memory!");
+  char *line;
+  char *str;
+  float line_data[3];
+  size_t len = 0;
+  int line_idx = 0;
+
+  // Open file for reading
+  char file_name[] = "raw2jpg_transform.txt";
+  char file_path[100];
+  strcpy(file_path, cam_model_path);
+  strcat(file_path, file_name);
+  FILE *fp = fopen(file_path, "r");
+  if (fp == NULL) {
+    printf("Didn't find the camera model file!\n");
+    exit(1);
+  }
+
+  while (getline(&line, &len, fp) != -1) {
+    str = strtok(line, " \n");
+    int i = 0;
+    while (str != NULL) {
+      line_data[i] = atof(str); 
+      str = strtok(NULL, " \n");
+      i++;
+    }
+
+    if (line_idx >= 1 && line_idx <= 3) {
+      for (int j = 0; j < 3; j++) {
+        Ts[(line_idx - 1) * 3 + j] = line_data[j];
+      }
+    }
+    line_idx = line_idx + 1;
+  }
+  fclose(fp);
+  free(line);
+  return Ts;
+}
+
+// Get white balance transform
+float* get_Tw(char* cam_model_path, int wb_index) {
+  float *Tw;
+  int err = posix_memalign((void **)&Tw, CACHELINE_SIZE, sizeof(float) * 9);
+  assert(err == 0 && "Failed to allocate memory!");
+  char *line;
+  char *str;
+  float line_data[3];
+  size_t len = 0;
+  int line_idx = 0;
+
+  // Calculate base for the white balance transform selected
+  // For more details see the camera model readme
+  int wb_base  = 8 + 5*(wb_index-1);
+
+  // Open file for reading
+  // Open file for reading
+  char file_name[] = "raw2jpg_transform.txt";
+  char file_path[100];
+  strcpy(file_path, cam_model_path);
+  strcat(file_path, file_name);
+  FILE *fp = fopen(file_path, "r");
+  if (fp == NULL) {
+    printf("Didn't find the camera model file!\n");
+    exit(1);
+  }
+
+  // Read a line at a time
+  while (getline(&line, &len, fp) != -1) {
+    str = strtok(line, " \n");
+    int i = 0;
+    while (str != NULL) {
+      line_data[i] = atof(str); 
+      str = strtok(NULL, " \n");
+      i++;
+    }
+
+    if (line_idx == wb_base) {
+      // Convert the white balance vector into a diagaonal matrix
+      for (int i=0; i<3; i++) {
+        for (int j=0; j<3; j++) {
+          if (i == j) {
+            Tw[i * 3 + j] = line_data[i];
+          } else {
+            Tw[i * 3 + j] = 0.0;
+          }
+        }
+      }
+    }
+    line_idx = line_idx + 1;
+  }
+  fclose(fp);
+  free(line);
+  return Tw;
+}
+
+
+// Get combined transforms for checking
+float* get_TsTw(char* cam_model_path, int wb_index) {
+  float *TsTw;
+  int err = posix_memalign((void **)&TsTw, CACHELINE_SIZE, sizeof(float) * 9);
+  assert(err == 0 && "Failed to allocate memory!");
+  char *line;
+  char *str;
+  float line_data[3];
+  size_t len = 0;
+  int line_idx = 0;
+
+  // Calculate base for the white balance transform selected
+  // For more details see the camera model readme
+  int wb_base  = 5 + 5*(wb_index-1);
+
+  // Open file for reading
+  char file_name[] = "raw2jpg_transform.txt";
+  char file_path[100];
+  strcpy(file_path, cam_model_path);
+  strcat(file_path, file_name);
+  FILE *fp = fopen(file_path, "r");
+  if (fp == NULL) {
+    printf("Didn't find the camera model file!\n");
+    exit(1);
+  }
+
+  // Read a line at a time
+  while (getline(&line, &len, fp) != -1) {
+    str = strtok(line, " \n");
+    int i = 0;
+    while (str != NULL) {
+      line_data[i] = atof(str); 
+      str = strtok(NULL, " \n");
+      i++;
+    }
+
+    if (line_idx >= wb_base && line_idx <= (wb_base + 2)) {
+      for (int j = 0; j < 3; j++) {
+        TsTw[(line_idx - wb_base) * 3 + j] = line_data[j];
+      }
+    }
+    line_idx = line_idx + 1;
+  }
+  fclose(fp);
+  free(line);
+  return TsTw;
+}
+
+// Get control points
+float* get_ctrl_pts(char* cam_model_path, int num_cntrl_pts) {
+  float *ctrl_pnts;
+  int err = posix_memalign((void **)&ctrl_pnts, CACHELINE_SIZE,
+                           sizeof(float) * num_cntrl_pts * 3);
+  assert(err == 0 && "Failed to allocate memory!");
+  char *line;
+  char *str;
+  float line_data[3];
+  size_t len = 0;
+  int line_idx = 0;
+
+  // Open file for reading
+  char file_name[] = "raw2jpg_ctrlPoints.txt";
+  char file_path[100];
+  strcpy(file_path, cam_model_path);
+  strcat(file_path, file_name);
+  FILE *fp = fopen(file_path, "r");
+  if (fp == NULL) {
+    printf("Didn't find the camera model file!\n");
+    exit(1);
+  }
+
+  // Read a line at a time
+  while (getline(&line, &len, fp) != -1) {
+    str = strtok(line, " \n");
+    int i = 0;
+    while (str != NULL) {
+      line_data[i] = atof(str);
+      str = strtok(NULL, " \n");
+      i++;
+    }
+
+    if (line_idx >= 1 && line_idx <= num_cntrl_pts) {
+      for (int j = 0; j < 3; j++) {
+        ctrl_pnts[(line_idx - 1) * 3 + j] = line_data[j];
+      }
+    }
+    line_idx = line_idx + 1;
+  }
+  fclose(fp);
+  free(line);
+  return ctrl_pnts;
+}
+
+// Get weights
+float* get_weights(char* cam_model_path, int num_cntrl_pts) {
+  float *weights;
+  int err = posix_memalign((void **)&weights, CACHELINE_SIZE,
+                           sizeof(float) * num_cntrl_pts * 3);
+  assert(err == 0 && "Failed to allocate memory!");
+  char *line;
+  char *str;
+  float line_data[3];
+  size_t len = 0;
+  int line_idx = 0;
+
+  // Open file for reading
+  char file_name[] = "raw2jpg_coefs.txt";
+  char file_path[100];
+  strcpy(file_path, cam_model_path);
+  strcat(file_path, file_name);
+  FILE *fp = fopen(file_path, "r");
+  if (fp == NULL) {
+    printf("Didn't find the camera model file!\n");
+    exit(1);
+  }
+
+  // Read a line at a time
+  while (getline(&line, &len, fp) != -1) {
+    str = strtok(line, " \n");
+    int i = 0;
+    while (str != NULL) {
+      line_data[i] = atof(str);
+      str = strtok(NULL, " \n");
+      i++;
+    }
+
+    if (line_idx >= 1 && line_idx <= num_cntrl_pts) {
+      for (int j = 0; j < 3; j++) {
+        weights[(line_idx - 1) * 3 + j] = line_data[j];
+      }
+    }
+    line_idx = line_idx + 1;
+  }
+  fclose(fp);
+  free(line);
+  return weights;
+}
+
+// Get coeficients
+float* get_coefs(char* cam_model_path, int num_cntrl_pts) {
+  float *coefs;
+  int err = posix_memalign((void **)&coefs, CACHELINE_SIZE, sizeof(float) * 12);
+  assert(err == 0 && "Failed to allocate memory!");
+  char *line;
+  char *str;
+  float line_data[3];
+  size_t len = 0;
+  int line_idx = 0;
+
+  // Open file for reading
+  char file_name[] = "raw2jpg_coefs.txt";
+  char file_path[100];
+  strcpy(file_path, cam_model_path);
+  strcat(file_path, file_name);
+  FILE *fp = fopen(file_path, "r");
+  if (fp == NULL) {
+    printf("Didn't find the camera model file!\n");
+    exit(1);
+  }
+
+  // Read a line at a time
+  while (getline(&line, &len, fp) != -1) {
+    str = strtok(line, " \n");
+    int i = 0;
+    while (str != NULL) {
+      line_data[i] = atof(str);
+      str = strtok(NULL, " \n");
+      i++;
+    }
+
+    if (line_idx >= (num_cntrl_pts + 1) && line_idx <= (num_cntrl_pts + 4)) {
+      for (int j = 0; j < 3; j++) {
+        coefs[(line_idx - num_cntrl_pts - 1) * 3 + j] = line_data[j];
+      }
+    }
+    line_idx = line_idx + 1;
+  }
+  fclose(fp);
+  free(line);
+  return coefs;
+}
+
+
+// Get tone mapping table
+float* get_tone_map(char* cam_model_path) {
+  float *tone_map;
+  int err = posix_memalign((void **)&tone_map, CACHELINE_SIZE,
+                           sizeof(float) * 256 * CHAN_SIZE);
+  assert(err == 0 && "Failed to allocate memory!");
+  char *line;
+  char *str;
+  float line_data[3];
+  size_t len = 0;
+  int line_idx = 0;
+
+  // Open file for reading
+  char file_name[] = "raw2jpg_respFcns.txt";
+  char file_path[100];
+  strcpy(file_path, cam_model_path);
+  strcat(file_path, file_name);
+  FILE *fp = fopen(file_path, "r");
+  if (fp == NULL) {
+    printf("Didn't find the camera model file!\n");
+    exit(1);
+  }
+
+  // Read a line at a time
+  while (getline(&line, &len, fp) != -1) {
+    str = strtok(line, " \n");
+    int i = 0;
+    while (str != NULL) {
+      line_data[i] = atof(str);
+      str = strtok(NULL, " \n");
+      i++;
+    }
+
+    if (line_idx >= 1 && line_idx <= 256) {
+      for (int j = 0; j < 3; j++) {
+        tone_map[(line_idx - 1) * 3 + j] = line_data[j];
+      }
+    }
+    line_idx = line_idx + 1;
+  }
+  fclose(fp);
+  free(line);
+  return tone_map;
+}
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e5ee95217c901e57250bcce6b3cfc37cd9d6ce7
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.h
@@ -0,0 +1,25 @@
+#ifndef _LOAD_CAM_MODEL_H_
+#define _LOAD_CAM_MODEL_H_
+
+// Get color space transform
+float *get_Ts(char *cam_model_path);
+
+// Get white balance transform
+float *get_Tw(char *cam_model_path, int wb_index);
+
+// Get combined transforms for checking
+float *get_TsTw(char *cam_model_path, int wb_index);
+
+// Get control points
+float *get_ctrl_pts(char *cam_model_path, int num_cntrl_pts);
+
+// Get weights
+float *get_weights(char *cam_model_path, int num_cntrl_pts);
+
+// Get coeficients
+float *get_coefs(char *cam_model_path, int num_cntrl_pts);
+
+// Get tone mapping table
+float *get_tone_map(char *cam_model_path);
+
+#endif
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc75beb66419ae3a23f2d9578d9b5c19a238c4fe
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main.c
@@ -0,0 +1,156 @@
+
+#include <stdlib.h>
+#include "utility.h"
+#include "visc.h"
+#include "defs.h"
+
+
+typedef struct __attribute__((__packed__)) {
+    float* input; size_t bytes_input;
+    float* result; size_t bytes_result;
+} 
+RootIn;
+
+/*
+typedef struct __attribute__((__packed__)) {
+    uint8_t *input; size_t bytes_input;
+    uint8_t *result; size_t bytes_result;
+    float *input_scaled; size_t bytes_input_scaled; 
+    float *result_scaled; size_t bytes_result_scaled;
+    float *demosaic_out; size_t bytes_demosaic_out;
+    float *denoise_out; size_t bytes_denoise_out;
+    float *transform_out; size_t bytes_transform_out;
+    float *gamut_out;size_t bytes_gamut_out;
+    float *TsTw; size_t bytes_TsTw;
+    float *ctrl_pts; size_t bytes_ctrl_pts;
+    float *weights; size_t bytes_weights;
+    float*coefs; size_t bytes_coefs;
+    float *l2_dist; size_t bytes_l2_dist;
+    float *tone_map; size_t bytes_tone_map;
+    int row_size; int col_size;
+} 
+RootIn;
+*/
+
+
+
+void scale_values(float* input, size_t num_elems) {
+
+  __visc__hint(DEVICE);
+  __visc__attributes(1, input, 1, input);
+  
+  for (int ind = 0; ind < num_elems; ind++){
+    input[ind] = input[ind] * 2.0;
+  }
+  
+  __visc__return(1, num_elems);
+}
+
+
+
+
+void graphRoot(/*0*/ float* input, /*1*/ size_t bytes_input, 
+               /*2*/ float* result, /*3*/ size_t bytes_result) {
+
+  //Specifies compilation target for current node
+  __visc__hint(CPU_TARGET);
+
+  __visc__attributes(2, input, result, 2, input, result);
+
+  // Create an 0D (specified by 1st argument) HPVM node - so a single node
+  // associated with node function ---_fxp_wrapper
+
+  void* scaleNode = __visc__createNodeND(0, scale_values);
+    
+  // BindIn binds inputs of current node with specified node
+  // - destination node
+  // - argument position in argument list of function of source node
+  // - argument position in argument list of function of destination node
+  // - streaming (1) or non-streaming (0)
+
+  // Edge transfers data between nodes within the same level of hierarchy.
+  // - source and destination dataflow nodes
+  // - edge type, all-all (1) or one-one(0)
+  // - source position (in output struct of source node)
+  // - destination position (in argument list of destination node)
+  // - streaming (1) or non-streaming (0)
+
+  // scale_fxp inputs
+  __visc__bindIn(scaleNode, 0, 0, 0); // input -> ScNode:input
+  __visc__bindIn(scaleNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input
+    
+  // Similar to bindIn, but for the output. Output of a node is a struct, and
+  // we consider the fields in increasing ordering.
+  __visc__bindOut(scaleNode, 0, 0, 0);
+    
+}
+
+
+
+
+
+int main(int argc, char* argv[]) {
+
+    size_t input_size = 100;
+    size_t result_size = 100;
+
+    size_t input_bytes = input_size * sizeof(float);
+    size_t result_bytes = result_size * sizeof(float);
+
+    // This is host_input in cam_pipe()
+    float* input = (float*) malloc(input_bytes);
+    for(unsigned int i = 0; i < input_size; i++){
+      input[i] = 1.0;
+    }
+    // This is host_result in cam_pipe()
+    float* result = (float*) malloc(result_bytes);
+
+   
+    __visc__init();
+
+    RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn));
+
+    // Set up HPVM DFG inputs in the rootArgs struct.
+    rootArgs->input = input;
+    rootArgs->bytes_input = input_size;
+
+    printf("input = %d input_bytes = %d \n", input, input_bytes);
+    
+    rootArgs->result = result;
+    rootArgs->bytes_result = result_size;
+
+  
+    llvm_visc_track_mem(input, input_bytes);
+    llvm_visc_track_mem(result, result_bytes);
+
+
+    void* testDFG = __visc__launch(0, graphRoot, (void*) rootArgs);
+    __visc__wait(testDFG);
+
+
+    printf("input = %d \n", input);
+    
+    llvm_visc_request_mem(input, input_bytes);
+    //llvm_visc_request_mem(result, result_bytes);
+
+    printf("requested mem \n");
+
+    for(unsigned int i = 0; i < input_size; i++){
+      printf("input[%d] = %f \n", i, input[i]);
+    }
+
+    //llvm_visc_untrack_mem(input);
+    //llvm_visc_untrack_mem(result);
+
+    printf ("untracked mem \n");
+    
+    __visc__cleanup();
+
+    printf ("cleaned up visc");
+
+    return 0;
+}
+
+
+
+    
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main_old.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main_old.c
new file mode 100644
index 0000000000000000000000000000000000000000..ea42ad0bf87fd8e0b337ea1e7d0ad803025e849e
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main_old.c
@@ -0,0 +1,865 @@
+#include <argp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <math.h>
+#include "utility.h"
+
+#include "cam_pipe_utility.h"
+#include "pipe_stages.h"
+#include "load_cam_model.h"
+
+#include "visc.h"
+
+int NUM_TEST_CASES;
+int NUM_CLASSES;
+int INPUT_DIM;
+int NUM_WORKER_THREADS;
+
+// Type of struct that is used to pass arguments to the HPVM dataflow graph
+// using the hpvm launch operation
+typedef struct __attribute__((__packed__)) {
+    uint8_t *input; size_t bytes_input;
+    uint8_t *result; size_t bytes_result;
+    float *input_scaled; size_t bytes_input_scaled; 
+    float *result_scaled; size_t bytes_result_scaled;
+    float *demosaic_out; size_t bytes_demosaic_out;
+    float *denoise_out; size_t bytes_denoise_out;
+    float *transform_out; size_t bytes_transform_out;
+    float *gamut_out;size_t bytes_gamut_out;
+    float *TsTw; size_t bytes_TsTw;
+    float *ctrl_pts; size_t bytes_ctrl_pts;
+    float *weights; size_t bytes_weights;
+    float*coefs; size_t bytes_coefs;
+    float *l2_dist; size_t bytes_l2_dist;
+    float *tone_map; size_t bytes_tone_map;
+    int row_size; int col_size;
+} 
+RootIn;
+
+typedef enum _argnum {
+    RAW_IMAGE_BIN,
+    OUTPUT_IMAGE_BIN,
+    NUM_REQUIRED_ARGS,
+    DATA_FILE = NUM_REQUIRED_ARGS,
+    NUM_ARGS,
+} argnum;
+
+typedef struct _arguments {
+    char* args[NUM_ARGS];
+    int num_inputs;
+    int num_threads;
+} arguments;
+
+static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n";
+static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary";
+static struct argp_option options[] = {
+    { "num-inputs", 'n', "N", 0, "Number of input images" }, { 0 },
+    { "data-file", 'f', "F", 0,
+      "File to read data and weights from (if data-init-mode == READ_FILE or "
+      "save-params is true). *.txt files are decoded as text files, while "
+      "*.bin files are decoded as binary files." },
+};
+
+static error_t parse_opt(int key, char* arg, struct argp_state* state) {
+    arguments* args = (arguments*)(state->input);
+    switch (key) {
+        case 'n': {
+            args->num_inputs = strtol(arg, NULL, 10);
+            break;
+        }
+        case 'f': {
+            args->args[DATA_FILE] = arg;
+            break;
+        }
+        case 't': {
+            args->num_threads = strtol(arg, NULL, 10);
+            break;
+        }
+        case ARGP_KEY_ARG: {
+            if (state->arg_num >= NUM_REQUIRED_ARGS)
+                argp_usage(state);
+            args->args[state->arg_num] = arg;
+            break;
+        }
+        case ARGP_KEY_END: {
+            if (state->arg_num < NUM_REQUIRED_ARGS) {
+                fprintf(stderr,
+                        "Not enough arguments! Got %d, require %d.\n",
+                        state->arg_num,
+                        NUM_REQUIRED_ARGS);
+                argp_usage(state);
+            }
+            break;
+        }
+        default:
+            return ARGP_ERR_UNKNOWN;
+    }
+    return 0;
+}
+
+void set_default_args(arguments* args) {
+    args->num_inputs = 1;
+    args->num_threads = 0;
+    for (int i = 0; i < NUM_ARGS; i++) {
+        args->args[i] = NULL;
+    }
+}
+
+static struct argp parser = { options, parse_opt, args_doc, prog_doc };
+
+// Helper function for printing intermediate results
+void descale_cpu(float *input, size_t bytes_input, 
+                 uint8_t *output, size_t bytes_result,
+                 int row_size, int col_size) {
+  
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++) {
+        int index = (chan*row_size + row) * col_size + col;
+        output[index] = min(max(input[index] * 255, 0), 255);
+      }
+}
+
+static void sort(float arr[], int n) {
+    int i, j;
+    for (i = 0; i < n - 1; i++)
+        for (j = 0; j < n - i - 1; j++)
+            if (arr[j] > arr[j + 1]) {
+                float temp = arr[j];
+                arr[j] = arr[j + 1];
+                arr[j + 1] = temp;
+            }
+}
+
+/**************************************************************/
+/*** HPVM Leaf node Functions - Performing the computations ***/
+/**************************************************************/
+
+// In this benchmark, no use of HPVM query intrinsics in the leaf node functions
+
+// Leaf HPVM node function for scale
+void scale_fxp(uint8_t *input, size_t bytes_input, 
+               float *output, size_t bytes_output,
+               int row_size, int col_size) {
+
+  //Specifies compilation target for current node
+  __visc__hint(DEVICE);
+
+  // Specifies pointer arguments that will be used as "in" and "out" arguments
+  // - count of "in" arguments
+  // - list of "in" argument , and similar for "out"
+  __visc__attributes(2, input, output, 1, output);
+  
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++){
+        int index = (chan*row_size + row) * col_size + col;
+        output[index] = input[index] * 1.0 / 255;
+      }
+  __visc__return(1, bytes_output);
+}
+
+// Leaf HPVM node function for descale
+void descale_fxp(float *input, size_t bytes_input, 
+                 uint8_t *output, size_t bytes_result,
+                 int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(2, input, output, 1, output);
+  
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++) {
+        int index = (chan*row_size + row) * col_size + col;
+        output[index] = min(max(input[index] * 255, 0), 255);
+      }
+  __visc__return(1, bytes_result);
+}
+
+// Leaf HPVM node function for demosaicing
+void demosaic_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(2, input, result, 1, result);
+  
+  for (int row = 1; row < row_size - 1; row++)
+    for (int col = 1; col < col_size - 1; col++) {
+        int index_0 = (0 * row_size + row) * col_size + col;
+        int index_1 = (1 * row_size + row) * col_size + col;
+        int index_2 = (2 * row_size + row) * col_size + col;
+        if (row % 2 == 0 && col % 2 == 0) {
+            // Green pixel
+            // Getting the R values
+            float R1 = input[index_0 - 1];
+            float R2 = input[index_0 + 1];
+            // Getting the B values
+            float B1 = input[index_2 - col_size];
+            float B2 = input[index_2 + col_size];
+            // R
+            result[index_0] = (R1 + R2) / 2;
+            // G
+            result[index_1] = input[index_1] * 2;
+            // B
+            result[index_2] = (B1 + B2) / 2;
+        } else if (row % 2 == 0 && col % 2 == 1) {
+            // Red pixel
+            // Getting the G values
+            float G1 = input[index_1 - col_size];
+            float G2 = input[index_1 + col_size];
+            float G3 = input[index_1 - 1];
+            float G4 = input[index_1 + 1];
+            // Getting the B values
+            float B1 = input[index_2 - col_size - 1];
+            float B2 = input[index_2 - col_size + 1];
+            float B3 = input[index_2 + col_size - 1];
+            float B4 = input[index_2 + col_size + 1];
+            // R
+            result[index_0] = input[index_0];
+            // G
+            result[index_1] = (G1 + G2 + G3 + G4) / 2;
+            // B (center pixel)
+            result[index_2] = (B1 + B2 + B3 + B4) / 4;
+        } else if (row % 2 == 1 && col % 2 == 0) {
+            // Blue pixel
+            // Getting the R values
+            float R1 = input[index_0 - col_size - 1];
+            float R2 = input[index_0 + col_size - 1];
+            float R3 = input[index_0 - col_size + 1];
+            float R4 = input[index_0 + col_size + 1];
+            // Getting the G values
+            float G1 = input[index_1 - col_size];
+            float G2 = input[index_1 + col_size];
+            float G3 = input[index_1 - 1];
+            float G4 = input[index_1 + 1];
+            // R
+            result[index_0] = (R1 + R2 + R3 + R4) / 4;
+            // G
+            result[index_1] = (G1 + G2 + G3 + G4) / 2;
+            // B
+            result[index_2] = input[index_2];
+        } else {
+            // Bottom Green pixel
+            // Getting the R values
+            float R1 = input[index_0 - col_size];
+            float R2 = input[index_0 + col_size];
+            // Getting the B values
+            float B1 = input[index_2 - 1];
+            float B2 = input[index_2 + 1];
+            // R
+            result[index_0] = (R1 + R2) / 2;
+            // G
+            result[index_1] = input[index_1] * 2;
+            // B
+            result[index_2] = (B1 + B2) / 2;
+        }
+      }
+  __visc__return(1, bytes_result);
+}
+
+// Leaf HPVM node function for denoise
+void denoise_fxp(float *input, size_t bytes_input, 
+                 float *result, size_t bytes_result,
+                 int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(2, input, result, 1, result);
+  
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++)
+        if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
+          float filter[9];
+          for (int i = -1; i < 2; i++)
+            for (int j = -1; j < 2; j++) {
+              int index = ((i+row) - row + 1) * 3 + (j+col) - col + 1;
+              filter[index] = input[(chan * row_size + (i + row)) * col_size + (j + col)];
+            }
+          sort(filter, 9);
+          result[(chan * row_size + row) * col_size + col] = filter[4];
+        } else {
+      result[(chan * row_size + row) * col_size + col] = input[(chan * row_size + row) * col_size + col];
+        }
+  __visc__return(1, bytes_result);
+}
+
+// Leaf HPVM node function, for color map and white balance transform
+void transform_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *TsTw_tran, size_t bytes_TsTw,
+                   int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(3, input, result, TsTw_tran, 1, result);
+  
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++) {
+        int index = (chan * row_size + row) * col_size + col;
+        int index_0 = (0 * row_size + row) * col_size + col;
+        int index_1 = (1 * row_size + row) * col_size + col;
+        int index_2 = (2 * row_size + row) * col_size + col;
+        int index_2d_0 = 0 * CHAN_SIZE + chan;
+        int index_2d_1 = 1 * CHAN_SIZE + chan;
+        int index_2d_2 = 2 * CHAN_SIZE + chan;
+        result[index] =
+            max(input[index_0] * TsTw_tran[index_2d_0] +
+                input[index_1] * TsTw_tran[index_2d_1] +
+                input[index_2] * TsTw_tran[index_2d_2],
+                0);
+      }
+  __visc__return(1, bytes_result);
+}
+
+// Leaf HPVM node function, for gamut mapping
+void gamut_map_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights,
+                   float *coefs, size_t bytes_coefs,
+                   float *l2_dist, size_t bytes_l2_dist,
+                   int row_size, int col_size) {
+  __visc__hint(CPU_TARGET);
+  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist);
+
+ // First, get the L2 norm from every pixel to the control points,
+ // Then, sum it and weight it. Finally, add the bias.
+
+  for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      for (int cp = 0; cp < 3702; cp++) {
+        int index_0 = (0 * row_size + row) * col_size + col;
+        int index_1 = (1 * row_size + row) * col_size + col;
+        int index_2 = (2 * row_size + row) * col_size + col;
+        float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); 
+        float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
+        float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
+        float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
+        float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); 
+        float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
+        float val = val1 * val2 + val3 * val4 + val5 * val6;
+        float sqrt_val = sqrt(val);
+        l2_dist[cp] = sqrt_val;
+      }
+      for (int chan = 0; chan < CHAN_SIZE; chan++) {
+        float chan_val = 0.0;
+        for (int cp = 0; cp < 3702; cp++) {
+          chan_val += l2_dist[cp] * weights[cp * CHAN_SIZE + chan];
+        }
+        chan_val += coefs[0 * CHAN_SIZE + chan] + 
+                    coefs[1 * CHAN_SIZE + chan] * input[(0 * row_size + row) * col_size + col] +
+                    coefs[2 * CHAN_SIZE + chan] * input[(1 * row_size + row) * col_size + col] +
+                    coefs[3 * CHAN_SIZE + chan] * input[(2 * row_size + row) * col_size + col];
+        result[(chan * row_size + row) * col_size + col] = max(chan_val, 0);
+      }
+    }
+  __visc__return(1, bytes_result);
+}
+
+// HPVM leaf node function, for tone mapping
+void tone_map_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  float *tone_map, size_t bytes_tone_map,
+                  int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(3, input, result, tone_map, 1, result);
+  
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    for (int row = 0; row < row_size; row++)
+      for (int col = 0; col < col_size; col++) {
+        int index = (chan * row_size + row) * col_size + col;
+        uint8_t x = input[index] * 255;
+        result[index] = tone_map[x * CHAN_SIZE + chan];
+      }
+  __visc__return(1, bytes_result);
+}
+
+/********************************************************************/
+/*** HPVM Internal node Functions - Determine the graph structure ***/
+/********************************************************************/
+
+// We create a wrapper node per leaf node - this is an implementation
+// requirement for the FPGA backend . The CPU backend also supports this,
+// so it does not cause a portability issue.
+
+void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       int row_size, int col_size) {
+  __visc__hint(CPU_TARGET);
+  __visc__attributes(2, input, result, 1, result);
+
+  // Create an 1D (specified by 1st argument) HPVM node with 1 dynamic
+  // instance (last argument) associated with node function scale_fxp
+  void *ScaleNode = __visc__createNodeND(1, scale_fxp, (size_t)1);
+
+  // Binds inputs of current node with specified node
+  // - destination node
+  // - argument position in argument list of function of source node
+  // - argument position in argument list of function of destination node
+  // - streaming (1) or non-streaming (0)
+  __visc__bindIn(ScaleNode, 0, 0, 0); // bind input
+  __visc__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input
+  __visc__bindIn(ScaleNode, 2, 2, 0); // bind result
+  __visc__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result
+  __visc__bindIn(ScaleNode, 4, 4, 0); // bind row_size
+  __visc__bindIn(ScaleNode, 5, 5, 0); // bind col_size
+
+  // Similar to bindIn, but for the output. Output of a node is a struct, and
+  // we consider the fields in increasing ordering.
+  __visc__bindOut(ScaleNode, 0, 0, 0);
+}
+
+void descale_fxp_wrapper(float *input, size_t bytes_input, 
+                       uint8_t *result, size_t bytes_result,
+                       int row_size, int col_size) {
+  __visc__hint(CPU_TARGET);
+  __visc__attributes(2, input, result, 1, result);
+  void *DescaleNode = __visc__createNodeND(1, descale_fxp, (size_t)1);
+  __visc__bindIn(DescaleNode, 0, 0, 0); // bind input
+  __visc__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input
+  __visc__bindIn(DescaleNode, 2, 2, 0); // bind result
+  __visc__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result
+  __visc__bindIn(DescaleNode, 4, 4, 0); // bind row_size
+  __visc__bindIn(DescaleNode, 5, 5, 0); // bind col_size
+  
+  __visc__bindOut(DescaleNode, 0, 0, 0);
+}
+
+void demosaic_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       int row_size, int col_size) {
+  __visc__hint(CPU_TARGET);
+  __visc__attributes(2, input, result, 1, result);
+  void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, (size_t)1);
+  __visc__bindIn(DemosaicNode, 0, 0, 0); // bind input
+  __visc__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input
+  __visc__bindIn(DemosaicNode, 2, 2, 0); // bind result
+  __visc__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result
+  __visc__bindIn(DemosaicNode, 4, 4, 0); // bind row_size
+  __visc__bindIn(DemosaicNode, 5, 5, 0); // bind col_size
+  
+  __visc__bindOut(DemosaicNode, 0, 0, 0);
+}
+
+void denoise_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       int row_size, int col_size) {
+  __visc__hint(CPU_TARGET);
+  __visc__attributes(2, input, result, 1, result);
+  void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, (size_t)1);
+  __visc__bindIn(DenoiseNode, 0, 0, 0); // bind input
+  __visc__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input
+  __visc__bindIn(DenoiseNode, 2, 2, 0); // bind result
+  __visc__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result
+  __visc__bindIn(DenoiseNode, 4, 4, 0); // bind row_size
+  __visc__bindIn(DenoiseNode, 5, 5, 0); // bind col_size
+  
+  __visc__bindOut(DenoiseNode, 0, 0, 0);
+}
+
+void transform_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       float *TsTw_tran, size_t bytes_TsTw,
+                       int row_size, int col_size) {
+  __visc__hint(CPU_TARGET);
+  __visc__attributes(3, input, result, TsTw_tran, 1, result);
+  void *TransformNode = __visc__createNodeND(1, transform_fxp, (size_t)1);
+  __visc__bindIn(TransformNode, 0, 0, 0); // bind input
+  __visc__bindIn(TransformNode, 1, 1, 0); // bind bytes_input
+  __visc__bindIn(TransformNode, 2, 2, 0); // bind result
+  __visc__bindIn(TransformNode, 3, 3, 0); // bind bytes_result
+  __visc__bindIn(TransformNode, 4, 4, 0); // bind tstw
+  __visc__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw
+  __visc__bindIn(TransformNode, 6, 6, 0); // bind row_size
+  __visc__bindIn(TransformNode, 7, 7, 0); // bind col_size
+  
+  __visc__bindOut(TransformNode, 0, 0, 0);
+}
+
+void gamut_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       float *ctrl_pts, size_t bytes_ctrl_pts,
+                       float *weights, size_t bytes_weights,
+                       float *coefs, size_t bytes_coefs,
+                       float *l2_dist, size_t bytes_l2_dist,
+                       int row_size, int col_size) {
+  __visc__hint(CPU_TARGET);
+  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
+  void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, (size_t)1);
+  __visc__bindIn(GamutNode, 0, 0, 0); // bind input
+  __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input
+  __visc__bindIn(GamutNode, 2, 2, 0); // bind result
+  __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result
+  __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts
+  __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts
+  __visc__bindIn(GamutNode, 6, 6, 0); // bind weights
+  __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights
+  __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs
+  __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs
+  __visc__bindIn(GamutNode, 10, 10, 0); // bind l2_dist
+  __visc__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist
+  __visc__bindIn(GamutNode, 12, 12, 0); // bind row_size
+  __visc__bindIn(GamutNode, 13, 13, 0); // bind col_size
+  
+  __visc__bindOut(GamutNode, 0, 0, 0);
+}
+void tone_map_fxp_wrapper(float *input, size_t bytes_input, 
+                       float *result, size_t bytes_result,
+                       float *tone_map, size_t bytes_tone_map,
+                       int row_size, int col_size) {
+
+  __visc__hint(CPU_TARGET);
+  __visc__attributes(3, input, result, tone_map, 1, result);
+  void *ToneMapNode = __visc__createNodeND(1, tone_map_fxp, (size_t)1);
+  __visc__bindIn(ToneMapNode, 0, 0, 0); // bind input
+  __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input
+  __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result
+  __visc__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result
+  __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map 
+  __visc__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map
+  __visc__bindIn(ToneMapNode, 6, 6, 0); // bind row_size
+  __visc__bindIn(ToneMapNode, 7, 7, 0); // bind col_size
+  
+  __visc__bindOut(ToneMapNode, 0, 0, 0);
+}
+
+
+/*** ROOT Node - Top Level of the Graph Hierarchy ***/
+void CamPipeRoot(/*0*/ uint8_t *input,         /*1*/ size_t bytes_input, 
+                 /*2*/ uint8_t *result,        /*3*/ size_t bytes_result,
+                 /*4*/ float *input_scaled,    /*5*/ size_t bytes_input_scaled,
+                 /*6*/ float *result_scaled,   /*7*/ size_t bytes_result_scaled,
+                 /*8*/ float *demosaic_out,    /*9*/ size_t bytes_demosaic_out,
+                 /*10*/ float *denoise_out,    /*11*/ size_t bytes_denoise_out,
+                 /*12*/ float *transform_out,  /*13*/ size_t bytes_transform_out,
+                 /*14*/ float *gamut_out,      /*15*/ size_t bytes_gamut_out,
+                 /*16*/ float *TsTw,           /*17*/ size_t bytes_TsTw,
+                 /*18*/ float *ctrl_pts,       /*19*/ size_t bytes_ctrl_pts,
+                 /*20*/ float *weights,        /*21*/ size_t bytes_weights,
+                 /*22*/ float*coefs,           /*23*/ size_t bytes_coefs,
+                 /*24*/ float *l2_dist,        /*25*/ size_t bytes_l2_dist,
+                 /*26*/ float *tone_map,       /*27*/ size_t bytes_tone_map,
+                 /*28*/ int row_size,          /*29*/ int col_size) {
+
+  //Specifies compilation target for current node
+    __visc__hint(CPU_TARGET);
+
+  // Specifies pointer arguments that will be used as "in" and "out" arguments
+  // - count of "in" arguments
+  // - list of "in" argument , and similar for "out"
+    __visc__attributes(14, input, result, input_scaled, result_scaled, demosaic_out, denoise_out, 
+                       transform_out, gamut_out, TsTw, ctrl_pts, weights, coefs, tone_map, l2_dist, 
+                       5, result, demosaic_out, denoise_out, transform_out, gamut_out);
+
+  // Create an 0D (specified by 1st argument) HPVM node - so a single node
+  // associated with node function ---_fxp_wrapper
+    void* ScNode = __visc__createNodeND(0, scale_fxp_wrapper);
+    void* DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper);
+    void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper);
+    void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper);
+    void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper);
+    void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper);
+    void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper);
+    
+  // BindIn binds inputs of current node with specified node
+  // - destination node
+  // - argument position in argument list of function of source node
+  // - argument position in argument list of function of destination node
+  // - streaming (1) or non-streaming (0)
+
+  // Edge transfers data between nodes within the same level of hierarchy.
+  // - source and destination dataflow nodes
+  // - edge type, all-all (1) or one-one(0)
+  // - source position (in output struct of source node)
+  // - destination position (in argument list of destination node)
+  // - streaming (1) or non-streaming (0)
+
+    // scale_fxp inputs
+    __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input
+    __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input
+    __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result
+    __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result
+    __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
+    __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size
+
+    // demosaic_fxp inputs
+    __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
+    __visc__edge(ScNode, DmNode, 1, 0, 1, 0); // SCNode:bytes_result -> DmNode:bytes_input
+    __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result
+    __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result
+    __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size 
+    __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size
+
+    // denoise_fxp inputs
+    __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
+    __visc__edge(DmNode, DnNode, 1, 0, 1, 0); // DMNode:bytes_result -> DnNode:bytes_input
+    __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
+    __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
+    __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size 
+    __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size
+    
+    // transform_fxp inputs
+    __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
+    __visc__edge(DnNode, TrNode, 1, 0, 1, 0); // DnNode:bytes_result -> TrNode:bytes_input
+    __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
+    __visc__bindIn(TrNode, 13, 3, 0); // bytes_result_scaled -> TrNode:bytes_result
+    __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
+    __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
+    __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size 
+    __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size
+    
+    // gamut_fxp inputs
+    __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
+    __visc__edge(TrNode, GmNode, 1, 0, 1, 0); // TrNode:bytes_result -> GmNode:bytes_input
+    __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
+    __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
+    __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
+    __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
+    __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
+    __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
+    __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
+    __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
+    __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
+    __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
+    __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size 
+    __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size
+    
+    // tone_map_fxp inputs
+    __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
+    __visc__edge(GmNode, TnNode, 1, 0, 1, 0); // GmNode:bytes_result -> TnNode:bytes_input
+    __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
+    __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
+    __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
+    __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
+    __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size 
+    __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size
+
+    // descale_fxp inputs
+    __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
+    __visc__edge(TnNode, DsNode, 1, 0, 1, 0); // TnNode:bytes_result -> DsNode:bytes_input
+    __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result
+    __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result
+    __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
+    __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
+
+  // Similar to bindIn, but for the output. Output of a node is a struct, and
+  // we consider the fields in increasing ordering.
+    __visc__bindOut(DsNode, 0, 0, 0);
+    
+}
+
+int main(int argc, char* argv[]) {
+    // Parse the arguments.
+    arguments args;
+    set_default_args(&args);
+    argp_parse(&parser, argc, argv, 0, 0, &args);
+
+    // Read a raw image.
+    // NOTE: We deliberately perform this file I/O outside of the kernel.
+    printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
+    int row_size, col_size;
+    uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
+
+    printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
+
+    // Allocate a buffer for storing the output image data.
+    // (This is currently the same size as the input image data.)
+    size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
+    size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
+    uint8_t *image_out = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t *image_out_gamut = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t *image_out_demosaic = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t *image_out_denoise = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t *image_out_transform = (uint8_t*) malloc_aligned(bytes_image);
+
+    __visc__init();
+
+    ///////////////////////////////////////////////////////////////
+    // Camera Model Parameters
+    ///////////////////////////////////////////////////////////////
+    // Path to the camera model to be used
+//    char cam_model_path[100];
+//    char cam_model_path = "cam_models/NikonD7000/";
+    // White balance index (select white balance from transform file)
+    // The first white balance in the file has a wb_index of 1
+    // For more information on model format see the readme
+    int wb_index = 6;
+
+    // Number of control points
+    int num_ctrl_pts = 3702;
+    uint8_t *input, *result;
+    float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, *transform_out, *gamut_out;
+    float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
+
+    TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
+    float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
+    free(TsTw);
+    TsTw = trans;
+    ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
+    weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
+    coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
+    tone_map = get_tone_map("cam_models/NikonD7000/");
+    
+    input_scaled = (float*) malloc_aligned(bytes_fimage);
+    result_scaled = (float*) malloc_aligned(bytes_fimage);
+    demosaic_out = (float*) malloc_aligned(bytes_fimage);
+    denoise_out = (float*) malloc_aligned(bytes_fimage);
+    transform_out  = (float*) malloc_aligned(bytes_fimage);
+    gamut_out = (float*) malloc_aligned(bytes_fimage);
+    l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts);    
+    
+    // This is host_input in cam_pipe()
+    input = (uint8_t*) malloc_aligned(bytes_image);
+    convert_hwc_to_chw(image_in, row_size, col_size, &input);
+    
+    // This is host_result in cam_pipe()
+    result = (uint8_t*) malloc_aligned(bytes_image);
+
+    // Allocate struct to pass DFG inputs
+    RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn));
+
+    // Set up HPVM DFG inputs in the rootArgs struct.
+    rootArgs->input = input;
+    rootArgs->bytes_input = bytes_image;
+    
+    rootArgs->result = result;
+    rootArgs->bytes_result = bytes_image;
+    
+    rootArgs->input_scaled = input_scaled;
+    rootArgs->bytes_input_scaled = bytes_fimage;
+    
+    rootArgs->result_scaled = result_scaled;
+    rootArgs->bytes_result_scaled = bytes_fimage;
+    
+    rootArgs->demosaic_out = demosaic_out;
+    rootArgs->bytes_demosaic_out = bytes_fimage;
+    
+    rootArgs->denoise_out = denoise_out;
+    rootArgs->bytes_denoise_out = bytes_fimage;
+    
+    rootArgs->transform_out = transform_out;
+    rootArgs->bytes_transform_out = bytes_fimage;
+
+    rootArgs->gamut_out = gamut_out;
+    rootArgs->bytes_gamut_out = bytes_fimage;
+
+    rootArgs->TsTw = TsTw;
+    rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->ctrl_pts = ctrl_pts;
+    rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->weights = weights;
+    rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->coefs = coefs;
+    rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->tone_map = tone_map;
+    rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);
+    
+    rootArgs->l2_dist = l2_dist;
+    rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);
+    
+    rootArgs->row_size = row_size;
+    rootArgs->col_size = col_size;
+
+    // Memory tracking is required for pointer arguments.
+    // Nodes can be scheduled on different targets, and 
+    // dataflow edge implementation needs to request data.
+    // The pair (pointer, size) is inserted in memory tracker using this call
+    llvm_visc_track_mem(input, bytes_image);
+    llvm_visc_track_mem(result, bytes_image);
+    llvm_visc_track_mem(input_scaled, bytes_fimage);
+    llvm_visc_track_mem(result_scaled, bytes_fimage);
+    llvm_visc_track_mem(demosaic_out, bytes_fimage);
+    llvm_visc_track_mem(denoise_out, bytes_fimage);
+    llvm_visc_track_mem(transform_out, bytes_fimage);
+    llvm_visc_track_mem(gamut_out, bytes_fimage);
+    llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); 
+    llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+    llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+    llvm_visc_track_mem(coefs, 4 * CHAN_SIZE *sizeof(float));
+    llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
+    llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float));
+    
+    printf("\n\nLaunching CAVA pipeline!\n");
+
+    void* camPipeDFG = __visc__launch(0, CamPipeRoot, (void*) rootArgs);
+    __visc__wait(camPipeDFG);
+
+    printf("\n\nPipeline execution completed!\n");
+    printf("\n\nRequesting memory!\n");
+
+    // Request data from graph.    
+    llvm_visc_request_mem(result, bytes_image);
+    llvm_visc_request_mem(demosaic_out, bytes_fimage);
+    llvm_visc_request_mem(denoise_out, bytes_fimage);
+    llvm_visc_request_mem(transform_out, bytes_fimage);
+    llvm_visc_request_mem(gamut_out, bytes_fimage);
+    printf("\n\nDone requesting memory!\n");
+
+
+    uint8_t* gamut_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
+  uint8_t* demosaic_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t* transform_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
+    uint8_t* denoise_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
+    
+  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, row_size, col_size);
+    descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, row_size, col_size);
+    descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, row_size, col_size);
+    descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, row_size, col_size);
+    
+    convert_chw_to_hwc(result, row_size, col_size, &image_out);
+   convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
+    convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, &image_out_demosaic);
+    convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, &image_out_denoise);
+    convert_chw_to_hwc(transform_out_descaled, row_size, col_size, &image_out_transform);
+
+    
+    // Remove tracked pointers.
+    llvm_visc_untrack_mem(input);
+    llvm_visc_untrack_mem(result);
+    llvm_visc_untrack_mem(input_scaled);
+    llvm_visc_untrack_mem(result_scaled);
+    llvm_visc_untrack_mem(demosaic_out);
+    llvm_visc_untrack_mem(denoise_out);
+    llvm_visc_untrack_mem(transform_out);
+    llvm_visc_untrack_mem(gamut_out);
+    
+    llvm_visc_untrack_mem(TsTw); 
+    llvm_visc_untrack_mem(ctrl_pts);
+    llvm_visc_untrack_mem(weights);
+    llvm_visc_untrack_mem(coefs);
+    llvm_visc_untrack_mem(tone_map);
+    llvm_visc_untrack_mem(l2_dist);
+
+    // Output the image.
+    // NOTE: We deliberately perform this file I/O outside of the kernel.
+  char str[50], base_str[50];
+  strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]);
+  strcpy(str, base_str);
+  strcat(str, ".bin");
+  printf("Writing output image to %s\n", str);
+  write_image_to_binary(str, image_out, row_size, col_size);
+  strcpy(str, base_str);
+  strcat(str, "_gamut.bin");
+  printf("Writing output image to %s\n", str);
+  write_image_to_binary(str, image_out_gamut, row_size, col_size);
+  strcpy(str, base_str);
+  strcat(str, "_demosaic.bin");
+  printf("Writing output image to %s\n", str);
+  write_image_to_binary(str, image_out_demosaic, row_size, col_size);
+  strcpy(str, base_str);
+  strcat(str, "_denoise.bin");
+  printf("Writing output image to %s\n", str);
+  write_image_to_binary(str, image_out_denoise, row_size, col_size);
+  strcpy(str, base_str);
+  strcat(str, "_transform.bin");
+  printf("Writing output image to %s\n", str);
+  write_image_to_binary(str, image_out_transform, row_size, col_size);
+
+    __visc__cleanup();
+
+    return 0;
+}
+
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/new_main.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/new_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6e36ba1db0d4d2e7806256b4c31e5955917e68c
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/new_main.c
@@ -0,0 +1,115 @@
+
+
+#include <stdlib.h>
+
+
+typedef struct __attribute__((__packed__)) {
+    float* input; size_t bytes_input;
+    float* result; size_t bytes_result;
+} 
+RootIn;
+
+
+
+
+void scale_values(float* input, size_t bytes_input) {
+
+  __visc__hint(DEVICE);
+  __visc__attributes(1, input, 1, input);
+  
+  for (int ind = 0; ind < bytes_input; ind++){
+    input[ind] = input[ind] * 2.0;
+  }
+  
+  __visc__return(1, bytes_input);
+}
+
+
+
+
+void graphRoot(/*0*/ float* input, /*1*/ size_t bytes_input, 
+               /*2*/ float* result, /*3*/ size_t bytes_result) {
+
+  //Specifies compilation target for current node
+  __visc__hint(CPU_TARGET);
+
+  __visc__attributes(2, input, result, 2, input, result);
+
+  // Create an 0D (specified by 1st argument) HPVM node - so a single node
+  // associated with node function ---_fxp_wrapper
+
+  void* scaleNode = __visc__createNodeND(0, scale_values);
+    
+  // BindIn binds inputs of current node with specified node
+  // - destination node
+  // - argument position in argument list of function of source node
+  // - argument position in argument list of function of destination node
+  // - streaming (1) or non-streaming (0)
+
+  // Edge transfers data between nodes within the same level of hierarchy.
+  // - source and destination dataflow nodes
+  // - edge type, all-all (1) or one-one(0)
+  // - source position (in output struct of source node)
+  // - destination position (in argument list of destination node)
+  // - streaming (1) or non-streaming (0)
+
+  // scale_fxp inputs
+  __visc__bindIn(scaleNode, 0, 0, 0); // input -> ScNode:input
+  __visc__bindIn(scaleNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input
+    
+  // Similar to bindIn, but for the output. Output of a node is a struct, and
+  // we consider the fields in increasing ordering.
+  __visc__bindOut(scaleNode, 0, 0, 0);
+    
+}
+
+
+
+
+
+int main(int argc, char* argv[]) {
+
+    __visc__init();
+
+    size_t input_size = 100;
+    size_t result_size = 100;
+
+    size_t input_bytes = input_size * sizeof(float);
+    size_t result_bytes = result_size * sizeof(float);
+
+    // This is host_input in cam_pipe()
+    input = (float*) malloc(input_bytes);
+    // This is host_result in cam_pipe()
+    result = (float*) malloc(result_bytes);
+
+
+    RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn));
+
+    // Set up HPVM DFG inputs in the rootArgs struct.
+    rootArgs->input = input;
+    rootArgs->bytes_input = input_bytes;
+    
+    rootArgs->result = result;
+    rootArgs->bytes_result = result_bytes;
+
+    llvm_visc_track_mem(input, input_bytes);
+    llvm_visc_track_mem(result, result_bytes);
+
+
+    void* testDFG = __visc__launch(0, graphRoot, (void*) rootArgs);
+    __visc__wait(testDFG);
+
+
+    llvm_visc_request_mem(result, input_bytes);
+    llvm_visc_request_mem(result, result_bytes);
+
+
+
+    __visc__cleanup();
+
+    return 0;
+}
+
+
+
+    
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.c
new file mode 100644
index 0000000000000000000000000000000000000000..2ebedec936915b5e7f11881c5001c84b6db26474
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.c
@@ -0,0 +1,290 @@
+#include <stdio.h>
+#include <math.h>
+#include "pipe_stages.h"
+#include "cam_pipe_utility.h"
+
+//void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) {
+void scale_fxp(uint8_t *input, size_t bytes_input, 
+               float *output, size_t bytes_output,
+               int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(2, input, output, 1, output);
+  
+  ARRAY_3D(uint8_t, _input, input, row_size, col_size);
+  ARRAY_3D(float, _output, output, row_size, col_size);
+  sl_chan:
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    sl_row:
+    for (int row = 0; row < row_size; row++)
+      sl_col:
+      for (int col = 0; col < col_size; col++)
+        _output[chan][row][col] = _input[chan][row][col] * 1.0 / 255;
+
+  __visc__return(1, bytes_output);
+}
+
+//void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) {
+void descale_fxp(float *input, size_t bytes_input, 
+                 uint8_t *output, size_t bytes_result,
+                 int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(2, input, output, 1, output);
+  
+  ARRAY_3D(float, _input, input, row_size, col_size);
+  ARRAY_3D(uint8_t, _output, output, row_size, col_size);
+  dsl_chan:
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    dsl_row:
+    for (int row = 0; row < row_size; row++)
+      dsl_col:
+      for (int col = 0; col < col_size; col++)
+        _output[chan][row][col] = min(max(_input[chan][row][col] * 255, 0), 255);
+
+  __visc__return(1, bytes_output);
+}
+
+// Demosaicing stage
+// G R
+// B G
+//void demosaic_fxp(float *input, int row_size, int col_size, float *result) {
+void demosaic_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(2, input, result, 1, result);
+  
+  printf("Demosaicing.\n");
+  ARRAY_3D(float, _input, input, row_size, col_size);
+  ARRAY_3D(float, _result, result, row_size, col_size);
+
+  dm_row:
+  for (int row = 1; row < row_size - 1; row++)
+    dm_col:
+    for (int col = 1; col < col_size - 1; col++)
+        if (row % 2 == 0 && col % 2 == 0) {
+            // Green pixel
+            // Getting the R values
+            float R1 = _input[0][row][col - 1];
+            float R2 = _input[0][row][col + 1];
+            // Getting the B values
+            float B1 = _input[2][row - 1][col];
+            float B2 = _input[2][row + 1][col];
+            // R
+            _result[0][row][col] = (R1 + R2) / 2;
+            // G
+            _result[1][row][col] = _input[1][row][col] * 2;
+            // B
+            _result[2][row][col] = (B1 + B2) / 2;
+        } else if (row % 2 == 0 && col % 2 == 1) {
+            // Red pixel
+            // Getting the G values
+            float G1 = _input[1][row - 1][col];
+            float G2 = _input[1][row + 1][col];
+            float G3 = _input[1][row][col - 1];
+            float G4 = _input[1][row][col + 1];
+            // Getting the B values
+            float B1 = _input[2][row - 1][col - 1];
+            float B2 = _input[2][row - 1][col + 1];
+            float B3 = _input[2][row + 1][col - 1];
+            float B4 = _input[2][row + 1][col + 1];
+            // R
+            _result[0][row][col] = _input[0][row][col];
+            // G
+            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+            // B (center pixel)
+            _result[2][row][col] = (B1 + B2 + B3 + B4) / 4;
+        } else if (row % 2 == 1 && col % 2 == 0) {
+            // Blue pixel
+            // Getting the R values
+            float R1 = _input[0][row - 1][col - 1];
+            float R2 = _input[0][row + 1][col - 1];
+            float R3 = _input[0][row - 1][col + 1];
+            float R4 = _input[0][row + 1][col + 1];
+            // Getting the G values
+            float G1 = _input[1][row - 1][col];
+            float G2 = _input[1][row + 1][col];
+            float G3 = _input[1][row][col - 1];
+            float G4 = _input[1][row][col + 1];
+            // R
+            _result[0][row][col] = (R1 + R2 + R3 + R4) / 4;
+            // G
+            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+            // B
+            _result[2][row][col] = _input[2][row][col];
+        } else {
+            // Bottom Green pixel
+            // Getting the R values
+            float R1 = _input[0][row - 1][col];
+            float R2 = _input[0][row + 1][col];
+            // Getting the B values
+            float B1 = _input[2][row][col - 1];
+            float B2 = _input[2][row][col + 1];
+            // R
+            _result[0][row][col] = (R1 + R2) / 2;
+            // G
+            _result[1][row][col] = _input[1][row][col] * 2;
+            // B
+            _result[2][row][col] = (B1 + B2) / 2;
+        }
+
+  __visc__return(1, bytes_result);
+}
+
+static void sort(float arr[], int n) {
+    int i, j;
+    dn_sort_i:
+    for (i = 0; i < n - 1; i++)
+        dn_sort_j:
+        for (j = 0; j < n - i - 1; j++)
+            if (arr[j] > arr[j + 1]) {
+                float temp = arr[j];
+                arr[j] = arr[j + 1];
+                arr[j + 1] = temp;
+            }
+}
+
+// Simple denoise
+//void denoise_fxp(float *input, int row_size, int col_size, float *result) {
+void denoise_fxp(float *input, size_t bytes_input, 
+                 float *result, size_t bytes_result,
+                 int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(2, input, result, 1, result);
+  
+  printf("Denoising.\n");
+  ARRAY_3D(float, _input, input, row_size, col_size);
+  ARRAY_3D(float, _result, result, row_size, col_size);
+
+  dn_chan:
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    dn_row:
+    for (int row = 0; row < row_size; row++)
+      dn_col:
+      for (int col = 0; col < col_size; col++)
+        if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
+          float filter[9];
+          dn_slide_row:
+          for (int i = row-1; i < row+2; i++)
+            dn_slide_col:
+            for (int j = col-1; j < col+2; j++) {
+              int index = (i - row + 1) * 3 + j - col + 1;
+              filter[index] = _input[chan][i][j];
+            }
+          sort(filter, 9);
+          _result[chan][row][col] = filter[4];
+        } else {
+          _result[chan][row][col] = _input[chan][row][col];
+        }
+  __visc__return(1, bytes_result);
+}
+
+// Color map and white balance transform
+//void transform_fxp(float *input, int row_size, int col_size, float *result,
+//                   float *TsTw_tran) {
+void transform_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *TsTw_tran, size_t bytes_TsTw,
+                   int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(3, input, result, TsTw_tran, 1, result);
+  
+  printf("Color mapping.\n");
+  ARRAY_3D(float, _input, input, row_size, col_size);
+  ARRAY_3D(float, _result, result, row_size, col_size);
+  ARRAY_2D(float, _TsTw_tran, TsTw_tran, 3);
+
+  tr_chan:
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    tr_row:
+    for (int row = 0; row < row_size; row++)
+      tr_col:
+      for (int col = 0; col < col_size; col++)
+        _result[chan][row][col] =
+            max(_input[0][row][col] * _TsTw_tran[0][chan] +
+                    _input[1][row][col] * _TsTw_tran[1][chan] +
+                    _input[2][row][col] * _TsTw_tran[2][chan],
+                0);
+  __visc__return(1, bytes_result);
+}
+
+//
+// Weighted radial basis function for gamut mapping
+//
+//void gamut_map_fxp(float *input, int row_size, int col_size, float *result,
+//                   float *ctrl_pts, float *weights, float *coefs, float *l2_dist) {
+void gamut_map_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights,
+                   float *coefs, size_t bytes_coefs,
+                   float *l2_dist, size_t bytes_l2_dist,
+                   int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
+  
+  printf("Gamut mapping.\n");
+  ARRAY_3D(float, _input, input, row_size, col_size);
+  ARRAY_3D(float, _result, result, row_size, col_size);
+  ARRAY_2D(float, _ctrl_pts, ctrl_pts, 3);
+  ARRAY_2D(float, _weights, weights, 3);
+  ARRAY_2D(float, _coefs, coefs, 3);
+
+  // First, get the L2 norm from every pixel to the control points,
+  // Then, sum it and weight it. Finally, add the bias.
+  gm_rbf_row:
+  for (int row = 0; row < row_size; row++)
+    gm_rbf_col:
+    for (int col = 0; col < col_size; col++) {
+      gm_rbf_cp0:
+      for (int cp = 0; cp < num_ctrl_pts; cp++) {
+        l2_dist[cp] =
+            sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) *
+                     (_input[0][row][col] - _ctrl_pts[cp][0]) +
+                 (_input[1][row][col] - _ctrl_pts[cp][1]) *
+                     (_input[1][row][col] - _ctrl_pts[cp][1]) +
+                 (_input[2][row][col] - _ctrl_pts[cp][2]) *
+                     (_input[2][row][col] - _ctrl_pts[cp][2]));
+      }
+      gm_rbf_chan:
+      for (int chan = 0; chan < CHAN_SIZE; chan++) {
+        float chan_val = 0.0;
+        gm_rbf_cp1:
+        for (int cp = 0; cp < num_ctrl_pts; cp++) {
+          chan_val += l2_dist[cp] * _weights[cp][chan];
+        }
+        // Add on the biases for the RBF
+        chan_val += _coefs[0][chan] + _coefs[1][chan] * _input[0][row][col] +
+                    _coefs[2][chan] * _input[1][row][col] +
+                    _coefs[3][chan] * _input[2][row][col];
+        _result[chan][row][col] = max(chan_val, 0);
+      }
+    }
+  __visc__return(1, bytes_result);
+}
+
+// Tone mapping
+//void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map,
+//                  float *result) {
+void tone_map_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  float *tone_map, size_t bytes_tone_map,
+                  int row_size, int col_size) {
+  __visc__hint(DEVICE);
+  __visc__attributes(3, input, result, tone_map, 1, result);
+  
+  printf("Tone mapping.\n");
+  ARRAY_3D(float, _input, input, row_size, col_size);
+  ARRAY_3D(float, _result, result, row_size, col_size);
+  ARRAY_2D(float, _tone_map, tone_map, 3);
+
+  tm_chan:
+  for (int chan = 0; chan < CHAN_SIZE; chan++)
+    tm_row:
+    for (int row = 0; row < row_size; row++)
+      tm_col:
+      for (int col = 0; col < col_size; col++) {
+        uint8_t x = _input[chan][row][col] * 255;
+        _result[chan][row][col] = _tone_map[x][chan];
+      }
+  __visc__return(1, bytes_result);
+}
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.h
new file mode 100644
index 0000000000000000000000000000000000000000..eae4347b991fe948173fc85334c65f084d40b745
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.h
@@ -0,0 +1,63 @@
+#ifndef _PIPE_STAGES_H_
+#define _PIPE_STAGES_H_
+
+#include "defs.h"
+
+#define CHAN_SIZE 3
+
+#define ISP 0x4
+
+#define max(a,b) \
+  ({ __typeof__ (a) _a = (a); \
+      __typeof__ (b) _b = (b); \
+    _a > _b ? _a : _b; })
+
+#define min(a,b) \
+  ({ __typeof__ (a) _a = (a); \
+      __typeof__ (b) _b = (b); \
+    _a < _b ? _a : _b; })
+
+#define abs(a) \
+  ({ __typeof__ (a) _a = (a); \
+    _a < 0 ? -_a : _a; })
+
+extern int num_ctrl_pts;
+
+void scale_fxp(uint8_t *input, size_t bytes_input, 
+               float *output, size_t bytes_output,
+               int row_size, int col_size);
+
+void descale_fxp(float *input, size_t bytes_input, 
+                 uint8_t *output, size_t bytes_result,
+                 int row_size, int col_size);
+
+void demosaic_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  int row_size, int col_size);
+
+void denoise_fxp(float *input, size_t bytes_input, 
+                 float *result, size_t bytes_result,
+                 int row_size, int col_size);
+
+void transform_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *TsTw_tran, size_t bytes_TsTw,
+                   int row_size, int col_size);
+
+void gamut_map_fxp(float *input, size_t bytes_input, 
+                   float *result, size_t bytes_result,
+                   float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights,
+                   float *coefs, size_t bytes_coefs,
+                   float *l2_dist, size_t bytes_l2_dist,
+                   int row_size, int col_size);
+
+void tone_map_fxp(float *input, size_t bytes_input, 
+                  float *result, size_t bytes_result,
+                  float *tone_map, size_t bytes_tone_map,
+                  int row_size, int col_size);
+
+void tone_map_approx_fxp(float *input, int row_size, int col_size,
+                         float *result);
+
+#endif
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/rename.sh b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/rename.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3b6faac63aef89cc51f776ed7e5f1e048f8da815
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/rename.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+for f in *.cc; do 
+    mv -- "$f" "${f%.cc}.c"
+done
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.c
new file mode 100644
index 0000000000000000000000000000000000000000..c1eaee3333c2afffdcae827f956efa4e25705352
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.c
@@ -0,0 +1,11 @@
+#include <stdlib.h>
+#include <assert.h>
+#include "defs.h"
+#include "utility.h"
+
+void *malloc_aligned(size_t size) {
+  void *ptr = NULL;
+  int err = posix_memalign((void **)&ptr, CACHELINE_SIZE, size);
+  assert(err == 0 && "Failed to allocate memory!");
+  return ptr;
+}
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc407c72a3c251dc5628bc90e23e71040a074a32
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.h
@@ -0,0 +1,8 @@
+#ifndef _COMMON_UTILITY_H_
+#define _COMMON_UTILITY_H_
+
+#include <stddef.h>
+
+void *malloc_aligned(size_t size);
+
+#endif
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/visc.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/visc.h
new file mode 100644
index 0000000000000000000000000000000000000000..a263e352523431a868637b1849ac532d7ed716a9
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/visc.h
@@ -0,0 +1,110 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef DEVICE
+#define DEVICE GPU_TARGET
+#endif
+
+#include "llvm/SupportVISC/VISCHint.h"
+
+#ifdef __cplusplus
+extern "C" {
+void __visc__hint(visc::Target);
+//void __visc__wait(void*);
+#else
+void __visc__hint(enum Target);
+//void __visc__wait(unsigned);
+#endif
+
+#ifdef __cplusplus
+//void* __visc__node(...);
+//void* __visc__createNode(...);
+//void* __visc__createNode1D(...);
+//void* __visc__createNode2D(...);
+//void* __visc__createNode3D(...);
+//void __visc__return(...);
+#endif
+
+void* __visc__createNodeND(unsigned,...);
+void __visc__return(unsigned, ...);
+
+void __visc__attributes(unsigned, ...);
+void __visc__init();
+void __visc__cleanup();
+
+void __visc__bindIn(void*, unsigned, unsigned, unsigned);
+void __visc__bindOut(void*, unsigned, unsigned, unsigned);
+void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned);
+void __visc__push(void*, void*);
+void* __visc__pop(void*);
+void* __visc__launch(unsigned, ...);
+void __visc__wait(void*);
+
+void* __visc__getNode();
+void* __visc__getParentNode(void*);
+void __visc__barrier();
+void* __visc__malloc(long);
+long __visc__getNodeInstanceID_x(void*);
+long __visc__getNodeInstanceID_y(void*);
+long __visc__getNodeInstanceID_z(void*);
+long __visc__getNumNodeInstances_x(void*);
+long __visc__getNumNodeInstances_y(void*);
+long __visc__getNumNodeInstances_z(void*);
+
+// Atomic
+// signed int
+int __visc__atomic_cmpxchg(int*, int, int);
+int __visc__atomic_add(int*, int);
+int __visc__atomic_sub(int*, int);
+int __visc__atomic_xchg(int*, int);
+int __visc__atomic_inc(int*);
+int __visc__atomic_dec(int*);
+int __visc__atomic_min(int*, int);
+int __visc__atomic_max(int*, int);
+int __visc__atomic_umax(int*, int);
+int __visc__atomic_umin(int*, int);
+int __visc__atomic_and(int*, int);
+int __visc__atomic_or(int*, int);
+int __visc__atomic_xor(int*, int);
+
+// Special Func
+float __visc__floor(float);
+float __visc__rsqrt(float);
+float __visc__sqrt(float);
+float __visc__sin(float);
+float __visc__cos(float);
+// unsigned int
+//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
+//unsigned __visc__atomic_add(unsigned*, unsigned);
+//unsigned __visc__atomic_sub(unsigned*, unsigned);
+//unsigned __visc__atomic_xchg(unsigned*, unsigned);
+//unsigned __visc__atomic_inc(unsigned*);
+//unsigned __visc__atomic_dec(unsigned*);
+//unsigned __visc__atomic_min(unsigned*, unsigned);
+//unsigned __visc__atomic_max(unsigned*, unsigned);
+//unsigned __visc__atomic_and(unsigned*, unsigned);
+//unsigned __visc__atomic_or(unsigned*, unsigned);
+//unsigned __visc__atomic_xor(unsigned*, unsigned);
+
+
+#include <unistd.h>
+
+long get_global_id(int);
+long get_group_id(int);
+long get_local_id(int);
+long get_local_size(int);
+
+
+void llvm_visc_track_mem(void*, size_t);
+void llvm_visc_untrack_mem(void*);
+void llvm_visc_request_mem(void*, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+