diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f6a82b3e3d84b98630f45bc1c95961ea6968d4e9 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/Makefile @@ -0,0 +1,254 @@ +# This Makefile compiles the HPVM-CAVA pilot project. +# It builds HPVM-related dependencies, then the native camera pipeline ISP code. +# +# Paths to some dependencies (e.g., HPVM, LLVM) must exist in Makefile.config, +# which can be copied from Makefile.config.example for a start. + +CONFIG_FILE := Makefile.config + +ifeq ($(wildcard $(CONFIG_FILE)),) + $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example) +endif +include $(CONFIG_FILE) + +# Compiler Flags + +DLEVEL ?= 0 +#CAM_CFLAGS?=-O0 \ + #-Wno-psabi \ + #-Wno-unused-label -Wno-unused-but-set-variable \ + #-Wno-maybe-uninitialized -DARCHITECTURE=SMV -DTRANSPOSE_WEIGHTS=1 \ + -DDEBUG_LEVEL=$(DLEVEL) +LFLAGS += -lm -lrt + +# Build dirs +ifeq ($(VERSION),) + VERSION = Default +endif +SRC_DIR = src/ +CAM_PIPE_SRC_DIR = $(SRC_DIR) +BUILD_DIR = build/$(TARGET)_$(VERSION) +CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + +# Source files for the frontend camera pipeline +COMMON_SRCS = main.c \ + utility.c + + +CAM_PIPE_SRCS = load_cam_model.c \ + cam_pipe_utility.c \ + dma_interface.c # FIXME: This is a hack until external C++ files can be included in build. + + +# NOTE: We have temporarily removed gem5 and other dependencies for simplicity. +SRCS = $(COMMON_SRCS) $(CAM_PIPE_SRCS) +#GEM5_DMA_SRC = gem5/dma_interface.c +#GEM5_SYS_SRCS = gem5/aladdin_sys_connection.cpp gem5/aladdin_sys_constants.cpp +#GEM5_UTIL_SRCS = ../../util/m5/m5op_x86.S + +# NATIVE_FULL_PATH_SRCS contains all the full path source files for the camera vision pipeline. +NATIVE_FULL_PATH_SRCS = $(patsubst %, $(SRC_DIR)/%, $(COMMON_SRCS)) +NATIVE_FULL_PATH_SRCS += $(patsubst %, $(CAM_PIPE_SRC_DIR)/%, $(CAM_PIPE_SRCS)) + +# GEM5_FULL_PATH_SRCS contains all the full path source files for the gem5 files. +#GEM5_FULL_PATH_SRCS = $(patsubst %, $(ALADDIN_HOME)/%, $(GEM5_DMA_SRC) $(GEM5_SYS_SRCS) $(GEM5_UTIL_SRCS)) + +#INCLUDES += -I$(ALADDIN_HOME) \ +# -I$(ALADDIN_HOME)/../../include \ +# -I$(ALADDIN_HOME)/gem5 +INCLUDES += -I$(SRC_DIR) \ + -I$(CAM_PIPE_SRC_DIR) \ + +ifneq ($(CONFUSE_ROOT),) +INCLUDES += -I$(CONFUSE_ROOT)/include +LFLAGS += -L$(CONFUSE_ROOT)/lib +endif + +EXE = cava-visc-$(VERSION)-$(TARGET) + +CAM_CFLAGS += -mf16c -flax-vector-conversions +LFLAGS += -pthread + +#$(DEBUG): $(NATIVE_FULL_PATH_SRCS) $(GEM5_FULL_PATH_SRCS) +# @echo Building benchmark for native machine with debug support. +# #@mkdir -p $(BUILD_DIR) +# @$(CC) $(CAM_CFLAGS) -ggdb3 $(INCLUDES) -DGEM5 -DDMA_MODE -DDMA_INTERFACE_V3 -o $(DEBUG) $^ $(LFLAGS) +# +#clean-native: +# rm -f $(NATIVE) $(DEBUG) + +## BEGIN HPVM MAKEFILE +LANGUAGE=visc +SRCDIR_OBJS= load_cam_model.ll cam_pipe_utility.ll dma_interface.ll utility.ll +OBJS_SRC=src/cam_pipe.c src/pipe_stages.c src/load_cam_model.c src/cam_pipe_utility.c src/dma_interface.c src/utility.c +VISC_OBJS=main.visc.ll +APP = $(EXE) +APP_CUDALDFLAGS=-lm -lstdc++ +APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3 +APP_CXXFLAGS=-ffast-math -O0 -I/opt/opencv/include +APP_LDFLAGS=$(LFLAGS) +OPT_FLAGS = -tti -targetlibinfo -tbaa -scoped-noalias -assumption-cache-tracker -profile-summary-info -forceattrs -inferattrs -ipsccp -globalopt -domtree -mem2reg -deadargelim -domtree -basicaa -aa -simplifycfg -pgo-icall-prom -basiccg -globals-aa -prune-eh -always-inline -functionattrs -domtree -sroa -early-cse -lazy-value-info -jump-threading -correlated-propagation -simplifycfg -domtree -basicaa -aa -libcalls-shrinkwrap -tailcallelim -simplifycfg -reassociate -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -licm -loop-unswitch -simplifycfg -domtree -basicaa -aa -loops -loop-simplify -lcssa-verification -lcssa -scalar-evolution -indvars -loop-idiom -loop-deletion -memdep -memcpyopt -sccp -domtree -demanded-bits -bdce -basicaa -aa -lazy-value-info -jump-threading -correlated-propagation -domtree -basicaa -aa -memdep -dse -loops -loop-simplify -lcssa-verification -lcssa -aa -scalar-evolution -licm -postdomtree -adce -simplifycfg -domtree -basicaa -aa -barrier -basiccg -rpo-functionattrs -globals-aa -float2int -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -loop-accesses -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-distribute -loop-simplify -lcssa-verification -lcssa -branch-prob -block-freq -scalar-evolution -basicaa -aa -loop-accesses -demanded-bits -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-vectorize -loop-simplify -scalar-evolution -aa -loop-accesses -loop-load-elim -basicaa -aa -simplifycfg -domtree -basicaa -aa -loops -scalar-evolution -alignment-from-assumptions -strip-dead-prototypes -domtree -loops -branch-prob -block-freq -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -branch-prob -block-freq -loop-sink -instsimplify + +CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS) +OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS) +CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS) +LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS) + +LIBCLC_LIB_PATH = $(LLVM_SRC_ROOT)/../libclc/built_libs +VISC_RT_PATH = $(LLVM_SRC_ROOT)/projects/visc-rt + +VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll +#LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx--nvidiacl.bc +LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx64--nvidiacl.bc +#LIBCLC_NVPTX_LIB = nvptx64--nvidiacl.bc + +LLVM_34_AS = $(LLVM_34_ROOT)/build/bin/llvm-as + +TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce +KERNEL_GEN_FLAGS = -O3 -target nvptx64-nvidia-nvcl + +ifeq ($(TARGET),x86) + DEVICE = SPIR_TARGET + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_SPIR.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-spir -dfg2llvm-x86 -clearDFG + CFLAGS += -DOPENCL_CPU + VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir +else ifeq ($(TARGET),seq) + DEVICE = CPU_TARGET + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS += -visc-timers-x86 +else ifeq ($(TARGET),fpga) + DEVICE = FPGA_TARGET + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_FPGA.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-fpga -dfg2llvm-x86 -clearDFG + CFLAGS += -DOPENCL_CPU + VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-fpga +else + DEVICE = GPU_TARGET + VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMLocalMem.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -localmem -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG + VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx +endif + TESTGEN_OPTFLAGS += -visc-timers-gen + +CFLAGS += -DDEVICE=$(DEVICE) +CXXFLAGS += -DDEVICE=$(DEVICE) + +#ifeq ($(TIMER),x86) +# VISC_OPTFLAGS += -visc-timers-x86 +#else ifeq ($(TIMER),ptx) +# VISC_OPTFLAGS += -visc-timers-ptx +#else ifeq ($(TIMER),gen) +# TESTGEN_OPTFLAGS += -visc-timers-gen +#else ifeq ($(TIMER),spir) +# TESTGEN_OPTFLAGS += -visc-timers-spir +#else ifeq ($(TIMER),fpga) +# TESTGEN_OPTFLAGS += -visc-timers-fpga +#else ifeq ($(TIMER),no) +#else +# ifeq ($(TARGET),x86) +# VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir +# else ifeq ($(TARGET),seq) +# VISC_OPTFLAGS += -visc-timers-x86 +# else ifeq ($(TARGET),fpga) +# VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-fpga +# else ifeq ($(TARGET),seqx86) +# VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-spir +# else ifeq ($(TARGET),seqgpu) +# VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx +# else +# VISC_OPTFLAGS += -visc-timers-x86 -visc-timers-ptx +# endif +# TESTGEN_OPTFLAGS += -visc-timers-gen +#endif + +# Add BUILDDIR as a prefix to each element of $1 +INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1)) + +# Add SRCDIR as a prefix to each element of $1 +#INSRCDIR=$(addprefix $(SRCDIR)/,$(1)) + +PYTHON_LLVM_40_34 = ../llvm-40-34.py + +.PRECIOUS: $(BUILD_DIR)/%.ll + +OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS)) +TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) +KERNEL = $(TEST_OBJS).kernels.ll + +ifeq ($(TARGET),x86) + SPIR_ASSEMBLY = $(TEST_OBJS).kernels.bc +else ifeq ($(TARGET),seq) +else ifeq ($(TARGET),fpga) + AOC_CL = $(TEST_OBJS).kernels.cl + AOCL_ASSEMBLY = $(TEST_OBJS).kernels.aocx + BOARD = a10gx + ifeq ($(EMULATION),1) + EXE = cava-visc-emu + AOC_EMU = -march=emulator + BUILD_DIR = build/$(TARGET)-emu + endif +else + KERNEL_LINKED = $(BUILD_DIR)/$(APP).kernels.linked.ll + #KERNEL = $(TEST_OBJS).kernels.ll + PTX_ASSEMBLY = $(TEST_OBJS).nvptx.s +endif + +HOST_LINKED = $(BUILD_DIR)/$(APP).linked.ll +HOST = $(BUILD_DIR)/$(APP).host.ll + +ifeq ($(OPENCL_PATH),) +FAILSAFE=no_opencl +else +FAILSAFE= +endif + +# Targets +default: $(FAILSAFE) $(BUILD_DIR) $(EXE) +#default: $(FAILSAFE) $(BUILD_DIR) $(PTX_ASSEMBLY) $(SPIR_ASSEMBLY) $(AOC_CL) $(AOCL_ASSEMBLY) $(EXE) + +$(PTX_ASSEMBLY) : $(KERNEL_LINKED) + $(CC) $(KERNEL_GEN_FLAGS) -S $< -o $@ + +$(KERNEL_LINKED) : $(KERNEL) + $(LLVM_LINK) $(LIBCLC_NVPTX_LIB) -S $< -o $@ + +$(SPIR_ASSEMBLY) : $(KERNEL) + python $(PYTHON_LLVM_40_34) $< $(BUILD_DIR)/kernel_34.ll + $(LLVM_34_AS) $(BUILD_DIR)/kernel_34.ll -o $@ + +$(AOCL_ASSEMBLY) : $(AOC_CL) + aoc --report $(AOC_EMU) $(AOC_CL) -o $(AOCL_ASSEMBLY) -board=$(BOARD) + +$(AOC_CL) : $(KERNEL) + llvm-cbe --debug $(KERNEL) + +$(EXE) : $(HOST_LINKED) + $(CXX) -O3 $(LDFLAGS) $< -o $@ + +$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB) + $(LLVM_LINK) $^ -S -o $@ + +$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp + make -C $(LLVM_LIB_PATH) + +$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS) + $(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST) +# mv *.ll $(BUILD_DIR) +# $(OPT) -debug-only=DFG2LLVM_SPIR,DFG2LLVM_X86,DFG2LLVM_FPGA,GENVISC $(VISC_OPTFLAGS) -S $< -o $(HOST) +#$(OBJS): $(OBJS_SRC) +# $(CC) $(OBJS_CFLAGS) -emit-llvm -S -o $@ $< + +$(BUILD_DIR): + mkdir -p $(BUILD_DIR) + +$(BUILD_DIR)/%.ll : $(SRC_DIR)/%.c + $(CC) $(OBJS_CFLAGS) -emit-llvm -S -o $@ $< + +$(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c + $(CC) $(CFLAGS) -emit-llvm -S -o $@ $< + +#$(BUILD_DIR)/main.opt.ll : $(BUILD_DIR)/main.ll +# $(OPT) $(OPT_FLAGS) $< -S -o $@ + +$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll + $(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@ + +## END HPVM MAKEFILE diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam-vision-native b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam-vision-native new file mode 100755 index 0000000000000000000000000000000000000000..938e1b1a15d86e71b9b044e594472dad10fd9bc2 Binary files /dev/null and b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam-vision-native differ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.c new file mode 100644 index 0000000000000000000000000000000000000000..7874ff9d529afebc40d1660637e85b3a1e00f23e --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.c @@ -0,0 +1,139 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include "pipe_stages.h" +#include "load_cam_model.h" +#include "cam_pipe_utility.h" +#include "dma_interface.h" +#ifdef DMA_MODE +#include "gem5_harness.h" +#endif + +// FIXME: Include gem5/dma_interface.cc/h separately +#ifndef DMA_INTERFACE_V3 +#define DMA_INTERFACE_V3 +#endif//DMA_INTERFACE_V3 + +/////////////////////////////////////////////////////////////// +// Camera Model Parameters +/////////////////////////////////////////////////////////////// + +// Path to the camera model to be used +char cam_model_path[100]; + +// White balance index (select white balance from transform file) +// The first white balance in the file has a wb_index of 1 +// For more information on model format see the readme +int wb_index = 6; + +// Number of control points +int num_ctrl_pts = 3702; + +void load_cam_params_hw(float *host_TsTw, float *host_ctrl_pts, + float *host_weights, float *host_coefs, + float *host_tone_map, float *acc_TsTw, + float *acc_ctrl_pts, float *acc_weights, + float *acc_coefs, float *acc_tone_map) { + dmaLoad(acc_TsTw, host_TsTw, 9 * sizeof(float)); + dmaLoad(acc_ctrl_pts, host_ctrl_pts, + num_ctrl_pts * CHAN_SIZE * sizeof(float)); + dmaLoad(acc_weights, host_weights, num_ctrl_pts * CHAN_SIZE * sizeof(float)); + dmaLoad(acc_coefs, host_coefs, 4 * CHAN_SIZE * sizeof(float)); + dmaLoad(acc_tone_map, host_tone_map, 256 * CHAN_SIZE * sizeof(float)); +} + +void isp_hw(uint8_t *host_input, uint8_t *host_result, int row_size, + int col_size, uint8_t *acc_input, uint8_t *acc_result, + float *acc_input_scaled, float *acc_result_scaled, float *acc_TsTw, + float *acc_ctrl_pts, float *acc_weights, float *acc_coefs, + float *acc_tone_map, float *acc_l2_dist) { + dmaLoad(acc_input, host_input, + row_size * col_size * CHAN_SIZE * sizeof(uint8_t)); + scale_fxp(acc_input, row_size, col_size, acc_input_scaled); + demosaic_fxp(acc_input_scaled, row_size, col_size, acc_result_scaled); + denoise_fxp(acc_result_scaled, row_size, col_size, acc_input_scaled); + transform_fxp(acc_input_scaled, row_size, col_size, acc_result_scaled, + acc_TsTw); + gamut_map_fxp(acc_result_scaled, row_size, col_size, acc_input_scaled, + acc_ctrl_pts, acc_weights, acc_coefs, acc_l2_dist); + tone_map_fxp(acc_input_scaled, row_size, col_size, acc_tone_map, + acc_result_scaled); + // tone_map_approx_fxp(acc_input_scaled, row_size, col_size, + // acc_result_scaled); + descale_fxp(acc_result_scaled, row_size, col_size, acc_result); + dmaStore(host_result, acc_result, + row_size * col_size * CHAN_SIZE * sizeof(uint8_t)); +} + +void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size, + int col_size) { + uint8_t *acc_input, *acc_result; + float *acc_input_scaled, *acc_result_scaled; + float *host_TsTw, *host_ctrl_pts, *host_weights, *host_coefs, *host_tone_map; + float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map, *acc_l2_dist; + + strcat(cam_model_path, "cam_models/NikonD7000/"); + + host_TsTw = get_TsTw(cam_model_path, wb_index); + float *trans = transpose_mat(host_TsTw, CHAN_SIZE, CHAN_SIZE); + free(host_TsTw); + host_TsTw = trans; + host_ctrl_pts = get_ctrl_pts(cam_model_path, num_ctrl_pts); + host_weights = get_weights(cam_model_path, num_ctrl_pts); + host_coefs = get_coefs(cam_model_path, num_ctrl_pts); + host_tone_map = get_tone_map(cam_model_path); + + acc_input = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE); + acc_result = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE); + acc_input_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE); + acc_result_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE); + acc_TsTw = (float*) malloc_aligned(sizeof(float) * 9); + acc_ctrl_pts = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE); + acc_weights = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE); + acc_coefs = (float*) malloc_aligned(sizeof(float) * 12); + acc_tone_map = (float*) malloc_aligned(sizeof(float) * 256 * CHAN_SIZE); + acc_l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts); + + // Load camera model parameters for the ISP + MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw, + sizeof(float) * 9); + MAP_ARRAY_TO_ACCEL(ISP, "host_ctrl_pts", host_ctrl_pts, + sizeof(float) * num_ctrl_pts * CHAN_SIZE); + MAP_ARRAY_TO_ACCEL(ISP, "host_weights", host_weights, + sizeof(float) * num_ctrl_pts * CHAN_SIZE); + MAP_ARRAY_TO_ACCEL(ISP, "host_coefs", host_coefs, + sizeof(float) * 4 * CHAN_SIZE); + MAP_ARRAY_TO_ACCEL(ISP, "host_tone_map", host_tone_map, + sizeof(float) * 256 * CHAN_SIZE); + INVOKE_KERNEL(ISP, load_cam_params_hw, host_TsTw, host_ctrl_pts, host_weights, + host_coefs, host_tone_map, acc_TsTw, acc_ctrl_pts, acc_weights, + acc_coefs, acc_tone_map); + + // Invoke the ISP + MAP_ARRAY_TO_ACCEL(ISP, "host_input", host_input, + sizeof(uint8_t) * row_size * col_size * CHAN_SIZE); + MAP_ARRAY_TO_ACCEL(ISP, "host_result", host_result, + sizeof(uint8_t) * row_size * col_size * CHAN_SIZE); + INVOKE_KERNEL(ISP, isp_hw, host_input, host_result, row_size, col_size, + acc_input, acc_result, acc_input_scaled, acc_result_scaled, + acc_TsTw, acc_ctrl_pts, acc_weights, acc_coefs, acc_tone_map, + acc_l2_dist); + + free(acc_input); + free(acc_result); + free(acc_input_scaled); + free(acc_result_scaled); + free(host_TsTw); + free(host_ctrl_pts); + free(host_weights); + free(host_coefs); + free(host_tone_map); + free(acc_TsTw); + free(acc_ctrl_pts); + free(acc_weights); + free(acc_coefs); + free(acc_tone_map); + free(acc_l2_dist); +} + diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.h new file mode 100644 index 0000000000000000000000000000000000000000..83a77e01a67886af64902eacfd5af69a0a8d48b0 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe.h @@ -0,0 +1,7 @@ +#ifndef _CAM_PIPE_H_ +#define _CAM_PIPE_H_ + +void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size, + int col_size); + +#endif diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.c new file mode 100644 index 0000000000000000000000000000000000000000..f806e9ee1a2e288fabcb8ad658a47c3919fbb661 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.c @@ -0,0 +1,82 @@ +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +#include "cam_pipe_utility.h" +//#include "pipe_stages.h" + +uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size) { + uint8_t *image; + FILE *fp = fopen(file_path, "r"); + int chan_size; + if (fread(row_size, sizeof(int), 1, fp) != 1) + assert("Failed to read row size from binary file!"); + if (fread(col_size, sizeof(int), 1, fp) != 1) + assert("Failed to read col size from binary file!"); + if (fread(&chan_size, sizeof(int), 1, fp) != 1) + assert("Failed to read row size from binary file!"); + assert(chan_size == CHAN_SIZE && "Channel size read from the binary file " + "doesn't equal to the default value!\n"); + + int size = *row_size * *col_size * CHAN_SIZE; + image = malloc_aligned(sizeof(uint8_t) * size); + if (fread(image, sizeof(uint8_t), size, fp) != size) + assert("Failed to read the image from binary file!"); + fclose(fp); + return image; +} + +void write_image_to_binary(char *file_path, uint8_t *image, int row_size, int col_size) { + FILE *fp = fopen(file_path, "w"); + + int shape[3] = { row_size, col_size, CHAN_SIZE }; + fwrite(shape, sizeof(int), 3, fp); + + int size = row_size * col_size * CHAN_SIZE; + fwrite(image, sizeof(uint8_t), size, fp); + fclose(fp); +} + +float *transpose_mat(float *inmat, int width, int height) { + // Define vectors + float *outmat; + int err = + posix_memalign((void **)&outmat, CACHELINE_SIZE, sizeof(float) * height * width); + assert(err == 0 && "Failed to allocate memory!"); + + // Transpose the matrix + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + outmat[j * height + i] = inmat[i * width + j]; + } + } + return outmat; +} + +void convert_hwc_to_chw(uint8_t *input, int row_size, int col_size, + uint8_t **result) { + if (*result == NULL) { + *result = (uint8_t *)malloc_aligned(row_size * col_size * CHAN_SIZE * + sizeof(uint8_t)); + } + ARRAY_3D(uint8_t, _input, input, col_size, CHAN_SIZE); + ARRAY_3D(uint8_t, _result, *result, row_size, col_size); + for (int h = 0; h < row_size; h++) + for (int w = 0; w < col_size; w++) + for (int c = 0; c < CHAN_SIZE; c++) + _result[c][h][w] = _input[h][w][c]; +} + +void convert_chw_to_hwc(uint8_t *input, int row_size, int col_size, + uint8_t **result) { + if (*result == NULL) { + *result = (uint8_t *)malloc_aligned(row_size * col_size * CHAN_SIZE * + sizeof(uint8_t)); + } + ARRAY_3D(uint8_t, _input, input, row_size, col_size); + ARRAY_3D(uint8_t, _result, *result, col_size, CHAN_SIZE); + for (int c = 0; c < CHAN_SIZE; c++) + for (int h = 0; h < row_size; h++) + for (int w = 0; w < col_size; w++) + _result[h][w][c] = _input[c][h][w]; +} diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.h new file mode 100644 index 0000000000000000000000000000000000000000..b4fb6cde0c438b23c2b596cf0418953aaedca501 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/cam_pipe_utility.h @@ -0,0 +1,16 @@ +#ifndef _CAM_PIPE_UTILITY_H_ +#define _CAM_PIPE_UTILITY_H_ + +#include "utility.h" +#include "pipe_stages.h" + +uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size); +void write_image_to_binary(char *file_path, uint8_t *image, int row_size, + int col_size); +float *transpose_mat(float *inmat, int width, int height); +void convert_hwc_to_chw(uint8_t *input, int row_size, int col_size, + uint8_t **result); +void convert_chw_to_hwc(uint8_t *input, int row_size, int col_size, + uint8_t **result); + +#endif diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/defs.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/defs.h new file mode 100644 index 0000000000000000000000000000000000000000..ccc8acc857c36fd13115670932a38dc3a406dc29 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/defs.h @@ -0,0 +1,224 @@ +#ifndef _COMMON_DEFS_H_ +#define _COMMON_DEFS_H_ + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long uint64_t; + +#define CACHELINE_SIZE 64 + +// Debugging message macros. +#if DEBUG_LEVEL >= 1 + #define INFO_MSG(args...) printf(args) + + #if DEBUG_LEVEL >= 2 + #define PRINT_MSG(args...) printf(args) + #define PRINT_DEBUG(hid, rows, cols, num_cols) \ + print_debug(hid, rows, cols, num_cols) + #define PRINT_DEBUG4D(hid, rows, cols, height) \ + print_debug4d(hid, rows, cols, height) + #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols) \ + print_debug4d_fp16(hid, num, height, rows, cols) + + #if DEBUG_LEVEL >= 3 + #define PRINT_DEBUG_V(hid, rows, cols, num_cols) \ + print_debug(hid, rows, cols, num_cols) + #define PRINT_DEBUG4D_V(hid, rows, cols, height) \ + print_debug4d(hid, rows, cols, height) + #define PRINT_MSG_V(args...) printf(args) + #else + #define PRINT_DEBUG_V(hid, rows, cols, num_cols) + #define PRINT_DEBUG4D_V(hid, rows, cols, height) + #define PRINT_MSG_V(args...) + #endif + #else + #define PRINT_MSG(args...) + #define PRINT_DEBUG(hid, rows, cols, num_cols) + #define PRINT_DEBUG4D(hid, rows, cols, height) + #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols) + #define PRINT_DEBUG_V(hid, rows, cols, height) + #define PRINT_DEBUG4D_V(hid, rows, cols, height) + #define PRINT_MSG_V(args...) + #endif +#else + #define INFO_MSG(args...) + #define PRINT_DEBUG(hid, rows, cols, num_cols) + #define PRINT_DEBUG4D(hid, rows, cols, height) + #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols) + #define PRINT_MSG(args...) + #define PRINT_DEBUG_V(hid, rows, cols, height) + #define PRINT_DEBUG4D_V(hid, rows, cols, height) + #define PRINT_MSG_V(args...) +#endif + +#define STRING(arg) #arg + +// This is to avoid a ton of spurious unused variable warnings when +// we're not building for gem5. +#define UNUSED(x) (void)(x) + +// Macros for computing the maximum of a group of elements. +// +// Why macros and not functions (or a loop)? A loop takes O(n) cycles to +// compute the maximum, when it could be done in O(log n) time with a tree +// based implementation. But Aladdin regards function calls as a hard +// dependency that it does not optimize across, so we would not get the +// parallelism we expect from the tree. Thus, these must be macros. +// +// I've only implemented a few of these. These are only meant for the pooling +// layers, and we shouldn't need more than a 3x3 pooling layer anyways. +#define max2(A, B) (((A) > (B)) ? (A) : (B)) +#define max3(e0, e1, e2) max2(max2(e0, e1), e2) +#define max4(e0, e1, e2, e3) max2(max2(e0, e1), max2(e2, e3)) +#define max8(e0, e1, e2, e3, e4, e5, e6, e7) \ + max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7)) +#define max9(e0, e1, e2, e3, e4, e5, e6, e7, e8) \ + max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8) + +#define min2(A, B) (((A) < (B)) ? (A) : (B)) + +#define FRAC_CEIL(A, B) ((A) / (B) + ((A) % (B) != 0)) +// Convenience macros to switch between invoking an accelerator (if building a +// binary for gem5) or just calling the kernel function in software. +// +// Usage: +// +// These macros expand differently based on whether the GEM5_HARNESS macro is +// defined. If so, then this binary is meant to be run under gem5, invoking +// accelerators; if not, this binary should run the pure software version of +// the accelerated kernels. +// +// If GEM5_HARNESS is defined: +// +// MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize) +// ===> mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr, mySize) +// +// INVOKE_KERNEL(myReqCode, kernelFuncName, args...) +// ===> invokeAcceleratorAndBlock(myReqCode) +// +// Otherwise: +// MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize) +// expands to nothing +// +// INVOKE_KERNEL(myReqCode, kernelFuncName, args...) +// ===> kernelFuncName(args) +// +#ifdef GEM5_HARNESS + +#define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size) \ + mapArrayToAccelerator(req_code, name, base_addr, size) +#define INVOKE_KERNEL(req_code, kernel_ptr, args...) \ + do { \ + UNUSED(kernel_ptr); \ + invokeAcceleratorAndBlock(req_code); \ + } while (0) +#define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...) \ + do { \ + UNUSED(kernel_ptr); \ + invokeAcceleratorAndReturn2(req_code, finish_flag); \ + } while (0) + +#define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size) \ + do { \ + invokeAladdinTrafficGenAndBlock(start_addr, size, false, false); \ + } while (0) +#define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size) \ + do { \ + invokeAladdinTrafficGenAndBlock(start_addr, size, true, false); \ + } while (0) +#define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size) \ + do { \ + invokeAladdinTrafficGenAndBlock(start_addr, size, false, true); \ + } while (0) +#define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size) \ + do { \ + invokeAladdinTrafficGenAndBlock(start_addr, size, true, true); \ + } while (0) + +#else + +#define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size) \ + do { \ + INFO_MSG("Mapping array %s @ %p, size %d.\n", \ + name, (void*)base_addr, (int)(size)); \ + UNUSED(req_code); \ + UNUSED(name); \ + UNUSED(base_addr); \ + UNUSED(size); \ + } while (0) +#define INVOKE_KERNEL(req_code, kernel_ptr, args...) kernel_ptr(args) +#define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...) \ + kernel_ptr(args) +#define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size) \ + do { \ + UNUSED(start_addr); \ + UNUSED(size); \ + } while (0) +#define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size) \ + do { \ + UNUSED(start_addr); \ + UNUSED(size); \ + } while (0) +#define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size) \ + do { \ + UNUSED(start_addr); \ + UNUSED(size); \ + } while (0) +#define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size) \ + do { \ + UNUSED(start_addr); \ + UNUSED(size); \ + } while (0) + +#endif + +// Simplified version of MAP_ARRAY_TO_ACCEL. +// +// This assumes that the current name of the base pointer is also the name of +// the array in the top level function of the dynamic trace. THIS IS VERY +// IMPORTANT - if the argument passed to a top level function has been renamed in +// the function, then this WILL NOT WORK! +// +// MAP_ARRAY(myReqCode, myArray, mySize) +// ===> MAP_ARRAY_TO_ACCEL(myReqCode, "myArray", myArray, mySize) +#define MAP_ARRAY(req_code, name_and_base_addr, size) \ + MAP_ARRAY_TO_ACCEL( \ + req_code, STRING(name_and_base_addr), name_and_base_addr, size) + +// Use these convenience macros to cast a raw pointer into a multidimensional +// variable-length array, which lets us use [] notation inside of the ugly +// sub2ind syntax! +// +// Usage: +// If we have an array like array[5][4]: +// ARRAY_2D(TYPE, output_name, array, 4); +// +// If we have an array like array[5][4][3]: +// ARRAY_3D(TYPE, output_name, array, 4, 3); +// +// If we have an array like array[5][4][3][2] +// ARRAY_4D(TYPE, output_name, array, 4, 3, 2); +// +// And so on... +#define ARRAY_1D(TYPE, output_array_name, input_array_name) \ + TYPE* output_array_name = (TYPE*)input_array_name + +#define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1) \ + TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name + +#define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2) \ + TYPE(*output_array_name)[DIM_1][DIM_2] = \ + (TYPE(*)[DIM_1][DIM_2])input_array_name + +#define ARRAY_4D( \ + TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3) \ + TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3] = \ + (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name + +#define ARRAY_5D( \ + TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3, DIM_4) \ + TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3][DIM_4] = \ + (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name + +#endif diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.c new file mode 100644 index 0000000000000000000000000000000000000000..81bce54469886153170f994a77250a784cc9b7d7 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.c @@ -0,0 +1,73 @@ +#include <assert.h> +#include <string.h> +#include "dma_interface.h" + +// All _dmaImplN functions must be always inlined or we'll get extra functions +// in the trace. + +#if defined(DMA_INTERFACE_V3) + +// Starting with version 3, all versioning will be distinguished by the return +// value of the DMA functions. + +__attribute__((__always_inline__)) +int _dmaImpl3(void* dst_addr, void* src_addr, size_t size) { + assert(size > 0); + memmove(dst_addr, src_addr, size); + return 3; +} + +int dmaLoad(void* dst_addr, void* src_host_addr, size_t size) { + return _dmaImpl3(dst_addr, src_host_addr, size); +} + +int dmaStore(void* dst_host_addr, void* src_addr, size_t size) { + return _dmaImpl3(dst_host_addr, src_addr, size); +} + +int setReadyBits(void* start_addr, size_t size, unsigned value) { + asm(""); + return 0; +} + +#elif defined(DMA_INTERFACE_V2) + +// With version 2 and earlier, we return (void*)NULL and use the number of +// function arguments to distinguish the DMA functions. + +__attribute__((__always_inline__)) +void* _dmaImpl2(void* base_addr, size_t src_off, size_t dst_off, size_t size) { + assert(size > 0); + memmove(base_addr + dst_off, base_addr + src_off, size); + return NULL; +} + +void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size) { + return _dmaImpl2(base_addr, src_off, dst_off, size); +} + +void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size) { + return _dmaImpl2(base_addr, src_off, dst_off, size); +} + +#else + +__attribute__((__always_inline__)) +void* _dmaImpl1(void* base_addr, size_t offset, size_t size) { + assert(size > 0); + asm(""); + return NULL; +} + +void* dmaLoad(void* addr, size_t offset, size_t size) { + return _dmaImpl1(addr, offset, size); +} + +void* dmaStore(void* addr, size_t offset, size_t size) { + return _dmaImpl1(addr, offset, size); +} +#endif + +void dmaFence() { + asm(""); +} diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.h new file mode 100644 index 0000000000000000000000000000000000000000..f23234eede4df99db84b144646530dfe240c6e62 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/dma_interface.h @@ -0,0 +1,44 @@ +#ifndef __DMA_INTERFACE_H__ +#define __DMA_INTERFACE_H__ + +#include <stddef.h> + +#define PAGE_SIZE 4096 + +#if defined(DMA_INTERFACE_V3) + +// Version 3 of the DMA interface enables memcpy operations from arbitrary +// source and destination addresses. + +int dmaLoad(void* dst_addr, void* src_host_addr, size_t size); +int dmaStore(void* dst_host_addr, void* src_addr, size_t size); + +// The user can explicitly toggle the state of ready bits, if ready mode is +// enabled. This requires support from DMA v3. +int setReadyBits(void* start_addr, size_t size, unsigned value); + +#elif defined(DMA_INTERFACE_V2) + +#warning "DMA interface v2 is deprecated!" + +// Version 2 of the DMA interface separates source and destination offsets from +// the base address into different fields, and on the host machine, memory is +// actually copied from source to destination (the memory copy will not show up +// in the trace). + +void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size); +void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size); + +#else + +#warning "DMA interface v1 is deprecated!" + +// Version 1 of the DMA interface is now deprecated and will be removed entirely. + +void* dmaLoad(void* addr, size_t offset, size_t size); +void* dmaStore(void* addr, size_t offset, size_t size); + +#endif +void dmaFence(); + +#endif diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/gem5_harness.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/gem5_harness.h new file mode 100644 index 0000000000000000000000000000000000000000..36859cfe1ba67bc8197d7ea3961adfbeb95c70b1 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/gem5_harness.h @@ -0,0 +1,10 @@ +#ifndef _GEM5_HARNESS_H_ +#define _GEM5_HARNESS_H_ + +/* One header to include them all. */ + +//#include "aladdin_sys_connection.h" +//#include "aladdin_sys_constants.h" +#include "dma_interface.h" + +#endif diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.c new file mode 100644 index 0000000000000000000000000000000000000000..124fe0b7d175c2655feac562ecd6e2a5b73cc96a --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.c @@ -0,0 +1,335 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include "utility.h" +#include "pipe_stages.h" +#include "load_cam_model.h" + +// Get color space transform +float* get_Ts(char* cam_model_path) { + float *Ts; + int err = posix_memalign((void **)&Ts, CACHELINE_SIZE, sizeof(float) * 9); + assert(err == 0 && "Failed to allocate memory!"); + char *line; + char *str; + float line_data[3]; + size_t len = 0; + int line_idx = 0; + + // Open file for reading + char file_name[] = "raw2jpg_transform.txt"; + char file_path[100]; + strcpy(file_path, cam_model_path); + strcat(file_path, file_name); + FILE *fp = fopen(file_path, "r"); + if (fp == NULL) { + printf("Didn't find the camera model file!\n"); + exit(1); + } + + while (getline(&line, &len, fp) != -1) { + str = strtok(line, " \n"); + int i = 0; + while (str != NULL) { + line_data[i] = atof(str); + str = strtok(NULL, " \n"); + i++; + } + + if (line_idx >= 1 && line_idx <= 3) { + for (int j = 0; j < 3; j++) { + Ts[(line_idx - 1) * 3 + j] = line_data[j]; + } + } + line_idx = line_idx + 1; + } + fclose(fp); + free(line); + return Ts; +} + +// Get white balance transform +float* get_Tw(char* cam_model_path, int wb_index) { + float *Tw; + int err = posix_memalign((void **)&Tw, CACHELINE_SIZE, sizeof(float) * 9); + assert(err == 0 && "Failed to allocate memory!"); + char *line; + char *str; + float line_data[3]; + size_t len = 0; + int line_idx = 0; + + // Calculate base for the white balance transform selected + // For more details see the camera model readme + int wb_base = 8 + 5*(wb_index-1); + + // Open file for reading + // Open file for reading + char file_name[] = "raw2jpg_transform.txt"; + char file_path[100]; + strcpy(file_path, cam_model_path); + strcat(file_path, file_name); + FILE *fp = fopen(file_path, "r"); + if (fp == NULL) { + printf("Didn't find the camera model file!\n"); + exit(1); + } + + // Read a line at a time + while (getline(&line, &len, fp) != -1) { + str = strtok(line, " \n"); + int i = 0; + while (str != NULL) { + line_data[i] = atof(str); + str = strtok(NULL, " \n"); + i++; + } + + if (line_idx == wb_base) { + // Convert the white balance vector into a diagaonal matrix + for (int i=0; i<3; i++) { + for (int j=0; j<3; j++) { + if (i == j) { + Tw[i * 3 + j] = line_data[i]; + } else { + Tw[i * 3 + j] = 0.0; + } + } + } + } + line_idx = line_idx + 1; + } + fclose(fp); + free(line); + return Tw; +} + + +// Get combined transforms for checking +float* get_TsTw(char* cam_model_path, int wb_index) { + float *TsTw; + int err = posix_memalign((void **)&TsTw, CACHELINE_SIZE, sizeof(float) * 9); + assert(err == 0 && "Failed to allocate memory!"); + char *line; + char *str; + float line_data[3]; + size_t len = 0; + int line_idx = 0; + + // Calculate base for the white balance transform selected + // For more details see the camera model readme + int wb_base = 5 + 5*(wb_index-1); + + // Open file for reading + char file_name[] = "raw2jpg_transform.txt"; + char file_path[100]; + strcpy(file_path, cam_model_path); + strcat(file_path, file_name); + FILE *fp = fopen(file_path, "r"); + if (fp == NULL) { + printf("Didn't find the camera model file!\n"); + exit(1); + } + + // Read a line at a time + while (getline(&line, &len, fp) != -1) { + str = strtok(line, " \n"); + int i = 0; + while (str != NULL) { + line_data[i] = atof(str); + str = strtok(NULL, " \n"); + i++; + } + + if (line_idx >= wb_base && line_idx <= (wb_base + 2)) { + for (int j = 0; j < 3; j++) { + TsTw[(line_idx - wb_base) * 3 + j] = line_data[j]; + } + } + line_idx = line_idx + 1; + } + fclose(fp); + free(line); + return TsTw; +} + +// Get control points +float* get_ctrl_pts(char* cam_model_path, int num_cntrl_pts) { + float *ctrl_pnts; + int err = posix_memalign((void **)&ctrl_pnts, CACHELINE_SIZE, + sizeof(float) * num_cntrl_pts * 3); + assert(err == 0 && "Failed to allocate memory!"); + char *line; + char *str; + float line_data[3]; + size_t len = 0; + int line_idx = 0; + + // Open file for reading + char file_name[] = "raw2jpg_ctrlPoints.txt"; + char file_path[100]; + strcpy(file_path, cam_model_path); + strcat(file_path, file_name); + FILE *fp = fopen(file_path, "r"); + if (fp == NULL) { + printf("Didn't find the camera model file!\n"); + exit(1); + } + + // Read a line at a time + while (getline(&line, &len, fp) != -1) { + str = strtok(line, " \n"); + int i = 0; + while (str != NULL) { + line_data[i] = atof(str); + str = strtok(NULL, " \n"); + i++; + } + + if (line_idx >= 1 && line_idx <= num_cntrl_pts) { + for (int j = 0; j < 3; j++) { + ctrl_pnts[(line_idx - 1) * 3 + j] = line_data[j]; + } + } + line_idx = line_idx + 1; + } + fclose(fp); + free(line); + return ctrl_pnts; +} + +// Get weights +float* get_weights(char* cam_model_path, int num_cntrl_pts) { + float *weights; + int err = posix_memalign((void **)&weights, CACHELINE_SIZE, + sizeof(float) * num_cntrl_pts * 3); + assert(err == 0 && "Failed to allocate memory!"); + char *line; + char *str; + float line_data[3]; + size_t len = 0; + int line_idx = 0; + + // Open file for reading + char file_name[] = "raw2jpg_coefs.txt"; + char file_path[100]; + strcpy(file_path, cam_model_path); + strcat(file_path, file_name); + FILE *fp = fopen(file_path, "r"); + if (fp == NULL) { + printf("Didn't find the camera model file!\n"); + exit(1); + } + + // Read a line at a time + while (getline(&line, &len, fp) != -1) { + str = strtok(line, " \n"); + int i = 0; + while (str != NULL) { + line_data[i] = atof(str); + str = strtok(NULL, " \n"); + i++; + } + + if (line_idx >= 1 && line_idx <= num_cntrl_pts) { + for (int j = 0; j < 3; j++) { + weights[(line_idx - 1) * 3 + j] = line_data[j]; + } + } + line_idx = line_idx + 1; + } + fclose(fp); + free(line); + return weights; +} + +// Get coeficients +float* get_coefs(char* cam_model_path, int num_cntrl_pts) { + float *coefs; + int err = posix_memalign((void **)&coefs, CACHELINE_SIZE, sizeof(float) * 12); + assert(err == 0 && "Failed to allocate memory!"); + char *line; + char *str; + float line_data[3]; + size_t len = 0; + int line_idx = 0; + + // Open file for reading + char file_name[] = "raw2jpg_coefs.txt"; + char file_path[100]; + strcpy(file_path, cam_model_path); + strcat(file_path, file_name); + FILE *fp = fopen(file_path, "r"); + if (fp == NULL) { + printf("Didn't find the camera model file!\n"); + exit(1); + } + + // Read a line at a time + while (getline(&line, &len, fp) != -1) { + str = strtok(line, " \n"); + int i = 0; + while (str != NULL) { + line_data[i] = atof(str); + str = strtok(NULL, " \n"); + i++; + } + + if (line_idx >= (num_cntrl_pts + 1) && line_idx <= (num_cntrl_pts + 4)) { + for (int j = 0; j < 3; j++) { + coefs[(line_idx - num_cntrl_pts - 1) * 3 + j] = line_data[j]; + } + } + line_idx = line_idx + 1; + } + fclose(fp); + free(line); + return coefs; +} + + +// Get tone mapping table +float* get_tone_map(char* cam_model_path) { + float *tone_map; + int err = posix_memalign((void **)&tone_map, CACHELINE_SIZE, + sizeof(float) * 256 * CHAN_SIZE); + assert(err == 0 && "Failed to allocate memory!"); + char *line; + char *str; + float line_data[3]; + size_t len = 0; + int line_idx = 0; + + // Open file for reading + char file_name[] = "raw2jpg_respFcns.txt"; + char file_path[100]; + strcpy(file_path, cam_model_path); + strcat(file_path, file_name); + FILE *fp = fopen(file_path, "r"); + if (fp == NULL) { + printf("Didn't find the camera model file!\n"); + exit(1); + } + + // Read a line at a time + while (getline(&line, &len, fp) != -1) { + str = strtok(line, " \n"); + int i = 0; + while (str != NULL) { + line_data[i] = atof(str); + str = strtok(NULL, " \n"); + i++; + } + + if (line_idx >= 1 && line_idx <= 256) { + for (int j = 0; j < 3; j++) { + tone_map[(line_idx - 1) * 3 + j] = line_data[j]; + } + } + line_idx = line_idx + 1; + } + fclose(fp); + free(line); + return tone_map; +} diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.h new file mode 100644 index 0000000000000000000000000000000000000000..8e5ee95217c901e57250bcce6b3cfc37cd9d6ce7 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/load_cam_model.h @@ -0,0 +1,25 @@ +#ifndef _LOAD_CAM_MODEL_H_ +#define _LOAD_CAM_MODEL_H_ + +// Get color space transform +float *get_Ts(char *cam_model_path); + +// Get white balance transform +float *get_Tw(char *cam_model_path, int wb_index); + +// Get combined transforms for checking +float *get_TsTw(char *cam_model_path, int wb_index); + +// Get control points +float *get_ctrl_pts(char *cam_model_path, int num_cntrl_pts); + +// Get weights +float *get_weights(char *cam_model_path, int num_cntrl_pts); + +// Get coeficients +float *get_coefs(char *cam_model_path, int num_cntrl_pts); + +// Get tone mapping table +float *get_tone_map(char *cam_model_path); + +#endif diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main.c new file mode 100644 index 0000000000000000000000000000000000000000..cc75beb66419ae3a23f2d9578d9b5c19a238c4fe --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main.c @@ -0,0 +1,156 @@ + +#include <stdlib.h> +#include "utility.h" +#include "visc.h" +#include "defs.h" + + +typedef struct __attribute__((__packed__)) { + float* input; size_t bytes_input; + float* result; size_t bytes_result; +} +RootIn; + +/* +typedef struct __attribute__((__packed__)) { + uint8_t *input; size_t bytes_input; + uint8_t *result; size_t bytes_result; + float *input_scaled; size_t bytes_input_scaled; + float *result_scaled; size_t bytes_result_scaled; + float *demosaic_out; size_t bytes_demosaic_out; + float *denoise_out; size_t bytes_denoise_out; + float *transform_out; size_t bytes_transform_out; + float *gamut_out;size_t bytes_gamut_out; + float *TsTw; size_t bytes_TsTw; + float *ctrl_pts; size_t bytes_ctrl_pts; + float *weights; size_t bytes_weights; + float*coefs; size_t bytes_coefs; + float *l2_dist; size_t bytes_l2_dist; + float *tone_map; size_t bytes_tone_map; + int row_size; int col_size; +} +RootIn; +*/ + + + +void scale_values(float* input, size_t num_elems) { + + __visc__hint(DEVICE); + __visc__attributes(1, input, 1, input); + + for (int ind = 0; ind < num_elems; ind++){ + input[ind] = input[ind] * 2.0; + } + + __visc__return(1, num_elems); +} + + + + +void graphRoot(/*0*/ float* input, /*1*/ size_t bytes_input, + /*2*/ float* result, /*3*/ size_t bytes_result) { + + //Specifies compilation target for current node + __visc__hint(CPU_TARGET); + + __visc__attributes(2, input, result, 2, input, result); + + // Create an 0D (specified by 1st argument) HPVM node - so a single node + // associated with node function ---_fxp_wrapper + + void* scaleNode = __visc__createNodeND(0, scale_values); + + // BindIn binds inputs of current node with specified node + // - destination node + // - argument position in argument list of function of source node + // - argument position in argument list of function of destination node + // - streaming (1) or non-streaming (0) + + // Edge transfers data between nodes within the same level of hierarchy. + // - source and destination dataflow nodes + // - edge type, all-all (1) or one-one(0) + // - source position (in output struct of source node) + // - destination position (in argument list of destination node) + // - streaming (1) or non-streaming (0) + + // scale_fxp inputs + __visc__bindIn(scaleNode, 0, 0, 0); // input -> ScNode:input + __visc__bindIn(scaleNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input + + // Similar to bindIn, but for the output. Output of a node is a struct, and + // we consider the fields in increasing ordering. + __visc__bindOut(scaleNode, 0, 0, 0); + +} + + + + + +int main(int argc, char* argv[]) { + + size_t input_size = 100; + size_t result_size = 100; + + size_t input_bytes = input_size * sizeof(float); + size_t result_bytes = result_size * sizeof(float); + + // This is host_input in cam_pipe() + float* input = (float*) malloc(input_bytes); + for(unsigned int i = 0; i < input_size; i++){ + input[i] = 1.0; + } + // This is host_result in cam_pipe() + float* result = (float*) malloc(result_bytes); + + + __visc__init(); + + RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn)); + + // Set up HPVM DFG inputs in the rootArgs struct. + rootArgs->input = input; + rootArgs->bytes_input = input_size; + + printf("input = %d input_bytes = %d \n", input, input_bytes); + + rootArgs->result = result; + rootArgs->bytes_result = result_size; + + + llvm_visc_track_mem(input, input_bytes); + llvm_visc_track_mem(result, result_bytes); + + + void* testDFG = __visc__launch(0, graphRoot, (void*) rootArgs); + __visc__wait(testDFG); + + + printf("input = %d \n", input); + + llvm_visc_request_mem(input, input_bytes); + //llvm_visc_request_mem(result, result_bytes); + + printf("requested mem \n"); + + for(unsigned int i = 0; i < input_size; i++){ + printf("input[%d] = %f \n", i, input[i]); + } + + //llvm_visc_untrack_mem(input); + //llvm_visc_untrack_mem(result); + + printf ("untracked mem \n"); + + __visc__cleanup(); + + printf ("cleaned up visc"); + + return 0; +} + + + + diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main_old.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main_old.c new file mode 100644 index 0000000000000000000000000000000000000000..ea42ad0bf87fd8e0b337ea1e7d0ad803025e849e --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/main_old.c @@ -0,0 +1,865 @@ +#include <argp.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <string.h> +#include <math.h> +#include "utility.h" + +#include "cam_pipe_utility.h" +#include "pipe_stages.h" +#include "load_cam_model.h" + +#include "visc.h" + +int NUM_TEST_CASES; +int NUM_CLASSES; +int INPUT_DIM; +int NUM_WORKER_THREADS; + +// Type of struct that is used to pass arguments to the HPVM dataflow graph +// using the hpvm launch operation +typedef struct __attribute__((__packed__)) { + uint8_t *input; size_t bytes_input; + uint8_t *result; size_t bytes_result; + float *input_scaled; size_t bytes_input_scaled; + float *result_scaled; size_t bytes_result_scaled; + float *demosaic_out; size_t bytes_demosaic_out; + float *denoise_out; size_t bytes_denoise_out; + float *transform_out; size_t bytes_transform_out; + float *gamut_out;size_t bytes_gamut_out; + float *TsTw; size_t bytes_TsTw; + float *ctrl_pts; size_t bytes_ctrl_pts; + float *weights; size_t bytes_weights; + float*coefs; size_t bytes_coefs; + float *l2_dist; size_t bytes_l2_dist; + float *tone_map; size_t bytes_tone_map; + int row_size; int col_size; +} +RootIn; + +typedef enum _argnum { + RAW_IMAGE_BIN, + OUTPUT_IMAGE_BIN, + NUM_REQUIRED_ARGS, + DATA_FILE = NUM_REQUIRED_ARGS, + NUM_ARGS, +} argnum; + +typedef struct _arguments { + char* args[NUM_ARGS]; + int num_inputs; + int num_threads; +} arguments; + +static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n"; +static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary"; +static struct argp_option options[] = { + { "num-inputs", 'n', "N", 0, "Number of input images" }, { 0 }, + { "data-file", 'f', "F", 0, + "File to read data and weights from (if data-init-mode == READ_FILE or " + "save-params is true). *.txt files are decoded as text files, while " + "*.bin files are decoded as binary files." }, +}; + +static error_t parse_opt(int key, char* arg, struct argp_state* state) { + arguments* args = (arguments*)(state->input); + switch (key) { + case 'n': { + args->num_inputs = strtol(arg, NULL, 10); + break; + } + case 'f': { + args->args[DATA_FILE] = arg; + break; + } + case 't': { + args->num_threads = strtol(arg, NULL, 10); + break; + } + case ARGP_KEY_ARG: { + if (state->arg_num >= NUM_REQUIRED_ARGS) + argp_usage(state); + args->args[state->arg_num] = arg; + break; + } + case ARGP_KEY_END: { + if (state->arg_num < NUM_REQUIRED_ARGS) { + fprintf(stderr, + "Not enough arguments! Got %d, require %d.\n", + state->arg_num, + NUM_REQUIRED_ARGS); + argp_usage(state); + } + break; + } + default: + return ARGP_ERR_UNKNOWN; + } + return 0; +} + +void set_default_args(arguments* args) { + args->num_inputs = 1; + args->num_threads = 0; + for (int i = 0; i < NUM_ARGS; i++) { + args->args[i] = NULL; + } +} + +static struct argp parser = { options, parse_opt, args_doc, prog_doc }; + +// Helper function for printing intermediate results +void descale_cpu(float *input, size_t bytes_input, + uint8_t *output, size_t bytes_result, + int row_size, int col_size) { + + for (int chan = 0; chan < CHAN_SIZE; chan++) + for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan*row_size + row) * col_size + col; + output[index] = min(max(input[index] * 255, 0), 255); + } +} + +static void sort(float arr[], int n) { + int i, j; + for (i = 0; i < n - 1; i++) + for (j = 0; j < n - i - 1; j++) + if (arr[j] > arr[j + 1]) { + float temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } +} + +/**************************************************************/ +/*** HPVM Leaf node Functions - Performing the computations ***/ +/**************************************************************/ + +// In this benchmark, no use of HPVM query intrinsics in the leaf node functions + +// Leaf HPVM node function for scale +void scale_fxp(uint8_t *input, size_t bytes_input, + float *output, size_t bytes_output, + int row_size, int col_size) { + + //Specifies compilation target for current node + __visc__hint(DEVICE); + + // Specifies pointer arguments that will be used as "in" and "out" arguments + // - count of "in" arguments + // - list of "in" argument , and similar for "out" + __visc__attributes(2, input, output, 1, output); + + for (int chan = 0; chan < CHAN_SIZE; chan++) + for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++){ + int index = (chan*row_size + row) * col_size + col; + output[index] = input[index] * 1.0 / 255; + } + __visc__return(1, bytes_output); +} + +// Leaf HPVM node function for descale +void descale_fxp(float *input, size_t bytes_input, + uint8_t *output, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, output, 1, output); + + for (int chan = 0; chan < CHAN_SIZE; chan++) + for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan*row_size + row) * col_size + col; + output[index] = min(max(input[index] * 255, 0), 255); + } + __visc__return(1, bytes_result); +} + +// Leaf HPVM node function for demosaicing +void demosaic_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, result, 1, result); + + for (int row = 1; row < row_size - 1; row++) + for (int col = 1; col < col_size - 1; col++) { + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + if (row % 2 == 0 && col % 2 == 0) { + // Green pixel + // Getting the R values + float R1 = input[index_0 - 1]; + float R2 = input[index_0 + 1]; + // Getting the B values + float B1 = input[index_2 - col_size]; + float B2 = input[index_2 + col_size]; + // R + result[index_0] = (R1 + R2) / 2; + // G + result[index_1] = input[index_1] * 2; + // B + result[index_2] = (B1 + B2) / 2; + } else if (row % 2 == 0 && col % 2 == 1) { + // Red pixel + // Getting the G values + float G1 = input[index_1 - col_size]; + float G2 = input[index_1 + col_size]; + float G3 = input[index_1 - 1]; + float G4 = input[index_1 + 1]; + // Getting the B values + float B1 = input[index_2 - col_size - 1]; + float B2 = input[index_2 - col_size + 1]; + float B3 = input[index_2 + col_size - 1]; + float B4 = input[index_2 + col_size + 1]; + // R + result[index_0] = input[index_0]; + // G + result[index_1] = (G1 + G2 + G3 + G4) / 2; + // B (center pixel) + result[index_2] = (B1 + B2 + B3 + B4) / 4; + } else if (row % 2 == 1 && col % 2 == 0) { + // Blue pixel + // Getting the R values + float R1 = input[index_0 - col_size - 1]; + float R2 = input[index_0 + col_size - 1]; + float R3 = input[index_0 - col_size + 1]; + float R4 = input[index_0 + col_size + 1]; + // Getting the G values + float G1 = input[index_1 - col_size]; + float G2 = input[index_1 + col_size]; + float G3 = input[index_1 - 1]; + float G4 = input[index_1 + 1]; + // R + result[index_0] = (R1 + R2 + R3 + R4) / 4; + // G + result[index_1] = (G1 + G2 + G3 + G4) / 2; + // B + result[index_2] = input[index_2]; + } else { + // Bottom Green pixel + // Getting the R values + float R1 = input[index_0 - col_size]; + float R2 = input[index_0 + col_size]; + // Getting the B values + float B1 = input[index_2 - 1]; + float B2 = input[index_2 + 1]; + // R + result[index_0] = (R1 + R2) / 2; + // G + result[index_1] = input[index_1] * 2; + // B + result[index_2] = (B1 + B2) / 2; + } + } + __visc__return(1, bytes_result); +} + +// Leaf HPVM node function for denoise +void denoise_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, result, 1, result); + + for (int chan = 0; chan < CHAN_SIZE; chan++) + for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) + if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { + float filter[9]; + for (int i = -1; i < 2; i++) + for (int j = -1; j < 2; j++) { + int index = ((i+row) - row + 1) * 3 + (j+col) - col + 1; + filter[index] = input[(chan * row_size + (i + row)) * col_size + (j + col)]; + } + sort(filter, 9); + result[(chan * row_size + row) * col_size + col] = filter[4]; + } else { + result[(chan * row_size + row) * col_size + col] = input[(chan * row_size + row) * col_size + col]; + } + __visc__return(1, bytes_result); +} + +// Leaf HPVM node function, for color map and white balance transform +void transform_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *TsTw_tran, size_t bytes_TsTw, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(3, input, result, TsTw_tran, 1, result); + + for (int chan = 0; chan < CHAN_SIZE; chan++) + for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + int index_2d_0 = 0 * CHAN_SIZE + chan; + int index_2d_1 = 1 * CHAN_SIZE + chan; + int index_2d_2 = 2 * CHAN_SIZE + chan; + result[index] = + max(input[index_0] * TsTw_tran[index_2d_0] + + input[index_1] * TsTw_tran[index_2d_1] + + input[index_2] * TsTw_tran[index_2d_2], + 0); + } + __visc__return(1, bytes_result); +} + +// Leaf HPVM node function, for gamut mapping +void gamut_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, + float *coefs, size_t bytes_coefs, + float *l2_dist, size_t bytes_l2_dist, + int row_size, int col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist); + + // First, get the L2 norm from every pixel to the control points, + // Then, sum it and weight it. Finally, add the bias. + + for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + for (int cp = 0; cp < 3702; cp++) { + int index_0 = (0 * row_size + row) * col_size + col; + int index_1 = (1 * row_size + row) * col_size + col; + int index_2 = (2 * row_size + row) * col_size + col; + float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); + float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]); + float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); + float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); + float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); + float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]); + float val = val1 * val2 + val3 * val4 + val5 * val6; + float sqrt_val = sqrt(val); + l2_dist[cp] = sqrt_val; + } + for (int chan = 0; chan < CHAN_SIZE; chan++) { + float chan_val = 0.0; + for (int cp = 0; cp < 3702; cp++) { + chan_val += l2_dist[cp] * weights[cp * CHAN_SIZE + chan]; + } + chan_val += coefs[0 * CHAN_SIZE + chan] + + coefs[1 * CHAN_SIZE + chan] * input[(0 * row_size + row) * col_size + col] + + coefs[2 * CHAN_SIZE + chan] * input[(1 * row_size + row) * col_size + col] + + coefs[3 * CHAN_SIZE + chan] * input[(2 * row_size + row) * col_size + col]; + result[(chan * row_size + row) * col_size + col] = max(chan_val, 0); + } + } + __visc__return(1, bytes_result); +} + +// HPVM leaf node function, for tone mapping +void tone_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *tone_map, size_t bytes_tone_map, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(3, input, result, tone_map, 1, result); + + for (int chan = 0; chan < CHAN_SIZE; chan++) + for (int row = 0; row < row_size; row++) + for (int col = 0; col < col_size; col++) { + int index = (chan * row_size + row) * col_size + col; + uint8_t x = input[index] * 255; + result[index] = tone_map[x * CHAN_SIZE + chan]; + } + __visc__return(1, bytes_result); +} + +/********************************************************************/ +/*** HPVM Internal node Functions - Determine the graph structure ***/ +/********************************************************************/ + +// We create a wrapper node per leaf node - this is an implementation +// requirement for the FPGA backend . The CPU backend also supports this, +// so it does not cause a portability issue. + +void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, result, 1, result); + + // Create an 1D (specified by 1st argument) HPVM node with 1 dynamic + // instance (last argument) associated with node function scale_fxp + void *ScaleNode = __visc__createNodeND(1, scale_fxp, (size_t)1); + + // Binds inputs of current node with specified node + // - destination node + // - argument position in argument list of function of source node + // - argument position in argument list of function of destination node + // - streaming (1) or non-streaming (0) + __visc__bindIn(ScaleNode, 0, 0, 0); // bind input + __visc__bindIn(ScaleNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(ScaleNode, 2, 2, 0); // bind result + __visc__bindIn(ScaleNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(ScaleNode, 4, 4, 0); // bind row_size + __visc__bindIn(ScaleNode, 5, 5, 0); // bind col_size + + // Similar to bindIn, but for the output. Output of a node is a struct, and + // we consider the fields in increasing ordering. + __visc__bindOut(ScaleNode, 0, 0, 0); +} + +void descale_fxp_wrapper(float *input, size_t bytes_input, + uint8_t *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, result, 1, result); + void *DescaleNode = __visc__createNodeND(1, descale_fxp, (size_t)1); + __visc__bindIn(DescaleNode, 0, 0, 0); // bind input + __visc__bindIn(DescaleNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(DescaleNode, 2, 2, 0); // bind result + __visc__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(DescaleNode, 4, 4, 0); // bind row_size + __visc__bindIn(DescaleNode, 5, 5, 0); // bind col_size + + __visc__bindOut(DescaleNode, 0, 0, 0); +} + +void demosaic_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, result, 1, result); + void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, (size_t)1); + __visc__bindIn(DemosaicNode, 0, 0, 0); // bind input + __visc__bindIn(DemosaicNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(DemosaicNode, 2, 2, 0); // bind result + __visc__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(DemosaicNode, 4, 4, 0); // bind row_size + __visc__bindIn(DemosaicNode, 5, 5, 0); // bind col_size + + __visc__bindOut(DemosaicNode, 0, 0, 0); +} + +void denoise_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(2, input, result, 1, result); + void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, (size_t)1); + __visc__bindIn(DenoiseNode, 0, 0, 0); // bind input + __visc__bindIn(DenoiseNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(DenoiseNode, 2, 2, 0); // bind result + __visc__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(DenoiseNode, 4, 4, 0); // bind row_size + __visc__bindIn(DenoiseNode, 5, 5, 0); // bind col_size + + __visc__bindOut(DenoiseNode, 0, 0, 0); +} + +void transform_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *TsTw_tran, size_t bytes_TsTw, + int row_size, int col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(3, input, result, TsTw_tran, 1, result); + void *TransformNode = __visc__createNodeND(1, transform_fxp, (size_t)1); + __visc__bindIn(TransformNode, 0, 0, 0); // bind input + __visc__bindIn(TransformNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(TransformNode, 2, 2, 0); // bind result + __visc__bindIn(TransformNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(TransformNode, 4, 4, 0); // bind tstw + __visc__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw + __visc__bindIn(TransformNode, 6, 6, 0); // bind row_size + __visc__bindIn(TransformNode, 7, 7, 0); // bind col_size + + __visc__bindOut(TransformNode, 0, 0, 0); +} + +void gamut_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, + float *coefs, size_t bytes_coefs, + float *l2_dist, size_t bytes_l2_dist, + int row_size, int col_size) { + __visc__hint(CPU_TARGET); + __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result); + void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, (size_t)1); + __visc__bindIn(GamutNode, 0, 0, 0); // bind input + __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(GamutNode, 2, 2, 0); // bind result + __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts + __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts + __visc__bindIn(GamutNode, 6, 6, 0); // bind weights + __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights + __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs + __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs + __visc__bindIn(GamutNode, 10, 10, 0); // bind l2_dist + __visc__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist + __visc__bindIn(GamutNode, 12, 12, 0); // bind row_size + __visc__bindIn(GamutNode, 13, 13, 0); // bind col_size + + __visc__bindOut(GamutNode, 0, 0, 0); +} +void tone_map_fxp_wrapper(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *tone_map, size_t bytes_tone_map, + int row_size, int col_size) { + + __visc__hint(CPU_TARGET); + __visc__attributes(3, input, result, tone_map, 1, result); + void *ToneMapNode = __visc__createNodeND(1, tone_map_fxp, (size_t)1); + __visc__bindIn(ToneMapNode, 0, 0, 0); // bind input + __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input + __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result + __visc__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result + __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map + __visc__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map + __visc__bindIn(ToneMapNode, 6, 6, 0); // bind row_size + __visc__bindIn(ToneMapNode, 7, 7, 0); // bind col_size + + __visc__bindOut(ToneMapNode, 0, 0, 0); +} + + +/*** ROOT Node - Top Level of the Graph Hierarchy ***/ +void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input, + /*2*/ uint8_t *result, /*3*/ size_t bytes_result, + /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled, + /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled, + /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out, + /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out, + /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out, + /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out, + /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw, + /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts, + /*20*/ float *weights, /*21*/ size_t bytes_weights, + /*22*/ float*coefs, /*23*/ size_t bytes_coefs, + /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist, + /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map, + /*28*/ int row_size, /*29*/ int col_size) { + + //Specifies compilation target for current node + __visc__hint(CPU_TARGET); + + // Specifies pointer arguments that will be used as "in" and "out" arguments + // - count of "in" arguments + // - list of "in" argument , and similar for "out" + __visc__attributes(14, input, result, input_scaled, result_scaled, demosaic_out, denoise_out, + transform_out, gamut_out, TsTw, ctrl_pts, weights, coefs, tone_map, l2_dist, + 5, result, demosaic_out, denoise_out, transform_out, gamut_out); + + // Create an 0D (specified by 1st argument) HPVM node - so a single node + // associated with node function ---_fxp_wrapper + void* ScNode = __visc__createNodeND(0, scale_fxp_wrapper); + void* DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper); + void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper); + void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper); + void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper); + void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper); + void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper); + + // BindIn binds inputs of current node with specified node + // - destination node + // - argument position in argument list of function of source node + // - argument position in argument list of function of destination node + // - streaming (1) or non-streaming (0) + + // Edge transfers data between nodes within the same level of hierarchy. + // - source and destination dataflow nodes + // - edge type, all-all (1) or one-one(0) + // - source position (in output struct of source node) + // - destination position (in argument list of destination node) + // - streaming (1) or non-streaming (0) + + // scale_fxp inputs + __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input + __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input + __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result + __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result + __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size + __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size + + // demosaic_fxp inputs + __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input + __visc__edge(ScNode, DmNode, 1, 0, 1, 0); // SCNode:bytes_result -> DmNode:bytes_input + __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result + __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result + __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size + __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size + + // denoise_fxp inputs + __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input + __visc__edge(DmNode, DnNode, 1, 0, 1, 0); // DMNode:bytes_result -> DnNode:bytes_input + __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result + __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result + __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size + __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size + + // transform_fxp inputs + __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input + __visc__edge(DnNode, TrNode, 1, 0, 1, 0); // DnNode:bytes_result -> TrNode:bytes_input + __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result + __visc__bindIn(TrNode, 13, 3, 0); // bytes_result_scaled -> TrNode:bytes_result + __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann + __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw + __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size + __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size + + // gamut_fxp inputs + __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input + __visc__edge(TrNode, GmNode, 1, 0, 1, 0); // TrNode:bytes_result -> GmNode:bytes_input + __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result + __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result + __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts + __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts + __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights + __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights + __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs + __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs + __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist + __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist + __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size + __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size + + // tone_map_fxp inputs + __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input + __visc__edge(GmNode, TnNode, 1, 0, 1, 0); // GmNode:bytes_result -> TnNode:bytes_input + __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result + __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result + __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map + __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map + __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size + __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size + + // descale_fxp inputs + __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input + __visc__edge(TnNode, DsNode, 1, 0, 1, 0); // TnNode:bytes_result -> DsNode:bytes_input + __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result + __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result + __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size + __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size + + // Similar to bindIn, but for the output. Output of a node is a struct, and + // we consider the fields in increasing ordering. + __visc__bindOut(DsNode, 0, 0, 0); + +} + +int main(int argc, char* argv[]) { + // Parse the arguments. + arguments args; + set_default_args(&args); + argp_parse(&parser, argc, argv, 0, 0, &args); + + // Read a raw image. + // NOTE: We deliberately perform this file I/O outside of the kernel. + printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]); + int row_size, col_size; + uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size); + + printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE); + + // Allocate a buffer for storing the output image data. + // (This is currently the same size as the input image data.) + size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE; + size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE; + uint8_t *image_out = (uint8_t*) malloc_aligned(bytes_image); + uint8_t *image_out_gamut = (uint8_t*) malloc_aligned(bytes_image); + uint8_t *image_out_demosaic = (uint8_t*) malloc_aligned(bytes_image); + uint8_t *image_out_denoise = (uint8_t*) malloc_aligned(bytes_image); + uint8_t *image_out_transform = (uint8_t*) malloc_aligned(bytes_image); + + __visc__init(); + + /////////////////////////////////////////////////////////////// + // Camera Model Parameters + /////////////////////////////////////////////////////////////// + // Path to the camera model to be used +// char cam_model_path[100]; +// char cam_model_path = "cam_models/NikonD7000/"; + // White balance index (select white balance from transform file) + // The first white balance in the file has a wb_index of 1 + // For more information on model format see the readme + int wb_index = 6; + + // Number of control points + int num_ctrl_pts = 3702; + uint8_t *input, *result; + float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, *transform_out, *gamut_out; + float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist; + + TsTw = get_TsTw("cam_models/NikonD7000/", wb_index); + float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE); + free(TsTw); + TsTw = trans; + ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts); + weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts); + coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts); + tone_map = get_tone_map("cam_models/NikonD7000/"); + + input_scaled = (float*) malloc_aligned(bytes_fimage); + result_scaled = (float*) malloc_aligned(bytes_fimage); + demosaic_out = (float*) malloc_aligned(bytes_fimage); + denoise_out = (float*) malloc_aligned(bytes_fimage); + transform_out = (float*) malloc_aligned(bytes_fimage); + gamut_out = (float*) malloc_aligned(bytes_fimage); + l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts); + + // This is host_input in cam_pipe() + input = (uint8_t*) malloc_aligned(bytes_image); + convert_hwc_to_chw(image_in, row_size, col_size, &input); + + // This is host_result in cam_pipe() + result = (uint8_t*) malloc_aligned(bytes_image); + + // Allocate struct to pass DFG inputs + RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn)); + + // Set up HPVM DFG inputs in the rootArgs struct. + rootArgs->input = input; + rootArgs->bytes_input = bytes_image; + + rootArgs->result = result; + rootArgs->bytes_result = bytes_image; + + rootArgs->input_scaled = input_scaled; + rootArgs->bytes_input_scaled = bytes_fimage; + + rootArgs->result_scaled = result_scaled; + rootArgs->bytes_result_scaled = bytes_fimage; + + rootArgs->demosaic_out = demosaic_out; + rootArgs->bytes_demosaic_out = bytes_fimage; + + rootArgs->denoise_out = denoise_out; + rootArgs->bytes_denoise_out = bytes_fimage; + + rootArgs->transform_out = transform_out; + rootArgs->bytes_transform_out = bytes_fimage; + + rootArgs->gamut_out = gamut_out; + rootArgs->bytes_gamut_out = bytes_fimage; + + rootArgs->TsTw = TsTw; + rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float); + + rootArgs->ctrl_pts = ctrl_pts; + rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float); + + rootArgs->weights = weights; + rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float); + + rootArgs->coefs = coefs; + rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float); + + rootArgs->tone_map = tone_map; + rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float); + + rootArgs->l2_dist = l2_dist; + rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float); + + rootArgs->row_size = row_size; + rootArgs->col_size = col_size; + + // Memory tracking is required for pointer arguments. + // Nodes can be scheduled on different targets, and + // dataflow edge implementation needs to request data. + // The pair (pointer, size) is inserted in memory tracker using this call + llvm_visc_track_mem(input, bytes_image); + llvm_visc_track_mem(result, bytes_image); + llvm_visc_track_mem(input_scaled, bytes_fimage); + llvm_visc_track_mem(result_scaled, bytes_fimage); + llvm_visc_track_mem(demosaic_out, bytes_fimage); + llvm_visc_track_mem(denoise_out, bytes_fimage); + llvm_visc_track_mem(transform_out, bytes_fimage); + llvm_visc_track_mem(gamut_out, bytes_fimage); + llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(coefs, 4 * CHAN_SIZE *sizeof(float)); + llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float)); + llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float)); + + printf("\n\nLaunching CAVA pipeline!\n"); + + void* camPipeDFG = __visc__launch(0, CamPipeRoot, (void*) rootArgs); + __visc__wait(camPipeDFG); + + printf("\n\nPipeline execution completed!\n"); + printf("\n\nRequesting memory!\n"); + + // Request data from graph. + llvm_visc_request_mem(result, bytes_image); + llvm_visc_request_mem(demosaic_out, bytes_fimage); + llvm_visc_request_mem(denoise_out, bytes_fimage); + llvm_visc_request_mem(transform_out, bytes_fimage); + llvm_visc_request_mem(gamut_out, bytes_fimage); + printf("\n\nDone requesting memory!\n"); + + + uint8_t* gamut_out_descaled = (uint8_t*) malloc_aligned(bytes_image); + uint8_t* demosaic_out_descaled = (uint8_t*) malloc_aligned(bytes_image); + uint8_t* transform_out_descaled = (uint8_t*) malloc_aligned(bytes_image); + uint8_t* denoise_out_descaled = (uint8_t*) malloc_aligned(bytes_image); + + descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, row_size, col_size); + descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, row_size, col_size); + descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, row_size, col_size); + descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, row_size, col_size); + + convert_chw_to_hwc(result, row_size, col_size, &image_out); + convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut); + convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, &image_out_demosaic); + convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, &image_out_denoise); + convert_chw_to_hwc(transform_out_descaled, row_size, col_size, &image_out_transform); + + + // Remove tracked pointers. + llvm_visc_untrack_mem(input); + llvm_visc_untrack_mem(result); + llvm_visc_untrack_mem(input_scaled); + llvm_visc_untrack_mem(result_scaled); + llvm_visc_untrack_mem(demosaic_out); + llvm_visc_untrack_mem(denoise_out); + llvm_visc_untrack_mem(transform_out); + llvm_visc_untrack_mem(gamut_out); + + llvm_visc_untrack_mem(TsTw); + llvm_visc_untrack_mem(ctrl_pts); + llvm_visc_untrack_mem(weights); + llvm_visc_untrack_mem(coefs); + llvm_visc_untrack_mem(tone_map); + llvm_visc_untrack_mem(l2_dist); + + // Output the image. + // NOTE: We deliberately perform this file I/O outside of the kernel. + char str[50], base_str[50]; + strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]); + strcpy(str, base_str); + strcat(str, ".bin"); + printf("Writing output image to %s\n", str); + write_image_to_binary(str, image_out, row_size, col_size); + strcpy(str, base_str); + strcat(str, "_gamut.bin"); + printf("Writing output image to %s\n", str); + write_image_to_binary(str, image_out_gamut, row_size, col_size); + strcpy(str, base_str); + strcat(str, "_demosaic.bin"); + printf("Writing output image to %s\n", str); + write_image_to_binary(str, image_out_demosaic, row_size, col_size); + strcpy(str, base_str); + strcat(str, "_denoise.bin"); + printf("Writing output image to %s\n", str); + write_image_to_binary(str, image_out_denoise, row_size, col_size); + strcpy(str, base_str); + strcat(str, "_transform.bin"); + printf("Writing output image to %s\n", str); + write_image_to_binary(str, image_out_transform, row_size, col_size); + + __visc__cleanup(); + + return 0; +} + diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/new_main.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/new_main.c new file mode 100644 index 0000000000000000000000000000000000000000..e6e36ba1db0d4d2e7806256b4c31e5955917e68c --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/new_main.c @@ -0,0 +1,115 @@ + + +#include <stdlib.h> + + +typedef struct __attribute__((__packed__)) { + float* input; size_t bytes_input; + float* result; size_t bytes_result; +} +RootIn; + + + + +void scale_values(float* input, size_t bytes_input) { + + __visc__hint(DEVICE); + __visc__attributes(1, input, 1, input); + + for (int ind = 0; ind < bytes_input; ind++){ + input[ind] = input[ind] * 2.0; + } + + __visc__return(1, bytes_input); +} + + + + +void graphRoot(/*0*/ float* input, /*1*/ size_t bytes_input, + /*2*/ float* result, /*3*/ size_t bytes_result) { + + //Specifies compilation target for current node + __visc__hint(CPU_TARGET); + + __visc__attributes(2, input, result, 2, input, result); + + // Create an 0D (specified by 1st argument) HPVM node - so a single node + // associated with node function ---_fxp_wrapper + + void* scaleNode = __visc__createNodeND(0, scale_values); + + // BindIn binds inputs of current node with specified node + // - destination node + // - argument position in argument list of function of source node + // - argument position in argument list of function of destination node + // - streaming (1) or non-streaming (0) + + // Edge transfers data between nodes within the same level of hierarchy. + // - source and destination dataflow nodes + // - edge type, all-all (1) or one-one(0) + // - source position (in output struct of source node) + // - destination position (in argument list of destination node) + // - streaming (1) or non-streaming (0) + + // scale_fxp inputs + __visc__bindIn(scaleNode, 0, 0, 0); // input -> ScNode:input + __visc__bindIn(scaleNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input + + // Similar to bindIn, but for the output. Output of a node is a struct, and + // we consider the fields in increasing ordering. + __visc__bindOut(scaleNode, 0, 0, 0); + +} + + + + + +int main(int argc, char* argv[]) { + + __visc__init(); + + size_t input_size = 100; + size_t result_size = 100; + + size_t input_bytes = input_size * sizeof(float); + size_t result_bytes = result_size * sizeof(float); + + // This is host_input in cam_pipe() + input = (float*) malloc(input_bytes); + // This is host_result in cam_pipe() + result = (float*) malloc(result_bytes); + + + RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn)); + + // Set up HPVM DFG inputs in the rootArgs struct. + rootArgs->input = input; + rootArgs->bytes_input = input_bytes; + + rootArgs->result = result; + rootArgs->bytes_result = result_bytes; + + llvm_visc_track_mem(input, input_bytes); + llvm_visc_track_mem(result, result_bytes); + + + void* testDFG = __visc__launch(0, graphRoot, (void*) rootArgs); + __visc__wait(testDFG); + + + llvm_visc_request_mem(result, input_bytes); + llvm_visc_request_mem(result, result_bytes); + + + + __visc__cleanup(); + + return 0; +} + + + + diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.c new file mode 100644 index 0000000000000000000000000000000000000000..2ebedec936915b5e7f11881c5001c84b6db26474 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.c @@ -0,0 +1,290 @@ +#include <stdio.h> +#include <math.h> +#include "pipe_stages.h" +#include "cam_pipe_utility.h" + +//void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) { +void scale_fxp(uint8_t *input, size_t bytes_input, + float *output, size_t bytes_output, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, output, 1, output); + + ARRAY_3D(uint8_t, _input, input, row_size, col_size); + ARRAY_3D(float, _output, output, row_size, col_size); + sl_chan: + for (int chan = 0; chan < CHAN_SIZE; chan++) + sl_row: + for (int row = 0; row < row_size; row++) + sl_col: + for (int col = 0; col < col_size; col++) + _output[chan][row][col] = _input[chan][row][col] * 1.0 / 255; + + __visc__return(1, bytes_output); +} + +//void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) { +void descale_fxp(float *input, size_t bytes_input, + uint8_t *output, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, output, 1, output); + + ARRAY_3D(float, _input, input, row_size, col_size); + ARRAY_3D(uint8_t, _output, output, row_size, col_size); + dsl_chan: + for (int chan = 0; chan < CHAN_SIZE; chan++) + dsl_row: + for (int row = 0; row < row_size; row++) + dsl_col: + for (int col = 0; col < col_size; col++) + _output[chan][row][col] = min(max(_input[chan][row][col] * 255, 0), 255); + + __visc__return(1, bytes_output); +} + +// Demosaicing stage +// G R +// B G +//void demosaic_fxp(float *input, int row_size, int col_size, float *result) { +void demosaic_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, result, 1, result); + + printf("Demosaicing.\n"); + ARRAY_3D(float, _input, input, row_size, col_size); + ARRAY_3D(float, _result, result, row_size, col_size); + + dm_row: + for (int row = 1; row < row_size - 1; row++) + dm_col: + for (int col = 1; col < col_size - 1; col++) + if (row % 2 == 0 && col % 2 == 0) { + // Green pixel + // Getting the R values + float R1 = _input[0][row][col - 1]; + float R2 = _input[0][row][col + 1]; + // Getting the B values + float B1 = _input[2][row - 1][col]; + float B2 = _input[2][row + 1][col]; + // R + _result[0][row][col] = (R1 + R2) / 2; + // G + _result[1][row][col] = _input[1][row][col] * 2; + // B + _result[2][row][col] = (B1 + B2) / 2; + } else if (row % 2 == 0 && col % 2 == 1) { + // Red pixel + // Getting the G values + float G1 = _input[1][row - 1][col]; + float G2 = _input[1][row + 1][col]; + float G3 = _input[1][row][col - 1]; + float G4 = _input[1][row][col + 1]; + // Getting the B values + float B1 = _input[2][row - 1][col - 1]; + float B2 = _input[2][row - 1][col + 1]; + float B3 = _input[2][row + 1][col - 1]; + float B4 = _input[2][row + 1][col + 1]; + // R + _result[0][row][col] = _input[0][row][col]; + // G + _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; + // B (center pixel) + _result[2][row][col] = (B1 + B2 + B3 + B4) / 4; + } else if (row % 2 == 1 && col % 2 == 0) { + // Blue pixel + // Getting the R values + float R1 = _input[0][row - 1][col - 1]; + float R2 = _input[0][row + 1][col - 1]; + float R3 = _input[0][row - 1][col + 1]; + float R4 = _input[0][row + 1][col + 1]; + // Getting the G values + float G1 = _input[1][row - 1][col]; + float G2 = _input[1][row + 1][col]; + float G3 = _input[1][row][col - 1]; + float G4 = _input[1][row][col + 1]; + // R + _result[0][row][col] = (R1 + R2 + R3 + R4) / 4; + // G + _result[1][row][col] = (G1 + G2 + G3 + G4) / 2; + // B + _result[2][row][col] = _input[2][row][col]; + } else { + // Bottom Green pixel + // Getting the R values + float R1 = _input[0][row - 1][col]; + float R2 = _input[0][row + 1][col]; + // Getting the B values + float B1 = _input[2][row][col - 1]; + float B2 = _input[2][row][col + 1]; + // R + _result[0][row][col] = (R1 + R2) / 2; + // G + _result[1][row][col] = _input[1][row][col] * 2; + // B + _result[2][row][col] = (B1 + B2) / 2; + } + + __visc__return(1, bytes_result); +} + +static void sort(float arr[], int n) { + int i, j; + dn_sort_i: + for (i = 0; i < n - 1; i++) + dn_sort_j: + for (j = 0; j < n - i - 1; j++) + if (arr[j] > arr[j + 1]) { + float temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } +} + +// Simple denoise +//void denoise_fxp(float *input, int row_size, int col_size, float *result) { +void denoise_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(2, input, result, 1, result); + + printf("Denoising.\n"); + ARRAY_3D(float, _input, input, row_size, col_size); + ARRAY_3D(float, _result, result, row_size, col_size); + + dn_chan: + for (int chan = 0; chan < CHAN_SIZE; chan++) + dn_row: + for (int row = 0; row < row_size; row++) + dn_col: + for (int col = 0; col < col_size; col++) + if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) { + float filter[9]; + dn_slide_row: + for (int i = row-1; i < row+2; i++) + dn_slide_col: + for (int j = col-1; j < col+2; j++) { + int index = (i - row + 1) * 3 + j - col + 1; + filter[index] = _input[chan][i][j]; + } + sort(filter, 9); + _result[chan][row][col] = filter[4]; + } else { + _result[chan][row][col] = _input[chan][row][col]; + } + __visc__return(1, bytes_result); +} + +// Color map and white balance transform +//void transform_fxp(float *input, int row_size, int col_size, float *result, +// float *TsTw_tran) { +void transform_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *TsTw_tran, size_t bytes_TsTw, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(3, input, result, TsTw_tran, 1, result); + + printf("Color mapping.\n"); + ARRAY_3D(float, _input, input, row_size, col_size); + ARRAY_3D(float, _result, result, row_size, col_size); + ARRAY_2D(float, _TsTw_tran, TsTw_tran, 3); + + tr_chan: + for (int chan = 0; chan < CHAN_SIZE; chan++) + tr_row: + for (int row = 0; row < row_size; row++) + tr_col: + for (int col = 0; col < col_size; col++) + _result[chan][row][col] = + max(_input[0][row][col] * _TsTw_tran[0][chan] + + _input[1][row][col] * _TsTw_tran[1][chan] + + _input[2][row][col] * _TsTw_tran[2][chan], + 0); + __visc__return(1, bytes_result); +} + +// +// Weighted radial basis function for gamut mapping +// +//void gamut_map_fxp(float *input, int row_size, int col_size, float *result, +// float *ctrl_pts, float *weights, float *coefs, float *l2_dist) { +void gamut_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, + float *coefs, size_t bytes_coefs, + float *l2_dist, size_t bytes_l2_dist, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result); + + printf("Gamut mapping.\n"); + ARRAY_3D(float, _input, input, row_size, col_size); + ARRAY_3D(float, _result, result, row_size, col_size); + ARRAY_2D(float, _ctrl_pts, ctrl_pts, 3); + ARRAY_2D(float, _weights, weights, 3); + ARRAY_2D(float, _coefs, coefs, 3); + + // First, get the L2 norm from every pixel to the control points, + // Then, sum it and weight it. Finally, add the bias. + gm_rbf_row: + for (int row = 0; row < row_size; row++) + gm_rbf_col: + for (int col = 0; col < col_size; col++) { + gm_rbf_cp0: + for (int cp = 0; cp < num_ctrl_pts; cp++) { + l2_dist[cp] = + sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) * + (_input[0][row][col] - _ctrl_pts[cp][0]) + + (_input[1][row][col] - _ctrl_pts[cp][1]) * + (_input[1][row][col] - _ctrl_pts[cp][1]) + + (_input[2][row][col] - _ctrl_pts[cp][2]) * + (_input[2][row][col] - _ctrl_pts[cp][2])); + } + gm_rbf_chan: + for (int chan = 0; chan < CHAN_SIZE; chan++) { + float chan_val = 0.0; + gm_rbf_cp1: + for (int cp = 0; cp < num_ctrl_pts; cp++) { + chan_val += l2_dist[cp] * _weights[cp][chan]; + } + // Add on the biases for the RBF + chan_val += _coefs[0][chan] + _coefs[1][chan] * _input[0][row][col] + + _coefs[2][chan] * _input[1][row][col] + + _coefs[3][chan] * _input[2][row][col]; + _result[chan][row][col] = max(chan_val, 0); + } + } + __visc__return(1, bytes_result); +} + +// Tone mapping +//void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map, +// float *result) { +void tone_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *tone_map, size_t bytes_tone_map, + int row_size, int col_size) { + __visc__hint(DEVICE); + __visc__attributes(3, input, result, tone_map, 1, result); + + printf("Tone mapping.\n"); + ARRAY_3D(float, _input, input, row_size, col_size); + ARRAY_3D(float, _result, result, row_size, col_size); + ARRAY_2D(float, _tone_map, tone_map, 3); + + tm_chan: + for (int chan = 0; chan < CHAN_SIZE; chan++) + tm_row: + for (int row = 0; row < row_size; row++) + tm_col: + for (int col = 0; col < col_size; col++) { + uint8_t x = _input[chan][row][col] * 255; + _result[chan][row][col] = _tone_map[x][chan]; + } + __visc__return(1, bytes_result); +} diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.h new file mode 100644 index 0000000000000000000000000000000000000000..eae4347b991fe948173fc85334c65f084d40b745 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/pipe_stages.h @@ -0,0 +1,63 @@ +#ifndef _PIPE_STAGES_H_ +#define _PIPE_STAGES_H_ + +#include "defs.h" + +#define CHAN_SIZE 3 + +#define ISP 0x4 + +#define max(a,b) \ + ({ __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + _a > _b ? _a : _b; }) + +#define min(a,b) \ + ({ __typeof__ (a) _a = (a); \ + __typeof__ (b) _b = (b); \ + _a < _b ? _a : _b; }) + +#define abs(a) \ + ({ __typeof__ (a) _a = (a); \ + _a < 0 ? -_a : _a; }) + +extern int num_ctrl_pts; + +void scale_fxp(uint8_t *input, size_t bytes_input, + float *output, size_t bytes_output, + int row_size, int col_size); + +void descale_fxp(float *input, size_t bytes_input, + uint8_t *output, size_t bytes_result, + int row_size, int col_size); + +void demosaic_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size); + +void denoise_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + int row_size, int col_size); + +void transform_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *TsTw_tran, size_t bytes_TsTw, + int row_size, int col_size); + +void gamut_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *ctrl_pts, size_t bytes_ctrl_pts, + float *weights, size_t bytes_weights, + float *coefs, size_t bytes_coefs, + float *l2_dist, size_t bytes_l2_dist, + int row_size, int col_size); + +void tone_map_fxp(float *input, size_t bytes_input, + float *result, size_t bytes_result, + float *tone_map, size_t bytes_tone_map, + int row_size, int col_size); + +void tone_map_approx_fxp(float *input, int row_size, int col_size, + float *result); + +#endif diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/rename.sh b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/rename.sh new file mode 100755 index 0000000000000000000000000000000000000000..3b6faac63aef89cc51f776ed7e5f1e048f8da815 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/rename.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +for f in *.cc; do + mv -- "$f" "${f%.cc}.c" +done diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.c b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.c new file mode 100644 index 0000000000000000000000000000000000000000..c1eaee3333c2afffdcae827f956efa4e25705352 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.c @@ -0,0 +1,11 @@ +#include <stdlib.h> +#include <assert.h> +#include "defs.h" +#include "utility.h" + +void *malloc_aligned(size_t size) { + void *ptr = NULL; + int err = posix_memalign((void **)&ptr, CACHELINE_SIZE, size); + assert(err == 0 && "Failed to allocate memory!"); + return ptr; +} diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..fc407c72a3c251dc5628bc90e23e71040a074a32 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/utility.h @@ -0,0 +1,8 @@ +#ifndef _COMMON_UTILITY_H_ +#define _COMMON_UTILITY_H_ + +#include <stddef.h> + +void *malloc_aligned(size_t size); + +#endif diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/visc.h b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/visc.h new file mode 100644 index 0000000000000000000000000000000000000000..a263e352523431a868637b1849ac532d7ed716a9 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/cava_test/src/visc.h @@ -0,0 +1,110 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifndef DEVICE +#define DEVICE GPU_TARGET +#endif + +#include "llvm/SupportVISC/VISCHint.h" + +#ifdef __cplusplus +extern "C" { +void __visc__hint(visc::Target); +//void __visc__wait(void*); +#else +void __visc__hint(enum Target); +//void __visc__wait(unsigned); +#endif + +#ifdef __cplusplus +//void* __visc__node(...); +//void* __visc__createNode(...); +//void* __visc__createNode1D(...); +//void* __visc__createNode2D(...); +//void* __visc__createNode3D(...); +//void __visc__return(...); +#endif + +void* __visc__createNodeND(unsigned,...); +void __visc__return(unsigned, ...); + +void __visc__attributes(unsigned, ...); +void __visc__init(); +void __visc__cleanup(); + +void __visc__bindIn(void*, unsigned, unsigned, unsigned); +void __visc__bindOut(void*, unsigned, unsigned, unsigned); +void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned); +void __visc__push(void*, void*); +void* __visc__pop(void*); +void* __visc__launch(unsigned, ...); +void __visc__wait(void*); + +void* __visc__getNode(); +void* __visc__getParentNode(void*); +void __visc__barrier(); +void* __visc__malloc(long); +long __visc__getNodeInstanceID_x(void*); +long __visc__getNodeInstanceID_y(void*); +long __visc__getNodeInstanceID_z(void*); +long __visc__getNumNodeInstances_x(void*); +long __visc__getNumNodeInstances_y(void*); +long __visc__getNumNodeInstances_z(void*); + +// Atomic +// signed int +int __visc__atomic_cmpxchg(int*, int, int); +int __visc__atomic_add(int*, int); +int __visc__atomic_sub(int*, int); +int __visc__atomic_xchg(int*, int); +int __visc__atomic_inc(int*); +int __visc__atomic_dec(int*); +int __visc__atomic_min(int*, int); +int __visc__atomic_max(int*, int); +int __visc__atomic_umax(int*, int); +int __visc__atomic_umin(int*, int); +int __visc__atomic_and(int*, int); +int __visc__atomic_or(int*, int); +int __visc__atomic_xor(int*, int); + +// Special Func +float __visc__floor(float); +float __visc__rsqrt(float); +float __visc__sqrt(float); +float __visc__sin(float); +float __visc__cos(float); +// unsigned int +//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned); +//unsigned __visc__atomic_add(unsigned*, unsigned); +//unsigned __visc__atomic_sub(unsigned*, unsigned); +//unsigned __visc__atomic_xchg(unsigned*, unsigned); +//unsigned __visc__atomic_inc(unsigned*); +//unsigned __visc__atomic_dec(unsigned*); +//unsigned __visc__atomic_min(unsigned*, unsigned); +//unsigned __visc__atomic_max(unsigned*, unsigned); +//unsigned __visc__atomic_and(unsigned*, unsigned); +//unsigned __visc__atomic_or(unsigned*, unsigned); +//unsigned __visc__atomic_xor(unsigned*, unsigned); + + +#include <unistd.h> + +long get_global_id(int); +long get_group_id(int); +long get_local_id(int); +long get_local_size(int); + + +void llvm_visc_track_mem(void*, size_t); +void llvm_visc_untrack_mem(void*); +void llvm_visc_request_mem(void*, size_t); + +#ifdef __cplusplus +} +#endif +