From fc36236a449b1ac088e4c0d394f3d3b53e148242 Mon Sep 17 00:00:00 2001
From: Hashim Sharif <hsharif3@tyler.cs.illinois.edu>
Date: Fri, 6 Dec 2019 01:51:13 -0600
Subject: [PATCH] Adding simple Saxpy test

---
 .../parboil/benchmarks/saxpy_test/Makefile    | 176 ++++++++++++++
 .../parboil/benchmarks/saxpy_test/src/defs.h  | 224 ++++++++++++++++++
 .../parboil/benchmarks/saxpy_test/src/main.c  | 135 +++++++++++
 .../parboil/benchmarks/saxpy_test/src/visc.h  | 110 +++++++++
 4 files changed, 645 insertions(+)
 create mode 100644 llvm/test/VISC/parboil/benchmarks/saxpy_test/Makefile
 create mode 100644 llvm/test/VISC/parboil/benchmarks/saxpy_test/src/defs.h
 create mode 100644 llvm/test/VISC/parboil/benchmarks/saxpy_test/src/main.c
 create mode 100644 llvm/test/VISC/parboil/benchmarks/saxpy_test/src/visc.h

diff --git a/llvm/test/VISC/parboil/benchmarks/saxpy_test/Makefile b/llvm/test/VISC/parboil/benchmarks/saxpy_test/Makefile
new file mode 100644
index 0000000000..f452c0b2cc
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/saxpy_test/Makefile
@@ -0,0 +1,176 @@
+# This Makefile compiles the HPVM-CAVA pilot project. 
+# It builds HPVM-related dependencies, then the native camera pipeline ISP code.
+#
+# Paths to some dependencies (e.g., HPVM, LLVM) must exist in Makefile.config,
+# which can be copied from Makefile.config.example for a start.
+
+CONFIG_FILE := Makefile.config
+
+ifeq ($(wildcard $(CONFIG_FILE)),)
+    $(error $(CONFIG_FILE) not found. See $(CONFIG_FILE).example)
+endif
+include $(CONFIG_FILE)
+
+# Compiler Flags
+
+DLEVEL ?= 0
+LFLAGS += -lm -lrt
+
+# Build dirs
+ifeq ($(VERSION),)
+    VERSION = IR_modules
+endif
+SRC_DIR = src/
+CAM_PIPE_SRC_DIR = $(SRC_DIR)
+BUILD_DIR = build/$(TARGET)_$(VERSION)
+CURRENT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+# Source files for the frontend camera pipeline
+COMMON_SRCS = main.c \
+
+
+#CAM_PIPE_SRCS = load_cam_model.c \
+        cam_pipe_utility.c \
+	dma_interface.c # FIXME: This is a hack until external C++ files can be included in build.
+
+
+# NOTE: We have temporarily removed gem5 and other dependencies for simplicity.
+SRCS = $(COMMON_SRCS)
+
+# NATIVE_FULL_PATH_SRCS contains all the full path source files for the camera vision pipeline.
+NATIVE_FULL_PATH_SRCS = $(patsubst %, $(SRC_DIR)/%, $(COMMON_SRCS))
+NATIVE_FULL_PATH_SRCS += $(patsubst %, $(CAM_PIPE_SRC_DIR)/%, $(CAM_PIPE_SRCS))
+
+INCLUDES +=  -I$(SRC_DIR) \
+            -I$(CAM_PIPE_SRC_DIR) \
+
+ifneq ($(CONFUSE_ROOT),)
+INCLUDES += -I$(CONFUSE_ROOT)/include
+LFLAGS += -L$(CONFUSE_ROOT)/lib
+endif
+
+EXE = vec_add
+
+CAM_CFLAGS += -mf16c -flax-vector-conversions
+LFLAGS += -pthread
+
+
+## BEGIN HPVM MAKEFILE
+LANGUAGE=visc
+#SRCDIR_OBJS= load_cam_model.ll cam_pipe_utility.ll dma_interface.ll utility.ll
+#OBJS_SRC=src/cam_pipe.c src/pipe_stages.c src/load_cam_model.c src/cam_pipe_utility.c src/dma_interface.c src/utility.c
+VISC_OBJS=main.visc.ll
+APP = $(EXE)
+APP_CUDALDFLAGS=-lm -lstdc++
+APP_CFLAGS= $(INCLUDES) -DDMA_MODE -DDMA_INTERFACE_V3
+APP_CXXFLAGS=-ffast-math -O0 -I/opt/opencv/include
+APP_LDFLAGS=$(LFLAGS)
+OPT_FLAGS = -tti -targetlibinfo -tbaa -scoped-noalias -assumption-cache-tracker -profile-summary-info -forceattrs -inferattrs -ipsccp -globalopt -domtree -mem2reg -deadargelim -domtree -basicaa -aa -simplifycfg -pgo-icall-prom -basiccg -globals-aa -prune-eh -always-inline -functionattrs -domtree -sroa -early-cse -lazy-value-info -jump-threading -correlated-propagation -simplifycfg -domtree -basicaa -aa -libcalls-shrinkwrap -tailcallelim -simplifycfg -reassociate -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -licm -loop-unswitch -simplifycfg -domtree -basicaa -aa -loops -loop-simplify -lcssa-verification -lcssa -scalar-evolution -indvars -loop-idiom -loop-deletion -memdep -memcpyopt -sccp -domtree -demanded-bits -bdce -basicaa -aa -lazy-value-info -jump-threading -correlated-propagation -domtree -basicaa -aa -memdep -dse -loops -loop-simplify -lcssa-verification -lcssa -aa -scalar-evolution -licm -postdomtree -adce -simplifycfg -domtree -basicaa -aa -barrier -basiccg -rpo-functionattrs -globals-aa -float2int -domtree -loops -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -loop-rotate -loop-accesses -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-distribute -loop-simplify -lcssa-verification -lcssa -branch-prob -block-freq -scalar-evolution -basicaa -aa -loop-accesses -demanded-bits -lazy-branch-prob -lazy-block-freq -opt-remark-emitter -loop-vectorize -loop-simplify -scalar-evolution -aa -loop-accesses -loop-load-elim -basicaa -aa -simplifycfg -domtree -basicaa -aa -loops -scalar-evolution -alignment-from-assumptions -strip-dead-prototypes -domtree -loops -branch-prob -block-freq -loop-simplify -lcssa-verification -lcssa -basicaa -aa -scalar-evolution -branch-prob -block-freq -loop-sink -instsimplify 
+
+CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
+OBJS_CFLAGS = -O1 $(APP_CFLAGS) $(PLATFORM_CFLAGS)
+CXXFLAGS = $(APP_CXXFLAGS) $(PLATFORM_CXXFLAGS)
+LDFLAGS= $(APP_LDFLAGS) $(PLATFORM_LDFLAGS)
+
+LIBCLC_LIB_PATH = $(LLVM_SRC_ROOT)/../libclc/built_libs
+VISC_RT_PATH = $(LLVM_SRC_ROOT)/projects/visc-rt
+
+VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll
+LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx64--nvidiacl.bc
+
+LLVM_34_AS = $(LLVM_34_ROOT)/build/bin/llvm-as
+
+TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
+KERNEL_GEN_FLAGS = -O3 -target nvptx64-nvidia-nvcl
+
+
+DEVICE = CPU_TARGET
+VISC_OPTFLAGS = -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG
+VISC_OPTFLAGS += -visc-timers-x86
+TESTGEN_OPTFLAGS += -visc-timers-gen
+
+CFLAGS += -DDEVICE=$(DEVICE)
+CXXFLAGS += -DDEVICE=$(DEVICE)
+
+
+# Add BUILDDIR as a prefix to each element of $1
+INBUILDDIR=$(addprefix $(BUILD_DIR)/,$(1))
+
+PYTHON_LLVM_40_34 = ../llvm-40-34.py
+
+.PRECIOUS: $(BUILD_DIR)/%.ll
+
+OBJS = $(call INBUILDDIR,$(SRCDIR_OBJS))
+TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
+KERNEL = $(TEST_OBJS).kernels.ll
+
+ifeq ($(TARGET),x86)
+  SPIR_ASSEMBLY = $(TEST_OBJS).kernels.bc
+else ifeq ($(TARGET),seq)
+else ifeq ($(TARGET),fpga)
+  AOC_CL = $(TEST_OBJS).kernels.cl
+  AOCL_ASSEMBLY = $(TEST_OBJS).kernels.aocx
+  BOARD = a10gx
+  ifeq ($(EMULATION),1)
+    EXE = cava-visc-emu
+    AOC_EMU = -march=emulator
+    BUILD_DIR = build/$(TARGET)-emu
+  endif
+else
+  KERNEL_LINKED = $(BUILD_DIR)/$(APP).kernels.linked.ll
+  PTX_ASSEMBLY = $(TEST_OBJS).nvptx.s
+endif
+
+HOST_LINKED = $(BUILD_DIR)/$(APP).linked.ll
+HOST = $(BUILD_DIR)/$(APP).host.ll
+
+ifeq ($(OPENCL_PATH),)
+FAILSAFE=no_opencl
+else 
+FAILSAFE=
+endif
+
+# Targets
+default: $(FAILSAFE) $(BUILD_DIR) $(EXE)
+#default: $(FAILSAFE) $(BUILD_DIR) $(PTX_ASSEMBLY) $(SPIR_ASSEMBLY) $(AOC_CL) $(AOCL_ASSEMBLY) $(EXE)
+
+$(PTX_ASSEMBLY) : $(KERNEL_LINKED)
+	$(CC) $(KERNEL_GEN_FLAGS) -S $< -o $@
+
+$(KERNEL_LINKED) : $(KERNEL)
+	$(LLVM_LINK) $(LIBCLC_NVPTX_LIB) -S $< -o $@
+
+$(SPIR_ASSEMBLY) : $(KERNEL)
+	python $(PYTHON_LLVM_40_34) $< $(BUILD_DIR)/kernel_34.ll
+	$(LLVM_34_AS) $(BUILD_DIR)/kernel_34.ll -o $@
+
+$(AOCL_ASSEMBLY) : $(AOC_CL)
+	aoc --report $(AOC_EMU) $(AOC_CL) -o $(AOCL_ASSEMBLY) -board=$(BOARD)
+
+$(AOC_CL) : $(KERNEL)
+	llvm-cbe --debug $(KERNEL)
+
+$(EXE) : $(HOST_LINKED)
+	$(CXX) -O3 $(LDFLAGS) $< -o $@
+
+$(HOST_LINKED) : $(HOST) $(OBJS) $(VISC_RT_LIB)
+	$(LLVM_LINK) $^ -S -o $@
+
+$(VISC_RT_LIB) : $(VISC_RT_PATH)/visc-rt.cpp
+	make -C $(LLVM_LIB_PATH)
+
+$(HOST) $(KERNEL): $(BUILD_DIR)/$(VISC_OBJS)
+	$(OPT) -debug $(VISC_OPTFLAGS) -S $< -o $(HOST)
+
+$(BUILD_DIR):
+	mkdir -p $(BUILD_DIR)
+
+$(BUILD_DIR)/%.ll : $(SRC_DIR)/%.c
+	$(CC) $(OBJS_CFLAGS) -emit-llvm -S -o $@ $<
+
+$(BUILD_DIR)/main.ll : $(SRC_DIR)/main.c
+	$(CC) $(CFLAGS) -emit-llvm -S -o $@ $<
+
+$(BUILD_DIR)/main.visc.ll : $(BUILD_DIR)/main.ll
+	$(OPT) -debug-only=genvisc $(TESTGEN_OPTFLAGS) $< -S -o $@
+
diff --git a/llvm/test/VISC/parboil/benchmarks/saxpy_test/src/defs.h b/llvm/test/VISC/parboil/benchmarks/saxpy_test/src/defs.h
new file mode 100644
index 0000000000..ccc8acc857
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/saxpy_test/src/defs.h
@@ -0,0 +1,224 @@
+#ifndef _COMMON_DEFS_H_
+#define _COMMON_DEFS_H_
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long uint64_t;
+
+#define CACHELINE_SIZE 64
+
+// Debugging message macros.
+#if DEBUG_LEVEL >= 1
+  #define INFO_MSG(args...) printf(args)
+
+  #if DEBUG_LEVEL >= 2
+    #define PRINT_MSG(args...) printf(args)
+    #define PRINT_DEBUG(hid, rows, cols, num_cols)                                 \
+        print_debug(hid, rows, cols, num_cols)
+    #define PRINT_DEBUG4D(hid, rows, cols, height)                                 \
+        print_debug4d(hid, rows, cols, height)
+    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)                       \
+        print_debug4d_fp16(hid, num, height, rows, cols)
+
+    #if DEBUG_LEVEL >= 3
+      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)                               \
+          print_debug(hid, rows, cols, num_cols)
+      #define PRINT_DEBUG4D_V(hid, rows, cols, height)                               \
+          print_debug4d(hid, rows, cols, height)
+      #define PRINT_MSG_V(args...) printf(args)
+    #else
+      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)
+      #define PRINT_DEBUG4D_V(hid, rows, cols, height)
+      #define PRINT_MSG_V(args...)
+    #endif
+  #else
+    #define PRINT_MSG(args...)
+    #define PRINT_DEBUG(hid, rows, cols, num_cols)
+    #define PRINT_DEBUG4D(hid, rows, cols, height)
+    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+    #define PRINT_DEBUG_V(hid, rows, cols, height)
+    #define PRINT_DEBUG4D_V(hid, rows, cols, height)
+    #define PRINT_MSG_V(args...)
+  #endif
+#else
+  #define INFO_MSG(args...)
+  #define PRINT_DEBUG(hid, rows, cols, num_cols)
+  #define PRINT_DEBUG4D(hid, rows, cols, height)
+  #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+  #define PRINT_MSG(args...)
+  #define PRINT_DEBUG_V(hid, rows, cols, height)
+  #define PRINT_DEBUG4D_V(hid, rows, cols, height)
+  #define PRINT_MSG_V(args...)
+#endif
+
+#define STRING(arg) #arg
+
+// This is to avoid a ton of spurious unused variable warnings when
+// we're not building for gem5.
+#define UNUSED(x) (void)(x)
+
+// Macros for computing the maximum of a group of elements.
+//
+// Why macros and not functions (or a loop)? A loop takes O(n) cycles to
+// compute the maximum, when it could be done in O(log n) time with a tree
+// based implementation. But Aladdin regards function calls as a hard
+// dependency that it does not optimize across, so we would not get the
+// parallelism we expect from the tree. Thus, these must be macros.
+//
+// I've only implemented a few of these. These are only meant for the pooling
+// layers, and we shouldn't need more than a 3x3 pooling layer anyways.
+#define max2(A, B) (((A) > (B)) ? (A) : (B))
+#define max3(e0, e1, e2) max2(max2(e0, e1), e2)
+#define max4(e0, e1, e2, e3) max2(max2(e0, e1), max2(e2, e3))
+#define max8(e0, e1, e2, e3, e4, e5, e6, e7)                                   \
+    max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7))
+#define max9(e0, e1, e2, e3, e4, e5, e6, e7, e8)                               \
+    max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8)
+
+#define min2(A, B) (((A) < (B)) ? (A) : (B))
+
+#define FRAC_CEIL(A, B) ((A) / (B) + ((A) % (B) != 0))
+// Convenience macros to switch between invoking an accelerator (if building a
+// binary for gem5) or just calling the kernel function in software.
+//
+// Usage:
+//
+//  These macros expand differently based on whether the GEM5_HARNESS macro is
+//  defined. If so, then this binary is meant to be run under gem5, invoking
+//  accelerators; if not, this binary should run the pure software version of
+//  the accelerated kernels.
+//
+//  If GEM5_HARNESS is defined:
+//
+//     MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize)
+//        ===>   mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr, mySize)
+//
+//     INVOKE_KERNEL(myReqCode, kernelFuncName, args...)
+//        ===>   invokeAcceleratorAndBlock(myReqCode)
+//
+//  Otherwise:
+//     MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize)
+//        expands to nothing
+//
+//     INVOKE_KERNEL(myReqCode, kernelFuncName, args...)
+//        ===>  kernelFuncName(args)
+//
+#ifdef GEM5_HARNESS
+
+#define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
+    mapArrayToAccelerator(req_code, name, base_addr, size)
+#define INVOKE_KERNEL(req_code, kernel_ptr, args...)                           \
+    do {                                                                       \
+        UNUSED(kernel_ptr);                                                    \
+        invokeAcceleratorAndBlock(req_code);                                   \
+    } while (0)
+#define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
+    do {                                                                       \
+        UNUSED(kernel_ptr);                                                    \
+        invokeAcceleratorAndReturn2(req_code, finish_flag);                    \
+    } while (0)
+
+#define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, false, false);       \
+    } while (0)
+#define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, true, false);        \
+    } while (0)
+#define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, false, true);        \
+    } while (0)
+#define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
+    do {                                                                       \
+        invokeAladdinTrafficGenAndBlock(start_addr, size, true, true);         \
+    } while (0)
+
+#else
+
+#define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
+    do {                                                                       \
+        INFO_MSG("Mapping array %s @ %p, size %d.\n",                          \
+                 name, (void*)base_addr, (int)(size));                         \
+        UNUSED(req_code);                                                      \
+        UNUSED(name);                                                          \
+        UNUSED(base_addr);                                                     \
+        UNUSED(size);                                                          \
+    } while (0)
+#define INVOKE_KERNEL(req_code, kernel_ptr, args...) kernel_ptr(args)
+#define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
+    kernel_ptr(args)
+#define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
+#define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
+#define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
+#define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
+    do {                                                                       \
+        UNUSED(start_addr);                                                    \
+        UNUSED(size);                                                          \
+    } while (0)
+
+#endif
+
+// Simplified version of MAP_ARRAY_TO_ACCEL.
+//
+// This assumes that the current name of the base pointer is also the name of
+// the array in the top level function of the dynamic trace. THIS IS VERY
+// IMPORTANT - if the argument passed to a top level function has been renamed in
+// the function, then this WILL NOT WORK!
+//
+// MAP_ARRAY(myReqCode, myArray, mySize)
+//    ===>   MAP_ARRAY_TO_ACCEL(myReqCode, "myArray", myArray, mySize)
+#define MAP_ARRAY(req_code, name_and_base_addr, size)                          \
+    MAP_ARRAY_TO_ACCEL(                                                        \
+            req_code, STRING(name_and_base_addr), name_and_base_addr, size)
+
+// Use these convenience macros to cast a raw pointer into a multidimensional
+// variable-length array, which lets us use [] notation inside of the ugly
+// sub2ind syntax!
+//
+// Usage:
+//   If we have an array like array[5][4]:
+//      ARRAY_2D(TYPE, output_name, array, 4);
+//
+//   If we have an array like array[5][4][3]:
+//      ARRAY_3D(TYPE, output_name, array, 4, 3);
+//
+//   If we have an array like array[5][4][3][2]
+//      ARRAY_4D(TYPE, output_name, array, 4, 3, 2);
+//
+//   And so on...
+#define ARRAY_1D(TYPE, output_array_name, input_array_name)                    \
+    TYPE* output_array_name = (TYPE*)input_array_name
+
+#define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1)             \
+    TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
+
+#define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2)      \
+    TYPE(*output_array_name)[DIM_1][DIM_2] =                                   \
+        (TYPE(*)[DIM_1][DIM_2])input_array_name
+
+#define ARRAY_4D(                                                              \
+    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3)            \
+        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3] =                        \
+            (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name
+
+#define ARRAY_5D(                                                              \
+    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3, DIM_4)     \
+        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3][DIM_4] =                 \
+            (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name
+
+#endif
diff --git a/llvm/test/VISC/parboil/benchmarks/saxpy_test/src/main.c b/llvm/test/VISC/parboil/benchmarks/saxpy_test/src/main.c
new file mode 100644
index 0000000000..ef4e620be0
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/saxpy_test/src/main.c
@@ -0,0 +1,135 @@
+
+#include <stdlib.h>
+//#include "utility.h"
+#include "visc.h"
+#include "defs.h"
+
+
+typedef struct __attribute__((__packed__)) {
+    float* input; size_t bytes_input;
+    float* result; size_t bytes_result;
+} 
+RootIn;
+
+
+
+void scale_values(float* input, size_t num_elems) {
+
+  __visc__hint(DEVICE);
+  __visc__attributes(1, input, 1, input);
+  
+  for (int ind = 0; ind < num_elems; ind++){
+    input[ind] = input[ind] * 2.0;
+  }
+  
+  __visc__return(1, num_elems);
+}
+
+
+
+
+void graphRoot(/*0*/ float* input, /*1*/ size_t bytes_input, 
+               /*2*/ float* result, /*3*/ size_t bytes_result) {
+
+  //Specifies compilation target for current node
+  __visc__hint(CPU_TARGET);
+
+  __visc__attributes(2, input, result, 2, input, result);
+
+  // Create an 0D (specified by 1st argument) HPVM node - so a single node
+  // associated with node function ---_fxp_wrapper
+
+  void* scaleNode = __visc__createNodeND(0, scale_values);
+    
+  // BindIn binds inputs of current node with specified node
+  // - destination node
+  // - argument position in argument list of function of source node
+  // - argument position in argument list of function of destination node
+  // - streaming (1) or non-streaming (0)
+
+  // Edge transfers data between nodes within the same level of hierarchy.
+  // - source and destination dataflow nodes
+  // - edge type, all-all (1) or one-one(0)
+  // - source position (in output struct of source node)
+  // - destination position (in argument list of destination node)
+  // - streaming (1) or non-streaming (0)
+
+  // scale_fxp inputs
+  __visc__bindIn(scaleNode, 0, 0, 0); // input -> ScNode:input
+  __visc__bindIn(scaleNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input
+    
+  // Similar to bindIn, but for the output. Output of a node is a struct, and
+  // we consider the fields in increasing ordering.
+  __visc__bindOut(scaleNode, 0, 0, 0);
+    
+}
+
+
+
+
+
+int main(int argc, char* argv[]) {
+
+    size_t input_size = 100;
+    size_t result_size = 100;
+
+    size_t input_bytes = input_size * sizeof(float);
+    size_t result_bytes = result_size * sizeof(float);
+
+    // This is host_input in cam_pipe()
+    float* input = (float*) malloc(input_bytes);
+    for(unsigned int i = 0; i < input_size; i++){
+      input[i] = 1.0;
+    }
+    // This is host_result in cam_pipe()
+    float* result = (float*) malloc(result_bytes);
+
+   
+    __visc__init();
+
+    RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn));
+
+    // Set up HPVM DFG inputs in the rootArgs struct.
+    rootArgs->input = input;
+    rootArgs->bytes_input = input_size;
+
+    printf("input = %d input_bytes = %d \n", input, input_bytes);
+    
+    rootArgs->result = result;
+    rootArgs->bytes_result = result_size;
+
+  
+    llvm_visc_track_mem(input, input_bytes);
+    llvm_visc_track_mem(result, result_bytes);
+
+
+    void* testDFG = __visc__launch(0, graphRoot, (void*) rootArgs);
+    __visc__wait(testDFG);
+
+
+    printf("input = %d \n", input);
+    
+    llvm_visc_request_mem(input, input_bytes);
+    //llvm_visc_request_mem(result, result_bytes);
+
+    printf("requested mem \n");
+
+    for(unsigned int i = 0; i < input_size; i++){
+      printf("input[%d] = %f \n", i, input[i]);
+    }
+
+    //llvm_visc_untrack_mem(input);
+    //llvm_visc_untrack_mem(result);
+
+    printf ("untracked mem \n");
+    
+    __visc__cleanup();
+
+    printf ("cleaned up visc");
+
+    return 0;
+}
+
+
+
+    
diff --git a/llvm/test/VISC/parboil/benchmarks/saxpy_test/src/visc.h b/llvm/test/VISC/parboil/benchmarks/saxpy_test/src/visc.h
new file mode 100644
index 0000000000..a263e35252
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/saxpy_test/src/visc.h
@@ -0,0 +1,110 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef DEVICE
+#define DEVICE GPU_TARGET
+#endif
+
+#include "llvm/SupportVISC/VISCHint.h"
+
+#ifdef __cplusplus
+extern "C" {
+void __visc__hint(visc::Target);
+//void __visc__wait(void*);
+#else
+void __visc__hint(enum Target);
+//void __visc__wait(unsigned);
+#endif
+
+#ifdef __cplusplus
+//void* __visc__node(...);
+//void* __visc__createNode(...);
+//void* __visc__createNode1D(...);
+//void* __visc__createNode2D(...);
+//void* __visc__createNode3D(...);
+//void __visc__return(...);
+#endif
+
+void* __visc__createNodeND(unsigned,...);
+void __visc__return(unsigned, ...);
+
+void __visc__attributes(unsigned, ...);
+void __visc__init();
+void __visc__cleanup();
+
+void __visc__bindIn(void*, unsigned, unsigned, unsigned);
+void __visc__bindOut(void*, unsigned, unsigned, unsigned);
+void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned);
+void __visc__push(void*, void*);
+void* __visc__pop(void*);
+void* __visc__launch(unsigned, ...);
+void __visc__wait(void*);
+
+void* __visc__getNode();
+void* __visc__getParentNode(void*);
+void __visc__barrier();
+void* __visc__malloc(long);
+long __visc__getNodeInstanceID_x(void*);
+long __visc__getNodeInstanceID_y(void*);
+long __visc__getNodeInstanceID_z(void*);
+long __visc__getNumNodeInstances_x(void*);
+long __visc__getNumNodeInstances_y(void*);
+long __visc__getNumNodeInstances_z(void*);
+
+// Atomic
+// signed int
+int __visc__atomic_cmpxchg(int*, int, int);
+int __visc__atomic_add(int*, int);
+int __visc__atomic_sub(int*, int);
+int __visc__atomic_xchg(int*, int);
+int __visc__atomic_inc(int*);
+int __visc__atomic_dec(int*);
+int __visc__atomic_min(int*, int);
+int __visc__atomic_max(int*, int);
+int __visc__atomic_umax(int*, int);
+int __visc__atomic_umin(int*, int);
+int __visc__atomic_and(int*, int);
+int __visc__atomic_or(int*, int);
+int __visc__atomic_xor(int*, int);
+
+// Special Func
+float __visc__floor(float);
+float __visc__rsqrt(float);
+float __visc__sqrt(float);
+float __visc__sin(float);
+float __visc__cos(float);
+// unsigned int
+//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
+//unsigned __visc__atomic_add(unsigned*, unsigned);
+//unsigned __visc__atomic_sub(unsigned*, unsigned);
+//unsigned __visc__atomic_xchg(unsigned*, unsigned);
+//unsigned __visc__atomic_inc(unsigned*);
+//unsigned __visc__atomic_dec(unsigned*);
+//unsigned __visc__atomic_min(unsigned*, unsigned);
+//unsigned __visc__atomic_max(unsigned*, unsigned);
+//unsigned __visc__atomic_and(unsigned*, unsigned);
+//unsigned __visc__atomic_or(unsigned*, unsigned);
+//unsigned __visc__atomic_xor(unsigned*, unsigned);
+
+
+#include <unistd.h>
+
+long get_global_id(int);
+long get_group_id(int);
+long get_local_id(int);
+long get_local_size(int);
+
+
+void llvm_visc_track_mem(void*, size_t);
+void llvm_visc_untrack_mem(void*);
+void llvm_visc_request_mem(void*, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
-- 
GitLab