From c7c9bcea1c004d089fc3a10eeafa249549fd2c99 Mon Sep 17 00:00:00 2001 From: Hashim Sharif <hsharif3@tyler.cs.illinois.edu> Date: Tue, 26 Nov 2019 16:07:59 -0600 Subject: [PATCH] CPU version of visc-rt - DSOC --- llvm/projects/visc-cpu-rt/CMakeLists.txt | 43 + .../deviceStatusSwitchIntervals.txt | 2 + .../projects/visc-cpu-rt/device_abstraction.h | 82 + llvm/projects/visc-cpu-rt/makefile | 31 + llvm/projects/visc-cpu-rt/policy.h | 103 ++ llvm/projects/visc-cpu-rt/visc-rt.cpp | 1648 +++++++++++++++++ llvm/projects/visc-cpu-rt/visc-rt.h | 302 +++ 7 files changed, 2211 insertions(+) create mode 100644 llvm/projects/visc-cpu-rt/CMakeLists.txt create mode 100644 llvm/projects/visc-cpu-rt/deviceStatusSwitchIntervals.txt create mode 100644 llvm/projects/visc-cpu-rt/device_abstraction.h create mode 100644 llvm/projects/visc-cpu-rt/makefile create mode 100644 llvm/projects/visc-cpu-rt/policy.h create mode 100644 llvm/projects/visc-cpu-rt/visc-rt.cpp create mode 100644 llvm/projects/visc-cpu-rt/visc-rt.h diff --git a/llvm/projects/visc-cpu-rt/CMakeLists.txt b/llvm/projects/visc-cpu-rt/CMakeLists.txt new file mode 100644 index 0000000000..4887abbaf9 --- /dev/null +++ b/llvm/projects/visc-cpu-rt/CMakeLists.txt @@ -0,0 +1,43 @@ +add_custom_target(visc-cpu-rt ALL) +add_custom_command( + TARGET visc-cpu-rt PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/deviceStatusSwitchIntervals.txt + ${CMAKE_CURRENT_BINARY_DIR}/deviceStatusSwitchIntervals.txt + DEPENDS deviceStatusSwitchIntervals.txt + COMMENT "Copying deviceStatusSwitchIntervals.txt") +add_custom_command( + TARGET visc-cpu-rt PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/device_abstraction.h + ${CMAKE_CURRENT_BINARY_DIR}/device_abstraction.h + DEPENDS device_abstraction.h + COMMENT "Copying device_abstraction.h") +add_custom_command( + TARGET visc-cpu-rt PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/policy.h + ${CMAKE_CURRENT_BINARY_DIR}/policy.h + DEPENDS policy.h + COMMENT "Copying policy.h") +add_custom_command( + TARGET visc-cpu-rt PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.h + ${CMAKE_CURRENT_BINARY_DIR}/visc-rt.h + DEPENDS visc-rt.h + COMMENT "Copying visc-rt.h") +add_custom_command( + TARGET visc-cpu-rt PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.cpp + ${CMAKE_CURRENT_BINARY_DIR}/visc-rt.cpp + DEPENDS visc-rt.cpp + COMMENT "Copying visc-rt.cpp") +add_custom_command( + TARGET visc-cpu-rt PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/makefile + ${CMAKE_CURRENT_BINARY_DIR}/makefile + DEPENDS makefile + COMMENT "Copying makefile") diff --git a/llvm/projects/visc-cpu-rt/deviceStatusSwitchIntervals.txt b/llvm/projects/visc-cpu-rt/deviceStatusSwitchIntervals.txt new file mode 100644 index 0000000000..7069470a1a --- /dev/null +++ b/llvm/projects/visc-cpu-rt/deviceStatusSwitchIntervals.txt @@ -0,0 +1,2 @@ +10 +10 15 10 16 15 30 15 25 20 15 diff --git a/llvm/projects/visc-cpu-rt/device_abstraction.h b/llvm/projects/visc-cpu-rt/device_abstraction.h new file mode 100644 index 0000000000..68748c7ab7 --- /dev/null +++ b/llvm/projects/visc-cpu-rt/device_abstraction.h @@ -0,0 +1,82 @@ +#ifndef __DEVICE_ABSTRACTION__ +#define __DEVICE_ABSTRACTION__ + +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <time.h> +#include <thread> +#include <vector> +#include <iostream> +#include <fstream> + +#define MIN_INTERVAL 2 +#define MAX_INTERVAL 8 +#define NUM_INTERVALS 10 + +// Device status variable: true if the device is available for use +volatile bool deviceStatus = true; +// Intervals at which to change the device status +std::vector<unsigned> Intervals; + +// Set to true when program execution ends and so we can end the device +// simulation +volatile bool executionEnd = false; + + +void initializeDeviceStatusIntervals() { + + unsigned sz = 0; + unsigned tmp = 0; + + const char *fn = + "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/deviceStatusSwitchIntervals.txt"; + std::ifstream infile; + infile.open(fn); + if (!infile.is_open()) { + std::cout << "Failed to open " << fn << " for reading\n"; + return; + } + infile >> sz; + + if (sz) { + // We have data. Read them into the vector + for (unsigned i = 0; i < sz; i++) { + infile >> tmp; + Intervals.push_back(tmp); + } + infile.close(); + } else { + // We have no data. Create random data and write them into the file + infile.close(); + std::ofstream outfile; + outfile.open(fn); + if (!outfile.is_open()) { + std::cout << "Failed to open " << fn << " for writing\n"; + return; + } + sz = 1 + rand()%NUM_INTERVALS; + outfile << sz; + for (unsigned i = 0; i < sz; i++) { + Intervals.push_back(MIN_INTERVAL + rand()%(MAX_INTERVAL - MIN_INTERVAL)); + outfile << Intervals[i]; + } + outfile.close(); + } + + return; +} + +void updateDeviceStatus() { + + unsigned i = 0; + while (!executionEnd) { + std::this_thread::sleep_for (std::chrono::seconds(Intervals[i])); + deviceStatus = !deviceStatus; + std::cout << "Changed device status to " << deviceStatus << "\n"; + i = (i+1) % Intervals.size(); + } + +} + +#endif // __DEVICE_ABSTRACTION__ diff --git a/llvm/projects/visc-cpu-rt/makefile b/llvm/projects/visc-cpu-rt/makefile new file mode 100644 index 0000000000..a1fd6e1e01 --- /dev/null +++ b/llvm/projects/visc-cpu-rt/makefile @@ -0,0 +1,31 @@ +LLVM_SRC_ROOT = ../../../llvm +LLVM_BUILD_ROOT = ../.. + +OPENCL_INC_PATH = /opt/intel/opencl-sdk/include + +ifeq ($(NUM_CORES),) + NUM_CORES=8 +endif + +CPP_FLAGS = -I $(LLVM_SRC_ROOT)/include -I $(LLVM_BUILD_ROOT)/include -I $(OPENCL_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS +TARGET:=visc-rt + +#LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang +#LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++ + +LLVM_CC=clang-4.0 +LLVM_CXX=clang++-4.0 + +OPTS = + +ifeq ($(DEBUG),1) + OPTS+=-DDEBUG_BUILD +endif + +all: $(TARGET:%=%.ll) + +$(TARGET:%=%.ll):%.ll:%.cpp %.h + $(LLVM_CXX) -DNUM_CORES=$(NUM_CORES) -O3 -S -emit-llvm $(CPP_FLAGS) $(OPTS) $< -o $@ + +clean : + rm -f $(TARGET).ll diff --git a/llvm/projects/visc-cpu-rt/policy.h b/llvm/projects/visc-cpu-rt/policy.h new file mode 100644 index 0000000000..436ad39295 --- /dev/null +++ b/llvm/projects/visc-cpu-rt/policy.h @@ -0,0 +1,103 @@ +#ifndef __POLICY__ +#define __POLICY__ + +#include <string> +#include "device_abstraction.h" + + /************************* Policies *************************************/ +class Policy { + public: + virtual int getVersion(const char *, int64_t) = 0; + virtual ~Policy() {}; +}; + +class NodePolicy : public Policy { + virtual int getVersion(const char *name, int64_t it) override { + std::string s(name); + //std::string NodeNames[1] = { "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" }; + std::string NodeNames[] = { + "WrapperGaussianSmoothing_cloned", + "WrapperlaplacianEstimate_cloned", + "WrapperComputeZeroCrossings_cloned", + "WrapperComputeGradient_cloned", + "WrapperComputeMaxGradient_cloned", + "WrapperRejectZeroCrossings_cloned", + }; + //if (!s.compare(NodeNames[4])) { + // std::cout << s << ": CPU" << "\n"; + // return 0; + //} + return 2; + } +}; + +class IterationPolicy : public Policy { + virtual int getVersion(const char *name, int64_t it) override { + if ((it % 10 == 0) || (it % 10 == 1)) + return 0; + else + return 2; + } +}; + +class DeviceStatusPolicy : public Policy { + virtual int getVersion(const char *name, int64_t it) override { + if (deviceStatus) { + //std::cout << "Returning GPU\n"; + return 2; + } + else { + //std::cout << "Returning CPU\n"; + return 0; + } + } +}; + +/* ------------------------------------------------------------------------- */ +// Added for the CFAR interactive policy demo. + +class InteractivePolicy : public Policy { +private: + // 0 :for CPU, 1 for GPU, 2 for Vector + unsigned int userTargetDeviceChoice; + // Used to end thread execution + bool end; + // Thread that will update userTargetDeviceChoice + + //std::thread userTargetDeviceChoiceThread; + + // Thread function + void updateUserTargetChoice() { + while (!end) { + std::cout << "Select target device (0 for CPU, 1 fpr GPU): "; + + std::cin >> userTargetDeviceChoice; + + if (userTargetDeviceChoice > 1) { + std::cout << "Invalid target device. Selecting GPU instead.\n"; + userTargetDeviceChoice = 1; + } + + } + } + +public: + // Inherited method, erquired for every policy object + virtual int getVersion(const char *name, int64_t it) { + return userTargetDeviceChoice; + } + + InteractivePolicy() { + userTargetDeviceChoice = 1; + end = false; + //userTargetDeviceChoiceThread = + // std::thread(&InteractivePolicy::updateUserTargetChoice, this); + } + + ~InteractivePolicy() { + end = true; + //userTargetDeviceChoiceThread.join(); + } +}; + +#endif // __POLICY__ diff --git a/llvm/projects/visc-cpu-rt/visc-rt.cpp b/llvm/projects/visc-cpu-rt/visc-rt.cpp new file mode 100644 index 0000000000..b4e9405efb --- /dev/null +++ b/llvm/projects/visc-cpu-rt/visc-rt.cpp @@ -0,0 +1,1648 @@ + +#include <iostream> +#include <string> +#include <pthread.h> +#include <cstdlib> +#include <cstdio> +#include <cstring> +#include <cassert> +#include <map> +#include <CL/cl.h> + +#include <unistd.h> + +#if _POSIX_VERSION >= 200112L +# include <sys/time.h> +#endif +#include "visc-rt.h" + +#ifndef DEBUG_BUILD +#define DEBUG(s) {} +#else +#define DEBUG(s) s +#endif + +#define BILLION 1000000000LL + +using namespace std; + +typedef struct { + pthread_t threadID; + std::vector<pthread_t>* threads; + // Map from InputPort to Size + std::map<unsigned, uint64_t>* ArgInPortSizeMap; + //std::vector<uint64_t>* BindInSizes; + std::vector<unsigned>* BindInSourcePort; + std::vector<uint64_t>* BindOutSizes; + std::vector<uint64_t>* EdgeSizes; + std::vector<CircularBuffer<uint64_t>*>* BindInputBuffers; + std::vector<CircularBuffer<uint64_t>*>* BindOutputBuffers; + std::vector<CircularBuffer<uint64_t>*>* EdgeBuffers; + std::vector<CircularBuffer<uint64_t>*>* isLastInputBuffers; +} DFNodeContext_X86; + +typedef struct { + cl_context clOCLContext; + cl_command_queue clCommandQue; + cl_program clProgram; + cl_kernel clKernel; +} DFNodeContext_OCL; + +cl_context globalOCLContext; +cl_device_id* clDevices; +cl_command_queue globalCommandQue; + +Policy *policy = NULL; +MemTracker MTracker; +vector<DFGDepth> DStack; +// Mutex to prevent concurrent access by multiple thereads in pipeline +pthread_mutex_t ocl_mtx; + +#define NUM_TESTS 1 +visc_TimerSet kernel_timer; + +static inline void checkErr(cl_int err, cl_int success, const char * name) { + if (err != success) { + cout << "ERROR: " << name << flush << "\n"; + cout << "ErrorCode: " << err << flush << "\n"; + exit(EXIT_FAILURE); + } +} + +/************************* Policies *************************************/ +void llvm_visc_policy_init() { + cout << "Initializing policy object ...\n"; +// policy = new NodePolicy(); +// policy = new IterationPolicy(); +// policy = new DeviceStatusPolicy(); + policy = new InteractivePolicy(); + cout << "DONE: Initializing policy object.\n"; +} + +void llvm_visc_policy_clear() { + if (policy) free(policy); +} + +int llvm_visc_policy_getVersion(const char *name, int64_t i) { + return policy->getVersion(name, i); +} + +/******************** Device Abstraction ********************************/ +//std::thread deviceStatusThread; +/* +void llvm_visc_deviceAbstraction_start() { + cout << "Starting device status simulation ...\n"; + // Initialize vector with points where ti switch device status + initializeDeviceStatusIntervals(); + // Create a thread that performs the changes + deviceStatusThread = std::thread(updateDeviceStatus); + cout << "Started device status simulation thread ...\n"; + return; +} +*/ + +void llvm_visc_deviceAbstraction_end() { + cout << "Ending device status simulation thread ...\n"; + // Set the variable that allows the thread to know that execution has ended + executionEnd = true; + // Wait for the thread that manages device status to terminate + //deviceStatusThread.join(); + cout << "Ended device status simulation.\n"; + return; +} + +void llvm_visc_deviceAbstraction_waitOnDeviceStatus() { + while (!deviceStatus) { }; + return; +} + +/************************* Depth Stack Routines ***************************/ + +void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, uint64_t limitY, + uint64_t iY, uint64_t limitZ, uint64_t iZ) { + //DEBUG(cout << "Pushing node information on stack:\n"); + //DEBUG(cout << "\tNumDim = " << n << "\t Limit(" << limitX << ", " << limitY << ", "<< limitZ <<")\n"); + //DEBUG(cout << "\tInstance(" << iX << ", " << iY << ", "<< iZ <<")\n"); + //DFGDepth nodeInfo (n, limitX, iX, limitY, iY, limitZ, iZ); + //DStack.push_back(nodeInfo); + //DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n"); +} + +void llvm_visc_x86_dstack_pop() { + //DEBUG(cout << "Popping from depth stack\n"); + //DStack.pop_back(); + //DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n"); +} + +uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) { + //DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level <<flush << "\n"); + //unsigned size = DStack.size(); + //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimLimit(dim) <<flush << "\n"); + //return DStack[size-level-1].getDimLimit(dim); + return 0; +} + +uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) { + //DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level <<flush << "\n"); + //unsigned size = DStack.size(); + //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimInstance(dim) <<flush << "\n"); + //return DStack[size-level-1].getDimInstance(dim); + return 0; +} + +/********************** Memory Tracking Routines **************************/ + +void llvm_visc_track_mem(void* ptr, size_t size) { + DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n"); + MemTrackerEntry* MTE = MTracker.lookup(ptr); + if(MTE != NULL) { + DEBUG(cout << "ID " << ptr << " already present in the MemTracker Table\n"); + return; + } + DEBUG(cout << "Inserting ID " << ptr << " in the MemTracker Table\n"); + MTracker.insert(ptr, size, MemTrackerEntry::HOST, ptr); + DEBUG(MTracker.print()); +} + +void llvm_visc_untrack_mem(void* ptr) { + DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n"); + MemTrackerEntry* MTE = MTracker.lookup(ptr); + if(MTE == NULL) { + cout << "WARNING: Trying to remove ID " << ptr << " not present in the MemTracker Table\n"; + return; + } + DEBUG(cout << "Removing ID " << ptr << " from MemTracker Table\n"); + if(MTE->getLocation() == MemTrackerEntry::DEVICE) + clReleaseMemObject((cl_mem) MTE->getAddress()); + MTracker.remove(ptr); + DEBUG(MTracker.print()); +} + + +static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL* Context, bool isInput, bool isOutput) { + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "[OCL] Request memory: " << ptr << " for context: " << Context->clOCLContext << flush << "\n"); + MemTrackerEntry* MTE = MTracker.lookup(ptr); + if (MTE == NULL) { + MTracker.print(); + cout << "ERROR: Requesting memory not present in Table\n"; + exit(EXIT_FAILURE); + } + // If already on device + if (MTE->getLocation() == MemTrackerEntry::DEVICE && + ((DFNodeContext_OCL*)MTE->getContext())->clOCLContext == Context->clOCLContext) { + DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); + pthread_mutex_unlock(&ocl_mtx); + return MTE->getAddress(); + } + + DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n"); + DEBUG(cout << "\t"; MTE->print(); cout << flush << "\n"); + // Else copy and update the latest copy + cl_mem_flags clFlags; + cl_int errcode; + + if(isInput && isOutput) clFlags = CL_MEM_READ_WRITE; + else if(isInput) clFlags = CL_MEM_READ_ONLY; + else if(isOutput) clFlags = CL_MEM_WRITE_ONLY; + else clFlags = CL_MEM_READ_ONLY; + + visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); + //pthread_mutex_lock(&ocl_mtx); + cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); + DEBUG(cout<< "\nMemory allocated on device: " << d_input << flush << "\n"); + if(isInput) { + DEBUG(cout << "\tCopying ..."); + //pthread_mutex_lock(&ocl_mtx); + errcode = clEnqueueWriteBuffer(Context->clCommandQue, + d_input, + CL_TRUE, + 0, + size,MTE->getAddress(), + 0,NULL,NULL); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device"); + } + + visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); + DEBUG(cout << " done\n"); + MTE->update(MemTrackerEntry::DEVICE, (void*) d_input, Context); + DEBUG(cout << "Updated Table\n"); + DEBUG(MTracker.print()); + pthread_mutex_unlock(&ocl_mtx); + return d_input; +} + +void* llvm_visc_x86_argument_ptr(void* ptr, size_t size) { + return llvm_visc_request_mem(ptr, size); +} + +void* llvm_visc_request_mem(void* ptr, size_t size) { + // Ignore objects whose size is 0 - no memory is requested. + if (size == 0) { + DEBUG(cout << "[X86] Request memory (ignored): " << ptr << flush << "\n"); + return ptr; + } + + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n"); + MemTrackerEntry* MTE = MTracker.lookup(ptr); + if(MTE == NULL) { + cout << "ERROR: Requesting memory not present in Table\n"; + pthread_mutex_unlock(&ocl_mtx); + exit(EXIT_FAILURE); + } + // If already on host + if(MTE->getLocation() == MemTrackerEntry::HOST) { + DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n"); + pthread_mutex_unlock(&ocl_mtx); + return MTE->getAddress(); + } + + // Else copy from device and update table + DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); + DEBUG(cout << "\tCopying ..."); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); + //pthread_mutex_lock(&ocl_mtx); + cl_int errcode = clEnqueueReadBuffer(((DFNodeContext_OCL*)MTE->getContext())->clCommandQue, + (cl_mem) MTE->getAddress(), + CL_TRUE, + 0, + size, + ptr, + 0, NULL, NULL); + //pthread_mutex_unlock(&ocl_mtx); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); + DEBUG(cout << " done\n"); + checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output"); + DEBUG(cout << "Free mem object on device\n"); + clReleaseMemObject((cl_mem) MTE->getAddress()); + DEBUG(cout << "Updated Table\n"); + MTE->update(MemTrackerEntry::HOST, ptr); + DEBUG(MTracker.print()); + pthread_mutex_unlock(&ocl_mtx); + return ptr; +} + +/*************************** Timer Routines **********************************/ + +static int is_async(enum visc_TimerID timer) +{ + return (timer == visc_TimerID_KERNEL) || + (timer == visc_TimerID_COPY_ASYNC); +} + +static int is_blocking(enum visc_TimerID timer) +{ + return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE); +} + +#define INVALID_TIMERID visc_TimerID_LAST + +static int asyncs_outstanding(struct visc_TimerSet* timers) +{ + return (timers->async_markers != NULL) && + (timers->async_markers->timerID != INVALID_TIMERID); +} + +static struct visc_async_time_marker_list * +get_last_async(struct visc_TimerSet* timers) +{ + /* Find the last event recorded thus far */ + struct visc_async_time_marker_list * last_event = timers->async_markers; + if(last_event != NULL && last_event->timerID != INVALID_TIMERID) { + while(last_event->next != NULL && + last_event->next->timerID != INVALID_TIMERID) + last_event = last_event->next; + return last_event; + } else + return NULL; +} + +static void insert_marker(struct visc_TimerSet* tset, enum visc_TimerID timer) +{ + cl_int ciErrNum = CL_SUCCESS; + struct visc_async_time_marker_list ** new_event = &(tset->async_markers); + + while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + new_event = &((*new_event)->next); + } + + if(*new_event == NULL) { + *new_event = (struct visc_async_time_marker_list *) + malloc(sizeof(struct visc_async_time_marker_list)); + (*new_event)->marker = calloc(1, sizeof(cl_event)); + /* + // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event +#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) +fprintf(stderr, "Creating Marker [%d]\n", timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Creating User Event Object!\n"); + } + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Setting User Event Status!\n"); + } +#endif +*/ + (*new_event)->next = NULL; + } + + /* valid event handle now aquired: insert the event record */ + (*new_event)->label = NULL; + (*new_event)->timerID = timer; + //pthread_mutex_lock(&ocl_mtx); + ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); + //pthread_mutex_unlock(&ocl_mtx); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Enqueueing Marker!\n"); + } + +} + +static void insert_submarker(struct visc_TimerSet* tset, char *label, enum visc_TimerID timer) +{ + cl_int ciErrNum = CL_SUCCESS; + struct visc_async_time_marker_list ** new_event = &(tset->async_markers); + + while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + new_event = &((*new_event)->next); + } + + if(*new_event == NULL) { + *new_event = (struct visc_async_time_marker_list *) + malloc(sizeof(struct visc_async_time_marker_list)); + (*new_event)->marker = calloc(1, sizeof(cl_event)); + /* +#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) +fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Creating User Event Object!\n"); + } + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Setting User Event Status!\n"); + } +#endif +*/ + (*new_event)->next = NULL; + } + + /* valid event handle now aquired: insert the event record */ + (*new_event)->label = label; + (*new_event)->timerID = timer; + //pthread_mutex_lock(&ocl_mtx); + ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); + //pthread_mutex_unlock(&ocl_mtx); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Enqueueing Marker!\n"); + } + +} + + +/* Assumes that all recorded events have completed */ +static visc_Timestamp record_async_times(struct visc_TimerSet* tset) +{ + struct visc_async_time_marker_list * next_interval = NULL; + struct visc_async_time_marker_list * last_marker = get_last_async(tset); + visc_Timestamp total_async_time = 0; + + for(next_interval = tset->async_markers; next_interval != last_marker; + next_interval = next_interval->next) { + cl_ulong command_start=0, command_end=0; + cl_int ciErrNum = CL_SUCCESS; + + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum); + } + + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum); + } + + visc_Timestamp interval = (visc_Timestamp) (((double)(command_end - command_start))); + tset->timers[next_interval->timerID].elapsed += interval; + if (next_interval->label != NULL) { + struct visc_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; + while (subtimer != NULL) { + if ( strcmp(subtimer->label, next_interval->label) == 0) { + subtimer->timer.elapsed += interval; + break; + } + subtimer = subtimer->next; + } + } + total_async_time += interval; + next_interval->timerID = INVALID_TIMERID; + } + + if(next_interval != NULL) + next_interval->timerID = INVALID_TIMERID; + + return total_async_time; +} + +static void +accumulate_time(visc_Timestamp *accum, + visc_Timestamp start, + visc_Timestamp end) +{ +#if _POSIX_VERSION >= 200112L + *accum += end - start; +#else +# error "Timestamps not implemented for this system" +#endif +} + +#if _POSIX_VERSION >= 200112L +static visc_Timestamp get_time() +{ + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + return (visc_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec); +} +#else +# error "no supported time libraries are available on this platform" +#endif + +void +visc_ResetTimer(struct visc_Timer *timer) +{ + timer->state = visc_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + timer->elapsed = 0; +#else +# error "visc_ResetTimer: not implemented for this system" +#endif +} + +void +visc_StartTimer(struct visc_Timer *timer) +{ + if (timer->state != visc_Timer_STOPPED) { + // FIXME: Removing warning statement to avoid printing this error + // fputs("Ignoring attempt to start a running timer\n", stderr); + return; + } + + timer->state = visc_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + timer->init = tv.tv_sec * BILLION + tv.tv_nsec; + } +#else +# error "visc_StartTimer: not implemented for this system" +#endif +} + +void +visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) +{ + + unsigned int numNotStopped = 0x3; // 11 + if (timer->state != visc_Timer_STOPPED) { + fputs("Warning: Timer was not stopped\n", stderr); + numNotStopped &= 0x1; // Zero out 2^1 + } + if (subtimer->state != visc_Timer_STOPPED) { + fputs("Warning: Subtimer was not stopped\n", stderr); + numNotStopped &= 0x2; // Zero out 2^0 + } + if (numNotStopped == 0x0) { + //fputs("Ignoring attempt to start running timer and subtimer\n", stderr); + return; + } + + timer->state = visc_Timer_RUNNING; + subtimer->state = visc_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + + if (numNotStopped & 0x2) { + timer->init = tv.tv_sec * BILLION + tv.tv_nsec; + } + + if (numNotStopped & 0x1) { + subtimer->init = tv.tv_sec * BILLION + tv.tv_nsec; + } + } +#else +# error "visc_StartTimer: not implemented for this system" +#endif + +} + +void +visc_StopTimer(struct visc_Timer *timer) +{ + visc_Timestamp fini; + + if (timer->state != visc_Timer_RUNNING) { + //fputs("Ignoring attempt to stop a stopped timer\n", stderr); + return; + } + + timer->state = visc_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + fini = tv.tv_sec * BILLION + tv.tv_nsec; + } +#else +# error "visc_StopTimer: not implemented for this system" +#endif + + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; +} + +void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) { + + visc_Timestamp fini; + + unsigned int numNotRunning = 0x3; // 11 + if (timer->state != visc_Timer_RUNNING) { + fputs("Warning: Timer was not running\n", stderr); + numNotRunning &= 0x1; // Zero out 2^1 + } + if (subtimer->state != visc_Timer_RUNNING) { + fputs("Warning: Subtimer was not running\n", stderr); + numNotRunning &= 0x2; // Zero out 2^0 + } + if (numNotRunning == 0x0) { + //fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); + return; + } + + + timer->state = visc_Timer_STOPPED; + subtimer->state = visc_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + fini = tv.tv_sec * BILLION + tv.tv_nsec; + } +#else +# error "visc_StopTimer: not implemented for this system" +#endif + + if (numNotRunning & 0x2) { + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + } + + if (numNotRunning & 0x1) { + accumulate_time(&subtimer->elapsed, subtimer->init, fini); + subtimer->init = fini; + } + +} + +/* Get the elapsed time in seconds. */ +double +visc_GetElapsedTime(struct visc_Timer *timer) +{ + double ret; + + if (timer->state != visc_Timer_STOPPED) { + fputs("Elapsed time from a running timer is inaccurate\n", stderr); + } + +#if _POSIX_VERSION >= 200112L + ret = timer->elapsed / 1e9; +#else +# error "visc_GetElapsedTime: not implemented for this system" +#endif + return ret; +} + +void +visc_InitializeTimerSet(struct visc_TimerSet *timers) +{ + int n; + + timers->wall_begin = get_time(); + timers->current = visc_TimerID_NONE; + + timers->async_markers = NULL; + + for (n = 0; n < visc_TimerID_LAST; n++) { + visc_ResetTimer(&timers->timers[n]); + timers->sub_timer_list[n] = NULL; + } +} + + +void +visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category) { + + struct visc_SubTimer *subtimer = (struct visc_SubTimer *) malloc + (sizeof(struct visc_SubTimer)); + + int len = strlen(label); + + subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + sprintf(subtimer->label, "%s", label); + + visc_ResetTimer(&subtimer->timer); + subtimer->next = NULL; + + struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[visc_Category]; + if (subtimerlist == NULL) { + subtimerlist = (struct visc_SubTimerList *) calloc + (1, sizeof(struct visc_SubTimerList)); + subtimerlist->subtimer_list = subtimer; + timers->sub_timer_list[visc_Category] = subtimerlist; + } else { + // Append to list + struct visc_SubTimer *element = subtimerlist->subtimer_list; + while (element->next != NULL) { + element = element->next; + } + element->next = subtimer; + } + +} + +void +visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) +{ + //cerr << "Switch to timer: " << timer << flush << "\n"; + /* Stop the currently running timer */ + if (timers->current != visc_TimerID_NONE) { + struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct visc_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; + + if (!is_async(timers->current) ) { + if (timers->current != timer) { + if (currSubTimer != NULL) { + visc_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + } else { + visc_StopTimer(&timers->timers[timers->current]); + } + } else { + if (currSubTimer != NULL) { + visc_StopTimer(&currSubTimer->timer); + } + } + } else { + insert_marker(timers, timer); + if (!is_async(timer)) { // if switching to async too, keep driver going + visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]); + } + } + } + + visc_Timestamp currentTime = get_time(); + + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next + * segment blocks on completion of previous async operations */ + if( asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(timer) ) ) { + + struct visc_async_time_marker_list * last_event = get_last_async(timers); + /* CL_COMPLETE if completed */ + + cl_int ciErrNum = CL_SUCCESS; + cl_int async_done = CL_COMPLETE; + + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stdout, "Error Querying EventInfo1!\n"); + } + + + if(is_blocking(timer)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async + * operations were first issued */ + + // timer to switch to is COPY or NONE + if(async_done != CL_COMPLETE) { + accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), + timers->async_begin,currentTime); + } + + /* Wait on async operation completion */ + ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Waiting for Events!\n"); + } + + visc_Timestamp total_async_time = record_async_times(timers); + + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + if(async_done == CL_COMPLETE) { + //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time); + timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; + } + + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding + // so something is deeper in stack + if(async_done == CL_COMPLETE ) { + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers); + } + } + + /* Start the new timer */ + if (timer != visc_TimerID_NONE) { + if(!is_async(timer)) { + visc_StartTimer(&timers->timers[timer]); + } else { + // toSwitchTo Is Async (KERNEL/COPY_ASYNC) + if (!asyncs_outstanding(timers)) { + /* No asyncs outstanding, insert a fresh async marker */ + + insert_marker(timers, timer); + timers->async_begin = currentTime; + } else if(!is_async(timers->current)) { + /* Previous asyncs still in flight, but a previous SwitchTo + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async + * operation */ + + struct visc_async_time_marker_list * last_event = get_last_async(timers); + last_event->label = NULL; + last_event->timerID = timer; + } + if (!is_async(timers->current)) { + visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]); + } + } + } + timers->current = timer; + +} + +void +visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category) +{ + struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct visc_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; + + if (timers->current != visc_TimerID_NONE) { + if (!is_async(timers->current) ) { + if (timers->current != category) { + if (curr != NULL) { + visc_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); + } else { + visc_StopTimer(&timers->timers[timers->current]); + } + } else { + if (curr != NULL) { + visc_StopTimer(&curr->timer); + } + } + } else { + insert_submarker(timers, label, category); + if (!is_async(category)) { // if switching to async too, keep driver going + visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]); + } + } + } + + visc_Timestamp currentTime = get_time(); + + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next + * segment blocks on completion of previous async operations */ + if( asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(category) ) ) { + + struct visc_async_time_marker_list * last_event = get_last_async(timers); + /* CL_COMPLETE if completed */ + + cl_int ciErrNum = CL_SUCCESS; + cl_int async_done = CL_COMPLETE; + + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stdout, "Error Querying EventInfo2!\n"); + } + + if(is_blocking(category)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async + * operations were first issued */ + + // timer to switch to is COPY or NONE + // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP + // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization + // it seems that the extra sync wall time isn't being recorded anywhere + if(async_done != CL_COMPLETE) + accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), + timers->async_begin,currentTime); + + /* Wait on async operation completion */ + ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Waiting for Events!\n"); + } + visc_Timestamp total_async_time = record_async_times(timers); + + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + // If it did finish, then accumulate all the async time that did happen into OVERLAP + // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed. + if(async_done == CL_COMPLETE /*cudaSuccess*/) + timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; + + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding + // so something is deeper in stack + if(async_done == CL_COMPLETE /*cudaSuccess*/) { + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers); + } + // else, this isn't blocking, so just check the next time around + } + + subtimerlist = timers->sub_timer_list[category]; + struct visc_SubTimer *subtimer = NULL; + + if (label != NULL) { + subtimer = subtimerlist->subtimer_list; + while (subtimer != NULL) { + if (strcmp(subtimer->label, label) == 0) { + break; + } else { + subtimer = subtimer->next; + } + } + } + + /* Start the new timer */ + if (category != visc_TimerID_NONE) { + if(!is_async(category)) { + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + if (category != timers->current && subtimer != NULL) { + visc_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); + } else if (subtimer != NULL) { + visc_StartTimer(&subtimer->timer); + } else { + visc_StartTimer(&timers->timers[category]); + } + } else { + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + // toSwitchTo Is Async (KERNEL/COPY_ASYNC) + if (!asyncs_outstanding(timers)) { + /* No asyncs outstanding, insert a fresh async marker */ + insert_submarker(timers, label, category); + timers->async_begin = currentTime; + } else if(!is_async(timers->current)) { + /* Previous asyncs still in flight, but a previous SwitchTo + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async + * operation */ + + struct visc_async_time_marker_list * last_event = get_last_async(timers); + last_event->timerID = category; + last_event->label = label; + } // else, marker for switchToThis was already inserted + + //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running + if (!is_async(timers->current)) { + visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]); + } + } + } + + timers->current = category; +} + +void +visc_PrintTimerSet(struct visc_TimerSet *timers) +{ + visc_Timestamp wall_end = get_time(); + + struct visc_Timer *t = timers->timers; + struct visc_SubTimer* sub = NULL; + + int maxSubLength; + + const char *categories[] = { + "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap", + "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", + "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc", + "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack" + + }; + + const int maxCategoryLength = 20; + + int i; + for(i = 1; i < visc_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format + if(visc_GetElapsedTime(&t[i]) != 0 || true) { + + // Print Category Timer + printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], visc_GetElapsedTime(&t[i])); + + if (timers->sub_timer_list[i] != NULL) { + sub = timers->sub_timer_list[i]->subtimer_list; + maxSubLength = 0; + while (sub != NULL) { + // Find longest SubTimer label + if (strlen(sub->label) > (unsigned long) maxSubLength) { + maxSubLength = strlen(sub->label); + } + sub = sub->next; + } + + // Fit to Categories + if (maxSubLength <= maxCategoryLength) { + maxSubLength = maxCategoryLength; + } + + sub = timers->sub_timer_list[i]->subtimer_list; + + // Print SubTimers + while (sub != NULL) { + printf(" -%-*s: %.9f\n", maxSubLength, sub->label, visc_GetElapsedTime(&sub->timer)); + sub = sub->next; + } + } + } + } + + if(visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %.9f\n", visc_GetElapsedTime(&t[visc_TimerID_OVERLAP])); + + float walltime = (wall_end - timers->wall_begin)/ 1e9; + printf("Timer Wall Time: %.9f\n", walltime); + +} + +void visc_DestroyTimerSet(struct visc_TimerSet * timers) +{ + /* clean up all of the async event markers */ + struct visc_async_time_marker_list* event = timers->async_markers; + while(event != NULL) { + + cl_int ciErrNum = CL_SUCCESS; + ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker); + if (ciErrNum != CL_SUCCESS) { + //fprintf(stderr, "Error Waiting for Events!\n"); + } + + ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) ); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Release Events!\n"); + } + + free((event)->marker); + struct visc_async_time_marker_list* next = ((event)->next); + + free(event); + + // (*event) = NULL; + event = next; + } + + int i = 0; + for(i = 0; i < visc_TimerID_LAST; ++i) { + if (timers->sub_timer_list[i] != NULL) { + struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; + struct visc_SubTimer *prev = NULL; + while (subtimer != NULL) { + free(subtimer->label); + prev = subtimer; + subtimer = subtimer->next; + free(prev); + } + free(timers->sub_timer_list[i]); + } + } +} + +/**************************** Pipeline API ************************************/ +#define BUFFER_SIZE 1 + +// Launch API for a streaming dataflow graph +void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void* args) { + DFNodeContext_X86* Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86)); + + Context->threads = new std::vector<pthread_t>(); + Context->ArgInPortSizeMap = new std::map<unsigned, uint64_t>(); + //Context->BindInSizes = new std::vector<uint64_t>(); + Context->BindInSourcePort = new std::vector<unsigned>(); + Context->BindOutSizes = new std::vector<uint64_t>(); + Context->EdgeSizes = new std::vector<uint64_t>(); + Context->BindInputBuffers = new std::vector<CircularBuffer<uint64_t>*>(); + Context->BindOutputBuffers = new std::vector<CircularBuffer<uint64_t>*>(); + Context->EdgeBuffers = new std::vector<CircularBuffer<uint64_t>*>(); + Context->isLastInputBuffers = new std::vector<CircularBuffer<uint64_t>*>(); + + DEBUG(cout << "StreamLaunch -- Graph: " << Context << ", Arguments: " << args << flush << "\n"); + LaunchFunc(args, Context); + return Context; +} + +// Push API for a streaming dataflow graph +void llvm_visc_streamPush(void* graphID, void* args) { + DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args << flush << "\n"); + DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; + unsigned offset = 0; + for (unsigned i=0; i< Ctx->ArgInPortSizeMap->size(); i++) { + uint64_t element; + memcpy(&element, (char*)args+offset, Ctx->ArgInPortSizeMap->at(i)); + offset += Ctx->ArgInPortSizeMap->at(i); + for(unsigned j=0; j<Ctx->BindInputBuffers->size();j++) { + if(Ctx->BindInSourcePort->at(j) == i) { + // Push to all bind buffers connected to parent node at this port + //DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); + llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element); + } + } + } + // Push 0 in isLastInput buffers of all child nodes + for (CircularBuffer<uint64_t>* buffer: *(Ctx->isLastInputBuffers)) + llvm_visc_bufferPush(buffer, 0); +} + +// Pop API for a streaming dataflow graph +void* llvm_visc_streamPop(void* graphID) { + DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n"); + DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; + unsigned totalBytes = 0; + for(uint64_t size: *(Ctx->BindOutSizes)) + totalBytes+= size; + void* output = malloc(totalBytes); + unsigned offset = 0; + for (unsigned i=0; i< Ctx->BindOutputBuffers->size(); i++) { + uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i)); + //DEBUG(cout << "\tPopped Value " << element << " from buffer\n"); + memcpy((char*)output+offset, &element, Ctx->BindOutSizes->at(i)); + offset += Ctx->BindOutSizes->at(i); + } + return output; +} + +// Wait API for a streaming dataflow graph +void llvm_visc_streamWait(void* graphID) { + DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n"); + DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; + // Push garbage to all other input buffers + for (unsigned i=0; i< Ctx->BindInputBuffers->size(); i++) { + uint64_t element = 0; + //DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); + llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element); + } + // Push 1 in isLastInput buffers of all child nodes + for (unsigned i=0; i < Ctx->isLastInputBuffers->size(); i++) + llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1); + + llvm_visc_freeThreads(graphID); +} + +// Create a buffer and return the bufferID +void* llvm_visc_createBindInBuffer(void* graphID, uint64_t size, unsigned inArgPort) { + DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); + DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; + CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindIn"); + DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); + Context->BindInputBuffers->push_back(bufferID); + (*(Context->ArgInPortSizeMap))[inArgPort] = size; + Context->BindInSourcePort->push_back(inArgPort); + //Context->BindInSizes->push_back(size); + return bufferID; +} + +void* llvm_visc_createBindOutBuffer(void* graphID, uint64_t size) { + DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); + DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; + //Twine name = Twine("Bind.Out.")+Twine(Context->BindOutputBuffers->size()); + CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindOut"); + DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); + Context->BindOutputBuffers->push_back(bufferID); + Context->BindOutSizes->push_back(size); + return bufferID; +} +void* llvm_visc_createEdgeBuffer(void* graphID, uint64_t size) { + DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); + DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; + //Twine name = Twine("Edge.")+Twine(Context->EdgeBuffers->size()); + CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "Edge"); + DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); + Context->EdgeBuffers->push_back(bufferID); + Context->EdgeSizes->push_back(size); + return bufferID; +} + +void* llvm_visc_createLastInputBuffer(void* graphID, uint64_t size) { + DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); + DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; + //Twine name = Twine("isLastInput.")+Twine(Context->EdgeBuffers->size()); + CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "LastInput"); + DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); + Context->isLastInputBuffers->push_back(bufferID); + return bufferID; +} + +// Free buffers +void llvm_visc_freeBuffers(void* graphID) { + DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n"); + DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; + for(CircularBuffer<uint64_t>* bufferID: *(Context->BindInputBuffers)) + delete bufferID; + for(CircularBuffer<uint64_t>* bufferID: *(Context->BindOutputBuffers)) + delete bufferID; + for(CircularBuffer<uint64_t>* bufferID: *(Context->EdgeBuffers)) + delete bufferID; + for(CircularBuffer<uint64_t>* bufferID: *(Context->isLastInputBuffers)) + delete bufferID; +} + +// Pop an element from the buffer +uint64_t llvm_visc_bufferPop(void* bufferID) { + CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID; + return buffer->pop(); +} + +// Push an element into the buffer +void llvm_visc_bufferPush(void* bufferID, uint64_t element) { + CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID; + buffer->push(element); +} + +// Create a thread +void llvm_visc_createThread(void* graphID, void* (*Func)(void*), void* arguments) { + DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func << ", Args: " << arguments << flush << "\n"); + DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; + int err; + pthread_t threadID; + if((err = pthread_create(&threadID, NULL, Func, arguments)) != 0) + cout << "Failed to create thread. Error code = " << err << flush << "\n"; + + Ctx->threads->push_back(threadID); +} + +// Wait for thread to finish +void llvm_visc_freeThreads(void* graphID) { + DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n"); + DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; + for(pthread_t thread: *(Ctx->threads)) + pthread_join(thread, NULL); +} + +/************************ OPENCL & PTHREAD API ********************************/ + +void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) { + DFNodeContext_X86 *Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86)); + //int err; + //if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0) + //cout << "Failed to create pthread. Error code = " << err << flush << "\n"; + rootFunc(arguments); + return Context; +} + +void llvm_visc_x86_wait(void* graphID) { + DEBUG(cout << "Waiting for pthread to finish ...\n"); + //DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; + //pthread_join(Context->threadID, NULL); + free(graphID); + DEBUG(cout << "\t... pthread Done!\n"); +} + +void* llvm_visc_ocl_initContext(enum visc::Target T) { + pthread_mutex_lock(&ocl_mtx); + //DEBUG(std::string Target = T == visc::GPU_TARGET? "GPU" : "SPIR"); + DEBUG(const char* Target = T == visc::GPU_TARGET? "GPU" : "SPIR"); + DEBUG(cout << "Initializing Context for " << Target << " device\n"); + cl_uint numPlatforms; + cl_int errcode; + errcode = clGetPlatformIDs(0, NULL, &numPlatforms); + checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms"); + + // now get all the platform IDs + cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms); + errcode = clGetPlatformIDs(numPlatforms, platforms, NULL); + checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs"); + + + for(unsigned i=0; i < numPlatforms; i++) { + char buffer[10240]; + DEBUG(cout << "Device " << i << " Info -->\n"); + clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL); + DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n"); + clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL); + DEBUG(cout << "\tVERSION = "<< buffer << flush << "\n"); + clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL); + DEBUG(cout << "\tNAME = " << buffer << flush << "\n"); + clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL); + DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n"); + clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); + DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n"); + } + // set platform property - just pick the first one + //cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + //(long) platforms[0], + //0}; + //globalOCLContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, + //NULL, NULL, &errcode); + assert(numPlatforms >= 2 && "Expecting two OpenCL platforms"); + // Choose second one which is X86 AVX + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + (long) platforms[T == visc::GPU_TARGET? 0 : 1], + 0}; + globalOCLContext = clCreateContextFromType(properties, + T == visc::GPU_TARGET? + CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, + NULL, NULL, &errcode); + // get the list of OCL devices associated with context + size_t dataBytes; + errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0, + NULL, &dataBytes); + checkErr(errcode, CL_SUCCESS, "Failure to get context info length"); + + clDevices = (cl_device_id *) malloc(dataBytes); + errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes, + clDevices, NULL); + checkErr(errcode, CL_SUCCESS, "Failure to get context info"); + if(false && T == visc::SPIR_TARGET) { + cl_device_partition_property props[4]; + props[0] = CL_DEVICE_PARTITION_BY_COUNTS; + props[1] = NUM_CORES; + props[2] = CL_DEVICE_PARTITION_BY_COUNTS_LIST_END; + props[3] = 0; + cl_device_id subdevice_id[8]; + cl_uint num_entries = 8; + + cl_uint numDevices; + clCreateSubDevices(clDevices[0], props, num_entries, subdevice_id, &numDevices); + //printf("Num of devices = %d\n", numDevices); + //for(unsigned i =0 ; i< numDevices; i++) + //printf("Subdevice id %d = %p\n", i, subdevice_id[i]); + clDevices[0] = subdevice_id[0]; + globalOCLContext = clCreateContext(properties, 1, clDevices, NULL, NULL, &errcode); + checkErr(errcode, CL_SUCCESS, "Failure to create OCL context"); + } + + free(platforms); + DEBUG(cout << "\tContext " << globalOCLContext << flush << "\n"); + checkErr(errcode, CL_SUCCESS, "Failure to create OCL context"); + + DEBUG(cout << "Initialize Kernel Timer\n"); + visc_InitializeTimerSet(&kernel_timer); + + pthread_mutex_unlock(&ocl_mtx); + return globalOCLContext; +} + +void llvm_visc_ocl_clearContext(void* graphID) { + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "Clear Context\n"); + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + // FIXME: Have separate function to release command queue and clear context. + // Would be useful when a context has multiple command queues + clReleaseKernel(Context->clKernel); + //clReleaseProgram(Context->clProgram); + //clReleaseCommandQueue(Context->clCommandQue); + //clReleaseContext(globalOCLContext); + //DEBUG(cout << "Released context at: " << globalOCLContext); + free(Context); + DEBUG(cout << "Done with OCL kernel\n"); + cout << "Printing VISC Timer: KernelTimer\n"; + visc_PrintTimerSet(&kernel_timer); + pthread_mutex_unlock(&ocl_mtx); + +} + +void llvm_visc_ocl_argument_shared(void* graphID, int arg_index, size_t size) { + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "Set Shared Memory Input:"); + DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n"); + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + DEBUG(cout << "Using Context: " << Context << flush << "\n"); + DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n"); + //pthread_mutex_lock(&ocl_mtx); + cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument"); + pthread_mutex_unlock(&ocl_mtx); +} + +void llvm_visc_ocl_argument_scalar(void* graphID, void* input, int arg_index, size_t size) { + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "Set Scalar Input:"); + DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n"); + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + DEBUG(cout << "Using Context: " << Context << flush << "\n"); + DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n"); + //pthread_mutex_lock(&ocl_mtx); + cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); + pthread_mutex_unlock(&ocl_mtx); +} + +void* llvm_visc_ocl_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) { + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "Set Pointer Input:"); + DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input << ", Size = "<< size << flush << "\n"); + // Size should be non-zero + assert(size != 0 && "Size of data pointed to has to be non-zero!"); + DEBUG(cout << "\tInput = "<< isInput << "\tOutput = " << isOutput << flush << "\n"); + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + + pthread_mutex_unlock(&ocl_mtx); + // Check with runtime the location of this memory + cl_mem d_input = (cl_mem) llvm_visc_ocl_request_mem(input, size, Context, isInput, isOutput); + + pthread_mutex_lock(&ocl_mtx); + // Set Kernel Argument + //pthread_mutex_lock(&ocl_mtx); + cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument"); + DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n"); + pthread_mutex_unlock(&ocl_mtx); + return d_input; +} + +void* llvm_visc_ocl_output_ptr(void* graphID, int arg_index, size_t size) { + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "Set device memory for Output Struct:"); + DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = "<< size << flush << "\n"); + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + cl_int errcode; + //pthread_mutex_lock(&ocl_mtx); + cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY, size, NULL, &errcode); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to create output buffer on device"); + //pthread_mutex_lock(&ocl_mtx); + errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument"); + DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n"); + pthread_mutex_unlock(&ocl_mtx); + return d_output; +} + +void llvm_visc_ocl_free(void* ptr) { + //DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n"); + //cl_mem d_ptr = (cl_mem) ptr; + //clReleaseMemObject(d_ptr); +} + +void* llvm_visc_ocl_getOutput(void* graphID, void* h_output, void* d_output, size_t size) { + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "Get Output:\n"); + DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output << ", Size = "<< size << flush << "\n"); + if(h_output == NULL) + h_output = malloc(size); + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + //pthread_mutex_lock(&ocl_mtx); + cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size, + h_output, 0, NULL, NULL); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output"); + pthread_mutex_unlock(&ocl_mtx); + return h_output; +} + +void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t* + localWorkSize, const size_t* globalWorkSize) { + pthread_mutex_lock(&ocl_mtx); + + size_t GlobalWG[3]; + size_t LocalWG[3]; + + // OpenCL EnqeueNDRangeKernel function results in segementation fault if we + // directly use local and global work groups arguments. Hence, allocating it + // on stack and copying. + for(unsigned i=0; i<workDim; i++) { + GlobalWG[i] = globalWorkSize[i]; + } + + // OpenCL allows local workgroup to be null. + if(localWorkSize != NULL) { + for(unsigned i=0; i<workDim; i++) { + LocalWG[i] = localWorkSize[i]; + } + } + + DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; + // TODO: Would like to use event to ensure better scheduling of kernels. + // Currently passing the event paratemeter results in seg fault with + // clEnqueueNDRangeKernel. + cl_event* event; + DEBUG(cout << "Enqueuing kernel:\n"); + DEBUG(cout << "\tCommand Queue: " << Context->clCommandQue << flush << "\n"); + DEBUG(cout << "\tKernel: " << Context->clKernel << flush << "\n"); + DEBUG(cout << "\tNumber of dimensions: " << workDim << flush << "\n"); + DEBUG(cout << "\tGlobal Work Group: ( "); + for(unsigned i = 0; i<workDim; i++) { + DEBUG(cout << GlobalWG[i] << " "); + } + DEBUG(cout << ")\n"); + if(localWorkSize != NULL) { + DEBUG(cout << "\tLocal Work Group: ( "); + for(unsigned i = 0; i<workDim; i++) { + DEBUG(cout << LocalWG[i] << " "); + } + DEBUG(cout << ")\n"); + } + //pthread_mutex_lock(&ocl_mtx); + clFinish(Context->clCommandQue); + //pthread_mutex_unlock(&ocl_mtx); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION); + //for(int i=0 ;i < NUM_TESTS; i++) { + //cout << "Iteration = " << i << flush << "\n"; + //pthread_mutex_lock(&ocl_mtx); + cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, + Context->clKernel, workDim, NULL, GlobalWG, (localWorkSize == NULL)? NULL : LocalWG, 0, NULL, NULL); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); + //} + //pthread_mutex_lock(&ocl_mtx); + clFinish(Context->clCommandQue); + //pthread_mutex_unlock(&ocl_mtx); + visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); + + pthread_mutex_unlock(&ocl_mtx); + return event; +} + + +////////////////////////////////////////////////////////////////////////////// +//! Loads a Program binary file. +//! +//! @return the source string if succeeded, 0 otherwise +//! @param Filename program filename +//! @param szFinalLength returned length of the code string +////////////////////////////////////////////////////////////////////////////// +static char* LoadProgSource(const char* Filename, size_t* szFinalLength) +{ + DEBUG(cout << "Load Prog Source\n"); + // locals + FILE* pFileStream = NULL; + size_t szSourceLength; + + // open the OpenCL source code file + pFileStream = fopen(Filename, "rb"); + if(pFileStream == 0) + { + return NULL; + } + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char* cSourceString = (char *)malloc(szSourceLength + 1); + if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) + { + fclose(pFileStream); + free(cSourceString); + return 0; + } + + // close the file and return the total length of the combined (preamble + source) string + fclose(pFileStream); + if(szFinalLength != 0) + { + *szFinalLength = szSourceLength; + } + cSourceString[szSourceLength] = '\0'; + + return cSourceString; +} + +void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) { + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "Launch OCL Kernel\n"); + // Initialize OpenCL + + // OpenCL specific variables + DFNodeContext_OCL *Context = (DFNodeContext_OCL *) malloc(sizeof(DFNodeContext_OCL)); + + size_t kernelLength; + cl_int errcode; + + // For a single context for all kernels + Context->clOCLContext = globalOCLContext; + + //Create a command-queue + //pthread_mutex_lock(&ocl_mtx); + Context->clCommandQue = clCreateCommandQueue(Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); + globalCommandQue = Context->clCommandQue; + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to create command queue"); + + DEBUG(cout << "Loading program binary: " << FileName << flush << "\n"); + char *programSource = LoadProgSource(FileName, &kernelLength); + checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary"); + + cl_int binaryStatus; + //pthread_mutex_lock(&ocl_mtx); + Context->clProgram = clCreateProgramWithBinary(Context->clOCLContext, 1, &clDevices[0], + &kernelLength, + (const unsigned char **)&programSource, + &binaryStatus, &errcode); + //pthread_mutex_unlock(&ocl_mtx); + checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); + + DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName << flush << "\n"); + errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL); + // If build fails, get build log from device + if(errcode != CL_SUCCESS) { + cout << "ERROR: Failure to build program\n"; + size_t len = 0; + errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0, + NULL, &len); + cout << "LOG LENGTH: " << len << flush << "\n"; + checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length"); + char *log = (char*) malloc(len*sizeof(char)); + errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len, + log, NULL); + checkErr(errcode, CL_SUCCESS, "Failure to collect program build log"); + + cout << "Device Build Log:\n" << log << flush << "\n"; + free(log); + pthread_mutex_unlock(&ocl_mtx); + exit(EXIT_FAILURE); + } + + Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode); + checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); + + DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n"); + //free(clDevices); + free(programSource); + + pthread_mutex_unlock(&ocl_mtx); + return Context; +} + + +void llvm_visc_ocl_wait(void* graphID) { + pthread_mutex_lock(&ocl_mtx); + DEBUG(cout << "Wait\n"); + DFNodeContext_OCL *Context = (DFNodeContext_OCL*) graphID; + //pthread_mutex_lock(&ocl_mtx); + clFinish(Context->clCommandQue); + //pthread_mutex_unlock(&ocl_mtx); + pthread_mutex_unlock(&ocl_mtx); +} + +void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID timer) { + //cout << "Switching to timer " << timer << flush << "\n"; + pthread_mutex_lock(&ocl_mtx); + //visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer); + pthread_mutex_unlock(&ocl_mtx); +} +void llvm_visc_printTimerSet(void** timerSet, char* timerName) { + pthread_mutex_lock(&ocl_mtx); + cout << "Printing VISC Timer: "; + if(timerName != NULL) + cout << timerName << flush << "\n"; + else + cout << "Anonymous\n"; + visc_PrintTimerSet((visc_TimerSet*) (*timerSet)); + pthread_mutex_unlock(&ocl_mtx); +} + +void* llvm_visc_initializeTimerSet() { + pthread_mutex_lock(&ocl_mtx); + visc_TimerSet* TS = (visc_TimerSet*) malloc (sizeof(visc_TimerSet)); + visc_InitializeTimerSet(TS); + pthread_mutex_unlock(&ocl_mtx); + return TS; +} + + + diff --git a/llvm/projects/visc-cpu-rt/visc-rt.h b/llvm/projects/visc-cpu-rt/visc-rt.h new file mode 100644 index 0000000000..5e7546f582 --- /dev/null +++ b/llvm/projects/visc-cpu-rt/visc-rt.h @@ -0,0 +1,302 @@ +/* + * + * (c) 2010 The Board of Trustees of the University of Illinois. + */ +#ifndef VISC_RT_HEADER +#define VISC_RT_HEADER + +#include <iostream> +#include <map> +#include <ctime> +#include <vector> +#include <pthread.h> +#include <string> +//#include <condition_variable> + +#include "llvm/SupportVISC/VISCHint.h" +#include "llvm/SupportVISC/VISCTimer.h" +#include "device_abstraction.h" +#include "policy.h" +#ifndef DEBUG_BUILD +#define DEBUG(s) {} +#else +#define DEBUG(s) s +#endif + + + +using namespace std; + +extern "C" { + +/************************* Policies *************************************/ + +void llvm_visc_policy_init(); +void llvm_visc_policy_clear(); +int llvm_visc_policy_getVersion(const char *, int64_t); + +/******************** Device Abstraction ********************************/ +void llvm_visc_deviceAbstraction_start(); +void llvm_visc_deviceAbstraction_end(); +void llvm_visc_deviceAbstraction_waitOnDeviceStatus(); + +/********************* DFG Depth Stack **********************************/ +class DFGDepth { + private: + unsigned numDim; + unsigned dimLimit[3]; + unsigned dimInstance[3]; + public: + DFGDepth() {} + DFGDepth(unsigned n, unsigned dimX = 0, unsigned iX = 0, unsigned dimY = 0, unsigned iY = 0, + unsigned dimZ = 0, unsigned iZ = 0) { + assert(n <= 3 && "Error! More than 3 dimensions not supported"); + numDim = n; + dimLimit[0] = dimX; + dimLimit[1] = dimY; + dimLimit[2] = dimZ; + dimInstance[0] = iX; + dimInstance[1] = iY; + dimInstance[2] = iZ; + } + unsigned getDimLimit(unsigned dim) { + assert(dim <= numDim && "Error! Requested dimension limit is not specified"); + return dimLimit[dim]; + } + + unsigned getDimInstance(unsigned dim) { + assert(dim <= numDim && "Error! Requested dimension instance is not specified"); + return dimInstance[dim]; + } + + unsigned getNumDim() { + return numDim; + } +}; + +void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0, + uint64_t limitY = 0, uint64_t iY = 0, uint64_t limitZ = 0, uint64_t iZ = 0); +void llvm_visc_x86_dstack_pop(); +uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim); +uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim); + + +/********************* Memory Tracker **********************************/ +class MemTrackerEntry { +public: + enum Location {HOST, DEVICE}; + private: + size_t size; + Location loc; + void* addr; + void* Context; + + public: + MemTrackerEntry(size_t _size, Location _loc, void* _addr, void* _Context): + size(_size), loc(_loc), addr(_addr), Context(_Context) { + } + + size_t getSize() { + return size; + } + + Location getLocation() { + return loc; + } + + void* getAddress() { + return addr; + } + + void* getContext() { + return Context; + } + + void update(Location _loc, void* _addr, void* _Context = NULL) { + loc = _loc; + addr = _addr; + Context = _Context; + } + + void print() { + cout << "Size = " << size << "\tLocation = " << loc << "\tAddress = " << addr << "\tContext = " << Context; + } +}; + + +class MemTracker { + +private: + std::map<void*, MemTrackerEntry*> Table; + +public: + MemTracker() { + } + + bool insert(void* ID, size_t size, MemTrackerEntry::Location loc, void* addr, void* Context = NULL) { + MemTrackerEntry* MTE = new MemTrackerEntry(size, loc, addr, Context); + Table.insert(std::pair<void*, MemTrackerEntry*>(ID, MTE)); + return MTE != NULL; + } + + MemTrackerEntry* lookup(void* ID) { + if(Table.count(ID) == 0) + return NULL; + return Table[ID]; + } + + void remove(void* ID) { + MemTrackerEntry* MTE = Table[ID]; + free(MTE); + Table.erase(ID); + } + + void print() { + cout << "Printing Table ... Size = " << Table.size() << flush << "\n"; + for(auto& Entry: Table) { + cout << Entry.first << ":\t" ; + Entry.second->print(); + cout << flush << "\n"; + } + } + +}; + +void llvm_visc_track_mem(void*, size_t); +void llvm_visc_untrack_mem(void*); +void* llvm_visc_request_mem(void*, size_t); + +/*********************** OPENCL & PTHREAD API **************************/ +void* llvm_visc_x86_launch(void* (void*), void*); +void llvm_visc_x86_wait(void*); +void* llvm_visc_ocl_initContext(enum visc::Target); + +void* llvm_visc_x86_argument_ptr(void*, size_t); + +void llvm_visc_ocl_clearContext(void*); +void llvm_visc_ocl_argument_shared(void*, int, size_t); +void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t); +void* llvm_visc_ocl_argument_ptr(void*, void*, int, size_t, bool, bool); +void* llvm_visc_ocl_output_ptr(void*, int, size_t); +void llvm_visc_ocl_free(void*); +void* llvm_visc_ocl_getOutput(void*, void*, void*, size_t); +void* llvm_visc_ocl_executeNode(void*, unsigned, const size_t*, const size_t*); +void* llvm_visc_ocl_launch(const char*, const char*); +void llvm_visc_ocl_wait(void*); + +void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID); +void llvm_visc_printTimerSet(void** timerSet, char* timerName = NULL); +void* llvm_visc_initializeTimerSet(); + +} + +/*************************** Pipeline API ******************************/ +// Circular Buffer class +unsigned counter = 0; +template <class ElementType> +class CircularBuffer { +private: + int numElements; + int bufferSize; + int Head; + int Tail; + pthread_mutex_t mtx; + pthread_cond_t cv; + vector<ElementType> buffer; + std::string name; + unsigned ID; + +public: + CircularBuffer(int maxElements, std::string _name = "ANON") { + ID = counter; + Head = 0; + Tail = 0; + numElements = 0; + name = _name; + bufferSize = maxElements+1; + buffer.reserve(bufferSize); + pthread_mutex_init(&mtx, NULL); + pthread_cond_init(&cv, NULL); + counter++; + + } + + bool push(ElementType E); + ElementType pop(); + +}; + +template <class ElementType> +bool CircularBuffer<ElementType>::push(ElementType E) { + //DEBUG(cout << name << " Buffer[" << ID << "]: Push " << E << flush << "\n"); + //unique_lock<mutex> lk(mtx); + pthread_mutex_lock(&mtx); + if((Head +1) % bufferSize == Tail) { + //DEBUG(cout << name << " Buffer[" << ID << "]: Push going to sleep ...\n"); + //cv.wait(lk); + pthread_cond_wait(&cv, &mtx); + //DEBUG(cout << name << " Buffer[" << ID << "]: Push woke up\n"); + } + buffer[Head] = E; + Head = (Head+1) % bufferSize; + numElements++; + //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n"); + //lk.unlock(); + pthread_mutex_unlock(&mtx); + //cv.notify_one(); + pthread_cond_signal(&cv); + return true; +} + +template <class ElementType> +ElementType CircularBuffer<ElementType>::pop() { + //unique_lock<mutex> lk(mtx); + //DEBUG(cout << name << " Buffer[" << ID << "]: Pop\n"); + pthread_mutex_lock(&mtx); + if(Tail == Head) { + //DEBUG(cout << name << " Buffer[" << ID << "]: Pop going to sleep ...\n"); + //cv.wait(lk); + pthread_cond_wait(&cv, &mtx); + //DEBUG(cout << name << " Buffer[" << ID << "]: Pop woke up\n"); + } + ElementType E = buffer[Tail]; + Tail = (Tail + 1) % bufferSize; + numElements--; + //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n"); + //lk.unlock(); + pthread_mutex_unlock(&mtx); + //cv.notify_one(); + pthread_cond_signal(&cv); + return E; +} + +extern "C" { +// Functions to push and pop values from pipeline buffers +uint64_t llvm_visc_bufferPop(void*); +void llvm_visc_bufferPush(void*, uint64_t); + +// Functions to create and destroy buffers +void* llvm_visc_createBindInBuffer(void*, uint64_t, unsigned); +void* llvm_visc_createBindOutBuffer(void*, uint64_t); +void* llvm_visc_createEdgeBuffer(void*, uint64_t); +void* llvm_visc_createLastInputBuffer(void*, uint64_t); + +void llvm_visc_freeBuffers(void*); + +// Functions to create and destroy threads +void llvm_visc_createThread(void* graphID, void*(*Func)(void*), void*); +void llvm_visc_freeThreads(void*); + +// Launch API for a streaming graph. +// Arguments: +// (1) Launch Function: void* (void*, void*) +// (2) Push Function: void (void*, std::vector<uint64_t>**, unsgined) +// (3) Pop Function: void* (std::vector<uint64_t>**, unsigned) +void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void*); +void llvm_visc_streamPush(void* graphID, void* args); +void* llvm_visc_streamPop(void* graphID); +void llvm_visc_streamWait(void* graphID); + +} + +#endif //VISC_RT_HEADER -- GitLab