From 21496d80b142bec1a44cccd358d8ef9eb79bc4d1 Mon Sep 17 00:00:00 2001 From: akashk4 <akashk4@illinois.edu> Date: Wed, 8 Jan 2020 21:02:09 -0600 Subject: [PATCH] Remove redundant visc-rt --- hpvm/visc-rt/CMakeLists.txt | 43 - hpvm/visc-rt/deviceStatusSwitchIntervals.txt | 2 - hpvm/visc-rt/device_abstraction.h | 82 - hpvm/visc-rt/makefile | 28 - hpvm/visc-rt/policy.h | 98 -- hpvm/visc-rt/visc-rt.cpp | 1645 ------------------ hpvm/visc-rt/visc-rt.h | 303 ---- 7 files changed, 2201 deletions(-) delete mode 100644 hpvm/visc-rt/CMakeLists.txt delete mode 100644 hpvm/visc-rt/deviceStatusSwitchIntervals.txt delete mode 100644 hpvm/visc-rt/device_abstraction.h delete mode 100644 hpvm/visc-rt/makefile delete mode 100644 hpvm/visc-rt/policy.h delete mode 100644 hpvm/visc-rt/visc-rt.cpp delete mode 100644 hpvm/visc-rt/visc-rt.h diff --git a/hpvm/visc-rt/CMakeLists.txt b/hpvm/visc-rt/CMakeLists.txt deleted file mode 100644 index 5767d82cdc..0000000000 --- a/hpvm/visc-rt/CMakeLists.txt +++ /dev/null @@ -1,43 +0,0 @@ -add_custom_target(visc-rt ALL) -add_custom_command( - TARGET visc-rt PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_SOURCE_DIR}/deviceStatusSwitchIntervals.txt - ${CMAKE_CURRENT_BINARY_DIR}/deviceStatusSwitchIntervals.txt - DEPENDS deviceStatusSwitchIntervals.txt - COMMENT "Copying deviceStatusSwitchIntervals.txt") -add_custom_command( - TARGET visc-rt PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_SOURCE_DIR}/device_abstraction.h - ${CMAKE_CURRENT_BINARY_DIR}/device_abstraction.h - DEPENDS device_abstraction.h - COMMENT "Copying device_abstraction.h") -add_custom_command( - TARGET visc-rt PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_SOURCE_DIR}/policy.h - ${CMAKE_CURRENT_BINARY_DIR}/policy.h - DEPENDS policy.h - COMMENT "Copying policy.h") -add_custom_command( - TARGET visc-rt PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.h - ${CMAKE_CURRENT_BINARY_DIR}/visc-rt.h - DEPENDS visc-rt.h - COMMENT "Copying visc-rt.h") -add_custom_command( - TARGET visc-rt PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_SOURCE_DIR}/visc-rt.cpp - ${CMAKE_CURRENT_BINARY_DIR}/visc-rt.cpp - DEPENDS visc-rt.cpp - COMMENT "Copying visc-rt.cpp") -add_custom_command( - TARGET visc-rt PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_SOURCE_DIR}/makefile - ${CMAKE_CURRENT_BINARY_DIR}/makefile - DEPENDS makefile - COMMENT "Copying makefile") diff --git a/hpvm/visc-rt/deviceStatusSwitchIntervals.txt b/hpvm/visc-rt/deviceStatusSwitchIntervals.txt deleted file mode 100644 index 7069470a1a..0000000000 --- a/hpvm/visc-rt/deviceStatusSwitchIntervals.txt +++ /dev/null @@ -1,2 +0,0 @@ -10 -10 15 10 16 15 30 15 25 20 15 diff --git a/hpvm/visc-rt/device_abstraction.h b/hpvm/visc-rt/device_abstraction.h deleted file mode 100644 index 68748c7ab7..0000000000 --- a/hpvm/visc-rt/device_abstraction.h +++ /dev/null @@ -1,82 +0,0 @@ -#ifndef __DEVICE_ABSTRACTION__ -#define __DEVICE_ABSTRACTION__ - -#include <stdio.h> -#include <stdlib.h> -#include <time.h> -#include <time.h> -#include <thread> -#include <vector> -#include <iostream> -#include <fstream> - -#define MIN_INTERVAL 2 -#define MAX_INTERVAL 8 -#define NUM_INTERVALS 10 - -// Device status variable: true if the device is available for use -volatile bool deviceStatus = true; -// Intervals at which to change the device status -std::vector<unsigned> Intervals; - -// Set to true when program execution ends and so we can end the device -// simulation -volatile bool executionEnd = false; - - -void initializeDeviceStatusIntervals() { - - unsigned sz = 0; - unsigned tmp = 0; - - const char *fn = - "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/deviceStatusSwitchIntervals.txt"; - std::ifstream infile; - infile.open(fn); - if (!infile.is_open()) { - std::cout << "Failed to open " << fn << " for reading\n"; - return; - } - infile >> sz; - - if (sz) { - // We have data. Read them into the vector - for (unsigned i = 0; i < sz; i++) { - infile >> tmp; - Intervals.push_back(tmp); - } - infile.close(); - } else { - // We have no data. Create random data and write them into the file - infile.close(); - std::ofstream outfile; - outfile.open(fn); - if (!outfile.is_open()) { - std::cout << "Failed to open " << fn << " for writing\n"; - return; - } - sz = 1 + rand()%NUM_INTERVALS; - outfile << sz; - for (unsigned i = 0; i < sz; i++) { - Intervals.push_back(MIN_INTERVAL + rand()%(MAX_INTERVAL - MIN_INTERVAL)); - outfile << Intervals[i]; - } - outfile.close(); - } - - return; -} - -void updateDeviceStatus() { - - unsigned i = 0; - while (!executionEnd) { - std::this_thread::sleep_for (std::chrono::seconds(Intervals[i])); - deviceStatus = !deviceStatus; - std::cout << "Changed device status to " << deviceStatus << "\n"; - i = (i+1) % Intervals.size(); - } - -} - -#endif // __DEVICE_ABSTRACTION__ diff --git a/hpvm/visc-rt/makefile b/hpvm/visc-rt/makefile deleted file mode 100644 index 01cc2b7b3f..0000000000 --- a/hpvm/visc-rt/makefile +++ /dev/null @@ -1,28 +0,0 @@ -#LLVM_SRC_ROOT = ../../../llvm -LLVM_BUILD_ROOT = ${LLVM_SRC_ROOT}/../build/ - -OPENCL_INC_PATH = /opt/intel/opencl-sdk/include - -ifeq ($(NUM_CORES),) - NUM_CORES=8 -endif - -CPP_FLAGS = -I $(LLVM_SRC_ROOT)/include -I $(LLVM_BUILD_ROOT)/include -I $(OPENCL_INC_PATH) -std=c++11 -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS -TARGET:=visc-rt - -LLVM_CC:=$(LLVM_BUILD_ROOT)/bin/clang -LLVM_CXX:=$(LLVM_BUILD_ROOT)/bin/clang++ - -OPTS = - -ifeq ($(DEBUG),1) - OPTS+=-DDEBUG_BUILD -endif - -all: $(TARGET:%=%.ll) - -$(TARGET:%=%.ll):%.ll:%.cpp %.h - $(LLVM_CXX) -DNUM_CORES=$(NUM_CORES) -O3 -S -emit-llvm $(CPP_FLAGS) $(OPTS) $< -o $@ - -clean : - rm -f $(TARGET).ll diff --git a/hpvm/visc-rt/policy.h b/hpvm/visc-rt/policy.h deleted file mode 100644 index f30c310c1a..0000000000 --- a/hpvm/visc-rt/policy.h +++ /dev/null @@ -1,98 +0,0 @@ -#ifndef __POLICY__ -#define __POLICY__ - -#include <string> -#include "device_abstraction.h" - - /************************* Policies *************************************/ -class Policy { - public: - virtual int getVersion(const char *, int64_t) = 0; - virtual ~Policy() {}; -}; - -class NodePolicy : public Policy { - virtual int getVersion(const char *name, int64_t it) override { - std::string s(name); - //std::string NodeNames[1] = { "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" }; - std::string NodeNames[] = { - "WrapperGaussianSmoothing_cloned", - "WrapperlaplacianEstimate_cloned", - "WrapperComputeZeroCrossings_cloned", - "WrapperComputeGradient_cloned", - "WrapperComputeMaxGradient_cloned", - "WrapperRejectZeroCrossings_cloned", - }; - //if (!s.compare(NodeNames[4])) { - // std::cout << s << ": CPU" << "\n"; - // return 0; - //} - return 2; - } -}; - -class IterationPolicy : public Policy { - virtual int getVersion(const char *name, int64_t it) override { - if ((it % 10 == 0) || (it % 10 == 1)) - return 0; - else - return 2; - } -}; - -class DeviceStatusPolicy : public Policy { - virtual int getVersion(const char *name, int64_t it) override { - if (deviceStatus) { - //std::cout << "Returning GPU\n"; - return 2; - } - else { - //std::cout << "Returning CPU\n"; - return 0; - } - } -}; - -/* ------------------------------------------------------------------------- */ -// Added for the CFAR interactive policy demo. - -class InteractivePolicy : public Policy { -private: - // 0 :for CPU, 1 for GPU, 2 for Vector - unsigned int userTargetDeviceChoice; - // Used to end thread execution - bool end; - // Thread that will update userTargetDeviceChoice - std::thread userTargetDeviceChoiceThread; - // Thread function - void updateUserTargetChoice() { - while (!end) { - std::cout << "Select target device (0 for CPU, 1 fpr GPU): "; - std::cin >> userTargetDeviceChoice; - if (userTargetDeviceChoice > 1) { - std::cout << "Invalid target device. Selecting GPU instead.\n"; - userTargetDeviceChoice = 1; - } - } - } - -public: - // Inherited method, erquired for every policy object - virtual int getVersion(const char *name, int64_t it) { - return userTargetDeviceChoice; - } - - InteractivePolicy() { - userTargetDeviceChoice = 1; - end = false; - userTargetDeviceChoiceThread = - std::thread(&InteractivePolicy::updateUserTargetChoice, this); - } - - ~InteractivePolicy() { - end = true; - userTargetDeviceChoiceThread.join(); - } -}; - -#endif // __POLICY__ diff --git a/hpvm/visc-rt/visc-rt.cpp b/hpvm/visc-rt/visc-rt.cpp deleted file mode 100644 index f05aa2ac24..0000000000 --- a/hpvm/visc-rt/visc-rt.cpp +++ /dev/null @@ -1,1645 +0,0 @@ -#include <iostream> -#include <string> -#include <pthread.h> -#include <cstdlib> -#include <cstdio> -#include <cstring> -#include <cassert> -#include <map> -#include <CL/cl.h> - -#include <unistd.h> - -#if _POSIX_VERSION >= 200112L -# include <sys/time.h> -#endif -#include "visc-rt.h" - -#ifndef DEBUG_BUILD -#define DEBUG(s) {} -#else -#define DEBUG(s) s -#endif - -#define BILLION 1000000000LL - -using namespace std; - -typedef struct { - pthread_t threadID; - std::vector<pthread_t>* threads; - // Map from InputPort to Size - std::map<unsigned, uint64_t>* ArgInPortSizeMap; - //std::vector<uint64_t>* BindInSizes; - std::vector<unsigned>* BindInSourcePort; - std::vector<uint64_t>* BindOutSizes; - std::vector<uint64_t>* EdgeSizes; - std::vector<CircularBuffer<uint64_t>*>* BindInputBuffers; - std::vector<CircularBuffer<uint64_t>*>* BindOutputBuffers; - std::vector<CircularBuffer<uint64_t>*>* EdgeBuffers; - std::vector<CircularBuffer<uint64_t>*>* isLastInputBuffers; -} DFNodeContext_X86; - -typedef struct { - cl_context clOCLContext; - cl_command_queue clCommandQue; - cl_program clProgram; - cl_kernel clKernel; -} DFNodeContext_OCL; - -cl_context globalOCLContext; -cl_device_id* clDevices; -cl_command_queue globalCommandQue; - -Policy *policy = NULL; -MemTracker MTracker; -vector<DFGDepth> DStack; -// Mutex to prevent concurrent access by multiple thereads in pipeline -pthread_mutex_t ocl_mtx; - -#define NUM_TESTS 1 -visc_TimerSet kernel_timer; - -static inline void checkErr(cl_int err, cl_int success, const char * name) { - if (err != success) { - cout << "ERROR: " << name << flush << "\n"; - cout << "ErrorCode: " << err << flush << "\n"; - exit(EXIT_FAILURE); - } -} - -/************************* Policies *************************************/ -void llvm_visc_policy_init() { - cout << "Initializing policy object ...\n"; -// policy = new NodePolicy(); -// policy = new IterationPolicy(); -// policy = new DeviceStatusPolicy(); - policy = new InteractivePolicy(); - cout << "DONE: Initializing policy object.\n"; -} - -void llvm_visc_policy_clear() { - if (policy) free(policy); -} - -int llvm_visc_policy_getVersion(const char *name, int64_t i) { - return policy->getVersion(name, i); -} - -/******************** Device Abstraction ********************************/ -std::thread deviceStatusThread; - -void llvm_visc_deviceAbstraction_start() { - cout << "Starting device status simulation ...\n"; - // Initialize vector with points where ti switch device status - initializeDeviceStatusIntervals(); - // Create a thread that performs the changes - deviceStatusThread = std::thread(updateDeviceStatus); - cout << "Started device status simulation thread ...\n"; - return; -} - -void llvm_visc_deviceAbstraction_end() { - cout << "Ending device status simulation thread ...\n"; - // Set the variable that allows the thread to know that execution has ended - executionEnd = true; - // Wait for the thread that manages device status to terminate - deviceStatusThread.join(); - cout << "Ended device status simulation.\n"; - return; -} - -void llvm_visc_deviceAbstraction_waitOnDeviceStatus() { - while (!deviceStatus) { }; - return; -} - -/************************* Depth Stack Routines ***************************/ - -void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, uint64_t limitY, - uint64_t iY, uint64_t limitZ, uint64_t iZ) { - //DEBUG(cout << "Pushing node information on stack:\n"); - //DEBUG(cout << "\tNumDim = " << n << "\t Limit(" << limitX << ", " << limitY << ", "<< limitZ <<")\n"); - //DEBUG(cout << "\tInstance(" << iX << ", " << iY << ", "<< iZ <<")\n"); - //DFGDepth nodeInfo (n, limitX, iX, limitY, iY, limitZ, iZ); - //DStack.push_back(nodeInfo); - //DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n"); -} - -void llvm_visc_x86_dstack_pop() { - //DEBUG(cout << "Popping from depth stack\n"); - //DStack.pop_back(); - //DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n"); -} - -uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) { - //DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level <<flush << "\n"); - //unsigned size = DStack.size(); - //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimLimit(dim) <<flush << "\n"); - //return DStack[size-level-1].getDimLimit(dim); - return 0; -} - -uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) { - //DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level <<flush << "\n"); - //unsigned size = DStack.size(); - //DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimInstance(dim) <<flush << "\n"); - //return DStack[size-level-1].getDimInstance(dim); - return 0; -} - -/********************** Memory Tracking Routines **************************/ - -void llvm_visc_track_mem(void* ptr, size_t size) { - DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n"); - MemTrackerEntry* MTE = MTracker.lookup(ptr); - if(MTE != NULL) { - DEBUG(cout << "ID " << ptr << " already present in the MemTracker Table\n"); - return; - } - DEBUG(cout << "Inserting ID " << ptr << " in the MemTracker Table\n"); - MTracker.insert(ptr, size, MemTrackerEntry::HOST, ptr); - DEBUG(MTracker.print()); -} - -void llvm_visc_untrack_mem(void* ptr) { - DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n"); - MemTrackerEntry* MTE = MTracker.lookup(ptr); - if(MTE == NULL) { - cout << "WARNING: Trying to remove ID " << ptr << " not present in the MemTracker Table\n"; - return; - } - DEBUG(cout << "Removing ID " << ptr << " from MemTracker Table\n"); - if(MTE->getLocation() == MemTrackerEntry::DEVICE) - clReleaseMemObject((cl_mem) MTE->getAddress()); - MTracker.remove(ptr); - DEBUG(MTracker.print()); -} - - -static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL* Context, bool isInput, bool isOutput) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "[OCL] Request memory: " << ptr << " for context: " << Context->clOCLContext << flush << "\n"); - MemTrackerEntry* MTE = MTracker.lookup(ptr); - if (MTE == NULL) { - MTracker.print(); - cout << "ERROR: Requesting memory not present in Table\n"; - exit(EXIT_FAILURE); - } - // If already on device - if (MTE->getLocation() == MemTrackerEntry::DEVICE && - ((DFNodeContext_OCL*)MTE->getContext())->clOCLContext == Context->clOCLContext) { - DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); - pthread_mutex_unlock(&ocl_mtx); - return MTE->getAddress(); - } - - DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n"); - DEBUG(cout << "\t"; MTE->print(); cout << flush << "\n"); - // Else copy and update the latest copy - cl_mem_flags clFlags; - cl_int errcode; - - if(isInput && isOutput) clFlags = CL_MEM_READ_WRITE; - else if(isInput) clFlags = CL_MEM_READ_ONLY; - else if(isOutput) clFlags = CL_MEM_WRITE_ONLY; - else clFlags = CL_MEM_READ_ONLY; - - visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); - //pthread_mutex_lock(&ocl_mtx); - cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); - DEBUG(cout<< "\nMemory allocated on device: " << d_input << flush << "\n"); - if(isInput) { - DEBUG(cout << "\tCopying ..."); - //pthread_mutex_lock(&ocl_mtx); - errcode = clEnqueueWriteBuffer(Context->clCommandQue, - d_input, - CL_TRUE, - 0, - size,MTE->getAddress(), - 0,NULL,NULL); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device"); - } - - visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); - DEBUG(cout << " done\n"); - MTE->update(MemTrackerEntry::DEVICE, (void*) d_input, Context); - DEBUG(cout << "Updated Table\n"); - DEBUG(MTracker.print()); - pthread_mutex_unlock(&ocl_mtx); - return d_input; -} - -void* llvm_visc_x86_argument_ptr(void* ptr, size_t size) { - return llvm_visc_request_mem(ptr, size); -} - -void* llvm_visc_request_mem(void* ptr, size_t size) { - // Ignore objects whose size is 0 - no memory is requested. - if (size == 0) { - DEBUG(cout << "[X86] Request memory (ignored): " << ptr << flush << "\n"); - return ptr; - } - - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n"); - MemTrackerEntry* MTE = MTracker.lookup(ptr); - if(MTE == NULL) { - cout << "ERROR: Requesting memory not present in Table\n"; - pthread_mutex_unlock(&ocl_mtx); - exit(EXIT_FAILURE); - } - // If already on host - if(MTE->getLocation() == MemTrackerEntry::HOST) { - DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n"); - pthread_mutex_unlock(&ocl_mtx); - return MTE->getAddress(); - } - - // Else copy from device and update table - DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); - DEBUG(cout << "\tCopying ..."); - visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY); - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clEnqueueReadBuffer(((DFNodeContext_OCL*)MTE->getContext())->clCommandQue, - (cl_mem) MTE->getAddress(), - CL_TRUE, - 0, - size, - ptr, - 0, NULL, NULL); - //pthread_mutex_unlock(&ocl_mtx); - visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); - DEBUG(cout << " done\n"); - checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output"); - DEBUG(cout << "Free mem object on device\n"); - clReleaseMemObject((cl_mem) MTE->getAddress()); - DEBUG(cout << "Updated Table\n"); - MTE->update(MemTrackerEntry::HOST, ptr); - DEBUG(MTracker.print()); - pthread_mutex_unlock(&ocl_mtx); - return ptr; -} - -/*************************** Timer Routines **********************************/ - -static int is_async(enum visc_TimerID timer) -{ - return (timer == visc_TimerID_KERNEL) || - (timer == visc_TimerID_COPY_ASYNC); -} - -static int is_blocking(enum visc_TimerID timer) -{ - return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE); -} - -#define INVALID_TIMERID visc_TimerID_LAST - -static int asyncs_outstanding(struct visc_TimerSet* timers) -{ - return (timers->async_markers != NULL) && - (timers->async_markers->timerID != INVALID_TIMERID); -} - -static struct visc_async_time_marker_list * -get_last_async(struct visc_TimerSet* timers) -{ - /* Find the last event recorded thus far */ - struct visc_async_time_marker_list * last_event = timers->async_markers; - if(last_event != NULL && last_event->timerID != INVALID_TIMERID) { - while(last_event->next != NULL && - last_event->next->timerID != INVALID_TIMERID) - last_event = last_event->next; - return last_event; - } else - return NULL; -} - -static void insert_marker(struct visc_TimerSet* tset, enum visc_TimerID timer) -{ - cl_int ciErrNum = CL_SUCCESS; - struct visc_async_time_marker_list ** new_event = &(tset->async_markers); - - while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { - new_event = &((*new_event)->next); - } - - if(*new_event == NULL) { - *new_event = (struct visc_async_time_marker_list *) - malloc(sizeof(struct visc_async_time_marker_list)); - (*new_event)->marker = calloc(1, sizeof(cl_event)); - /* - // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event -#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) -fprintf(stderr, "Creating Marker [%d]\n", timer); - *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Creating User Event Object!\n"); - } - ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Setting User Event Status!\n"); - } -#endif -*/ - (*new_event)->next = NULL; - } - - /* valid event handle now aquired: insert the event record */ - (*new_event)->label = NULL; - (*new_event)->timerID = timer; - //pthread_mutex_lock(&ocl_mtx); - ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); - //pthread_mutex_unlock(&ocl_mtx); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Enqueueing Marker!\n"); - } - -} - -static void insert_submarker(struct visc_TimerSet* tset, char *label, enum visc_TimerID timer) -{ - cl_int ciErrNum = CL_SUCCESS; - struct visc_async_time_marker_list ** new_event = &(tset->async_markers); - - while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { - new_event = &((*new_event)->next); - } - - if(*new_event == NULL) { - *new_event = (struct visc_async_time_marker_list *) - malloc(sizeof(struct visc_async_time_marker_list)); - (*new_event)->marker = calloc(1, sizeof(cl_event)); - /* -#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) -fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer); - *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Creating User Event Object!\n"); - } - ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Setting User Event Status!\n"); - } -#endif -*/ - (*new_event)->next = NULL; - } - - /* valid event handle now aquired: insert the event record */ - (*new_event)->label = label; - (*new_event)->timerID = timer; - //pthread_mutex_lock(&ocl_mtx); - ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker); - //pthread_mutex_unlock(&ocl_mtx); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Enqueueing Marker!\n"); - } - -} - - -/* Assumes that all recorded events have completed */ -static visc_Timestamp record_async_times(struct visc_TimerSet* tset) -{ - struct visc_async_time_marker_list * next_interval = NULL; - struct visc_async_time_marker_list * last_marker = get_last_async(tset); - visc_Timestamp total_async_time = 0; - - for(next_interval = tset->async_markers; next_interval != last_marker; - next_interval = next_interval->next) { - cl_ulong command_start=0, command_end=0; - cl_int ciErrNum = CL_SUCCESS; - - ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum); - } - - ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum); - } - - visc_Timestamp interval = (visc_Timestamp) (((double)(command_end - command_start))); - tset->timers[next_interval->timerID].elapsed += interval; - if (next_interval->label != NULL) { - struct visc_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; - while (subtimer != NULL) { - if ( strcmp(subtimer->label, next_interval->label) == 0) { - subtimer->timer.elapsed += interval; - break; - } - subtimer = subtimer->next; - } - } - total_async_time += interval; - next_interval->timerID = INVALID_TIMERID; - } - - if(next_interval != NULL) - next_interval->timerID = INVALID_TIMERID; - - return total_async_time; -} - -static void -accumulate_time(visc_Timestamp *accum, - visc_Timestamp start, - visc_Timestamp end) -{ -#if _POSIX_VERSION >= 200112L - *accum += end - start; -#else -# error "Timestamps not implemented for this system" -#endif -} - -#if _POSIX_VERSION >= 200112L -static visc_Timestamp get_time() -{ - struct timespec tv; - clock_gettime(CLOCK_MONOTONIC, &tv); - return (visc_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec); -} -#else -# error "no supported time libraries are available on this platform" -#endif - -void -visc_ResetTimer(struct visc_Timer *timer) -{ - timer->state = visc_Timer_STOPPED; - -#if _POSIX_VERSION >= 200112L - timer->elapsed = 0; -#else -# error "visc_ResetTimer: not implemented for this system" -#endif -} - -void -visc_StartTimer(struct visc_Timer *timer) -{ - if (timer->state != visc_Timer_STOPPED) { - // FIXME: Removing warning statement to avoid printing this error - // fputs("Ignoring attempt to start a running timer\n", stderr); - return; - } - - timer->state = visc_Timer_RUNNING; - -#if _POSIX_VERSION >= 200112L - { - struct timespec tv; - clock_gettime(CLOCK_MONOTONIC, &tv); - timer->init = tv.tv_sec * BILLION + tv.tv_nsec; - } -#else -# error "visc_StartTimer: not implemented for this system" -#endif -} - -void -visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) -{ - - unsigned int numNotStopped = 0x3; // 11 - if (timer->state != visc_Timer_STOPPED) { - fputs("Warning: Timer was not stopped\n", stderr); - numNotStopped &= 0x1; // Zero out 2^1 - } - if (subtimer->state != visc_Timer_STOPPED) { - fputs("Warning: Subtimer was not stopped\n", stderr); - numNotStopped &= 0x2; // Zero out 2^0 - } - if (numNotStopped == 0x0) { - //fputs("Ignoring attempt to start running timer and subtimer\n", stderr); - return; - } - - timer->state = visc_Timer_RUNNING; - subtimer->state = visc_Timer_RUNNING; - -#if _POSIX_VERSION >= 200112L - { - struct timespec tv; - clock_gettime(CLOCK_MONOTONIC, &tv); - - if (numNotStopped & 0x2) { - timer->init = tv.tv_sec * BILLION + tv.tv_nsec; - } - - if (numNotStopped & 0x1) { - subtimer->init = tv.tv_sec * BILLION + tv.tv_nsec; - } - } -#else -# error "visc_StartTimer: not implemented for this system" -#endif - -} - -void -visc_StopTimer(struct visc_Timer *timer) -{ - visc_Timestamp fini; - - if (timer->state != visc_Timer_RUNNING) { - //fputs("Ignoring attempt to stop a stopped timer\n", stderr); - return; - } - - timer->state = visc_Timer_STOPPED; - -#if _POSIX_VERSION >= 200112L - { - struct timespec tv; - clock_gettime(CLOCK_MONOTONIC, &tv); - fini = tv.tv_sec * BILLION + tv.tv_nsec; - } -#else -# error "visc_StopTimer: not implemented for this system" -#endif - - accumulate_time(&timer->elapsed, timer->init, fini); - timer->init = fini; -} - -void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) { - - visc_Timestamp fini; - - unsigned int numNotRunning = 0x3; // 11 - if (timer->state != visc_Timer_RUNNING) { - fputs("Warning: Timer was not running\n", stderr); - numNotRunning &= 0x1; // Zero out 2^1 - } - if (subtimer->state != visc_Timer_RUNNING) { - fputs("Warning: Subtimer was not running\n", stderr); - numNotRunning &= 0x2; // Zero out 2^0 - } - if (numNotRunning == 0x0) { - //fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); - return; - } - - - timer->state = visc_Timer_STOPPED; - subtimer->state = visc_Timer_STOPPED; - -#if _POSIX_VERSION >= 200112L - { - struct timespec tv; - clock_gettime(CLOCK_MONOTONIC, &tv); - fini = tv.tv_sec * BILLION + tv.tv_nsec; - } -#else -# error "visc_StopTimer: not implemented for this system" -#endif - - if (numNotRunning & 0x2) { - accumulate_time(&timer->elapsed, timer->init, fini); - timer->init = fini; - } - - if (numNotRunning & 0x1) { - accumulate_time(&subtimer->elapsed, subtimer->init, fini); - subtimer->init = fini; - } - -} - -/* Get the elapsed time in seconds. */ -double -visc_GetElapsedTime(struct visc_Timer *timer) -{ - double ret; - - if (timer->state != visc_Timer_STOPPED) { - fputs("Elapsed time from a running timer is inaccurate\n", stderr); - } - -#if _POSIX_VERSION >= 200112L - ret = timer->elapsed / 1e9; -#else -# error "visc_GetElapsedTime: not implemented for this system" -#endif - return ret; -} - -void -visc_InitializeTimerSet(struct visc_TimerSet *timers) -{ - int n; - - timers->wall_begin = get_time(); - timers->current = visc_TimerID_NONE; - - timers->async_markers = NULL; - - for (n = 0; n < visc_TimerID_LAST; n++) { - visc_ResetTimer(&timers->timers[n]); - timers->sub_timer_list[n] = NULL; - } -} - - -void -visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category) { - - struct visc_SubTimer *subtimer = (struct visc_SubTimer *) malloc - (sizeof(struct visc_SubTimer)); - - int len = strlen(label); - - subtimer->label = (char *) malloc (sizeof(char)*(len+1)); - sprintf(subtimer->label, "%s", label); - - visc_ResetTimer(&subtimer->timer); - subtimer->next = NULL; - - struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[visc_Category]; - if (subtimerlist == NULL) { - subtimerlist = (struct visc_SubTimerList *) calloc - (1, sizeof(struct visc_SubTimerList)); - subtimerlist->subtimer_list = subtimer; - timers->sub_timer_list[visc_Category] = subtimerlist; - } else { - // Append to list - struct visc_SubTimer *element = subtimerlist->subtimer_list; - while (element->next != NULL) { - element = element->next; - } - element->next = subtimer; - } - -} - -void -visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) -{ - //cerr << "Switch to timer: " << timer << flush << "\n"; - /* Stop the currently running timer */ - if (timers->current != visc_TimerID_NONE) { - struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct visc_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; - - if (!is_async(timers->current) ) { - if (timers->current != timer) { - if (currSubTimer != NULL) { - visc_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); - } else { - visc_StopTimer(&timers->timers[timers->current]); - } - } else { - if (currSubTimer != NULL) { - visc_StopTimer(&currSubTimer->timer); - } - } - } else { - insert_marker(timers, timer); - if (!is_async(timer)) { // if switching to async too, keep driver going - visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]); - } - } - } - - visc_Timestamp currentTime = get_time(); - - /* The only cases we check for asynchronous task completion is - * when an overlapping CPU operation completes, or the next - * segment blocks on completion of previous async operations */ - if( asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(timer) ) ) { - - struct visc_async_time_marker_list * last_event = get_last_async(timers); - /* CL_COMPLETE if completed */ - - cl_int ciErrNum = CL_SUCCESS; - cl_int async_done = CL_COMPLETE; - - ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); - if (ciErrNum != CL_SUCCESS) { - fprintf(stdout, "Error Querying EventInfo1!\n"); - } - - - if(is_blocking(timer)) { - /* Async operations completed after previous CPU operations: - * overlapped time is the total CPU time since this set of async - * operations were first issued */ - - // timer to switch to is COPY or NONE - if(async_done != CL_COMPLETE) { - accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), - timers->async_begin,currentTime); - } - - /* Wait on async operation completion */ - ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Waiting for Events!\n"); - } - - visc_Timestamp total_async_time = record_async_times(timers); - - /* Async operations completed before previous CPU operations: - * overlapped time is the total async time */ - if(async_done == CL_COMPLETE) { - //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time); - timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; - } - - } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding - // so something is deeper in stack - if(async_done == CL_COMPLETE ) { - /* Async operations completed before previous CPU operations: - * overlapped time is the total async time */ - timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers); - } - } - - /* Start the new timer */ - if (timer != visc_TimerID_NONE) { - if(!is_async(timer)) { - visc_StartTimer(&timers->timers[timer]); - } else { - // toSwitchTo Is Async (KERNEL/COPY_ASYNC) - if (!asyncs_outstanding(timers)) { - /* No asyncs outstanding, insert a fresh async marker */ - - insert_marker(timers, timer); - timers->async_begin = currentTime; - } else if(!is_async(timers->current)) { - /* Previous asyncs still in flight, but a previous SwitchTo - * already marked the end of the most recent async operation, - * so we can rename that marker as the beginning of this async - * operation */ - - struct visc_async_time_marker_list * last_event = get_last_async(timers); - last_event->label = NULL; - last_event->timerID = timer; - } - if (!is_async(timers->current)) { - visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]); - } - } - } - timers->current = timer; - -} - -void -visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category) -{ - struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; - struct visc_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; - - if (timers->current != visc_TimerID_NONE) { - if (!is_async(timers->current) ) { - if (timers->current != category) { - if (curr != NULL) { - visc_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); - } else { - visc_StopTimer(&timers->timers[timers->current]); - } - } else { - if (curr != NULL) { - visc_StopTimer(&curr->timer); - } - } - } else { - insert_submarker(timers, label, category); - if (!is_async(category)) { // if switching to async too, keep driver going - visc_StopTimer(&timers->timers[visc_TimerID_DRIVER]); - } - } - } - - visc_Timestamp currentTime = get_time(); - - /* The only cases we check for asynchronous task completion is - * when an overlapping CPU operation completes, or the next - * segment blocks on completion of previous async operations */ - if( asyncs_outstanding(timers) && - (!is_async(timers->current) || is_blocking(category) ) ) { - - struct visc_async_time_marker_list * last_event = get_last_async(timers); - /* CL_COMPLETE if completed */ - - cl_int ciErrNum = CL_SUCCESS; - cl_int async_done = CL_COMPLETE; - - ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); - if (ciErrNum != CL_SUCCESS) { - fprintf(stdout, "Error Querying EventInfo2!\n"); - } - - if(is_blocking(category)) { - /* Async operations completed after previous CPU operations: - * overlapped time is the total CPU time since this set of async - * operations were first issued */ - - // timer to switch to is COPY or NONE - // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP - // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization - // it seems that the extra sync wall time isn't being recorded anywhere - if(async_done != CL_COMPLETE) - accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed), - timers->async_begin,currentTime); - - /* Wait on async operation completion */ - ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Waiting for Events!\n"); - } - visc_Timestamp total_async_time = record_async_times(timers); - - /* Async operations completed before previous CPU operations: - * overlapped time is the total async time */ - // If it did finish, then accumulate all the async time that did happen into OVERLAP - // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed. - if(async_done == CL_COMPLETE /*cudaSuccess*/) - timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time; - - } else - /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ - // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding - // so something is deeper in stack - if(async_done == CL_COMPLETE /*cudaSuccess*/) { - /* Async operations completed before previous CPU operations: - * overlapped time is the total async time */ - timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers); - } - // else, this isn't blocking, so just check the next time around - } - - subtimerlist = timers->sub_timer_list[category]; - struct visc_SubTimer *subtimer = NULL; - - if (label != NULL) { - subtimer = subtimerlist->subtimer_list; - while (subtimer != NULL) { - if (strcmp(subtimer->label, label) == 0) { - break; - } else { - subtimer = subtimer->next; - } - } - } - - /* Start the new timer */ - if (category != visc_TimerID_NONE) { - if(!is_async(category)) { - if (subtimerlist != NULL) { - subtimerlist->current = subtimer; - } - - if (category != timers->current && subtimer != NULL) { - visc_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); - } else if (subtimer != NULL) { - visc_StartTimer(&subtimer->timer); - } else { - visc_StartTimer(&timers->timers[category]); - } - } else { - if (subtimerlist != NULL) { - subtimerlist->current = subtimer; - } - - // toSwitchTo Is Async (KERNEL/COPY_ASYNC) - if (!asyncs_outstanding(timers)) { - /* No asyncs outstanding, insert a fresh async marker */ - insert_submarker(timers, label, category); - timers->async_begin = currentTime; - } else if(!is_async(timers->current)) { - /* Previous asyncs still in flight, but a previous SwitchTo - * already marked the end of the most recent async operation, - * so we can rename that marker as the beginning of this async - * operation */ - - struct visc_async_time_marker_list * last_event = get_last_async(timers); - last_event->timerID = category; - last_event->label = label; - } // else, marker for switchToThis was already inserted - - //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running - if (!is_async(timers->current)) { - visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]); - } - } - } - - timers->current = category; -} - -void -visc_PrintTimerSet(struct visc_TimerSet *timers) -{ - visc_Timestamp wall_end = get_time(); - - struct visc_Timer *t = timers->timers; - struct visc_SubTimer* sub = NULL; - - int maxSubLength; - - const char *categories[] = { - "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap", - "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free", - "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc", - "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack" - - }; - - const int maxCategoryLength = 20; - - int i; - for(i = 1; i < visc_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format - if(visc_GetElapsedTime(&t[i]) != 0 || true) { - - // Print Category Timer - printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], visc_GetElapsedTime(&t[i])); - - if (timers->sub_timer_list[i] != NULL) { - sub = timers->sub_timer_list[i]->subtimer_list; - maxSubLength = 0; - while (sub != NULL) { - // Find longest SubTimer label - if (strlen(sub->label) > (unsigned long) maxSubLength) { - maxSubLength = strlen(sub->label); - } - sub = sub->next; - } - - // Fit to Categories - if (maxSubLength <= maxCategoryLength) { - maxSubLength = maxCategoryLength; - } - - sub = timers->sub_timer_list[i]->subtimer_list; - - // Print SubTimers - while (sub != NULL) { - printf(" -%-*s: %.9f\n", maxSubLength, sub->label, visc_GetElapsedTime(&sub->timer)); - sub = sub->next; - } - } - } - } - - if(visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0) - printf("CPU/Kernel Overlap: %.9f\n", visc_GetElapsedTime(&t[visc_TimerID_OVERLAP])); - - float walltime = (wall_end - timers->wall_begin)/ 1e9; - printf("Timer Wall Time: %.9f\n", walltime); - -} - -void visc_DestroyTimerSet(struct visc_TimerSet * timers) -{ - /* clean up all of the async event markers */ - struct visc_async_time_marker_list* event = timers->async_markers; - while(event != NULL) { - - cl_int ciErrNum = CL_SUCCESS; - ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker); - if (ciErrNum != CL_SUCCESS) { - //fprintf(stderr, "Error Waiting for Events!\n"); - } - - ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) ); - if (ciErrNum != CL_SUCCESS) { - fprintf(stderr, "Error Release Events!\n"); - } - - free((event)->marker); - struct visc_async_time_marker_list* next = ((event)->next); - - free(event); - - // (*event) = NULL; - event = next; - } - - int i = 0; - for(i = 0; i < visc_TimerID_LAST; ++i) { - if (timers->sub_timer_list[i] != NULL) { - struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; - struct visc_SubTimer *prev = NULL; - while (subtimer != NULL) { - free(subtimer->label); - prev = subtimer; - subtimer = subtimer->next; - free(prev); - } - free(timers->sub_timer_list[i]); - } - } -} - -/**************************** Pipeline API ************************************/ -#define BUFFER_SIZE 1 - -// Launch API for a streaming dataflow graph -void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void* args) { - DFNodeContext_X86* Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86)); - - Context->threads = new std::vector<pthread_t>(); - Context->ArgInPortSizeMap = new std::map<unsigned, uint64_t>(); - //Context->BindInSizes = new std::vector<uint64_t>(); - Context->BindInSourcePort = new std::vector<unsigned>(); - Context->BindOutSizes = new std::vector<uint64_t>(); - Context->EdgeSizes = new std::vector<uint64_t>(); - Context->BindInputBuffers = new std::vector<CircularBuffer<uint64_t>*>(); - Context->BindOutputBuffers = new std::vector<CircularBuffer<uint64_t>*>(); - Context->EdgeBuffers = new std::vector<CircularBuffer<uint64_t>*>(); - Context->isLastInputBuffers = new std::vector<CircularBuffer<uint64_t>*>(); - - DEBUG(cout << "StreamLaunch -- Graph: " << Context << ", Arguments: " << args << flush << "\n"); - LaunchFunc(args, Context); - return Context; -} - -// Push API for a streaming dataflow graph -void llvm_visc_streamPush(void* graphID, void* args) { - DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; - unsigned offset = 0; - for (unsigned i=0; i< Ctx->ArgInPortSizeMap->size(); i++) { - uint64_t element; - memcpy(&element, (char*)args+offset, Ctx->ArgInPortSizeMap->at(i)); - offset += Ctx->ArgInPortSizeMap->at(i); - for(unsigned j=0; j<Ctx->BindInputBuffers->size();j++) { - if(Ctx->BindInSourcePort->at(j) == i) { - // Push to all bind buffers connected to parent node at this port - //DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); - llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element); - } - } - } - // Push 0 in isLastInput buffers of all child nodes - for (CircularBuffer<uint64_t>* buffer: *(Ctx->isLastInputBuffers)) - llvm_visc_bufferPush(buffer, 0); -} - -// Pop API for a streaming dataflow graph -void* llvm_visc_streamPop(void* graphID) { - DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; - unsigned totalBytes = 0; - for(uint64_t size: *(Ctx->BindOutSizes)) - totalBytes+= size; - void* output = malloc(totalBytes); - unsigned offset = 0; - for (unsigned i=0; i< Ctx->BindOutputBuffers->size(); i++) { - uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i)); - //DEBUG(cout << "\tPopped Value " << element << " from buffer\n"); - memcpy((char*)output+offset, &element, Ctx->BindOutSizes->at(i)); - offset += Ctx->BindOutSizes->at(i); - } - return output; -} - -// Wait API for a streaming dataflow graph -void llvm_visc_streamWait(void* graphID) { - DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; - // Push garbage to all other input buffers - for (unsigned i=0; i< Ctx->BindInputBuffers->size(); i++) { - uint64_t element = 0; - //DEBUG(cout << "\tPushing Value " << element << " to buffer\n"); - llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element); - } - // Push 1 in isLastInput buffers of all child nodes - for (unsigned i=0; i < Ctx->isLastInputBuffers->size(); i++) - llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1); - - llvm_visc_freeThreads(graphID); -} - -// Create a buffer and return the bufferID -void* llvm_visc_createBindInBuffer(void* graphID, uint64_t size, unsigned inArgPort) { - DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindIn"); - DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); - Context->BindInputBuffers->push_back(bufferID); - (*(Context->ArgInPortSizeMap))[inArgPort] = size; - Context->BindInSourcePort->push_back(inArgPort); - //Context->BindInSizes->push_back(size); - return bufferID; -} - -void* llvm_visc_createBindOutBuffer(void* graphID, uint64_t size) { - DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - //Twine name = Twine("Bind.Out.")+Twine(Context->BindOutputBuffers->size()); - CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindOut"); - DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); - Context->BindOutputBuffers->push_back(bufferID); - Context->BindOutSizes->push_back(size); - return bufferID; -} -void* llvm_visc_createEdgeBuffer(void* graphID, uint64_t size) { - DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - //Twine name = Twine("Edge.")+Twine(Context->EdgeBuffers->size()); - CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "Edge"); - DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); - Context->EdgeBuffers->push_back(bufferID); - Context->EdgeSizes->push_back(size); - return bufferID; -} - -void* llvm_visc_createLastInputBuffer(void* graphID, uint64_t size) { - DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - //Twine name = Twine("isLastInput.")+Twine(Context->EdgeBuffers->size()); - CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "LastInput"); - DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n"); - Context->isLastInputBuffers->push_back(bufferID); - return bufferID; -} - -// Free buffers -void llvm_visc_freeBuffers(void* graphID) { - DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n"); - DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - for(CircularBuffer<uint64_t>* bufferID: *(Context->BindInputBuffers)) - delete bufferID; - for(CircularBuffer<uint64_t>* bufferID: *(Context->BindOutputBuffers)) - delete bufferID; - for(CircularBuffer<uint64_t>* bufferID: *(Context->EdgeBuffers)) - delete bufferID; - for(CircularBuffer<uint64_t>* bufferID: *(Context->isLastInputBuffers)) - delete bufferID; -} - -// Pop an element from the buffer -uint64_t llvm_visc_bufferPop(void* bufferID) { - CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID; - return buffer->pop(); -} - -// Push an element into the buffer -void llvm_visc_bufferPush(void* bufferID, uint64_t element) { - CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID; - buffer->push(element); -} - -// Create a thread -void llvm_visc_createThread(void* graphID, void* (*Func)(void*), void* arguments) { - DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func << ", Args: " << arguments << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; - int err; - pthread_t threadID; - if((err = pthread_create(&threadID, NULL, Func, arguments)) != 0) - cout << "Failed to create thread. Error code = " << err << flush << "\n"; - - Ctx->threads->push_back(threadID); -} - -// Wait for thread to finish -void llvm_visc_freeThreads(void* graphID) { - DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n"); - DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID; - for(pthread_t thread: *(Ctx->threads)) - pthread_join(thread, NULL); -} - -/************************ OPENCL & PTHREAD API ********************************/ - -void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) { - DFNodeContext_X86 *Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86)); - //int err; - //if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0) - //cout << "Failed to create pthread. Error code = " << err << flush << "\n"; - rootFunc(arguments); - return Context; -} - -void llvm_visc_x86_wait(void* graphID) { - DEBUG(cout << "Waiting for pthread to finish ...\n"); - //DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; - //pthread_join(Context->threadID, NULL); - free(graphID); - DEBUG(cout << "\t... pthread Done!\n"); -} - -void* llvm_visc_ocl_initContext(enum visc::Target T) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(std::string Target = T == visc::GPU_TARGET? "GPU" : "SPIR"); - DEBUG(cout << "Initializing Context for " << Target << " device\n"); - cl_uint numPlatforms; - cl_int errcode; - errcode = clGetPlatformIDs(0, NULL, &numPlatforms); - checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms"); - - // now get all the platform IDs - cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms); - errcode = clGetPlatformIDs(numPlatforms, platforms, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs"); - - - for(unsigned i=0; i < numPlatforms; i++) { - char buffer[10240]; - DEBUG(cout << "Device " << i << " Info -->\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL); - DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL); - DEBUG(cout << "\tVERSION = "<< buffer << flush << "\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL); - DEBUG(cout << "\tNAME = " << buffer << flush << "\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL); - DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n"); - clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); - DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n"); - } - // set platform property - just pick the first one - //cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, - //(long) platforms[0], - //0}; - //globalOCLContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, - //NULL, NULL, &errcode); - assert(numPlatforms >= 2 && "Expecting two OpenCL platforms"); - // Choose second one which is X86 AVX - cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, - (long) platforms[T == visc::GPU_TARGET? 0 : 1], - 0}; - globalOCLContext = clCreateContextFromType(properties, - T == visc::GPU_TARGET? - CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, - NULL, NULL, &errcode); - // get the list of OCL devices associated with context - size_t dataBytes; - errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0, - NULL, &dataBytes); - checkErr(errcode, CL_SUCCESS, "Failure to get context info length"); - - clDevices = (cl_device_id *) malloc(dataBytes); - errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes, - clDevices, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to get context info"); - if(false && T == visc::SPIR_TARGET) { - cl_device_partition_property props[4]; - props[0] = CL_DEVICE_PARTITION_BY_COUNTS; - props[1] = NUM_CORES; - props[2] = CL_DEVICE_PARTITION_BY_COUNTS_LIST_END; - props[3] = 0; - cl_device_id subdevice_id[8]; - cl_uint num_entries = 8; - - cl_uint numDevices; - clCreateSubDevices(clDevices[0], props, num_entries, subdevice_id, &numDevices); - //printf("Num of devices = %d\n", numDevices); - //for(unsigned i =0 ; i< numDevices; i++) - //printf("Subdevice id %d = %p\n", i, subdevice_id[i]); - clDevices[0] = subdevice_id[0]; - globalOCLContext = clCreateContext(properties, 1, clDevices, NULL, NULL, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create OCL context"); - } - - free(platforms); - DEBUG(cout << "\tContext " << globalOCLContext << flush << "\n"); - checkErr(errcode, CL_SUCCESS, "Failure to create OCL context"); - - DEBUG(cout << "Initialize Kernel Timer\n"); - visc_InitializeTimerSet(&kernel_timer); - - pthread_mutex_unlock(&ocl_mtx); - return globalOCLContext; -} - -void llvm_visc_ocl_clearContext(void* graphID) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Clear Context\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; - // FIXME: Have separate function to release command queue and clear context. - // Would be useful when a context has multiple command queues - clReleaseKernel(Context->clKernel); - //clReleaseProgram(Context->clProgram); - //clReleaseCommandQueue(Context->clCommandQue); - //clReleaseContext(globalOCLContext); - //DEBUG(cout << "Released context at: " << globalOCLContext); - free(Context); - DEBUG(cout << "Done with OCL kernel\n"); - cout << "Printing VISC Timer: KernelTimer\n"; - visc_PrintTimerSet(&kernel_timer); - pthread_mutex_unlock(&ocl_mtx); - -} - -void llvm_visc_ocl_argument_shared(void* graphID, int arg_index, size_t size) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Set Shared Memory Input:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; - DEBUG(cout << "Using Context: " << Context << flush << "\n"); - DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n"); - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument"); - pthread_mutex_unlock(&ocl_mtx); -} - -void llvm_visc_ocl_argument_scalar(void* graphID, void* input, int arg_index, size_t size) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Set Scalar Input:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; - DEBUG(cout << "Using Context: " << Context << flush << "\n"); - DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n"); - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); - pthread_mutex_unlock(&ocl_mtx); -} - -void* llvm_visc_ocl_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Set Pointer Input:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input << ", Size = "<< size << flush << "\n"); - // Size should be non-zero - assert(size != 0 && "Size of data pointed to has to be non-zero!"); - DEBUG(cout << "\tInput = "<< isInput << "\tOutput = " << isOutput << flush << "\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; - - pthread_mutex_unlock(&ocl_mtx); - // Check with runtime the location of this memory - cl_mem d_input = (cl_mem) llvm_visc_ocl_request_mem(input, size, Context, isInput, isOutput); - - pthread_mutex_lock(&ocl_mtx); - // Set Kernel Argument - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument"); - DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n"); - pthread_mutex_unlock(&ocl_mtx); - return d_input; -} - -void* llvm_visc_ocl_output_ptr(void* graphID, int arg_index, size_t size) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Set device memory for Output Struct:"); - DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = "<< size << flush << "\n"); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; - cl_int errcode; - //pthread_mutex_lock(&ocl_mtx); - cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY, size, NULL, &errcode); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to create output buffer on device"); - //pthread_mutex_lock(&ocl_mtx); - errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument"); - DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n"); - pthread_mutex_unlock(&ocl_mtx); - return d_output; -} - -void llvm_visc_ocl_free(void* ptr) { - //DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n"); - //cl_mem d_ptr = (cl_mem) ptr; - //clReleaseMemObject(d_ptr); -} - -void* llvm_visc_ocl_getOutput(void* graphID, void* h_output, void* d_output, size_t size) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Get Output:\n"); - DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output << ", Size = "<< size << flush << "\n"); - if(h_output == NULL) - h_output = malloc(size); - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size, - h_output, 0, NULL, NULL); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output"); - pthread_mutex_unlock(&ocl_mtx); - return h_output; -} - -void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t* - localWorkSize, const size_t* globalWorkSize) { - pthread_mutex_lock(&ocl_mtx); - - size_t GlobalWG[3]; - size_t LocalWG[3]; - - // OpenCL EnqeueNDRangeKernel function results in segementation fault if we - // directly use local and global work groups arguments. Hence, allocating it - // on stack and copying. - for(unsigned i=0; i<workDim; i++) { - GlobalWG[i] = globalWorkSize[i]; - } - - // OpenCL allows local workgroup to be null. - if(localWorkSize != NULL) { - for(unsigned i=0; i<workDim; i++) { - LocalWG[i] = localWorkSize[i]; - } - } - - DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID; - // TODO: Would like to use event to ensure better scheduling of kernels. - // Currently passing the event paratemeter results in seg fault with - // clEnqueueNDRangeKernel. - cl_event* event; - DEBUG(cout << "Enqueuing kernel:\n"); - DEBUG(cout << "\tCommand Queue: " << Context->clCommandQue << flush << "\n"); - DEBUG(cout << "\tKernel: " << Context->clKernel << flush << "\n"); - DEBUG(cout << "\tNumber of dimensions: " << workDim << flush << "\n"); - DEBUG(cout << "\tGlobal Work Group: ( "); - for(unsigned i = 0; i<workDim; i++) { - DEBUG(cout << GlobalWG[i] << " "); - } - DEBUG(cout << ")\n"); - if(localWorkSize != NULL) { - DEBUG(cout << "\tLocal Work Group: ( "); - for(unsigned i = 0; i<workDim; i++) { - DEBUG(cout << LocalWG[i] << " "); - } - DEBUG(cout << ")\n"); - } - //pthread_mutex_lock(&ocl_mtx); - clFinish(Context->clCommandQue); - //pthread_mutex_unlock(&ocl_mtx); - visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION); - //for(int i=0 ;i < NUM_TESTS; i++) { - //cout << "Iteration = " << i << flush << "\n"; - //pthread_mutex_lock(&ocl_mtx); - cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, - Context->clKernel, workDim, NULL, GlobalWG, (localWorkSize == NULL)? NULL : LocalWG, 0, NULL, NULL); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); - //} - //pthread_mutex_lock(&ocl_mtx); - clFinish(Context->clCommandQue); - //pthread_mutex_unlock(&ocl_mtx); - visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE); - - pthread_mutex_unlock(&ocl_mtx); - return event; -} - - -////////////////////////////////////////////////////////////////////////////// -//! Loads a Program binary file. -//! -//! @return the source string if succeeded, 0 otherwise -//! @param Filename program filename -//! @param szFinalLength returned length of the code string -////////////////////////////////////////////////////////////////////////////// -static char* LoadProgSource(const char* Filename, size_t* szFinalLength) -{ - DEBUG(cout << "Load Prog Source\n"); - // locals - FILE* pFileStream = NULL; - size_t szSourceLength; - - // open the OpenCL source code file - pFileStream = fopen(Filename, "rb"); - if(pFileStream == 0) - { - return NULL; - } - - // get the length of the source code - fseek(pFileStream, 0, SEEK_END); - szSourceLength = ftell(pFileStream); - fseek(pFileStream, 0, SEEK_SET); - - // allocate a buffer for the source code string and read it in - char* cSourceString = (char *)malloc(szSourceLength + 1); - if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) - { - fclose(pFileStream); - free(cSourceString); - return 0; - } - - // close the file and return the total length of the combined (preamble + source) string - fclose(pFileStream); - if(szFinalLength != 0) - { - *szFinalLength = szSourceLength; - } - cSourceString[szSourceLength] = '\0'; - - return cSourceString; -} - -void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Launch OCL Kernel\n"); - // Initialize OpenCL - - // OpenCL specific variables - DFNodeContext_OCL *Context = (DFNodeContext_OCL *) malloc(sizeof(DFNodeContext_OCL)); - - size_t kernelLength; - cl_int errcode; - - // For a single context for all kernels - Context->clOCLContext = globalOCLContext; - - //Create a command-queue - //pthread_mutex_lock(&ocl_mtx); - Context->clCommandQue = clCreateCommandQueue(Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); - globalCommandQue = Context->clCommandQue; - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to create command queue"); - - DEBUG(cout << "Loading program binary: " << FileName << flush << "\n"); - char *programSource = LoadProgSource(FileName, &kernelLength); - checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary"); - - cl_int binaryStatus; - //pthread_mutex_lock(&ocl_mtx); - Context->clProgram = clCreateProgramWithBinary(Context->clOCLContext, 1, &clDevices[0], - &kernelLength, - (const unsigned char **)&programSource, - &binaryStatus, &errcode); - //pthread_mutex_unlock(&ocl_mtx); - checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); - - DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName << flush << "\n"); - errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL); - // If build fails, get build log from device - if(errcode != CL_SUCCESS) { - cout << "ERROR: Failure to build program\n"; - size_t len = 0; - errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0, - NULL, &len); - cout << "LOG LENGTH: " << len << flush << "\n"; - checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length"); - char *log = (char*) malloc(len*sizeof(char)); - errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len, - log, NULL); - checkErr(errcode, CL_SUCCESS, "Failure to collect program build log"); - - cout << "Device Build Log:\n" << log << flush << "\n"; - free(log); - pthread_mutex_unlock(&ocl_mtx); - exit(EXIT_FAILURE); - } - - Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); - - DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n"); - //free(clDevices); - free(programSource); - - pthread_mutex_unlock(&ocl_mtx); - return Context; -} - - -void llvm_visc_ocl_wait(void* graphID) { - pthread_mutex_lock(&ocl_mtx); - DEBUG(cout << "Wait\n"); - DFNodeContext_OCL *Context = (DFNodeContext_OCL*) graphID; - //pthread_mutex_lock(&ocl_mtx); - clFinish(Context->clCommandQue); - //pthread_mutex_unlock(&ocl_mtx); - pthread_mutex_unlock(&ocl_mtx); -} - -void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID timer) { - //cout << "Switching to timer " << timer << flush << "\n"; - pthread_mutex_lock(&ocl_mtx); - //visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer); - pthread_mutex_unlock(&ocl_mtx); -} -void llvm_visc_printTimerSet(void** timerSet, char* timerName) { - pthread_mutex_lock(&ocl_mtx); - cout << "Printing VISC Timer: "; - if(timerName != NULL) - cout << timerName << flush << "\n"; - else - cout << "Anonymous\n"; - visc_PrintTimerSet((visc_TimerSet*) (*timerSet)); - pthread_mutex_unlock(&ocl_mtx); -} - -void* llvm_visc_initializeTimerSet() { - pthread_mutex_lock(&ocl_mtx); - visc_TimerSet* TS = (visc_TimerSet*) malloc (sizeof(visc_TimerSet)); - visc_InitializeTimerSet(TS); - pthread_mutex_unlock(&ocl_mtx); - return TS; -} - - - diff --git a/hpvm/visc-rt/visc-rt.h b/hpvm/visc-rt/visc-rt.h deleted file mode 100644 index 0a6cadebd3..0000000000 --- a/hpvm/visc-rt/visc-rt.h +++ /dev/null @@ -1,303 +0,0 @@ -/* - * - * (c) 2010 The Board of Trustees of the University of Illinois. - */ -#ifndef VISC_RT_HEADER -#define VISC_RT_HEADER - -#include <iostream> -#include <map> -#include <ctime> -#include <vector> -#include <pthread.h> -#include <string> -//#include <condition_variable> - -#include "../../include/SupportVISC/VISCHint.h" -#include "../../include/SupportVISC/VISCTimer.h" -#include "device_abstraction.h" -#include "policy.h" - -#ifndef DEBUG_BUILD -#define DEBUG(s) {} -#else -#define DEBUG(s) s -#endif - - - -using namespace std; - -extern "C" { - -/************************* Policies *************************************/ - -void llvm_visc_policy_init(); -void llvm_visc_policy_clear(); -int llvm_visc_policy_getVersion(const char *, int64_t); - -/******************** Device Abstraction ********************************/ -void llvm_visc_deviceAbstraction_start(); -void llvm_visc_deviceAbstraction_end(); -void llvm_visc_deviceAbstraction_waitOnDeviceStatus(); - -/********************* DFG Depth Stack **********************************/ -class DFGDepth { - private: - unsigned numDim; - unsigned dimLimit[3]; - unsigned dimInstance[3]; - public: - DFGDepth() {} - DFGDepth(unsigned n, unsigned dimX = 0, unsigned iX = 0, unsigned dimY = 0, unsigned iY = 0, - unsigned dimZ = 0, unsigned iZ = 0) { - assert(n <= 3 && "Error! More than 3 dimensions not supported"); - numDim = n; - dimLimit[0] = dimX; - dimLimit[1] = dimY; - dimLimit[2] = dimZ; - dimInstance[0] = iX; - dimInstance[1] = iY; - dimInstance[2] = iZ; - } - unsigned getDimLimit(unsigned dim) { - assert(dim <= numDim && "Error! Requested dimension limit is not specified"); - return dimLimit[dim]; - } - - unsigned getDimInstance(unsigned dim) { - assert(dim <= numDim && "Error! Requested dimension instance is not specified"); - return dimInstance[dim]; - } - - unsigned getNumDim() { - return numDim; - } -}; - -void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0, - uint64_t limitY = 0, uint64_t iY = 0, uint64_t limitZ = 0, uint64_t iZ = 0); -void llvm_visc_x86_dstack_pop(); -uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim); -uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim); - - -/********************* Memory Tracker **********************************/ -class MemTrackerEntry { -public: - enum Location {HOST, DEVICE}; - private: - size_t size; - Location loc; - void* addr; - void* Context; - - public: - MemTrackerEntry(size_t _size, Location _loc, void* _addr, void* _Context): - size(_size), loc(_loc), addr(_addr), Context(_Context) { - } - - size_t getSize() { - return size; - } - - Location getLocation() { - return loc; - } - - void* getAddress() { - return addr; - } - - void* getContext() { - return Context; - } - - void update(Location _loc, void* _addr, void* _Context = NULL) { - loc = _loc; - addr = _addr; - Context = _Context; - } - - void print() { - cout << "Size = " << size << "\tLocation = " << loc << "\tAddress = " << addr << "\tContext = " << Context; - } -}; - - -class MemTracker { - -private: - std::map<void*, MemTrackerEntry*> Table; - -public: - MemTracker() { - } - - bool insert(void* ID, size_t size, MemTrackerEntry::Location loc, void* addr, void* Context = NULL) { - MemTrackerEntry* MTE = new MemTrackerEntry(size, loc, addr, Context); - Table.insert(std::pair<void*, MemTrackerEntry*>(ID, MTE)); - return MTE != NULL; - } - - MemTrackerEntry* lookup(void* ID) { - if(Table.count(ID) == 0) - return NULL; - return Table[ID]; - } - - void remove(void* ID) { - MemTrackerEntry* MTE = Table[ID]; - free(MTE); - Table.erase(ID); - } - - void print() { - cout << "Printing Table ... Size = " << Table.size() << flush << "\n"; - for(auto& Entry: Table) { - cout << Entry.first << ":\t" ; - Entry.second->print(); - cout << flush << "\n"; - } - } - -}; - -void llvm_visc_track_mem(void*, size_t); -void llvm_visc_untrack_mem(void*); -void* llvm_visc_request_mem(void*, size_t); - -/*********************** OPENCL & PTHREAD API **************************/ -void* llvm_visc_x86_launch(void* (void*), void*); -void llvm_visc_x86_wait(void*); -void* llvm_visc_ocl_initContext(enum visc::Target); - -void* llvm_visc_x86_argument_ptr(void*, size_t); - -void llvm_visc_ocl_clearContext(void*); -void llvm_visc_ocl_argument_shared(void*, int, size_t); -void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t); -void* llvm_visc_ocl_argument_ptr(void*, void*, int, size_t, bool, bool); -void* llvm_visc_ocl_output_ptr(void*, int, size_t); -void llvm_visc_ocl_free(void*); -void* llvm_visc_ocl_getOutput(void*, void*, void*, size_t); -void* llvm_visc_ocl_executeNode(void*, unsigned, const size_t*, const size_t*); -void* llvm_visc_ocl_launch(const char*, const char*); -void llvm_visc_ocl_wait(void*); - -void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID); -void llvm_visc_printTimerSet(void** timerSet, char* timerName = NULL); -void* llvm_visc_initializeTimerSet(); - -} - -/*************************** Pipeline API ******************************/ -// Circular Buffer class -unsigned counter = 0; -template <class ElementType> -class CircularBuffer { -private: - int numElements; - int bufferSize; - int Head; - int Tail; - pthread_mutex_t mtx; - pthread_cond_t cv; - vector<ElementType> buffer; - std::string name; - unsigned ID; - -public: - CircularBuffer(int maxElements, std::string _name = "ANON") { - ID = counter; - Head = 0; - Tail = 0; - numElements = 0; - name = _name; - bufferSize = maxElements+1; - buffer.reserve(bufferSize); - pthread_mutex_init(&mtx, NULL); - pthread_cond_init(&cv, NULL); - counter++; - - } - - bool push(ElementType E); - ElementType pop(); - -}; - -template <class ElementType> -bool CircularBuffer<ElementType>::push(ElementType E) { - //DEBUG(cout << name << " Buffer[" << ID << "]: Push " << E << flush << "\n"); - //unique_lock<mutex> lk(mtx); - pthread_mutex_lock(&mtx); - if((Head +1) % bufferSize == Tail) { - //DEBUG(cout << name << " Buffer[" << ID << "]: Push going to sleep ...\n"); - //cv.wait(lk); - pthread_cond_wait(&cv, &mtx); - //DEBUG(cout << name << " Buffer[" << ID << "]: Push woke up\n"); - } - buffer[Head] = E; - Head = (Head+1) % bufferSize; - numElements++; - //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n"); - //lk.unlock(); - pthread_mutex_unlock(&mtx); - //cv.notify_one(); - pthread_cond_signal(&cv); - return true; -} - -template <class ElementType> -ElementType CircularBuffer<ElementType>::pop() { - //unique_lock<mutex> lk(mtx); - //DEBUG(cout << name << " Buffer[" << ID << "]: Pop\n"); - pthread_mutex_lock(&mtx); - if(Tail == Head) { - //DEBUG(cout << name << " Buffer[" << ID << "]: Pop going to sleep ...\n"); - //cv.wait(lk); - pthread_cond_wait(&cv, &mtx); - //DEBUG(cout << name << " Buffer[" << ID << "]: Pop woke up\n"); - } - ElementType E = buffer[Tail]; - Tail = (Tail + 1) % bufferSize; - numElements--; - //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n"); - //lk.unlock(); - pthread_mutex_unlock(&mtx); - //cv.notify_one(); - pthread_cond_signal(&cv); - return E; -} - -extern "C" { -// Functions to push and pop values from pipeline buffers -uint64_t llvm_visc_bufferPop(void*); -void llvm_visc_bufferPush(void*, uint64_t); - -// Functions to create and destroy buffers -void* llvm_visc_createBindInBuffer(void*, uint64_t, unsigned); -void* llvm_visc_createBindOutBuffer(void*, uint64_t); -void* llvm_visc_createEdgeBuffer(void*, uint64_t); -void* llvm_visc_createLastInputBuffer(void*, uint64_t); - -void llvm_visc_freeBuffers(void*); - -// Functions to create and destroy threads -void llvm_visc_createThread(void* graphID, void*(*Func)(void*), void*); -void llvm_visc_freeThreads(void*); - -// Launch API for a streaming graph. -// Arguments: -// (1) Launch Function: void* (void*, void*) -// (2) Push Function: void (void*, std::vector<uint64_t>**, unsgined) -// (3) Pop Function: void* (std::vector<uint64_t>**, unsigned) -void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void*); -void llvm_visc_streamPush(void* graphID, void* args); -void* llvm_visc_streamPop(void* graphID); -void llvm_visc_streamWait(void* graphID); - -} - -#endif //VISC_RT_HEADER -- GitLab